## Table 3

In [1]:
import re
import pandas as pd
import numpy as np

###############################################################################
# 0)  CONFIG
###############################################################################
S_FOCAL = 8                     # “s = 8” in the paper
REDUCED_FILE = "proper_OR.csv"    # path to the proper‑OR file you generated

###############################################################################
# 1)  READ  &  PARSE  (r , z)
###############################################################################
df = pd.read_csv(REDUCED_FILE)

# the third column looks like "(7, 12)" → split it into integers
def parse_pair(txt):
    r, z = map(int, re.findall(r"\d+", txt))
    return pd.Series({"r": r, "z": z})

df[["r", "z"]] = df.iloc[:, 2].apply(parse_pair)

###############################################################################
# 2)  BUILD THE TABLE
###############################################################################
records = []
for t_v in sorted(df["z"].unique()):          # 11 unique 2→3 times
    # Number of 2→3 events at t_v
    dN = (df["z"] == t_v).sum()

    # … among those who were still in state 1 at s = 8  (⇔  r_i ≥ 8)
    dN_s8 = ((df["z"] == t_v) & (df["r"] >= S_FOCAL)).sum()

    # Risk set just before t_v:  r_i < t_v  &  z_i ≥ t_v
    at_risk = (df["r"] < t_v) & (df["z"] >= t_v)
    Y  = at_risk.sum()

    # … and who were in state 1 at s = 8
    Y_s8 = (at_risk & (df["r"] >= S_FOCAL)).sum()

    records.append((t_v, dN, dN_s8, Y, Y_s8))

table3 = pd.DataFrame(
    records,
    columns=[
        "t_v",
        r"∑ dN_i(t_v)",
        r"∑ ​I(8 ≤ r_i) dN_i(t_v)",
        r"∑ Y_i(t_v)",
        r"∑ ​I(8 ≤ r_i) Y_i(t_v)",
    ],
)

###############################################################################
# 3)  DISPLAY
###############################################################################
print(table3.to_string(index=False))

 t_v  ∑ dN_i(t_v)  ∑ ​I(8 ≤ r_i) dN_i(t_v)  ∑ Y_i(t_v)  ∑ ​I(8 ≤ r_i) Y_i(t_v)
  12            2                        0          16                      10
  13            2                        0          17                      13
  15            3                        3          22                      20
  16            4                        3          20                      18
  17            2                        1          16                      15
  18            6                        6          14                      14
  19            1                        1           8                       8
  20            2                        2           7                       7
  21            2                        2           5                       5
  22            1                        1           3                       3
  23            2                        2           2                       2


## Table 5

In [2]:
import pandas as pd
from collections import defaultdict

# Load CSV files
improper_or_df = pd.read_csv("improper_OR.csv")
improper_mi_df = pd.read_csv("improper_MI.csv")

# Extract r from Reduced (r, 23) column
reduced_rows = improper_mi_df["Reduced (r,23)"].str.extract(r"\((\d+),\s*23\)").astype(int)
improper_mi_df["r"] = reduced_rows[0]

# Count multiplicities from improper_OR: match (L, L+1] x (23, ∞)
counts = defaultdict(int)
for _, row in improper_or_df.iterrows():
    L, R, W = row['L'], row['R'], row['W']
    if W == 23 and R == L + 1:
        counts[R] += 1  # R is r in (r, 23)

# Filter MI rows with matching r and create table format
table5_df = improper_mi_df[improper_mi_df["r"].isin(counts.keys())].copy()
table5_df["multiplicity"] = table5_df["r"].map(counts)
table5_df["Improper OR=Improper MI"] = table5_df["r"].apply(lambda r: f"({r-1},{r}) × (23,∞)")
table5_df["reduced"] = table5_df["r"].apply(lambda r: f"({r}, 23)")

# Final table with desired columns
table5 = table5_df[["Improper OR=Improper MI", "multiplicity", "reduced", "p"]].copy()

# Optional: add total row
total_multiplicity = table5["multiplicity"].sum()
table5.loc["total"] = ["total", total_multiplicity, "", ""]

# Print the table
print(table5.to_string(index=False))

Improper OR=Improper MI  multiplicity  reduced       p
        (9,10) × (23,∞)             2 (10, 23)  0.0639
       (10,11) × (23,∞)             7 (11, 23)  0.1649
       (11,12) × (23,∞)             1 (12, 23)  0.0379
       (12,13) × (23,∞)             7 (13, 23)  0.1682
       (14,15) × (23,∞)             8 (15, 23)  0.1548
       (15,16) × (23,∞)             5 (16, 23)  0.0619
                  total            30                 


## Table 6

In [3]:
import pandas as pd

# --- Load ---
or_df = pd.read_csv("improper_OR.csv")      # columns: L, R, W, Z
mi_df = pd.read_csv("improper_MI.csv")      # columns: Improper MI, p, Reduced (r,23)

# --- Parse MI endpoints from strings like "(5,7] × (23, ∞)" ---
uv = mi_df["Improper MI"].str.extract(r"\((\d+)\s*,\s*(\d+)\]")
mi_df["u"] = uv[0].astype(int)
mi_df["v"] = uv[1].astype(int)
mi_keep = mi_df[["u", "v", "Improper MI", "Reduced (r,23)"]].copy()

# --- Candidate improper ORs: W==23 (improper); we’ll later split adjacent vs non-adjacent ---
cand_or = or_df[or_df["W"] == 23][["L", "R"]].drop_duplicates()

# --- Containment: Type 1 = contains exactly one improper MI ---
cross = cand_or.merge(mi_keep, how="cross")
contained = cross[(cross["L"] <= cross["u"]) & (cross["v"] <= cross["R"])]

# Count # of improper MIs each OR contains
mi_counts = contained.groupby(["L", "R"]).size().reset_index(name="num_improper_MI")

# Keep only Type-1 ORs (exactly one improper MI)
type1_or = mi_counts[mi_counts["num_improper_MI"] == 1][["L", "R"]]

# --- Table 6 subset: NOT of the form (L, L+1) ---
table6_or = type1_or[type1_or["R"] != type1_or["L"] + 1]

# Attach the unique MI row for each selected OR
one_mi_detail = table6_or.merge(contained, on=["L", "R"], how="left")

# Multiplicity of each OR in raw data (how many times it appears with W=23)
mult = (or_df[or_df["W"] == 23]
        .groupby(["L", "R"]).size().reset_index(name="multiplicity"))

# Build Table 6
final = (one_mi_detail.merge(mult, on=["L", "R"], how="left")
         .drop_duplicates(subset=["L","R"])            # exactly one MI per OR
         .sort_values(by=["R", "L"])
         .assign(
             **{
                 "Improper OR":    lambda d: d.apply(lambda r: f"({r.L},{r.R}) × (23,∞)", axis=1),
                 "reduced (r,23)": lambda d: d["Reduced (r,23)"],
             }
         )[["Improper OR", "Improper MI", "multiplicity", "reduced (r,23)"]]
         .reset_index(drop=True))

# Add total row — multiplicities should sum to 9
table6 = pd.concat(
    [final, pd.DataFrame([["total", "", final["multiplicity"].sum(), ""]], columns=final.columns)],
    ignore_index=True
)

print(table6.to_string(index=False))

     Improper OR       Improper MI  multiplicity reduced (r,23)
  (1,7) × (23,∞)   (5,7] × (23, ∞)             1         (7,23)
  (5,7) × (23,∞)   (5,7] × (23, ∞)             2         (7,23)
  (7,9) × (23,∞)   (8,9] × (23, ∞)             1         (9,23)
(12,14) × (23,∞) (12,13] × (23, ∞)             2        (13,23)
(13,15) × (23,∞) (14,15] × (23, ∞)             3        (15,23)
           total                               9               


## Table 6a

In [4]:
import pandas as pd
import re
from pathlib import Path

# -------------------------
# 1) Load inputs
# -------------------------
table3 = pd.read_csv("table3.csv")
table5 = pd.read_csv("table5.csv")
table6 = pd.read_csv("table6.csv")

# -------------------------
# 2) Helpers
# -------------------------
def find_col(df, substrings):
    """Return the first column whose name contains ALL substrings (case-insensitive)."""
    subs = [s.lower() for s in substrings]
    for c in df.columns:
        cl = c.lower()
        if all(s in cl for s in subs):
            return c
    raise KeyError(f"Column with substrings {substrings} not found in {list(df.columns)}")

def extract_r(val):
    """Extract integer r from '(r,23)' allowing spaces."""
    if not isinstance(val, str):
        return None
    m = re.search(r"\((\d+)\s*,\s*23\)", val)
    return int(m.group(1)) if m else None

# -------------------------
# 3) Parse reduced points & multiplicities from Tables 5 & 6
# -------------------------
col_mult_5 = find_col(table5, ["multiplicity"])
col_red_5  = find_col(table5, ["reduced"])            # e.g., "reduced"
r_mult_5 = (
    table5.assign(r=table5[col_red_5].apply(extract_r))
          .dropna(subset=["r"])
          [["r", col_mult_5]]
          .rename(columns={col_mult_5: "multiplicity"})
)

col_mult_6 = find_col(table6, ["multiplicity"])
col_red_6  = find_col(table6, ["reduced"])            # e.g., "reduced (r,23)"
r_mult_6 = (
    table6.assign(r=table6[col_red_6].apply(extract_r))
          .dropna(subset=["r"])
          [["r", col_mult_6]]
          .rename(columns={col_mult_6: "multiplicity"})
)

# Combine 5 & 6, then aggregate multiplicities per r
r_mult = (
    pd.concat([r_mult_5, r_mult_6], ignore_index=True)
      .groupby("r", as_index=False)["multiplicity"].sum()
      .sort_values("r")
      .reset_index(drop=True)
)

# Precompute cumulative sums over r for fast lookups of r < t_v
r_mult["cum_all"]  = r_mult["multiplicity"].cumsum()
r_mult["ge8"]      = (r_mult["r"] >= 8).astype(int) * r_mult["multiplicity"]
r_mult["cum_ge8"]  = r_mult["ge8"].cumsum()

def cum_less_than(tv, colname):
    """Cumulative sum of selected multiplicities for all r < tv."""
    eligible = r_mult[r_mult["r"] < tv]
    return int(eligible[colname].iloc[-1]) if not eligible.empty else 0

# -------------------------
# 4) Read needed columns from Table 3 and build Table 6a
# -------------------------
col_tv      = find_col(table3, ["t_v"])
col_dN      = find_col(table3, ["∑", "dn"])              # "∑ dN_i(t_v)"
col_I8dN    = find_col(table3, ["∑", "i(8", "dn"])       # "∑ I(8 ≤ r_i) dN_i(t_v)"
col_Y       = find_col(table3, ["∑", "y_i"])             # "∑ Y_i(t_v)"
col_I8Y     = find_col(table3, ["∑", "i(8", "y_i"])      # "∑ I(8 ≤ r_i) Y_i(t_v)"

rows = []
for _, r in table3.iterrows():
    tv    = int(r[col_tv])
    dN    = int(r[col_dN])
    I8dN  = int(r[col_I8dN])
    Y0    = int(r[col_Y])
    I8Y0  = int(r[col_I8Y])

    add_Y   = cum_less_than(tv, "cum_all")   # all r<tv
    add_I8Y = cum_less_than(tv, "cum_ge8")   # only r>=8, r<tv

    rows.append({
        "t_v": tv,
        "∑ dN_i(t_v)": dN,
        "∑ I(8 ≤ r_i) dN_i(t_v)": I8dN,
        "∑ Y_i(t_v)": f"{Y0} + {add_Y} = {Y0 + add_Y}",
        "∑ I(8 ≤ r_i) Y_i(t_v)": f"{I8Y0} + {add_I8Y} = {I8Y0 + add_I8Y}",
    })

table6a = pd.DataFrame(rows)

# -------------------------
# 5) Save result
# -------------------------
table6a.to_csv("table6a.csv", index=False)
print(table6a.to_string(index=False))

 t_v  ∑ dN_i(t_v)  ∑ I(8 ≤ r_i) dN_i(t_v)   ∑ Y_i(t_v) ∑ I(8 ≤ r_i) Y_i(t_v)
  12            2                       0 16 + 13 = 29          10 + 10 = 20
  13            2                       0 17 + 14 = 31          13 + 11 = 24
  15            3                       3 22 + 23 = 45          20 + 20 = 40
  16            4                       3 20 + 34 = 54          18 + 31 = 49
  17            2                       1 16 + 39 = 55          15 + 36 = 51
  18            6                       6 14 + 39 = 53          14 + 36 = 50
  19            1                       1  8 + 39 = 47           8 + 36 = 44
  20            2                       2  7 + 39 = 46           7 + 36 = 43
  21            2                       2  5 + 39 = 44           5 + 36 = 41
  22            1                       1  3 + 39 = 42           3 + 36 = 39
  23            2                       2  2 + 39 = 41           2 + 36 = 38


## Test Statistic

In [11]:
import pandas as pd
import re

# --- Load & normalize columns ---
df = pd.read_csv("table6a.csv").rename(columns={
    't_v': 't_v',
    '∑ dN_i(t_v)': 'sum_dN',
    '∑ I(8 ≤ r_i) dN_i(t_v)': 'sum_I_dN',
    '∑ Y_i(t_v)': 'sum_Y',
    '∑ I(8 ≤ r_i) Y_i(t_v)': 'sum_I_Y'
})

def to_number(x):
    """Extract the numeric total from entries like '16 + 13 = 29' or pass through numbers."""
    if pd.isna(x): 
        return pd.NA
    if isinstance(x, (int, float)): 
        return x
    s = str(x)
    nums = re.findall(r'(-?\d+(?:\.\d+)?)', s)
    return float(nums[-1]) if nums else pd.NA

for c in ['sum_dN', 'sum_I_dN', 'sum_Y', 'sum_I_Y']:
    df[c] = df[c].apply(to_number).astype(float)

# --- Compute U(t_v) ---
# U(t_v) = sum_I_dN - sum_I_Y * (sum_dN / sum_Y)
df['U_t'] = df['sum_I_dN'] - df['sum_I_Y'] * (df['sum_dN'] / df['sum_Y'])

print(df)

    t_v  sum_dN  sum_I_dN  sum_Y  sum_I_Y       U_t
0    12     2.0       0.0   29.0     20.0 -1.379310
1    13     2.0       0.0   31.0     24.0 -1.548387
2    15     3.0       3.0   45.0     40.0  0.333333
3    16     4.0       3.0   54.0     49.0 -0.629630
4    17     2.0       1.0   55.0     51.0 -0.854545
5    18     6.0       6.0   53.0     50.0  0.339623
6    19     1.0       1.0   47.0     44.0  0.063830
7    20     2.0       2.0   46.0     43.0  0.130435
8    21     2.0       2.0   44.0     41.0  0.136364
9    22     1.0       1.0   42.0     39.0  0.071429
10   23     2.0       2.0   41.0     38.0  0.146341


In [12]:
# --- Compute total U ---
U = df['U_t'].sum()
print(f"Test Statistics: {round(U, 4)}")

Test Statistics: -3.1905


## Table 6b

In [15]:
import pandas as pd
import numpy as np

# === Load the input table (Table 6a stats) ===
df = pd.read_csv("table6a_stats.csv")

# Extract relevant columns
tv = df["t_v"].astype(int)         # time values
nV = df["sum_Y"].astype(int)       # n_V = sum of Y_i(t_v)
nv1 = df["sum_I_Y"].astype(int)    # n_{v1} = sum of I(8 <= r_i) Y_i(t_v)

# Compute the formula: n_{v1}(n_V - n_{v1}) / n_V^2
value = nv1 * (nV - nv1) / (nV ** 2)

# Build Table 6b DataFrame
table6b = pd.DataFrame({
    "t_v": tv,
    "n_V": nV,
    "n_{v1}": nv1,
    "n_{v1}(n_V - n_{v1})/n_V^2": value.round(6)
})

# Save to CSV
table6b.to_csv("table6b.csv", index=False)

print("Table 6b saved to table6b.csv")
print(table6b)

Table 6b saved to table6b.csv
    t_v  n_V  n_{v1}  n_{v1}(n_V - n_{v1})/n_V^2
0    12   29      20                    0.214031
1    13   31      24                    0.174818
2    15   45      40                    0.098765
3    16   54      49                    0.084019
4    17   55      51                    0.067438
5    18   53      50                    0.053400
6    19   47      44                    0.059756
7    20   46      43                    0.060964
8    21   44      41                    0.063533
9    22   42      39                    0.066327
10   23   41      38                    0.067817


In [16]:
# --- Compute standardized test statistic ---
df["var_term"] = df.apply(lambda row: (row["sum_I_Y"] * (row["sum_Y"] - row["sum_I_Y"])) / (row["sum_Y"]**2) if row["sum_Y"] > 0 else 0, axis=1)
V_U = df["var_term"].sum()

U_std = U / np.sqrt(V_U) if V_U > 0 else np.nan

print(f"Standardized Test Statistics: {round(U_std, 4)}")

Standardized Test Statistics: -3.1733
