In [2]:
import os
import sqlite3
import pandas as pd
import numpy as np
import math
from typing import Dict, List, Union, Any, Optional

In [3]:
# ===== CONFIG =====
MAGI_DB_PATH = "/projects/klybarge/pcori_ad/magi/magi_db/magi.db" 
OUT_DIR = "./Test"
os.makedirs(OUT_DIR, exist_ok=True)
EDGE_TABLE = "magi_counts_top500"  
TARGETS = [ "aa_meas_citalopram_rem",]
TOP_K = 500

HI = 1.5
LO = 1.0 / HI  

In [None]:
############### MAGI FUNCTION ###############
def analyze_causal_sequence_py(
    data: Union[str, pd.DataFrame],
    name_map: Dict[str, str],
    events: List[str],
    force_outcome=None,
) -> Dict[str, Any]:

    # -- 0) Ingest
    if isinstance(data, str):
        df = pd.read_csv(data)
    else:
        df = data.copy()

    # -- 1) Recode event names
    for col in ["target_concept_code", "concept_code"]:
        if col not in df.columns:
            raise ValueError(f"Missing required column: {col}")
    if name_map:
        df["target_concept_code"] = df["target_concept_code"].replace(name_map)
        df["concept_code"]        = df["concept_code"].replace(name_map)

    # -- 2) Filter & coerce numerics
    need = [
        "n_code_target", "n_code_no_target", "n_target", "n_no_target",
        "n_target_before_code", "n_code_before_target",
    ]
    missing = [c for c in need if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {', '.join(missing)}")

    has_total = "total_effect" in df.columns
    if has_total:
        df["total_effect"] = pd.to_numeric(df["total_effect"], errors="coerce")

    # -- Auto-detect events if not provided
    if events is None:
        ev_targets  = df["target_concept_code"].dropna().astype(str).unique().tolist()
        ev_children = df["concept_code"].dropna().astype(str).unique().tolist()
        events = sorted(set(ev_targets).intersection(ev_children))
        if len(events) == 0:
            events = sorted(set(ev_targets) | set(ev_children))
    if len(events) < 2:
        raise ValueError("Need at least two events after auto-detection.")

    df = df[df["target_concept_code"].isin(events) & df["concept_code"].isin(events)].copy()
    for c in need:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    if "no_code_no_target" not in df.columns:
        df["no_code_no_target"] = (df["n_no_target"] - df["n_code_no_target"]).clip(lower=0)
    else:
        df["no_code_no_target"] = pd.to_numeric(df["no_code_no_target"], errors="coerce").clip(lower=0)

    # Helper: total count for a parent
    def C_of(ev: str) -> float:
        sub = df[df["target_concept_code"] == ev]
        if sub.empty:
            return float("nan")
        C = pd.to_numeric(sub["n_target"], errors="coerce").max()
        return float(C) if pd.notna(C) and np.isfinite(C) else float("nan")

    # ---- SAFE log for trace (treat invalid/≤0 as log(1)=0) ----
    def safe_log(x: float) -> float:
        try:
            xv = float(x)
        except (TypeError, ValueError):
            return 0.0
        if not np.isfinite(xv) or xv <= 0.0:
            return 0.0
        return math.log(xv)

    # -- 3) Temporal order from before/after counts
    scores = {}
    for zk in events:
        s = 0.0
        for zj in [x for x in events if x != zk]:
            rowr = df[(df["target_concept_code"] == zj) & (df["concept_code"] == zk)]
            if not rowr.empty:
                s += float(rowr["n_code_before_target"].sum(skipna=True) -
                           rowr["n_target_before_code"].sum(skipna=True))
        scores[zk] = s

    sorted_scores = pd.Series(scores).sort_values(ascending=False)
    # --- enforce outcome if requested ---
    if force_outcome and (force_outcome in sorted_scores.index):
        outcome_event = force_outcome
        temporal_order = [ev for ev in sorted_scores.index if ev != outcome_event] + [outcome_event]
    else:
        outcome_event = sorted_scores.index[0]
        temporal_order = [ev for ev in sorted_scores.index if ev != outcome_event] + [outcome_event]

    # -- 4) Propagation order
    events_order = temporal_order
    outcome = events_order[-1]
    antecedents = events_order[:-1]

    # -- 5) T-values and λ’s
    T_val = pd.Series(0.0, index=antecedents, dtype=float)
    D_val = pd.Series(np.nan, index=antecedents, dtype=float)
    lambda_l: Dict[str, pd.Series] = {}

    for k in antecedents:
        row_ko = df[(df["target_concept_code"] == k) & (df["concept_code"] == outcome)]
        a = float(row_ko["n_code_target"].sum(skipna=True))            # Y∩k
        b = float(row_ko["n_code_no_target"].sum(skipna=True))         # ¬Y∩k

        if "n_target_no_code" in row_ko.columns:
            c = float(row_ko["n_target_no_code"].sum(skipna=True))     # Y∩¬k
        else:
            c = float(pd.to_numeric(row_ko["n_target"], errors="coerce").max() - a)

        if "no_code_no_target" in row_ko.columns:
            d = float(row_ko["no_code_no_target"].sum(skipna=True))    # ¬Y∩¬k
        else:
            d = float(pd.to_numeric(row_ko["n_no_target"], errors="coerce").max() - b)

        N1, N0 = a + b, c + d

        # adjusted odds (add-1 when a/b/c/d is 0)
        if a == 0:
            odds_pos_adj = 1.0 / (N1 + 1.0)
        elif b == 0:
            odds_pos_adj = (N1 + 1.0) / 1.0
        else:
            odds_pos_adj = a / b

        if c == 0:
            odds_neg_adj = 1.0 / (N0 + 1.0)
        elif d == 0:
            odds_neg_adj = (N0 + 1.0) / 1.0
        else:
            odds_neg_adj = c / d

        T_val.loc[k] = float(odds_pos_adj / odds_neg_adj)

        # λ_{k,j} = L(X_j | X_k) with j after k and before outcome
        pos_k = events_order.index(k)
        js = events_order[pos_k + 1 : -1] if pos_k < len(events_order) - 1 else []

        lam_vals = []
        for j in js:
            row_kj = df[(df["target_concept_code"] == k) & (df["concept_code"] == j)]
            if row_kj.empty:
                lam_vals.append((j, 0.0)); continue

            # Prefer precomputed total_effect for lambda if available
            te = float(pd.to_numeric(row_kj["total_effect"], errors="coerce").max()) if has_total else float("nan")
            if np.isfinite(te):
                lam_vals.append((j, te))
                continue

            # else compute via piecewise rule
            C11 = float(row_kj["n_code_target"].sum(skipna=True))      # C(j∩k)
            if "n_code_no_target" in row_kj.columns:                   # C(j∩¬k)
                Cj_not_k = float(row_kj["n_code_no_target"].sum(skipna=True))
            else:
                Cj = C_of(j)
                Cj_not_k = 0.0 if (not np.isfinite(Cj)) else max(Cj - C11, 0.0)
            Ck = C_of(k)

            if Cj_not_k == 0:
                L = 1.0 + C11
            elif C11 == 0:
                L = 1.0 / (1.0 + Cj_not_k)
            elif np.isfinite(Ck) and Ck > 0:
                L = C11 / Ck
            else:
                L = 0.0

            lam_vals.append((j, float(L)))

        lambda_l[k] = pd.Series({j: v for j, v in lam_vals}, dtype=float)

    # -- 6) Excel-style recursion trace
    trace_rows = []
    last_anc = antecedents[-1] if antecedents else None
    if last_anc is not None:
        D_val.loc[last_anc] = T_val.loc[last_anc]
        trace_rows.append({
            "stage": "Last 2 Nodes",
            "nodes": f"{last_anc} - {outcome}",
            "k": last_anc,
            "T_kY": T_val.loc[last_anc],
            "lambda_terms": None,
            "sum_lambda": 0.0,
            "D_kY": D_val.loc[last_anc],
            "log_D": safe_log(D_val.loc[last_anc]),
        })

    if len(antecedents) > 1:
        for k in list(reversed(antecedents))[1:]:
            lam = lambda_l.get(k, pd.Series(dtype=float))
            children = list(lam.index)
            num = T_val.loc[k] - float(np.nansum(lam.reindex(children).values * D_val.reindex(children).values))
            den = 1.0 - float(np.nansum(lam.values))
            D_val.loc[k] = (num / den) if np.isfinite(num / den) else T_val.loc[k]

            span = len(events_order) - events_order.index(k) + 1
            lam_str = ", ".join(
                f"λ_{events_order.index(k)+1}{events_order.index(ch)+1}={lam[ch]:.6f}"
                for ch in children
            ) if len(lam) else None

            trace_rows.append({
                "stage": f"Last {span} Nodes",
                "nodes": " - ".join([k] + events_order[events_order.index(k)+1:]),
                "k": k,
                "T_kY": T_val.loc[k],
                "lambda_terms": lam_str,
                "sum_lambda": float(np.nansum(lam.values)),
                "D_kY": D_val.loc[k],
                "log_D": safe_log(D_val.loc[k]),
            })

    trace_df = pd.DataFrame(trace_rows)

    # -- 7) Coefficients
    resp_rows = df[df["target_concept_code"] == outcome]
    if resp_rows.empty:
        raise ValueError(f"No rows for outcome '{outcome}'.")

    n_t = resp_rows["n_target"].dropna().iloc[0] if resp_rows["n_target"].dropna().size else np.nan
    n_n = resp_rows["n_no_target"].dropna().iloc[0] if resp_rows["n_no_target"].dropna().size else np.nan
    denom = n_t + n_n
    p_y = 0.5 if (not np.isfinite(denom) or denom <= 0) else (n_t / denom)
    beta_0 = float(np.log(p_y / (1 - p_y)))

    D_clean = pd.to_numeric(D_val, errors="coerce").astype(float)
# only keep positive values for log; others become NaN
    D_pos = D_clean.where(D_clean > 0)

    with np.errstate(divide="ignore", invalid="ignore"):
        beta_vals = np.log(D_pos.to_numpy())  # no warnings thrown
    beta_k_raw = pd.Series(beta_vals, index=D_val.index)
    invalid_predictors = list(beta_k_raw[~np.isfinite(beta_k_raw)].index)
    beta_k = beta_k_raw.copy()
    beta_k[~np.isfinite(beta_k)] = 0.0

    coef_df = pd.DataFrame({
        "predictor": list(beta_k.index) + ["(intercept)"],
        "beta": list(beta_k.astype(float).values) + [beta_0],
    })


    # -- 8) Logistic link: P(Y=1|Z) = 1 / (1 + exp(-(β0 + Σ β_i Z_i)))
    predictors = list(beta_k.index)
    beta_vec = beta_k.astype(float).values

    def predict_proba(z: Union[Dict[str, Any], pd.Series, np.ndarray, List[float], pd.DataFrame]) -> Union[float, np.ndarray, pd.Series]:
        """
        Compute probability using the logistic link.

        Accepts:
          - dict/Series mapping predictor name -> 0/1
          - 1D/2D numpy/list with columns ordered as `predictors`
          - DataFrame with columns containing any/all of `predictors` (others ignored)

        Returns:
          - float for 1D inputs; np.ndarray or pd.Series for vectorized inputs
        """
        if isinstance(z, pd.DataFrame):
            Z = z.reindex(columns=predictors, fill_value=0).astype(float).to_numpy()
            eta = beta_0 + Z @ beta_vec
            # stable sigmoid
            return 1.0 / (1.0 + np.exp(-np.clip(eta, -700, 700)))

        if isinstance(z, (dict, pd.Series)):
            v = np.array([float(z.get(p, 0.0)) for p in predictors], dtype=float)
            eta = beta_0 + float(v @ beta_vec)
            return float(1.0 / (1.0 + np.exp(-np.clip(eta, -700, 700))))

        arr = np.asarray(z, dtype=float)
        if arr.ndim == 1:
            if arr.size != len(predictors):
                raise ValueError(f"Expected {len(predictors)} features in order: {predictors}")
            eta = beta_0 + float(arr @ beta_vec)
            return float(1.0 / (1.0 + np.exp(-np.clip(eta, -700, 700))))
        else:
            if arr.shape[1] != len(predictors):
                raise ValueError(f"Expected shape (*, {len(predictors)}), got {arr.shape}")
            eta = beta_0 + arr @ beta_vec
            return 1.0 / (1.0 + np.exp(-np.clip(eta, -700, 700)))

    return {
        "sorted_scores": sorted_scores,
        "temporal_order": temporal_order,
        "order_used": events_order,
        "T_val": T_val,
        "D_val": D_val,
        "coef_df": coef_df,
        "lambda_l": lambda_l,
        "trace_df": trace_df,
        "invalid_predictors": invalid_predictors,
        # Logistic link outputs:
        "beta_0": beta_0,
        "beta": pd.Series(beta_vec, index=predictors, dtype=float),
        "logit_predictors": predictors,
        "predict_proba": predict_proba,
    }

def _ensure_derived_cols(df: pd.DataFrame) -> pd.DataFrame:
    """Fill no_code_no_target & total_effect in RAM if missing."""
    df = df.copy()
    if "no_code_no_target" not in df.columns:
        df["n_no_target"] = pd.to_numeric(df["n_no_target"], errors="coerce")
        df["n_code_no_target"] = pd.to_numeric(df["n_code_no_target"], errors="coerce")
        df["no_code_no_target"] = (df["n_no_target"] - df["n_code_no_target"]).clip(lower=0)

    if "total_effect" not in df.columns:
        for c in ["n_code_target","n_code_no_target","n_target","n_no_target"]:
            df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)
        a = df["n_code_target"].astype(float)
        b = df["n_code_no_target"].astype(float)
        c_ = (df["n_target"].astype(float) - a).clip(lower=0)
        d = (df["n_no_target"].astype(float) - b).clip(lower=0)
        N1, N0 = (a + b), (c_ + d)
        odds_pos = np.where((a > 0) & (b > 0), a / b,
                            np.where(b == 0, N1 + 1.0, 1.0 / (N1 + 1.0)))
        odds_neg = np.where((c_ > 0) & (d > 0), c_ / d,
                            np.where(d == 0, N0 + 1.0, 1.0 / (N0 + 1.0)))
        df["total_effect"] = odds_pos / odds_neg
    return df

def _select_k_balanced_no_fallback(k_to_T: pd.DataFrame, top_k: int = 500,
                                   hi: float = 1.5) -> pd.DataFrame:
    """
    Balanced selection without fallback:
      - Take up to top_k//2 strongest risk (TE >= hi)
      - Take up to top_k - top_k//2 strongest protective (TE <= 1/hi)
      - If either side is short, do NOT fill from elsewhere; total may be < top_k
      - Return one row per k (target_concept_code), ranked by extremeness
    """
    df = k_to_T.copy()

    # Ensure total_effect numeric
    df["total_effect"] = pd.to_numeric(df["total_effect"], errors="coerce")
    df = df.dropna(subset=["total_effect"]).copy()
    if df.empty:
        print("[SELECT] no rows with total_effect; returning empty selection.")
        return df

    # Extremeness symmetrical around 1
    df["effect_strength"] = np.where(df["total_effect"] >= 1.0,
                                     df["total_effect"],
                                     1.0 / df["total_effect"])

    # One row per k (keep most extreme)
    best_per_k = (df.sort_values("effect_strength", ascending=False)
                    .drop_duplicates(subset=["target_concept_code"], keep="first"))

    HI = hi
    LO = 1.0 / hi

    risk_pool = best_per_k[best_per_k["total_effect"] >= HI].copy()
    prot_pool = best_per_k[best_per_k["total_effect"] <= LO].copy()

    # Target split
    want_risk = top_k // 2
    want_prot = top_k - want_risk

    # Take strongest from each side, capped by availability
    take_risk = min(len(risk_pool), want_risk)
    take_prot = min(len(prot_pool), want_prot)

    sel_risk = risk_pool.nlargest(take_risk, "effect_strength")
    sel_prot = prot_pool.nlargest(take_prot, "effect_strength")

    selected = pd.concat([sel_risk, sel_prot], ignore_index=True)

    print(f"[SELECT] total unique k={len(best_per_k):,}  "
          f"risk={len(risk_pool):,}  prot={len(prot_pool):,}  "
          f"selected(total)={selected['target_concept_code'].nunique()}  "
          f"with: risk={take_risk}, prot={take_prot}")

    return selected.reset_index(drop=True)

def _fetch_k_to_T(conn, target_code: str) -> pd.DataFrame:
    """Fetch only k→T rows (concept_code == target)."""
    q = """
      SELECT m.*,
             tcn.concept_code AS target_concept_code,
             ccn.concept_code AS concept_code
      FROM magi_counts_top500 m
      JOIN concept_names tcn ON m.target_concept_code_int = tcn.concept_code_int
      JOIN concept_names ccn ON m.concept_code_int        = ccn.concept_code_int
      WHERE ccn.concept_code = ?
    """
    return pd.read_sql_query(q, conn, params=[target_code])

def _fetch_subgraph_by_targets(conn, events_list):
    """Fetch edges with target_concept_code IN events_set (single IN to avoid 999 param issues)."""
    ph = ",".join(["?"] * len(events_list))
    q = f"""
      SELECT m.*,
             tcn.concept_code AS target_concept_code,
             ccn.concept_code AS concept_code
      FROM magi_counts_top500 m
      JOIN concept_names tcn ON m.target_concept_code_int = tcn.concept_code_int
      JOIN concept_names ccn ON m.concept_code_int        = ccn.concept_code_int
      WHERE tcn.concept_code IN ({ph})
    """
    return pd.read_sql_query(q, conn, params=list(events_list))
# ===== Main loop over targets =====
uri = f"file:{MAGI_DB_PATH}?mode=ro"
with sqlite3.connect(uri, uri=True) as conn:

    for T in TARGETS:
        print("\n" + "="*100)
        print(f"[RUN] Target = {T}")

        # 1) k→T only
        k_to_T = _fetch_k_to_T(conn, T)
        if k_to_T.empty:
            print(f"[WARN] No k→T rows for {T}; skipping.")
            continue

        # 2) derive totals/effects
        k_to_T = _ensure_derived_cols(k_to_T)

        # 3) select k (balanced + fallback)
        sel_rows = _select_k_balanced_no_fallback(k_to_T, top_k=TOP_K)
        selected_k = set(sel_rows["target_concept_code"].astype(str))
        if len(selected_k) == 0:
            print(f"[WARN] No predictors selected for {T}; skipping.")
            continue

        # 4) build subgraph events and fetch edges
        events_set = selected_k | {T}
        df_trim = _fetch_subgraph_by_targets(conn, sorted(events_set))
        # Keep only edges fully inside the set (both ends)
        df_trim = df_trim[
            df_trim["target_concept_code"].isin(events_set) &
            df_trim["concept_code"].isin(events_set)
        ].copy()

        print(f"[TRIM] rows={len(df_trim):,}  events={len(events_set)}  "
              f"k→T rows={int((df_trim['concept_code']==T).sum())}  "
              f"T→j rows={int((df_trim['target_concept_code']==T).sum())}")

        # 5) save subgraph for audit
        sub_csv = os.path.join(OUT_DIR, f"magi_subgraph_{T}.csv")
        df_trim.to_csv(sub_csv, index=False)
        print(f"[SAVED] Subgraph → {sub_csv}")

        # 6) run MAGI (force outcome if your function supports it; else warn if differs)
        try:
            res = analyze_causal_sequence_py(df_trim, events=None, name_map=None, force_outcome=T)  # if patched
        except TypeError:
            res = analyze_causal_sequence_py(df_trim)  # original signature
            outcome_used = res["order_used"][-1]
            if outcome_used != T:
                print(f"[NOTE] outcome auto-inferred as {outcome_used}, not {T}")

        # 7) save coefficients
        outcome_used = res["order_used"][-1]
        coef_df = res["coef_df"]
        coef_csv = os.path.join(OUT_DIR, f"magi_coef_{outcome_used}.csv")
        coef_df.to_csv(coef_csv, index=False)
        print(f"[SAVED] Coefficients → {coef_csv}  | antecedents={len(res['order_used'])-1}  "
              f"used_total_effect={res.get('used_total_effect', False)}")

In [14]:
df = pd.read_csv(os.path.join(OUT_DIR, f"magi_subgraph_aa_meas_citalopram_rem.csv"))

In [23]:
print("df shape:", df.shape)
print("df columns:", df.columns.tolist())

df shape: (4687, 19)
df columns: ['target_concept_code_int', 'concept_code_int', 'n_code_target', 'n_code_no_target', 'n_target', 'n_no_target', 'n_code', 'n_target_before_code', 'n_code_before_target', 'n_target_no_code', 'n_no_code_no_target', 'lr', 'norm_lr', 'lr_rank', 'total_effects', 'total_effects_norm', 'total_effects_rank', 'target_concept_code', 'concept_code']


In [15]:
# --- 1) prune events to what's present in df ---
def prune_events_to_data(df: pd.DataFrame, name_map: dict, events: list, outcome: str = None):
    """
    Recode df with name_map (no-op if empty), then keep only events present in either column.
    If outcome is provided, verify it exists and has at least one row with any remaining event.
    Returns (present_events, missing_events).
    """
    df2 = df.copy()
    if "target_concept_code" not in df2.columns or "concept_code" not in df2.columns:
        raise KeyError("df must contain 'target_concept_code' and 'concept_code'")

    if name_map:
        df2["target_concept_code"] = df2["target_concept_code"].replace(name_map)
        df2["concept_code"]        = df2["concept_code"].replace(name_map)

    syms = set(df2["target_concept_code"]).union(df2["concept_code"])
    present = [e for e in events if e in syms]
    missing = sorted(set(events) - syms)

    if outcome is not None:
        if outcome not in syms:
            raise ValueError(f"Outcome '{outcome}' not in data after recode. Missing: {missing}")
        has_rows = ((df2["target_concept_code"] == outcome) & (df2["concept_code"].isin(present))).any()
        if not has_rows:
            raise ValueError(f"No rows for outcome '{outcome}' with remaining events {present}.")

    if len(present) < 2:
        raise ValueError(f"Need at least 2 events present to run the analysis. Present={present}, Missing={missing}")

    return present, missing


# --- 2) name_map (raw -> friendly); keep empty to use raw codes ---
name_map = {}

In [51]:
# --- 3) events to analyze (raw codes since name_map is empty) ---
events = [
    "aa_meas_citalopram_rem",  # intended outcome
    "rx_RxNorm_1236136",
    "px_CPT4_33226",
    "dx_SNOMED_33339001",
]

In [52]:
# --- 4) prune & run MAGI analysis with intended outcome ---
desired_outcome = "aa_meas_citalopram_rem"
present, missing = prune_events_to_data(df, name_map, events, outcome=desired_outcome)
if missing:
    print("Dropping missing events:", missing)
print("events (present):", present)

res = analyze_causal_sequence_py(
    df, name_map=name_map, events=present, force_outcome=desired_outcome
)
assert res["order_used"][-1] == desired_outcome, (
    f"Outcome mismatch: expected {desired_outcome}, got {res['order_used'][-1]}"
)


Dropping missing events: ['dx_SNOMED_33339001']
events (present): ['aa_meas_citalopram_rem', 'rx_RxNorm_1236136', 'px_CPT4_33226']


In [53]:
# --- 5) predict probabilities  ---

# Outcome chosen by the model
model_outcome = res["order_used"][-1]
print("Outcome (model):", model_outcome)
print("Temporal order:", " -> ".join(res["order_used"]))

# Predictors actually used by the logit (exclude the outcome just in case)
preds = [p for p in res["logit_predictors"] if p != model_outcome]
print("Predictors used by the logit:", preds)

# Compare with your pruned events list (present minus outcome)
desired_source = [e for e in present if e != model_outcome]
ignored = sorted(set(desired_source) - set(preds))
if ignored:
    print("Ignored (present but not in model):", ignored)

# show coefficients for the predictors + intercept
coef = res["coef_df"]
print("\nCoefficients (predictors + intercept):")
print(coef[coef["predictor"].isin(preds + ["(intercept)"])].reset_index(drop=True))

# Baseline (all 0)
z0 = {p: 0 for p in preds}
p0 = res["predict_proba"](z0)
print("Baseline probability:", p0)

desired_in_model = [e for e in desired_source if e in preds]
z = {p: (1 if p in desired_in_model else 0) for p in preds}
p = res["predict_proba"](z)
print("Predicted probability (desired on, others 0):", p)

Outcome (model): aa_meas_citalopram_rem
Temporal order: px_CPT4_33226 -> rx_RxNorm_1236136 -> aa_meas_citalopram_rem
Predictors used by the logit: ['px_CPT4_33226', 'rx_RxNorm_1236136']

Coefficients (predictors + intercept):
           predictor      beta
0      px_CPT4_33226  1.540802
1  rx_RxNorm_1236136 -1.262856
2        (intercept) -3.843496
Baseline probability: 0.020969454388289125
Predicted probability (desired on, others 0): 0.02750359341046971


In [59]:


# ---------- 6. helper: infer binary labels (Female/Male) ----------
def infer_binary_labels(code: str):
    """Return ('label for =1', 'label for =0') based on the code name."""
    c = (code or "").lower()
    # Common sex/gender patterns
    if c.endswith("_f") or "female" in c or "gender_f" in c or "sex_f" in c:
        return ("Female", "Male")
    if c.endswith("_m") or "male" in c or "gender_m" in c or "sex_m" in c:
        return ("Male", "Female")
    # Default
    return (f"{code}=1", f"{code}=0")


# ---------- A) predictor vs outcome 2×2 (with Female/Male row labels if k is sex) ----------
def generate_contingency_tables(df: pd.DataFrame, name_map: dict, res: dict):
    """
    For each predictor k in res['logit_predictors'] vs outcome Y, produce a 2×2:
      rows:  Z_k=1, Z_k=0  (labeled Female/Male if k is a sex flag)
      cols:  Y=1,  Y=0
    a = Y∩k  (n_code_target)
    b = Y∩¬k (n_code_no_target)
    c = ¬Y∩k (derived: n_target - a)
    d = ¬Y∩¬k (derived: n_no_target - b)
    """
    df2 = df.copy()
    df2["target_concept_code"] = df2["target_concept_code"].replace(name_map)
    df2["concept_code"]        = df2["concept_code"].replace(name_map)

    # ensure numeric
    need = ["n_code_target", "n_code_no_target", "n_target", "n_no_target"]
    for col in need:
        if col not in df2.columns:
            raise ValueError(f"Missing required column: {col}")
        df2[col] = pd.to_numeric(df2[col], errors="coerce")

    Y = res["order_used"][-1]
    preds = list(res["logit_predictors"])
    out = {}

    for k in preds:
        row = df2[(df2["target_concept_code"] == k) & (df2["concept_code"] == Y)]

        a = float(row["n_code_target"].sum(skipna=True)) if not row.empty else 0.0
        b = float(row["n_code_no_target"].sum(skipna=True)) if not row.empty else 0.0

        nt  = float(pd.to_numeric(row["n_target"], errors="coerce").max()) if not row.empty else 0.0
        nnt = float(pd.to_numeric(row["n_no_target"], errors="coerce").max()) if not row.empty else 0.0
        c = max(nt  - a, 0.0)
        d = max(nnt - b, 0.0)

        # Margins
        z1, z0 = a + c, b + d
        y1, y0 = a + b, c + d
        N      = y1 + y0

        # Label rows: Female/Male if k is a sex flag
        row1, row0 = infer_binary_labels(k)

        tbl = pd.DataFrame(
            [[a, c, z1],
             [b, d, z0],
             [y1, y0, N]],
            index=[row1, row0, "Sum"],
            columns=[f"{Y}=1", f"{Y}=0", "Sum"]
        )
        out[k] = tbl

    return out


# ---------- B) pairwise variable–variable 2×2 (labels Female/Male on any sex axis) ----------
def generate_pairwise_contingency_tables(df: pd.DataFrame, name_map: dict,
                                         items=None, use_predictors=True, res=None):
    """
    For each unordered pair (k, j), build a 2×2:
      rows = k=1/0 (Female/Male if k is a sex flag)
      cols = j=1/0 (Female/Male if j is a sex flag)

    a = k∩j            (max from either orientation’s n_code_target)
    b = k∩¬j           (prefer row target=j, concept=k: n_code_no_target; else C(k)-a)
    c = j∩¬k           (prefer row target=k, concept=j: n_code_no_target; else C(j)-a)
    d = N - (a+b+c)    (N from totals if available; else observed sum)
    """
    from itertools import combinations
    if use_predictors:
        if res is None:
            raise ValueError("res is required when use_predictors=True.")
        items = list(res["logit_predictors"]) if items is None else items
    elif items is None:
        raise ValueError("Provide `items` when use_predictors=False.")

    df2 = df.copy()
    df2["target_concept_code"] = df2["target_concept_code"].replace(name_map)
    df2["concept_code"]        = df2["concept_code"].replace(name_map)

    for col in ["n_code_target","n_code_no_target","n_target","n_no_target"]:
        if col not in df2.columns:
            raise ValueError(f"Missing required column: {col}")
        df2[col] = pd.to_numeric(df2[col], errors="coerce")

    # Totals helper
    def C_of(ev: str) -> float:
        sub = df2[df2["target_concept_code"] == ev]
        if sub.empty: return float("nan")
        C = pd.to_numeric(sub["n_target"], errors="coerce").max()
        return float(C) if pd.notna(C) and np.isfinite(C) else float("nan")

    # N helper
    def N_for_pair(k: str, j: str) -> float:
        cands = []
        for (t, cpt) in [(k, j), (j, k)]:
            row = df2[(df2["target_concept_code"] == t) & (df2["concept_code"] == cpt)]
            if not row.empty:
                nt = pd.to_numeric(row["n_target"], errors="coerce")
                nn = pd.to_numeric(row["n_no_target"], errors="coerce")
                val = (nt + nn).max()
                if pd.notna(val) and np.isfinite(val): cands.append(float(val))
        if cands: return max(cands)
        for t in (k, j):
            sub = df2[df2["target_concept_code"] == t]
            if not sub.empty:
                nt = pd.to_numeric(sub["n_target"], errors="coerce").max()
                nn = pd.to_numeric(sub["n_no_target"], errors="coerce").max()
                if pd.notna(nt) and pd.notna(nn): return float(nt + nn)
        return float("nan")

    syms = set(df2["target_concept_code"]).union(df2["concept_code"])
    items = [x for x in items if x in syms]

    out = {}
    for k, j in combinations(sorted(set(items)), 2):
        row_kj = df2[(df2["target_concept_code"] == k) & (df2["concept_code"] == j)]
        row_jk = df2[(df2["target_concept_code"] == j) & (df2["concept_code"] == k)]

        a1 = float(pd.to_numeric(row_kj["n_code_target"], errors="coerce").sum(skipna=True)) if not row_kj.empty else np.nan
        a2 = float(pd.to_numeric(row_jk["n_code_target"], errors="coerce").sum(skipna=True)) if not row_jk.empty else np.nan
        a = np.nanmax([a1, a2]) if np.any(np.isfinite([a1, a2])) else 0.0

        if not row_jk.empty:
            b = float(pd.to_numeric(row_jk["n_code_no_target"], errors="coerce").sum(skipna=True))
        else:
            Ck = C_of(k); b = 0.0 if not np.isfinite(Ck) else max(Ck - a, 0.0)

        if not row_kj.empty:
            c = float(pd.to_numeric(row_kj["n_code_no_target"], errors="coerce").sum(skipna=True))
        else:
            Cj = C_of(j); c = 0.0 if not np.isfinite(Cj) else max(Cj - a, 0.0)

        N = N_for_pair(k, j)
        if not np.isfinite(N):
            d = 0.0; N = a + b + c
        else:
            d = max(N - (a + b + c), 0.0)

        # Human-friendly labels
        row1, row0 = infer_binary_labels(k)
        col1, col0 = infer_binary_labels(j)

        # Build table with sex-aware labels
        k1, k0 = a + b, c + d
        j1, j0 = a + c, b + d
        tbl = pd.DataFrame(
            [[a, b, k1],
             [c, d, k0],
             [j1, j0, N]],
            index=[row1, row0, "Sum"],
            columns=[col1, col0, "Sum"]
        )
        out[(k, j)] = tbl

    return out


In [60]:
# --- 7) print the contingency tables ---
pred_vs_outcome = generate_contingency_tables(df, name_map, res)
for k, t in pred_vs_outcome.items():
    print(f"\n=== {k} vs {res['order_used'][-1]} ===")
    print(t)

pairwise = generate_pairwise_contingency_tables(df, name_map, use_predictors=True, res=res)
for (k, j), t in pairwise.items():
    print(f"\n=== Pairwise: {k} vs {j} ===")
    print(t)


=== px_CPT4_33226 vs aa_meas_citalopram_rem ===
                 aa_meas_citalopram_rem=1  aa_meas_citalopram_rem=0       Sum
px_CPT4_33226=1                       0.0                      10.0      10.0
px_CPT4_33226=0                   11159.0                  520986.0  532145.0
Sum                               11159.0                  520996.0  532155.0

=== rx_RxNorm_1236136 vs aa_meas_citalopram_rem ===
                     aa_meas_citalopram_rem=1  aa_meas_citalopram_rem=0  \
rx_RxNorm_1236136=1                       0.0                     165.0   
rx_RxNorm_1236136=0                   11159.0                  520831.0   
Sum                                   11159.0                  520996.0   

                          Sum  
rx_RxNorm_1236136=1     165.0  
rx_RxNorm_1236136=0  531990.0  
Sum                  532155.0  

=== Pairwise: px_CPT4_33226 vs rx_RxNorm_1236136 ===
                 rx_RxNorm_1236136=1  rx_RxNorm_1236136=0       Sum
px_CPT4_33226=1                  0.

In [None]:
######### All of Us Case-Control Organization ################ 
# ---------- UTILS ----------
def banner(txt):
    bar = "=" * max(12, len(txt) + 4)
    print(f"\n{bar}\n{txt}\n{bar}")

def subhead(txt):
    print(f"\n--- {txt} ---")

def safe_name(s: str) -> str:
    return "".join(ch if ch.isalnum() or ch in ("-", "_") else "_" for ch in s)

def qtiles(x):
    x = np.asarray(x)
    return np.quantile(x, [0, 0.01, 0.25, 0.5, 0.75, 0.99, 1.0])

def preview_active_codes(X_csr, feature_codes, row_indices, k=8):
    """Print up to k active codes for a few rows."""
    for i in row_indices:
        start, end = X_csr.indptr[i], X_csr.indptr[i+1]
        cols = X_csr.indices[start:end]
        codes_list = [feature_codes[j] for j in cols[:k]]
        print(f"   row {i}: n_active={len(cols)}  sample_active={codes_list}")

def load_magi_betas(coef_csv):
    df = pd.read_csv(coef_csv)
    def pick(df, opts):
        for c in opts:
            if c in df.columns:
                return c
        raise KeyError(f"Missing any of {opts} in {coef_csv}. Found: {list(df.columns)}")
    code_col = pick(df, ["concept_code","standard_concept_code","predictor","feature","term","name"])
    beta_col = pick(df, ["coef","coefficient","beta","estimate","b","value"])
    df[code_col] = df[code_col].astype(str).str.strip()
    is_int = df[code_col].str.lower().isin(["(intercept)","intercept","const","(const)","bias"])
    intercept = float(df.loc[is_int, beta_col].iloc[0]) if is_int.any() else 0.0
    coef_map  = dict(zip(df.loc[~is_int, code_col], df.loc[~is_int, beta_col].astype(float)))
    return intercept, coef_map

def sample_all_pos_kx_neg(y, k=4, seed=42):
    rng = np.random.default_rng(seed)
    pos = np.where(y == 1)[0]
    neg = np.where(y == 0)[0]
    if pos.size == 0: raise ValueError("No positives for this target.")
    want = min(k * pos.size, neg.size)
    sel_neg = rng.choice(neg, size=want, replace=False)
    sel = np.concatenate([pos, sel_neg]); rng.shuffle(sel)
    return sel

def score_from_betas(X_sub, feature_codes, betas_map, intercept):
    feat = np.array(feature_codes, dtype=str)
    mask = np.isin(feat, list(betas_map.keys()))
    idx  = np.where(mask)[0]
    if idx.size == 0: raise ValueError("No overlap between features and MAGI coefficients.")
    betas = np.array([betas_map[c] for c in feat[idx]], dtype=float)
    lp = intercept + X_sub[:, idx].dot(betas)      # (n,)
    p  = expit(np.asarray(lp).ravel())
    return np.asarray(lp).ravel(), p, idx, betas

def plot_roc(y_true, p_hat, title, out_png, out_svg):
    fpr, tpr, _ = roc_curve(y_true, p_hat)
    auc = roc_auc_score(y_true, p_hat)
    plt.figure()
    plt.plot(fpr, tpr, label=f"AUC={auc:.3f}")
    plt.plot([0,1], [0,1], linestyle="--", linewidth=1)
    plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
    plt.title(title); plt.legend(loc="lower right"); plt.tight_layout()
    plt.savefig(out_png, dpi=300, bbox_inches="tight")
    plt.savefig(out_svg, bbox_inches="tight")
    plt.close()
    return auc

# ---------- LOAD DESIGN ONCE ----------
banner("LOAD DESIGN")
X_full  = load_npz(f"{BASE}/Lasso_X.npz").tocsr().astype(np.float32)
persons = pd.read_csv(f"{BASE}/person_index.csv")["person_id"].astype(str).to_numpy()
codes   = pd.read_csv(f"{BASE}/code_index.csv")["concept_code"].astype(str).to_numpy()
print(f"[INFO] Matrix: persons={X_full.shape[0]:,}  codes={X_full.shape[1]:,}")
if len(persons) != X_full.shape[0] or len(codes) != X_full.shape[1]:
    raise ValueError("[ERROR] person/code indices do not match matrix shape.")

# ---------- RUN PER TARGET ----------
summary = []
for tcode in TARGETS:
    pretty = TARGET_NAME.get(tcode, tcode)
    banner(f"TARGET {tcode} — {pretty}")

    # SECTION A: labels
    subhead("A) Label vector from full design")
    idx_y = np.where(codes == tcode)[0]
    if idx_y.size == 0:
        print(f"[SKIP] Target not found in code_index.csv → {tcode}")
        continue
    y_full = X_full[:, idx_y[0]].toarray().ravel().astype(np.int8)
    print(f"[INFO] y_full: n={y_full.size:,}  pos={int(y_full.sum()):,}  "
          f"prev={y_full.mean():.4f}")

    # SECTION B: predictors (keep all except DV)
    subhead("B) Predictor matrix (keep all columns except DV)")
    mask_pred = (codes != tcode)
    X = X_full[:, mask_pred]
    feature_codes = codes[mask_pred]
    print(f"[INFO] Predictors: persons={X.shape[0]:,}  features={X.shape[1]:,}")
    print(f"[CHECK] DV in features? {tcode in feature_codes} (should be False)")

    # SECTION C: sampling (all pos + 4x neg)
    subhead("C) Sampling (keep ALL positives + 4× negatives)")
    sel = sample_all_pos_kx_neg(y_full, k=NEG_MULT, seed=RNG_SEED)
    X_sub       = X[sel, :]
    y_sub       = y_full[sel].astype(np.int8)
    persons_sub = persons[sel]
    n_rows      = X_sub.shape[0]
    n_pos_sub   = int(y_sub.sum())
    n_neg_sub   = n_rows - n_pos_sub
    print(f"[INFO] subset: n={n_rows:,}  pos={n_pos_sub:,}  neg={n_neg_sub:,}  "
          f"ratio≈{(n_neg_sub/max(n_pos_sub,1)):.2f}:1  PR-baseline={y_sub.mean():.4f}")
    # a tiny peek at first 3 rows' active codes
    try:
        preview_active_codes(X_sub, feature_codes, row_indices=range(min(3, n_rows)), k=8)
    except Exception as e:
        print(f"[WARN] preview_active_codes failed: {e}")

    # SECTION D: coefficients
    subhead("D) Load MAGI coefficients")
    coef_csv = COEF_PATTERN.format(target=tcode)
    if not os.path.exists(coef_csv):
        print(f"[SKIP] Coef file missing: {coef_csv}")
        continue
    intercept, coef_map = load_magi_betas(coef_csv)
    print(f"[INFO] Coefs: intercept={intercept:.6f}  n_features={len(coef_map):,}")
    # show a few coef samples
    for k,(cc,bb) in enumerate(list(coef_map.items())[:5]):
        print(f"   beta[{cc}] = {bb:.6f}")
    if "(intercept)" not in open(coef_csv, 'r', encoding="utf-8", errors="ignore").read():
        print("[NOTE] No explicit '(intercept)' row in CSV; using 0.0 if not found.")

    # SECTION E: alignment & scoring
    subhead("E) Align & score")
    lp, p_hat, idx_overlap, betas_vec = None, None, None, None
    try:
        lp, p_hat, idx_cols, betas_vec = score_from_betas(X_sub, feature_codes, coef_map, intercept)
    except Exception as e:
        print(f"[SKIP] Scoring failed (no overlap or other issue): {e}")
        continue

    n_overlap = idx_cols.size
    print(f"[INFO] overlap with predictors = {n_overlap:,} columns")
    print(f"[INFO] first 5 aligned columns: {[feature_codes[i] for i in idx_cols[:5]]}")
    print(f"[INFO] first 5 aligned betas:   {[float(b) for b in betas_vec[:5]]}")

    # SECTION F: metrics & distributions
    subhead("F) Metrics & probability distribution")
    auc    = roc_auc_score(y_sub, p_hat)
    pr_auc = average_precision_score(y_sub, p_hat)
    q = qtiles(p_hat)
    print(f"[RESULT] AUC={auc:.4f}  |  PR-AUC={pr_auc:.4f}  (baseline={y_sub.mean():.4f})")
    print(f"[DIST] prob quantiles: min={q[0]:.4g}, p1={q[1]:.4g}, p25={q[2]:.4g}, "
          f"median={q[3]:.4g}, p75={q[4]:.4g}, p99={q[5]:.4g}, max={q[6]:.4g}")
    print(f"[COUNT] prob>=0.999: {(p_hat>=0.999).sum()}  |  prob<=0.001: {(p_hat<=0.001).sum()}")

    # SECTION G: save predictions
    subhead("G) Save per-person predictions")
    safe = safe_name(pretty)
    pred_csv = os.path.join(CSV_DIR, f"pred_{safe}.csv")
    pd.DataFrame({
        "person_id": persons_sub,
        "y_true": y_sub.astype(int),
        "score_logit": lp,
        "prob": p_hat
    }).to_csv(pred_csv, index=False)
    print(f"[SAVE] predictions → {pred_csv}")
    print(pd.read_csv(pred_csv).head(10))

    # SECTION H: plots
    subhead("H) ROC plots (PNG/SVG)")
    png_path = os.path.join(PNG_DIR, f"ROC_{safe}.png")
    svg_path = os.path.join(PNG_DIR, f"ROC_{safe}.svg")
    _auc = plot_roc(y_sub, p_hat, pretty, png_path, svg_path)
    print(f"[SAVE] ROC → {png_path}")
    print(f"[SAVE] ROC → {svg_path}")

    # accumulate summary
    summary.append({
        "target_code": tcode,
        "target_name": pretty,
        "n_cases": n_rows,
        "n_pos": n_pos_sub,
        "n_neg": n_neg_sub,
        "feature_overlap": n_overlap,
        "AUC": auc,
        "PR_AUC": pr_auc,
        "PR_baseline": y_sub.mean(),
        "coef_csv": coef_csv,
        "pred_csv": pred_csv,
        "roc_png": png_path
    })

