In [None]:
# ============================================
# MAGI batch: keep ALL positives + 4x negatives
# Titles/filenames use REAL NAMES (not codes)
# ============================================
import os, math, numpy as np, pandas as pd
from scipy.sparse import load_npz
from scipy.special import expit
from sklearn.metrics import roc_auc_score, roc_curve, average_precision_score
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import precision_recall_curve
import hashlib

In [None]:
# ------------- CONFIG -------------
BASE      = "./testing_YL"

# Mapping: code -> pretty name (use your list)
TARGET_NAME = {
#    "dx_SNOMED_109378008": "Mesothelioma (malignant, clinical disorder)",
    "dx_SNOMED_254645002": "Malignant mesothelioma of pleura",
#    "dx_SNOMED_109853004": "Mesothelioma of peritoneum",
#    "dx_SNOMED_109372009": "Benign neoplasm of mesothelial tissue of pleura",
}

TARGETS = list(TARGET_NAME.keys())

# Coefficient CSVs live here, one per target
# Example filename: ./testing_YL/NIDA/redo/magi_coef_dx_SNOMED_15167005_byTE.csv
COEF_PATTERN = os.path.join(BASE, "Mesothelioma/magi_coef", "magi_coef_{target}.csv")

# Output folders
OUT_DIR      = os.path.join(BASE, "Mesothelioma/magi_out")
PNG_DIR      = os.path.join(OUT_DIR, "png")
CSV_DIR      = os.path.join(OUT_DIR, "preds")
os.makedirs(PNG_DIR, exist_ok=True)
os.makedirs(CSV_DIR, exist_ok=True)

# Sampling plan
NEG_MULT   = 4            # keep ALL positives + 4x negatives
RNG_SEED   = 42

# collage settings
COLS_GRID  = 2            # ROC images per row in collage
IMG_SCALE  = 0.75


In [None]:
# ---------- UTILS ----------
def banner(txt):
    bar = "=" * max(12, len(txt) + 4)
    print(f"\n{bar}\n{txt}\n{bar}")

def subhead(txt):
    print(f"\n--- {txt} ---")

def safe_name(s: str) -> str:
    return "".join(ch if ch.isalnum() or ch in ("-", "_") else "_" for ch in s)

def qtiles(x):
    x = np.asarray(x)
    return np.quantile(x, [0, 0.01, 0.25, 0.5, 0.75, 0.99, 1.0])

def preview_active_codes(X_csr, feature_codes, row_indices, k=8):
    """Print up to k active codes for a few rows."""
    for i in row_indices:
        start, end = X_csr.indptr[i], X_csr.indptr[i+1]
        cols = X_csr.indices[start:end]
        codes_list = [feature_codes[j] for j in cols[:k]]
        print(f"   row {i}: n_active={len(cols)}  sample_active={codes_list}")

def load_magi_betas(coef_csv):
    df = pd.read_csv(coef_csv)
    def pick(df, opts):
        for c in opts:
            if c in df.columns:
                return c
        raise KeyError(f"Missing any of {opts} in {coef_csv}. Found: {list(df.columns)}")
    code_col = pick(df, ["concept_code","standard_concept_code","predictor","feature","term","name"])
    beta_col = pick(df, ["coef","coefficient","beta","estimate","b","value"])
    df[code_col] = df[code_col].astype(str).str.strip()
    is_int = df[code_col].str.lower().isin(["(intercept)","intercept","const","(const)","bias"])
    intercept = float(df.loc[is_int, beta_col].iloc[0]) if is_int.any() else 0.0
    coef_map  = dict(zip(df.loc[~is_int, code_col], df.loc[~is_int, beta_col].astype(float)))
    return intercept, coef_map

def sample_fixed_pos_neg(y, n_pos=1000, n_neg=4000, seed=42):
    """
    Sample up to 1,000 positives; if fewer are available, use all of them.
    Always sample negatives = 4 × (#positives actually sampled), capped by availability.
    (n_neg is ignored for size; kept for backward compatibility.)
    """
    rng = np.random.default_rng(seed)
    pos_idx = np.where(y == 1)[0]
    neg_idx = np.where(y == 0)[0]
    if pos_idx.size == 0:
        raise ValueError("No positives for this target.")

    take_pos = min(n_pos, pos_idx.size)
    take_neg = min(4 * take_pos, neg_idx.size)  # <-- key change: 4× positives

    sel_pos = rng.choice(pos_idx, size=take_pos, replace=False)
    sel_neg = rng.choice(neg_idx, size=take_neg, replace=False)
    sel = np.concatenate([sel_pos, sel_neg])
    rng.shuffle(sel)
    return sel


def score_from_betas(X_sub, feature_codes, betas_map, intercept):
    feat = np.array(feature_codes, dtype=str)
    mask = np.isin(feat, list(betas_map.keys()))
    idx  = np.where(mask)[0]
    if idx.size == 0: raise ValueError("No overlap between features and MAGI coefficients.")
    betas = np.array([betas_map[c] for c in feat[idx]], dtype=float)
    lp = intercept + X_sub[:, idx].dot(betas)      # (n,)
    p  = expit(np.asarray(lp).ravel())
    return np.asarray(lp).ravel(), p, idx, betas

def plot_roc(y_true, p_hat, title, out_png, out_svg):
    fpr, tpr, _ = roc_curve(y_true, p_hat)
    auc = roc_auc_score(y_true, p_hat)
    plt.figure()
    plt.plot(fpr, tpr, label=f"AUC={auc:.3f}")
    plt.plot([0,1], [0,1], linestyle="--", linewidth=1)
    plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
    plt.title(title); plt.legend(loc="lower right"); plt.tight_layout()
    plt.savefig(out_png, dpi=300, bbox_inches="tight")
    plt.savefig(out_svg, bbox_inches="tight")
    plt.close()
    return auc

def drop_constants_and_duplicates_for_sample(X_csr, feature_codes):
    """
    On the sampled matrix (rows = selected patients):
      - Drop all-zero columns
      - Drop perfectly duplicate columns (exact same sparsity pattern & values)
    Returns reduced X, reduced feature_codes, and a boolean keep_mask aligned to original feature_codes.
    """
    n_rows, n_cols = X_csr.shape
    # drop all-zero quickly
    nnz = np.asarray((X_csr != 0).sum(axis=0)).ravel()
    keep_mask = (nnz > 0)

    # check duplicates among remaining columns via CSC hashes
    X_csc = X_csr[:, keep_mask].tocsc()
    sub_keep = np.ones(X_csc.shape[1], dtype=bool)
    seen = {}
    for j in range(X_csc.shape[1]):
        s, e = X_csc.indptr[j], X_csc.indptr[j+1]
        idx = X_csc.indices[s:e]
        dat = X_csc.data[s:e]
        key = (idx.tobytes(), dat.tobytes())
        if key in seen:
            sub_keep[j] = False
        else:
            seen[key] = j

    # map back to original feature space
    keep_idx = np.where(keep_mask)[0]
    keep_mask_final = np.zeros(n_cols, dtype=bool)
    keep_mask_final[keep_idx[sub_keep]] = True

    X_red = X_csr[:, keep_mask_final]
    feats_red = feature_codes[keep_mask_final]
    dropped = n_cols - int(keep_mask_final.sum())
    print(f"[INFO] LASSO collinearity cleanup: kept {X_red.shape[1]:,}/{n_cols:,} features "
          f"(dropped {dropped:,} all-zero/duplicate).")
    return X_red, feats_red, keep_mask_final

# large x and y, no title
def plot_roc(y_true, p_hat, title, out_png, out_svg):
    from sklearn.metrics import roc_curve, roc_auc_score
    import matplotlib.pyplot as plt

    fpr, tpr, _ = roc_curve(y_true, p_hat)
    auc = roc_auc_score(y_true, p_hat)

    plt.figure()
    plt.plot(fpr, tpr, label=f"AUC={auc:.3f}")
    plt.plot([0,1], [0,1], linestyle="--", linewidth=1)
    plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")

    # Larger + bold axis labels and ticks; no title
    plt.xlabel("False Positive Rate", fontsize=16, fontweight="bold")
    plt.ylabel("True Positive Rate", fontsize=16, fontweight="bold")
    plt.xticks(fontsize=14, fontweight="bold")
    plt.yticks(fontsize=14, fontweight="bold")

    ax = plt.gca()
    for spine in ax.spines.values():
        spine.set_linewidth(1.4)

    plt.legend(loc="lower right", fontsize=12)
    plt.tight_layout()
    plt.savefig(out_png, dpi=300, bbox_inches="tight")
    plt.savefig(out_svg, bbox_inches="tight")
    plt.close()
    return auc

# ---------- LOAD DESIGN ONCE ----------
banner("LOAD DESIGN")
X_full  = load_npz(f"{BASE}/Lasso_X.npz").tocsr().astype(np.float32)
persons = pd.read_csv(f"{BASE}/person_index.csv")["person_id"].astype(str).to_numpy()
codes   = pd.read_csv(f"{BASE}/code_index.csv")["concept_code"].astype(str).to_numpy()
print(f"[INFO] Matrix: persons={X_full.shape[0]:,}  codes={X_full.shape[1]:,}")
if len(persons) != X_full.shape[0] or len(codes) != X_full.shape[1]:
    raise ValueError("[ERROR] person/code indices do not match matrix shape.")

# ---------- RUN PER TARGET ----------

summary = []
for tcode in TARGETS:
    pretty = TARGET_NAME.get(tcode, tcode)
    banner(f"TARGET {tcode} — {pretty}")

    # SECTION A: labels
    subhead("A) Label vector from full design")
    idx_y = np.where(codes == tcode)[0]
    if idx_y.size == 0:
        print(f"[SKIP] Target not found in code_index.csv → {tcode}")
        continue
    y_full = X_full[:, idx_y[0]].toarray().ravel().astype(np.int8)
    print(f"[INFO] y_full: n={y_full.size:,}  pos={int(y_full.sum()):,}  "
          f"prev={y_full.mean():.4f}")

    # SECTION B: predictors (keep all except DV)
    subhead("B) Predictor matrix (keep all columns except DV)")
    mask_pred = (codes != tcode)
    X = X_full[:, mask_pred]
    feature_codes = codes[mask_pred]
    print(f"[INFO] Predictors: persons={X.shape[0]:,}  features={X.shape[1]:,}")
    print(f"[CHECK] DV in features? {tcode in feature_codes} (should be False)")
    
    # SECTION C: sampling (1k pos + 4k neg, random; SAME SAMPLE for MAGI & LASSO)
    subhead("C) Sampling (1,000 positives + 4,000 negatives)")
    sel = sample_fixed_pos_neg(y_full, n_pos=1000, n_neg=4000, seed=RNG_SEED)
    X_sub       = X[sel, :]
    y_sub       = y_full[sel].astype(np.int8)
    persons_sub = persons[sel]
    n_rows      = X_sub.shape[0]
    n_pos_sub   = int(y_sub.sum())
    n_neg_sub   = n_rows - n_pos_sub
    print(f"[INFO] subset: n={n_rows:,}  pos={n_pos_sub:,}  neg={n_neg_sub:,}  "
          f"ratio≈{(n_neg_sub/max(n_pos_sub,1)):.2f}:1  PR-baseline={y_sub.mean():.4f}")
    try:
        preview_active_codes(X_sub, feature_codes, row_indices=range(min(3, n_rows)), k=8)
    except Exception as e:
        print(f"[WARN] preview_active_codes failed: {e}")


    # SECTION D: coefficients (MAGI)
    subhead("D) Load MAGI coefficients")
    coef_csv = COEF_PATTERN.format(target=tcode)
    if not os.path.exists(coef_csv):
        print(f"[SKIP] Coef file missing: {coef_csv}")
        continue
    intercept, coef_map = load_magi_betas(coef_csv)
    print(f"[INFO] Coefs: intercept={intercept:.6f}  n_features={len(coef_map):,}")
    for k,(cc,bb) in enumerate(list(coef_map.items())[:5]):
        print(f"   beta[{cc}] = {bb:.6f}")
    if "(intercept)" not in open(coef_csv, 'r', encoding="utf-8", errors="ignore").read():
        print("[NOTE] No explicit '(intercept)' row in CSV; using 0.0 if not found.")

    # SECTION E: alignment & scoring (MAGI)
    subhead("E) Align & score (MAGI)")
    try:
        lp, p_hat, idx_cols, betas_vec = score_from_betas(X_sub, feature_codes, coef_map, intercept)
    except Exception as e:
        print(f"[SKIP] MAGI scoring failed (no overlap or other issue): {e}")
        continue
    n_overlap = idx_cols.size
    print(f"[INFO] overlap with predictors = {n_overlap:,} columns")
    print(f"[INFO] first 5 aligned columns: {[feature_codes[i] for i in idx_cols[:5]]}")
    print(f"[INFO] first 5 aligned betas:   {[float(b) for b in betas_vec[:5]]}")

    # SECTION F: metrics & distributions (MAGI)
    subhead("F) Metrics & probability distribution (MAGI)")
    auc    = roc_auc_score(y_sub, p_hat)
    pr_auc = average_precision_score(y_sub, p_hat)
    q = qtiles(p_hat)
    print(f"[RESULT] MAGI AUC={auc:.4f}  |  PR-AUC={pr_auc:.4f}  (baseline={y_sub.mean():.4f})")
    print(f"[DIST] prob quantiles: min={q[0]:.4g}, p1={q[1]:.4g}, p25={q[2]:.4g}, "
          f"median={q[3]:.4g}, p75={q[4]:.4g}, p99={q[5]:.4g}, max={q[6]:.4g}")
    print(f"[COUNT] prob>=0.999: {(p_hat>=0.999).sum()}  |  prob<=0.001: {(p_hat<=0.001).sum()}")

    # SECTION G: save predictions (MAGI)
    subhead("G) Save per-person predictions (MAGI)")
    safe = safe_name(pretty)
    pred_csv = os.path.join(CSV_DIR, f"pred_{safe}.csv")
    pd.DataFrame({
        "person_id": persons_sub,
        "y_true": y_sub.astype(int),
        "score_logit": lp,
        "prob": p_hat
    }).to_csv(pred_csv, index=False)
    print(f"[SAVE] predictions → {pred_csv}")
    print(pd.read_csv(pred_csv).head(10))

    # SECTION H: plots (MAGI)
    subhead("H) ROC plots (MAGI) PNG/SVG")
    png_path = os.path.abspath(os.path.join(PNG_DIR, f"ROC_{safe}.png"))
    svg_path = os.path.abspath(os.path.join(PNG_DIR, f"ROC_{safe}.svg"))
    _auc = plot_roc(y_sub, p_hat, pretty, png_path, svg_path)
    print(f"[SAVE] ROC → {png_path}")
    print(f"[SAVE] ROC → {svg_path}")

    # SECTION I: LASSO (ALL FEATURES, no SAFE) on the SAME SAMPLE
    subhead("I) LASSO: all features (drop constants/duplicates), 5-fold CV")
    try:
        Xs = X_sub.tocsr().astype(np.float32)

        # Remove constants & perfectly duplicate columns (on the sample only)
        X_lasso, feat_lasso, keep_mask = drop_constants_and_duplicates_for_sample(Xs, feature_codes)

        if X_lasso.shape[1] == 0:
            print("[SKIP] No usable features after cleanup; skipping LASSO.")
            lasso_cv_auc = np.nan; lasso_auc = np.nan; lasso_pr = np.nan
            lasso_pred_csv = ""; lasso_coef_csv = ""; lasso_roc_png = ""; lasso_pr_png = ""
        else:
            # Fit Logistic LASSO with 5-fold CV on all remaining features
            clf = LogisticRegressionCV(
                Cs=np.logspace(-3, 3, 12),
                cv=5,
                penalty="l1",
                solver="saga",
                scoring="roc_auc",
                max_iter=2000,
                n_jobs=-1,
                random_state=RNG_SEED,
                refit=True,
                fit_intercept=True,
            ).fit(X_lasso, y_sub)

            # CV summary
            scores_mat = clf.scores_[1]                 # (folds × Cs)
            mean_auc_per_C = scores_mat.mean(axis=0)
            best_idx = int(np.argmax(mean_auc_per_C))
            lasso_cv_auc = float(mean_auc_per_C[best_idx])
            best_C = float(np.atleast_1d(clf.C_)[0])
            print(f"[MODEL] LASSO best_C={best_C:.6g}  CV-AUC={lasso_cv_auc:.4f}")

            # Refit predictions & metrics (on the sampled data)
            p_hat_lasso = clf.predict_proba(X_lasso)[:, 1]
            lasso_auc   = roc_auc_score(y_sub, p_hat_lasso)
            lasso_pr    = average_precision_score(y_sub, p_hat_lasso)
            print(f"[RESULT] LASSO Refit AUC={lasso_auc:.4f} | PR-AUC={lasso_pr:.4f} (baseline={y_sub.mean():.4f})")

            # Save LASSO predictions
            safe = safe_name(pretty)
            lasso_pred_csv = os.path.join(CSV_DIR, f"pred_{safe}_LASSO.csv")
            pd.DataFrame({
                "person_id": persons_sub,
                "y_true": y_sub.astype(int),
                "prob_lasso": p_hat_lasso
            }).to_csv(lasso_pred_csv, index=False)
            print(f"[SAVE] LASSO predictions → {lasso_pred_csv}")

            # Save LASSO coefficients (non-zero only)
            coef = clf.coef_.ravel(); intercept_l = float(clf.intercept_.ravel()[0])
            nz = np.where(coef != 0)[0]
            coef_df = pd.DataFrame({"feature": feat_lasso[nz], "coef": coef[nz]}) \
                        .sort_values("coef", key=np.abs, ascending=False)
            coef_df.loc[-1] = {"feature": "(intercept)", "coef": intercept_l}
            coef_df.index = coef_df.index + 1
            lasso_coef_csv = os.path.join(OUT_DIR, f"lasso_coef_{safe}.csv")
            coef_df.to_csv(lasso_coef_csv, index=False)
            print(f"[SAVE] LASSO coefficients → {lasso_coef_csv} (nonzero={len(nz)} of {len(coef)})")

            # LASSO ROC & PR plots
            lasso_roc_png = os.path.abspath(os.path.join(PNG_DIR, f"ROC_{safe}_LASSO.png"))
            lasso_roc_svg = os.path.abspath(os.path.join(PNG_DIR, f"ROC_{safe}_LASSO.svg"))
            lasso_pr_png  = os.path.abspath(os.path.join(PNG_DIR, f"PR_{safe}_LASSO.png"))
            lasso_pr_svg  = os.path.abspath(os.path.join(PNG_DIR, f"PR_{safe}_LASSO.svg"))
            _ = plot_roc(y_sub, p_hat_lasso, f"{pretty} (LASSO, CV=5)", lasso_roc_png, lasso_roc_svg)
            _ = plot_pr(y_sub, p_hat_lasso, f"{pretty}", lasso_pr_png, lasso_pr_svg)
            print(f"[SAVE] LASSO ROC → {lasso_roc_png}")
            print(f"[SAVE] LASSO PR  → {lasso_pr_png}")

    except Exception as e:
        print(f"[SKIP] LASSO failed: {e}")
        lasso_cv_auc = np.nan; lasso_auc = np.nan; lasso_pr = np.nan
        lasso_pred_csv = ""; lasso_coef_csv = ""; lasso_roc_png = ""; lasso_pr_png = ""
        
    
    summary.append({
        "target_code": tcode,
        "target_name": pretty,
        "n_cases": n_rows,
        "n_pos": n_pos_sub,
        "n_neg": n_neg_sub,
        "feature_overlap": n_overlap,         # MAGI overlap info
        "AUC": auc,                           # MAGI
        "PR_AUC": pr_auc,                     # MAGI
        "PR_baseline": y_sub.mean(),
        "coef_csv": coef_csv,                 # MAGI coef csv path
        "pred_csv": pred_csv,                 # MAGI pred csv path
        "roc_png": os.path.abspath(png_path), # MAGI ROC path (abs)
        # ---- LASSO fields ----
        "LASSO_CV_AUC": lasso_cv_auc,
        "LASSO_Refit_AUC": lasso_auc,
        "LASSO_PR_AUC": lasso_pr,
        "LASSO_pred_csv": lasso_pred_csv,
        "LASSO_coef_csv": lasso_coef_csv,
        "LASSO_roc_png": lasso_roc_png if 'lasso_roc_png' in locals() else "",
        "LASSO_pr_png":  lasso_pr_png  if 'lasso_pr_png'  in locals() else "",
    })
