In [None]:
# 10 9 25
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

from scipy.sparse import load_npz
from scipy.special import expit
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import (
    roc_curve, roc_auc_score, accuracy_score,
)
from mlxtend.evaluate import bootstrap_point632_score

warnings.filterwarnings("ignore", category=RuntimeWarning)

# -------------------- CONFIG --------------------
BASE = "./testing_YL"

TARGET_NAME = {
    "dx_SNOMED_254645002": "Malignant mesothelioma of pleura",
}
TARGETS = list(TARGET_NAME.keys())

# Example: ./testing_YL/Mesothelioma/magi_coef/magi_coef_<target>.csv
COEF_PATTERN = os.path.join(BASE, "Mesothelioma/magi_coef", "magi_coef_{target}_nonzero.csv")

# Output folders
OUT_DIR      = os.path.join(BASE, "Mesothelioma/magi_out")
PNG_DIR      = os.path.join(OUT_DIR, "png")
CSV_DIR      = os.path.join(OUT_DIR, "preds")
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(PNG_DIR, exist_ok=True)
os.makedirs(CSV_DIR, exist_ok=True)

SUMMARY_CSV = os.path.join(OUT_DIR, "summary_per_target_and_bootstrap.csv")

RNG_SEED = 42
NEG_MULT = 4
N_BOOT   = 100

# selection ranking mode
RANK_MODE = "prevalence"  # or "beta"

# SAFE presence thresholds (mild/strict)
SAFE_MIN_TOTAL_COUNT_EPV_5_9  = 2
SAFE_MIN_POS_CARRIERS_EPV_5_9 = 1
SAFE_MIN_TOTAL_COUNT_EPV_LT5  = 3
SAFE_MIN_POS_CARRIERS_EPV_LT5 = 2
# -------------------- EPV CONFIG --------------------
EPV_TARGET = 9          # set 5 (current ask) or 9 (old behavior)
EPV_MIN    = 5          # lower bound in SAFE band (typically 5)

# Bootstrap policy: run bootstrap if EPV <= (multiplier * EPV_TARGET)
BOOTSTRAP_EPV_MAX_MULTIPLIER = 2.0  # (e.g., with EPV_TARGET=5 → run if EPV<=10)

# Choose SAFE presence thresholds by EPV regime
SAFE_POLICY = "mild"  # ("mild" here just uses the table below; keep as-is unless you want a switch)
def _safe_thresholds(epv_target: int):
    # You already defined these constants above
    if epv_target >= 5:
        return SAFE_MIN_TOTAL_COUNT_EPV_5_9, SAFE_MIN_POS_CARRIERS_EPV_5_9
    else:
        return SAFE_MIN_TOTAL_COUNT_EPV_LT5,  SAFE_MIN_POS_CARRIERS_EPV_LT5

MIN_TOTAL_COUNT, MIN_POS_CARRIERS = _safe_thresholds(EPV_TARGET)

# -------------------- UTILS --------------------
def banner(txt):
    bar = "=" * max(12, len(txt) + 4)
    print(f"\n{bar}\n{txt}\n{bar}")

def subhead(txt):
    print(f"\n--- {txt} ---")

def safe_name(s: str) -> str:
    return "".join(ch if ch.isalnum() or ch in ("-", "_") else "_" for ch in s)

def plot_roc(y_true, p_hat, title, out_png, out_svg):
    fpr, tpr, _ = roc_curve(y_true, p_hat)
    auc_val = roc_auc_score(y_true, p_hat)
    plt.figure()
    plt.plot(fpr, tpr, label=f"AUC={auc_val:.3f}")
    plt.plot([0,1], [0,1], linestyle="--", linewidth=1)
    plt.xlabel("False Positive Rate", fontsize=16, fontweight="bold")
    plt.ylabel("True Positive Rate",  fontsize=16, fontweight="bold")
    plt.xticks(fontsize=14, fontweight="bold")
    plt.yticks(fontsize=14, fontweight="bold")
    ax = plt.gca()
    for spine in ax.spines.values():
        spine.set_linewidth(1.4)
    plt.legend(loc="lower right", fontsize=12)
    plt.tight_layout()
    plt.savefig(out_png, dpi=300, bbox_inches="tight")
    plt.savefig(out_svg, bbox_inches="tight")
    plt.close()
    return float(auc_val)

def load_magi_betas(coef_csv):
    df = pd.read_csv(coef_csv)
    def pick(df, opts):
        for c in opts:
            if c in df.columns: return c
        raise KeyError(f"Missing any of {opts} in {coef_csv}. Found: {list(df.columns)}")
    code_col = pick(df, ["concept_code","standard_concept_code","predictor","feature","term","name"])
    beta_col = pick(df, ["coef","coefficient","beta","estimate","b","value"])
    df[code_col] = df[code_col].astype(str).str.strip()
    is_int = df[code_col].str.lower().isin(["(intercept)","intercept","const","(const)","bias"])
    intercept = float(df.loc[is_int, beta_col].iloc[0]) if is_int.any() else 0.0
    coef_map  = dict(zip(df.loc[~is_int, code_col], df.loc[~is_int, beta_col].astype(float)))
    return intercept, coef_map

def sample_fixed_pos_neg(y, n_pos=None, seed=RNG_SEED):
    rng = np.random.default_rng(seed)
    pos_idx = np.where(y == 1)[0]
    neg_idx = np.where(y == 0)[0]
    if pos_idx.size == 0:
        raise ValueError("No positives for this target.")

    if n_pos is None:                 # <-- support 'all positives'
        take_pos = pos_idx.size
        sel_pos = pos_idx
    else:
        take_pos = min(n_pos, pos_idx.size)
        sel_pos = rng.choice(pos_idx, size=take_pos, replace=False)

    take_neg = min(NEG_MULT * take_pos, neg_idx.size)
    sel_neg = rng.choice(neg_idx, size=take_neg, replace=False)
    sel = np.concatenate([sel_pos, sel_neg]); rng.shuffle(sel)
    return sel

def drop_constants_and_duplicates_for_sample(X_csr, feature_codes, verbose=True):
    n_rows, n_cols = X_csr.shape
    nnz = np.asarray((X_csr != 0).sum(axis=0)).ravel()
    keep_mask = (nnz > 0)
    X_csc = X_csr[:, keep_mask].tocsc()
    sub_keep = np.ones(X_csc.shape[1], dtype=bool)
    seen = {}
    for j in range(X_csc.shape[1]):
        s, e = X_csc.indptr[j], X_csc.indptr[j+1]
        idx = X_csc.indices[s:e]
        dat = X_csc.data[s:e]
        key = (idx.tobytes(), dat.tobytes())
        if key in seen:
            sub_keep[j] = False
        else:
            seen[key] = j
    keep_idx = np.where(keep_mask)[0]
    keep_mask_final = np.zeros(n_cols, dtype=bool)
    keep_mask_final[keep_idx[sub_keep]] = True
    X_red = X_csr[:, keep_mask_final]
    feats_red = feature_codes[keep_mask_final]
    dropped = int(n_cols - keep_mask_final.sum())
    if verbose:
        print(f"[INFO] LASSO/MAGI cleanup: kept {X_red.shape[1]:,}/{n_cols:,} features "
              f"(dropped {dropped:,} all-zero/duplicate).")
    return X_red, feats_red, keep_mask_final

def compute_epv(n_pos: int, n_predictors: int) -> float:
    if n_predictors <= 0:
        return float('inf') if n_pos > 0 else 0.0
    return float(n_pos) / float(n_predictors)

# ------------ Selection: EPV-target=9 with SAFE to force EPV∈[5,9] ------------
def select_and_trim_to_epv_range(
    X_train, y_train, feature_codes, coef_map, *,
    dv_code,
    rank="prevalence",
    require_present=True,
    min_total_count=1,
    min_pos_carriers=0,
    epv_target=9,
    epv_min=5,
    verbose=False,
):
    import math
    import numpy as np

    y_train = np.asarray(y_train).ravel().astype(int)
    n_pos = int(np.sum(y_train == 1))
    if n_pos == 0:
        if verbose: print("[SEL] No positives; empty selection.")
        return X_train[:, :0], feature_codes[:0], np.array([], dtype=int), np.inf

    # Bounds on #predictors to keep EPV in [epv_min, epv_target]
    # n_min = preferred (EPV≈9); n_max = upper cap (EPV≥5)
    n_min = max(1, int(math.ceil(n_pos / float(epv_target))))  # prefer this
    n_max = max(n_min, int(math.floor(n_pos / float(epv_min))))# don’t exceed this

    # MAGI mapping and DV exclusion
    mapped_mask = np.array([c in coef_map for c in feature_codes], dtype=bool)
    if dv_code is not None:
        mapped_mask &= (feature_codes != dv_code)

    cand_idx = np.where(mapped_mask)[0]
    if cand_idx.size == 0:
        if verbose: print("[SEL] No MAGI-mapped features (after DV exclusion).")
        return X_train[:, :0], feature_codes[:0], np.array([], dtype=int), np.inf

    # presence/support filters
    if require_present or (min_total_count > 1) or (min_pos_carriers > 0):
        X_cand = X_train[:, cand_idx]
        nnz_all = np.asarray((X_cand != 0).sum(axis=0)).ravel()
        keep_local = (nnz_all >= max(1, min_total_count))
        if min_pos_carriers > 0 and np.any(y_train == 1):
            pos_rows = np.where(y_train == 1)[0]
            nnz_pos = np.asarray((X_cand[pos_rows, :] != 0).sum(axis=0)).ravel()
            keep_local &= (nnz_pos >= min_pos_carriers)
        cand_idx = cand_idx[keep_local]
        if cand_idx.size == 0:
            if verbose: print("[SEL] After presence filters, no features remain.")
            return X_train[:, :0], feature_codes[:0], np.array([], dtype=int), np.inf

    # ranking
    if rank == "prevalence":
        nnz = np.asarray((X_train[:, cand_idx] != 0).sum(axis=0)).ravel()
        order_local = np.argsort(-nnz)
        ordered_idx = cand_idx[order_local]
    elif rank == "beta":
        betas = np.array([coef_map[feature_codes[i]] for i in cand_idx], dtype=float)
        order_local = np.argsort(-np.abs(betas))
        ordered_idx = cand_idx[order_local]
    else:
        raise ValueError("rank must be 'prevalence' or 'beta'")

    # ---- CHANGED: prefer EPV≈9 (n_min), cap by EPV≥5 (n_max) and availability ----
    n_avail = ordered_idx.size
    k_cap   = min(n_max, n_avail)
    k_final = min(n_min, k_cap)  # prefer n_min (EPV≈9)

    if k_final <= 0:
        if verbose: print("[SEL] No features after ranking/cap.")
        return X_train[:, :0], feature_codes[:0], np.array([], dtype=int), np.inf

    keep_idx = ordered_idx[:k_final]

    # cleanup (constants/duplicates)
    X_top   = X_train[:, keep_idx]
    feats_t = feature_codes[keep_idx]
    X_used, feats_used, mask_final = drop_constants_and_duplicates_for_sample(
        X_top.tocsr().astype(np.float32), feats_t
    )
    keep_idx_used = keep_idx[mask_final]

    # If cleanup increased count (shouldn’t), clip; if it reduced count, accept (EPV will go up)
    if X_used.shape[1] > k_final:
        X_used        = X_used[:, :k_final]
        feats_used    = feats_used[:k_final]
        keep_idx_used = keep_idx_used[:k_final]

    # Belt-and-suspenders: DV and identity checks
    if feats_used.size and dv_code is not None:
        leak = np.where(feats_used == dv_code)[0]
        if leak.size:
            keep_mask = np.ones(feats_used.size, dtype=bool); keep_mask[leak] = False
            feats_used    = feats_used[keep_mask]
            X_used        = X_used[:, keep_mask]
            keep_idx_used = keep_idx_used[keep_mask]

    yv = y_train.astype(np.int8)
    drop_cols = []
    for j in range(X_used.shape[1]):
        col = X_used.getcol(j).toarray().ravel().astype(np.int8)
        if np.array_equal(col, yv):
            drop_cols.append(j)
    if drop_cols:
        keep_mask = np.ones(X_used.shape[1], dtype=bool); keep_mask[np.array(drop_cols)] = False
        feats_used    = feats_used[keep_mask]
        X_used        = X_used[:, keep_mask]
        keep_idx_used = keep_idx_used[keep_mask]

    # EPV
    p_final = int(X_used.shape[1])
    epv_final = (float(n_pos) / p_final) if p_final > 0 else np.inf
    if verbose:
        print(f"[INFO] n_pos={n_pos}  n_min(pref@EPV≈{epv_target})={n_min}  "
              f"n_max(EPV≥{epv_min})={n_max}  n_avail={n_avail}  "
              f"k_final={p_final}  EPV={epv_final:.3f}")

    return X_used, feats_used, keep_idx_used.astype(int, copy=False), epv_final


# -------------------- Bootstrap helpers --------------------
def bootstrap_632plus_or_fallback(estimator, X, y, n_splits, scoring_func,
                                  predict_proba, seed, clone_estimator=False):
    try:
        return bootstrap_point632_score(
            estimator=estimator, X=X, y=y,
            n_splits=n_splits, method=".632+",
            scoring_func=scoring_func, predict_proba=predict_proba,
            random_seed=seed, clone_estimator=clone_estimator
        )
    except ZeroDivisionError:
        print("[WARN] .632+ failed (division by zero). Falling back to .632.")
    except Exception as e:
        print(f"[WARN] .632+ failed ({e}). Falling back to .632.")
    return bootstrap_point632_score(
        estimator=estimator, X=X, y=y,
        n_splits=n_splits, method=".632",
        scoring_func=scoring_func, predict_proba=predict_proba,
        random_seed=seed, clone_estimator=clone_estimator
    )

# -------------------- Estimators (per-resample selection with EPV rules) --------------------
class MagiBootstrapEstimatorCurrent(BaseEstimator, ClassifierMixin):
    def __init__(self, intercept, coef_map, feature_codes_all,
                 dv_code,                      # <-- add this
                 rank="prevalence", n_frac=0.10, require_present=True,
                 min_total_count=1, min_pos_carriers=0, verbose=False):
        self.intercept = intercept
        self.coef_map = coef_map
        self.feature_codes_all = feature_codes_all
        self.dv_code = dv_code               # <-- store it
        self.rank = rank
        self.n_frac = n_frac
        self.require_present = require_present
        self.min_total_count = min_total_count
        self.min_pos_carriers = min_pos_carriers
        self.verbose = verbose
        self._feats_used = None
        self._betas_used = None
        self._feature_codes_all = None
        self._intercept_ = None

    def fit(self, X, y):
        feature_codes_all = np.asarray(self.feature_codes_all)
        coef_map = dict(self.coef_map)
        intercept = float(self.intercept)

        # selection with DV removed + EPV target 9, SAFE [5,9]
        X_used, feats_used, _, _ = select_and_trim_to_epv_range(
            X, y, feature_codes_all, coef_map,
            dv_code=self.dv_code,                 # <-- pass dv_code
            rank=self.rank, require_present=self.require_present,
            min_total_count=MIN_TOTAL_COUNT, min_pos_carriers=MIN_POS_CARRIERS,
            epv_target=EPV_TARGET, epv_min=EPV_MIN,              
            verbose=self.verbose
        )
        assert self.dv_code not in set(feats_used), "DV leaked into predictors! (MAGI)"

        if X_used.shape[1] == 0:
            self._feats_used = np.array([], dtype=str)
            self._betas_used = np.array([], dtype=float)
            self._feature_codes_all = feature_codes_all
            self._intercept_ = intercept
            return self

        self._feats_used = feats_used
        self._betas_used = np.array([coef_map.get(c, 0.0) for c in feats_used], dtype=float)
        self._feature_codes_all = feature_codes_all
        self._intercept_ = intercept
        return self

    def predict_proba(self, X):
        if self._feats_used is None or self._feats_used.size == 0:
            p = float(expit(self._intercept_))
            return np.c_[1.0 - np.full(X.shape[0], p), np.full(X.shape[0], p)]
        code_to_idx = {c: i for i, c in enumerate(self._feature_codes_all)}
        col_idx = np.array([code_to_idx[c] for c in self._feats_used], dtype=int)
        z = self._intercept_ + X[:, col_idx].dot(self._betas_used)
        p = expit(np.asarray(z).ravel())
        return np.c_[1.0 - p, p]

    def predict(self, X):
        return (self.predict_proba(X)[:, 1] >= 0.5).astype(int)

class LassoBootstrapEstimatorCurrent(BaseEstimator, ClassifierMixin):
    def __init__(self, feature_codes_all, coef_map,
                 dv_code,                      # <-- add this
                 rank="prevalence", n_frac=0.10, require_present=True,
                 min_total_count=1, min_pos_carriers=0,
                 C=0.5, max_iter=2000, n_jobs=-1, random_state=42,
                 verbose=False):
        self.feature_codes_all = feature_codes_all
        self.coef_map = coef_map
        self.dv_code = dv_code               # <-- store it
        self.rank = rank
        self.n_frac = n_frac
        self.require_present = require_present
        self.min_total_count = min_total_count
        self.min_pos_carriers = min_pos_carriers
        self.C = C
        self.max_iter = max_iter
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.verbose = verbose
        self._feats_used = None
        self._feature_codes_all = None
        self._model = None
        self._p_const = None

    def fit(self, X, y):
        feature_codes_all = np.asarray(self.feature_codes_all)
        coef_map = dict(self.coef_map)

        # selection with DV removed + EPV target 9, SAFE [5,9]
        X_used, feats_used, _, _ = select_and_trim_to_epv_range(
            X, y, feature_codes_all, coef_map,
            dv_code=self.dv_code,                 # <-- pass dv_code
            rank=self.rank, require_present=self.require_present,
            epv_target=EPV_TARGET, epv_min=EPV_MIN,
            min_total_count=MIN_TOTAL_COUNT, min_pos_carriers=MIN_POS_CARRIERS,             
            verbose=self.verbose
        )
        assert self.dv_code not in set(feats_used), "DV leaked into predictors! (LASSO)"

        if X_used.shape[1] == 0:
            self._feats_used = np.array([], dtype=str)
            self._feature_codes_all = feature_codes_all
            self._model = None
            self._p_const = float(np.mean(y))
            return self

        self._feats_used = feats_used
        self._feature_codes_all = feature_codes_all

        self._model = LogisticRegression(
            penalty="l1", solver="saga", C=self.C,
            max_iter=self.max_iter, n_jobs=self.n_jobs,
            random_state=self.random_state
        ).fit(X_used, y)
        return self

    def predict_proba(self, X):
        if self._feats_used is None or self._feats_used.size == 0 or self._model is None:
            p = np.full(X.shape[0], self._p_const if self._p_const is not None else 0.5, dtype=float)
            return np.c_[1.0 - p, p]
        code_to_idx = {c: i for i, c in enumerate(self._feature_codes_all)}
        col_idx = np.array([code_to_idx[c] for c in self._feats_used], dtype=int)
        p = self._model.predict_proba(X[:, col_idx])[:, 1]
        return np.c_[1.0 - p, p]

    def predict(self, X):
        if self._feats_used is None or self._feats_used.size == 0 or self._model is None:
            p = self._p_const if self._p_const is not None else 0.5
            return (np.full(X.shape[0], p) >= 0.5).astype(int)
        code_to_idx = {c: i for i, c in enumerate(self._feature_codes_all)}
        col_idx = np.array([code_to_idx[c] for c in self._feats_used], dtype=int)
        proba = self._model.predict_proba(X[:, col_idx])[:, 1]
        return (proba >= 0.5).astype(int)

# -------------------- LOAD DESIGN --------------------
banner("LOAD DESIGN")
X_full  = load_npz(f"{BASE}/Lasso_X.npz").tocsr().astype(np.float32)
persons = pd.read_csv(f"{BASE}/person_index.csv")["person_id"].astype(str).to_numpy()
codes   = pd.read_csv(f"{BASE}/code_index.csv")["concept_code"].astype(str).to_numpy()
print(f"[INFO] Matrix: persons={X_full.shape[0]:,}  codes={X_full.shape[1]:,}")
if len(persons) != X_full.shape[0] or len(codes) != X_full.shape[1]:
    raise ValueError("[ERROR] person/code indices do not match matrix shape.")

# -------------------- RUN PER TARGET --------------------
rows = []
for tcode, pretty in TARGET_NAME.items():
    safe = safe_name(pretty)
    banner(f"TARGET {tcode} — {pretty}")

    # A) Labels
    idx_y = np.where(codes == tcode)[0]
    if idx_y.size == 0:
        print(f"[SKIP] Target not found in code_index.csv → {tcode}")
        continue
    y_full = X_full[:, idx_y[0]].toarray().ravel().astype(np.int8)

    # B) Predictors (exclude DV)
    mask_pred = (codes != tcode)
    X = X_full[:, mask_pred]
    feature_codes = codes[mask_pred]

    # C) Sample: include all cases + 4× controls
    sel = sample_fixed_pos_neg(y_full, n_pos=None, seed=RNG_SEED)
    X_sub = X[sel, :].tocsr().astype(np.float32)
    y_sub = y_full[sel].astype(int).ravel()
    persons_sub = persons[sel]

    n_rows = X_sub.shape[0]
    n_pos  = int(y_sub.sum())
    n_neg  = int(n_rows - n_pos)
    print(f"[INFO] subset: n={n_rows:,}  pos={n_pos:,}  neg={n_neg:,}  baseline={y_sub.mean():.4f}")

    # D) Load MAGI coefs
    coef_csv = COEF_PATTERN.format(target=tcode)
    if not os.path.exists(coef_csv):
        print(f"[SKIP] Missing MAGI coef file: {coef_csv}")
        continue
    intercept, coef_map = load_magi_betas(coef_csv)
    print(f"[INFO] MAGI coefs: intercept={intercept:.6f}  n_features={len(coef_map):,}")

    # E) Selection per NEW EPV rules (EPV≈9, then SAFE to [5,9])
    X_used, feats_used, _, epv = select_and_trim_to_epv_range(
        X_sub, y_sub, feature_codes, coef_map, dv_code=tcode,
        rank=RANK_MODE, require_present=True, 
        # default SAFE presence (mild) before any additional tightening:
        min_total_count=MIN_TOTAL_COUNT, min_pos_carriers=MIN_POS_CARRIERS,
        epv_target=EPV_TARGET, epv_min=EPV_MIN, verbose=False
    )
    print(f"[INFO] Predictors kept={X_used.shape[1]}  EPV={epv:.3f}")

    # If EPV still <5 → trim was applied inside; if EPV >9 due to limited MAGI features, we keep and log.
    if epv > 9.0:
        print("[NOTE] EPV > 9 (few MAGI features available). Proceeding with simpler model (no fill).")
    
    assert tcode not in set(feats_used), "DV leaked into predictors!"
    
        
    # inside the loop, right after selection:
    n_min = int(np.ceil(n_pos / EPV_TARGET))
    n_max = max(n_min, int(np.floor(n_pos / EPV_MIN)))
    print(f"[INFO] n_pos={n_pos}  n_min(pref@EPV≈{EPV_TARGET})={n_min}  "
          f"n_max(EPV≥{EPV_MIN})={n_max}  kept={X_used.shape[1]}  EPV={epv:.3f}")

    # ---- MAGI: same features ----
    if X_used.shape[1] == 0:
        p_hat_magi = np.full(n_rows, float(expit(intercept)), dtype=float)
        magi_nz_feats = np.array([], dtype=str)
        magi_nz_betas = np.array([], dtype=float)
    else:
        betas_vec = np.array([coef_map.get(f, 0.0) for f in feats_used], dtype=float)
        lp = intercept + X_used.dot(betas_vec)
        p_hat_magi = expit(np.asarray(lp).ravel())
        nz_mask = betas_vec != 0
        magi_nz_feats = np.array(feats_used)[nz_mask]
        magi_nz_betas = betas_vec[nz_mask]
    print(f"[MAGI] #non-zero features: {magi_nz_feats.size}")
    if magi_nz_feats.size:
        pairs = sorted(zip(magi_nz_feats.tolist(), magi_nz_betas.tolist()),
                       key=lambda x: abs(x[1]), reverse=True)
        print("[MAGI] Non-zero features (sorted by |beta|):")
        for name, val in pairs:
            print(f"   {name:>40}  {val:+.6f}")
    else:
        print("[MAGI] No non-zero features (intercept-only).")

    auc_magi = roc_auc_score(y_sub, p_hat_magi)
    print(f"[RESULT] MAGI  AUC={auc_magi:.4f}  baseline={y_sub.mean():.4f}")

    # Save MAGI used coefs
    magi_used_df = pd.DataFrame({"feature": magi_nz_feats, "beta": magi_nz_betas})
    magi_used_df.loc[-1] = {"feature": "(intercept)", "beta": float(intercept)}
    magi_used_df.index = magi_used_df.index + 1
    magi_used_csv = os.path.join(OUT_DIR, f"magi_coef_used_{safe}.csv")
    magi_used_df.to_csv(magi_used_csv, index=False)

    # MAGI preds & ROC
    pred_csv_magi = os.path.join(CSV_DIR, f"pred_{safe}.csv")
    pd.DataFrame({"person_id": persons_sub, "y_true": y_sub.astype(int), "prob_magi": p_hat_magi}).to_csv(pred_csv_magi, index=False)
    png_path = os.path.abspath(os.path.join(PNG_DIR, f"ROC_{safe}.png"))
    svg_path = os.path.abspath(os.path.join(PNG_DIR, f"ROC_{safe}.svg"))
    plot_roc(y_sub, p_hat_magi, pretty, png_path, svg_path)
    
    # ---- LASSO on SAME features (5-fold CV) ----
    try:
        if X_used.shape[1] == 0:
            print("[SKIP] No usable features; LASSO fallback to constant.")
            lasso_cv_auc = np.nan; lasso_auc = np.nan
            p_hat_lasso = np.full(n_rows, float(y_sub.mean()), dtype=float)
            lasso_pred_csv = ""; lasso_coef_csv = ""
        else:
            clf = LogisticRegressionCV(
                Cs=np.logspace(-3, 3, 12),
                cv=5,
                penalty="l1",
                solver="saga",
                scoring="roc_auc",
                max_iter=2000,
                n_jobs=-1,
                random_state=RNG_SEED,
                refit=True,
                fit_intercept=True,
            ).fit(X_used, y_sub)

            scores_mat = clf.scores_[1]
            mean_auc_per_C = scores_mat.mean(axis=0)
            best_idx = int(np.argmax(mean_auc_per_C))
            lasso_cv_auc = float(mean_auc_per_C[best_idx])
            best_C = float(np.atleast_1d(clf.C_)[0])
            print(f"[MODEL] LASSO best_C={best_C:.6g}  CV-AUC={lasso_cv_auc:.4f}")

            p_hat_lasso = clf.predict_proba(X_used)[:, 1]
            lasso_auc   = roc_auc_score(y_sub, p_hat_lasso)
            print(f"[RESULT] LASSO AUC={lasso_auc:.4f}  baseline={y_sub.mean():.4f}")

            # Save preds / coefs
            lasso_pred_csv = os.path.join(CSV_DIR, f"pred_{safe}_LASSO.csv")
            pd.DataFrame({"person_id": persons_sub, "y_true": y_sub.astype(int), "prob_lasso": p_hat_lasso}).to_csv(lasso_pred_csv, index=False)

            coef = clf.coef_.ravel(); intercept_l = float(clf.intercept_.ravel()[0])
            nz = np.where(coef != 0)[0]
            coef_df = (pd.DataFrame({"feature": np.array(feats_used)[nz], "coef": coef[nz]})
                       .sort_values("coef", key=np.abs, ascending=False))
            coef_df.loc[-1] = {"feature": "(intercept)", "coef": intercept_l}
            coef_df.index = coef_df.index + 1
            lasso_coef_csv = os.path.join(OUT_DIR, f"lasso_coef_{safe}.csv")
            coef_df.to_csv(lasso_coef_csv, index=False)
            print(f"[LASSO] #non-zero features: {nz.size}")
            if nz.size:
                rows_print = [(str(feats_used[i]), float(coef[i])) for i in nz]
                rows_print.sort(key=lambda x: abs(x[1]), reverse=True)
                print("[LASSO] Non-zero features (sorted by |coef|):")
                for name, val in rows_print:
                    print(f"   {name:>40}  {val:+.6f}")
            else:
                print("[LASSO] No non-zero coefficients (all zero)")
            print(f"[LASSO] Intercept: {intercept_l:+.6f}")

            lasso_roc_png = os.path.abspath(os.path.join(PNG_DIR, f"ROC_{safe}_LASSO.png"))
            lasso_roc_svg = os.path.abspath(os.path.join(PNG_DIR, f"ROC_{safe}_LASSO.svg"))
            plot_roc(y_sub, p_hat_lasso, f"{pretty} (LASSO, CV=5)", lasso_roc_png, lasso_roc_svg)
    except Exception as e:
        print(f"[SKIP] LASSO failed: {e}")
        lasso_cv_auc = np.nan; lasso_auc = np.nan
        lasso_pred_csv = ""; lasso_coef_csv = ""
        p_hat_lasso = np.full(n_rows, float(y_sub.mean()), dtype=float)
        
    # ---- Bootstrap policy: run if EPV is not too large relative to target ----
    run_bootstrap = (epv <= BOOTSTRAP_EPV_MAX_MULTIPLIER * EPV_TARGET) and (X_used.shape[1] > 0)
    AUC632_MAGI_mean = AUC632_MAGI_std = np.nan
    AUC632_LASSO_mean = AUC632_LASSO_std = np.nan
    ACC632_MAGI_mean = ACC632_MAGI_std = np.nan
    ACC632_LASSO_mean = ACC632_LASSO_std = np.nan

    if run_bootstrap:
        magi_est = MagiBootstrapEstimatorCurrent(
            intercept=intercept, coef_map=coef_map, feature_codes_all=feature_codes, dv_code=tcode,  
            rank=RANK_MODE, min_total_count=MIN_TOTAL_COUNT, min_pos_carriers=MIN_POS_CARRIERS, verbose=False
        )
        lasso_est = LassoBootstrapEstimatorCurrent(
            feature_codes_all=feature_codes, coef_map=coef_map, dv_code=tcode,  
            rank=RANK_MODE, min_total_count=MIN_TOTAL_COUNT, min_pos_carriers=MIN_POS_CARRIERS,
            C=0.5, max_iter=2000, n_jobs=-1, random_state=RNG_SEED, verbose=False
        )
        try:
            auc632_magi  = bootstrap_632plus_or_fallback(
                magi_est, X_sub, y_sub, N_BOOT, roc_auc_score, True, RNG_SEED, clone_estimator=False
            )
            auc632_lasso = bootstrap_632plus_or_fallback(
                lasso_est, X_sub, y_sub, N_BOOT, roc_auc_score, True, RNG_SEED, clone_estimator=False
            )
            acc632_magi  = bootstrap_632plus_or_fallback(
                magi_est, X_sub, y_sub, N_BOOT, accuracy_score, False, RNG_SEED, clone_estimator=False
            )
            acc632_lasso = bootstrap_632plus_or_fallback(
                lasso_est, X_sub, y_sub, N_BOOT, accuracy_score, False, RNG_SEED, clone_estimator=False
            )
            AUC632_MAGI_mean = float(np.mean(auc632_magi));  AUC632_MAGI_std  = float(np.std(auc632_magi))
            AUC632_LASSO_mean = float(np.mean(auc632_lasso));AUC632_LASSO_std = float(np.std(auc632_lasso))
            ACC632_MAGI_mean = float(np.mean(acc632_magi));  ACC632_MAGI_std  = float(np.std(acc632_magi))
            ACC632_LASSO_mean = float(np.mean(acc632_lasso));ACC632_LASSO_std = float(np.std(acc632_lasso))
            print(f"[BOOT] MAGI  .632+ AUC: mean={AUC632_MAGI_mean:.4f}  std={AUC632_MAGI_std:.4f}")
            print(f"[BOOT] LASSO .632+ AUC: mean={AUC632_LASSO_mean:.4f} std={AUC632_LASSO_std:.4f}")
            print(f"[BOOT] MAGI  .632+ ACC: mean={ACC632_MAGI_mean:.4f}  std={ACC632_MAGI_std:.4f}")
            print(f"[BOOT] LASSO .632+ ACC: mean={ACC632_LASSO_mean:.4f} std={ACC632_LASSO_std:.4f}")
        except Exception as e:
            print(f"[WARN] Bootstrap failed for {tcode}: {e}")
    else:
        if X_used.shape[1] == 0:
            print("[VAL] No predictors kept; skipping bootstrap.")
        else:
            thr = BOOTSTRAP_EPV_MAX_MULTIPLIER * EPV_TARGET
            print(f"[VAL] EPV > {thr:.1f}: per spec, bootstrap not required; skipping.")

    # ---- Summary row ---------------------------------------------------------
    rows.append({
        "target_code": tcode,
        "target_name": pretty,
        "n_cases": int(n_rows),
        "n_pos": int(n_pos),
        "n_neg": int(n_neg),
        "rank_mode": RANK_MODE,
        "features_used": int(X_used.shape[1]),
        "EPV": float(epv),
        "magi_nonzero_count": int(magi_nz_feats.size),
        "lasso_nonzero_count": int(nz.size) if 'nz' in locals() else 0,
        "AUC_MAGI": float(auc_magi),
        "AUC_LASSO": float(lasso_auc) if 'lasso_auc' in locals() and not np.isnan(lasso_auc) else np.nan,
        "AUC632_MAGI_mean":  AUC632_MAGI_mean,
        "AUC632_MAGI_std":   AUC632_MAGI_std,
        "AUC632_LASSO_mean": AUC632_LASSO_mean,
        "AUC632_LASSO_std":  AUC632_LASSO_std,
        "ACC632_MAGI_mean":  ACC632_MAGI_mean,
        "ACC632_MAGI_std":   ACC632_MAGI_std,
        "ACC632_LASSO_mean": ACC632_LASSO_mean,
        "ACC632_LASSO_std":  ACC632_LASSO_std,
        "coef_csv": coef_csv,
        "magi_used_coef_csv": os.path.join(OUT_DIR, f"magi_coef_used_{safe}.csv"),
        "pred_csv_MAGI": os.path.join(CSV_DIR, f"pred_{safe}.csv"),
        "pred_csv_LASSO": os.path.join(CSV_DIR, f"pred_{safe}_LASSO.csv") if 'lasso_pred_csv' in locals() else "",
        "lasso_coef_csv": os.path.join(OUT_DIR, f"lasso_coef_{safe}.csv") if 'lasso_coef_csv' in locals() else "",
        "roc_png_MAGI": os.path.abspath(os.path.join(PNG_DIR, f"ROC_{safe}.png")),
        "roc_png_LASSO": os.path.abspath(os.path.join(PNG_DIR, f"ROC_{safe}_LASSO.png")) if 'lasso_roc_png' in locals() else "",
    })

# -------------------- SAVE SUMMARY --------------------
if rows:
    df = pd.DataFrame(rows)
    df.to_csv(SUMMARY_CSV, index=False)
    # After you build `rows` (or right after selection), do something like:
    n_pos_local = int(y_sub.sum())
    n_min = max(1, int(np.ceil(n_pos_local / float(EPV_TARGET))))
    n_max = max(n_min, int(np.floor(n_pos_local / float(EPV_MIN))))
    print(f"[INFO] n_pos={n_pos_local}  "
          f"n_min(pref@EPV≈{EPV_TARGET})={n_min}  n_max(EPV≥{EPV_MIN})={n_max}  "
          f"kept={X_used.shape[1]}  EPV={epv:.3f}")

else:
    print("\n[SUMMARY] Nothing to save.")
