<a href="https://colab.research.google.com/github/Kenny625819/Applied-Data-Science/blob/main/DCA2%E5%80%A4%E5%8C%96.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ============================================================
# COMPLETE EXECUTABLE SCRIPT (Colab-ready, ESJ revision ready)
# BASE: user latest script with DCA (googlecolabDCAいり.docx)
# CHANGES:
# ✅ Tokuhashi/Katagiri are BINARIZED per paper thresholds:
#    - Tokuhashi >= 9 predicts SURVIVAL (1)
#    - New Katagiri < 7 predicts SURVIVAL (1)
# ✅ ROC/AUC/DeLong use these binary predictors (paper-consistent)
# ✅ DCA uses binary death-risk: p_death = 1 - p_survival_binary
# ============================================================

import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, f1_score, brier_score_loss
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression

import lightgbm as lgb

# -----------------------------
# 0) PATHS / CONSTANTS
# -----------------------------
# Colab typically uses /content. This auto-detects /content then /mnt/data.
CANDIDATES = [
    Path("/content/patient All2013.xlsx"),
    Path("/mnt/data/patient All2013.xlsx"),
]
FOUND = next((p for p in CANDIDATES if p.exists()), None)
if FOUND is None:
    raise FileNotFoundError(
        "Excel not found. Put 'patient All2013.xlsx' in /content (Colab) "
        "or /mnt/data."
    )
DATA_PATH = str(FOUND)
SHEET_NAME = "Sheet1"
print("Using DATA_PATH =", DATA_PATH)

OUT_DIR = Path("/content/ESJ_outputs")  # Colab-friendly
OUT_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_SEED = 42
N_SPLITS = 5
N_BOOT = 2000

# Temporal split (years)
TRAIN_YEARS = (2013, 2016)
TEST_YEARS  = (2017, 2021)

# Bonferroni family for prespecified primary comparisons:
# Full vs Tokuhashi (3 timepoints) + Full vs Katagiri (3 timepoints) = 6 tests
N_PRIMARY_TESTS = 6
ALPHA_PRIMARY_BONF = 0.05 / N_PRIMARY_TESTS  # 0.0083...

# DCA thresholds (clinically meaningful range)
DCA_MIN = 0.05
DCA_MAX = 0.50
DCA_NPTS = 19

# -----------------------------
# 1) Columns (patient All2013.xlsx)
# -----------------------------
DATE_COL = "ope date"

Y_COLS = {
    "3M": "3Month Survival",
    "6M": "6Month Survival",
    "12M": "12Month Survival",
}

TOK_COL = "Revised Tokuhashi score"
KAT_COL = "New Katagiri score"

# Preop features
PREOP_FEATURES = [
    "Age", "Sex", "BMI",
    "Malignancy (Katagiri Score)",
    "Visceral Metastasis",
    "Number of Spinal Metastases",
    "ECOGPS",
    "Frankel_bin",
    "Barthel Index",
    "Serum Albumin",
    "CRP",
]

# Intraop features
INTRAOP_FEATURES = ["Operation Time", "Intraoperative Blood Loss"]

ECOG_COL = "ECOGPS"

# -----------------------------
# 2) Plot style (publication)
# -----------------------------
BLACK = "black"
plt.rcParams.update({"font.family": "DejaVu Sans", "axes.unicode_minus": False})

def plot_roc(y, s_ai, s_tok, s_kat, save_path,
             auc_ai=None, auc_tok=None, auc_kat=None):
    """
    AUC can be passed from outside to match Excel values (bootstrap-based).
    If not provided, computed from data.
    Note: if s_tok/s_kat are binary (0/1), ROC will be step-like (still valid & plottable).
    """
    if auc_ai is None:
        auc_ai = roc_auc_score(y, s_ai)
    if auc_tok is None:
        auc_tok = roc_auc_score(y, s_tok)
    if auc_kat is None:
        auc_kat = roc_auc_score(y, s_kat)

    plt.figure(figsize=(6, 6))
    plt.plot([0, 1], [0, 1], "--", color="gray")

    fpr, tpr, _ = roc_curve(y, s_ai)
    plt.plot(fpr, tpr, "-", color=BLACK, linewidth=2.5,
             label=f"AI (LightGBM), AUC = {auc_ai:.3f}")

    fpr2, tpr2, _ = roc_curve(y, s_tok)
    plt.plot(fpr2, tpr2, "--", color=BLACK, linewidth=2,
             label=f"Revised Tokuhashi (cutoff), AUC = {auc_tok:.3f}")

    fpr3, tpr3, _ = roc_curve(y, s_kat)
    plt.plot(fpr3, tpr3, ":", color=BLACK, linewidth=2,
             label=f"New Katagiri (cutoff), AUC = {auc_kat:.3f}")

    plt.xlabel("1 – Specificity", fontsize=24)
    plt.ylabel("Sensitivity", fontsize=24)
    plt.xticks(fontsize=24)
    plt.yticks(fontsize=24)

    leg = plt.legend(fontsize=18, loc="center left",
                     bbox_to_anchor=(1.02, 0.5), frameon=True)
    leg.get_frame().set_edgecolor("black")

    plt.tight_layout(rect=[0, 0, 0.85, 1])
    plt.savefig(save_path, dpi=600, bbox_inches="tight")
    plt.close()

def plot_calibration(y, p, save_path, bins=10):
    dfc = pd.DataFrame({"y": y, "p": p})
    dfc["bin"] = pd.qcut(dfc["p"], q=bins, duplicates="drop")
    g = dfc.groupby("bin").agg(obs=("y", "mean"), pred=("p", "mean"))

    plt.figure(figsize=(5, 5))
    plt.plot([0, 1], [0, 1], "--", color="gray")
    plt.plot(g["pred"], g["obs"], "o-", color=BLACK, linewidth=2)

    plt.xlabel("Predicted survival probability", fontsize=20)
    plt.ylabel("Observed survival probability", fontsize=20)
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)

    plt.tight_layout()
    plt.savefig(save_path, dpi=600)
    plt.close()

# -----------------------------
# 3) Calibration slope/intercept (quantitative)
# -----------------------------
def calibration_slope_intercept(y_true, p_cal):
    """
    Logistic calibration on log-odds:
      logit(y) = intercept + slope * logit(p)
    Returns (slope, intercept).
    """
    y_true = np.asarray(y_true).astype(int)
    p = np.clip(np.asarray(p_cal, dtype=float), 1e-6, 1 - 1e-6)
    log_odds = np.log(p / (1 - p)).reshape(-1, 1)

    lr = LogisticRegression(penalty=None, solver="lbfgs", max_iter=1000)
    lr.fit(log_odds, y_true)

    slope = float(lr.coef_[0][0])
    intercept = float(lr.intercept_[0])
    return slope, intercept

# -----------------------------
# 4) AUC CI / threshold / metrics
# -----------------------------
def bootstrap_auc_ci(y_true, y_score, n_boot=2000, seed=42):
    rng = np.random.default_rng(seed)
    y_true = np.asarray(y_true)
    y_score = np.asarray(y_score)
    n = len(y_true)

    aucs = []
    for _ in range(n_boot):
        idx = rng.integers(0, n, n)
        if len(np.unique(y_true[idx])) < 2:
            continue
        aucs.append(roc_auc_score(y_true[idx], y_score[idx]))

    auc = roc_auc_score(y_true, y_score)
    if len(aucs) == 0:
        return float(auc), float("nan"), float("nan")
    lo, hi = np.percentile(np.array(aucs), [2.5, 97.5])
    return float(auc), float(lo), float(hi)

def best_threshold_youden(y_true, y_score):
    fpr, tpr, thr = roc_curve(y_true, y_score)
    j = tpr - fpr
    k = int(np.argmax(j))
    return float(thr[k])

def metrics_at_threshold(y_true, y_prob, thr):
    y_pred = (y_prob >= thr).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sens = tp / (tp + fn) if (tp + fn) else np.nan
    spec = tn / (tn + fp) if (tn + fp) else np.nan
    f1 = f1_score(y_true, y_pred)
    return float(sens), float(spec), float(f1)

# -----------------------------
# 5) DeLong test (AUC difference)
# -----------------------------
try:
    from scipy import stats
except Exception:
    stats = None

def _compute_midrank(x):
    J = np.argsort(x)
    Z = x[J]
    N = len(x)
    T = np.zeros(N, dtype=float)
    i = 0
    while i < N:
        j = i
        while j < N and Z[j] == Z[i]:
            j += 1
        T[i:j] = 0.5 * (i + j - 1) + 1
        i = j
    T2 = np.empty(N, dtype=float)
    T2[J] = T
    return T2

def _fast_delong(predictions_sorted_transposed, label_1_count):
    m = label_1_count
    n = predictions_sorted_transposed.shape[1] - m

    pos = predictions_sorted_transposed[:, :m]
    neg = predictions_sorted_transposed[:, m:]

    k = predictions_sorted_transposed.shape[0]
    tx = np.empty([k, m], dtype=float)
    ty = np.empty([k, n], dtype=float)
    tz = np.empty([k, m + n], dtype=float)

    for r in range(k):
        tx[r, :] = _compute_midrank(pos[r, :])
        ty[r, :] = _compute_midrank(neg[r, :])
        tz[r, :] = _compute_midrank(predictions_sorted_transposed[r, :])

    aucs = (tz[:, :m].sum(axis=1) - m * (m + 1) / 2) / (m * n)
    v01 = (tz[:, :m] - tx) / n
    v10 = 1.0 - (tz[:, m:] - ty) / m

    sx = np.atleast_2d(np.cov(v01))
    sy = np.atleast_2d(np.cov(v10))
    s = sx / m + sy / n
    return aucs, s

def delong_pvalue(y_true, y_score_1, y_score_2):
    """
    Two-sided DeLong test p-value for difference in AUCs.
    y_true: binary (1=positive class)
    scores: higher => more likely y=1
    """
    y_true = np.asarray(y_true).astype(int)
    order = np.argsort(-y_true)  # positives first
    y_sorted = y_true[order]
    preds = np.vstack([y_score_1, y_score_2])[:, order]

    m = int(y_sorted.sum())
    aucs, s = _fast_delong(preds, m)

    diff = aucs[0] - aucs[1]
    var = s[0, 0] + s[1, 1] - 2 * s[0, 1]
    z = diff / np.sqrt(var + 1e-12)

    if stats is None:
        import math
        p = math.erfc(abs(z) / math.sqrt(2))
    else:
        p = 2 * (1 - stats.norm.cdf(abs(z)))

    return float(p), float(aucs[0]), float(aucs[1])

def bonferroni_adjust(p, m):
    return float(min(p * m, 1.0))

# -----------------------------
# 6) DCA utilities (event = death)
# -----------------------------
def decision_curve_net_benefit(y_event, p_event, thresholds):
    y_event = np.asarray(y_event).astype(int)
    p_event = np.asarray(p_event).astype(float)
    n = len(y_event)

    nb = []
    for pt in thresholds:
        pred = (p_event >= pt).astype(int)
        tp = np.sum((pred == 1) & (y_event == 1))
        fp = np.sum((pred == 1) & (y_event == 0))
        w = pt / (1 - pt)
        nb.append(tp / n - (fp / n) * w)
    return np.array(nb)

def decision_curve_baselines(y_event, thresholds):
    prev = np.mean(np.asarray(y_event).astype(int) == 1)
    nb_none = np.zeros_like(thresholds, dtype=float)
    nb_all  = prev - (1 - prev) * (thresholds / (1 - thresholds))
    return nb_none, nb_all

def plot_dca(y_event, curves_dict, save_path, title=None):
    thresholds = np.linspace(DCA_MIN, DCA_MAX, DCA_NPTS)
    nb_none, nb_all = decision_curve_baselines(y_event, thresholds)

    plt.figure(figsize=(6, 5))
    plt.plot(thresholds, nb_none, "--", linewidth=2, label="Treat none")
    plt.plot(thresholds, nb_all,  "-", linewidth=2, label="Treat all")

    for label, p_event in curves_dict.items():
        nb = decision_curve_net_benefit(y_event, p_event, thresholds)
        plt.plot(thresholds, nb, "-", linewidth=2, label=label)

    plt.xlabel("Threshold probability", fontsize=18)
    plt.ylabel("Net benefit", fontsize=18)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    if title:
        plt.title(title, fontsize=18)

    plt.legend(fontsize=11, frameon=True)
    plt.tight_layout()
    plt.savefig(save_path, dpi=600)
    plt.close()

# -----------------------------
# 7) Preprocess / X builder
# -----------------------------
def make_frankel_bin(series: pd.Series) -> pd.Series:
    s = series.astype(str).str.upper().str.strip()
    return s.map({"A": 0, "B": 0, "C": 0, "D": 1, "E": 1}).astype("float")

def build_X(df, feature_cols):
    X = df[feature_cols].copy()
    if ECOG_COL in X.columns:
        X[ECOG_COL] = X[ECOG_COL].astype("Int64")
        X = pd.get_dummies(X, columns=[ECOG_COL], prefix="ECOG", dummy_na=False)
    for k in range(0, 5):
        col = f"ECOG_{k}"
        if col not in X.columns:
            X[col] = 0
    return X

def align_train_test(Xtr, Xte):
    Xtr, Xte = Xtr.align(Xte, join="outer", axis=1, fill_value=0)
    return Xtr, Xte

# -----------------------------
# 8) CV (OOF) with isotonic on OOF
# -----------------------------
def run_lgb_cv_oof(X, y, lgb_params, n_splits=5, seed=42, n_boot=2000):
    y = np.asarray(y).astype(int)
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

    p_oof = np.zeros(len(y), dtype=float)
    train_aucs = []

    for tr, te in skf.split(X, y):
        model = lgb.LGBMClassifier(**lgb_params)
        model.fit(X.iloc[tr], y[tr])

        p_oof[te] = model.predict_proba(X.iloc[te])[:, 1]

        p_tr = model.predict_proba(X.iloc[tr])[:, 1]
        train_aucs.append(roc_auc_score(y[tr], p_tr))

    iso = IsotonicRegression(out_of_bounds="clip")
    iso.fit(p_oof, y)
    p_cal = iso.transform(p_oof)

    auc, lo, hi = bootstrap_auc_ci(y, p_cal, n_boot=n_boot, seed=seed)

    thr = best_threshold_youden(y, p_cal)
    sens, spec, f1 = metrics_at_threshold(y, p_cal, thr)
    brier = float(brier_score_loss(y, p_cal))

    cal_slope, cal_intercept = calibration_slope_intercept(y, p_cal)

    train_auc_mean = float(np.mean(train_aucs))
    train_auc_sd   = float(np.std(train_aucs))
    delta_train_oof = float(train_auc_mean - auc)

    return {
        "p_oof": p_oof,
        "p_cal": p_cal,
        "iso": iso,
        "auc": auc, "auc_lo": lo, "auc_hi": hi,
        "sens": sens, "spec": spec, "f1": f1, "brier": brier,
        "thr": thr,
        "train_auc_mean": train_auc_mean,
        "train_auc_sd": train_auc_sd,
        "delta_train_oof": delta_train_oof,
        "cal_slope": cal_slope,
        "cal_intercept": cal_intercept,
    }

# -----------------------------
# 9) Temporal validation (train years -> test years)
# -----------------------------
def run_temporal_validation(df, y_col, feature_cols, lgb_params, train_years, test_years, seed=42):
    df = df.copy()
    df["year"] = pd.to_datetime(df[DATE_COL]).dt.year

    tr0, tr1 = train_years
    te0, te1 = test_years

    dtr = df[(df["year"] >= tr0) & (df["year"] <= tr1)].copy()
    dte = df[(df["year"] >= te0) & (df["year"] <= te1)].copy()

    Xtr = build_X(dtr, feature_cols)
    Xte = build_X(dte, feature_cols)
    Xtr, Xte = align_train_test(Xtr, Xte)

    ytr = dtr.loc[Xtr.index, y_col].astype(int).values
    yte = dte.loc[Xte.index, y_col].astype(int).values

    # calibration mapping from TRAIN OOF only (no leakage)
    cv_tr = run_lgb_cv_oof(Xtr, ytr, lgb_params, n_splits=N_SPLITS, seed=seed, n_boot=200)
    iso = cv_tr["iso"]
    thr_train = cv_tr["thr"]

    model = lgb.LGBMClassifier(**lgb_params)
    model.fit(Xtr, ytr)

    p_test_raw = model.predict_proba(Xte)[:, 1]
    p_test_cal = iso.transform(p_test_raw)

    auc_raw = roc_auc_score(yte, p_test_raw) if len(np.unique(yte)) == 2 else np.nan
    auc_cal = roc_auc_score(yte, p_test_cal) if len(np.unique(yte)) == 2 else np.nan

    sens, spec, f1 = metrics_at_threshold(yte, p_test_cal, thr_train)
    brier = float(brier_score_loss(yte, p_test_cal))

    cal_slope, cal_intercept = calibration_slope_intercept(yte, p_test_cal)

    return {
        "train_years": f"{train_years[0]}–{train_years[1]}",
        "test_years": f"{test_years[0]}–{test_years[1]}",
        "n_test": int(len(yte)),
        "deaths_test": int((yte == 0).sum()),
        "auc_raw": float(auc_raw),
        "auc_cal": float(auc_cal),
        "sens_cal": sens,
        "spec_cal": spec,
        "f1_cal": f1,
        "brier_cal": brier,
        "cal_slope": float(cal_slope),
        "cal_intercept": float(cal_intercept),
    }

# -----------------------------
# 10) MAIN
# -----------------------------
def main():
    df = pd.read_excel(DATA_PATH, sheet_name=SHEET_NAME)

    # Create Frankel_bin
    df["Frankel_bin"] = make_frankel_bin(df["Frankel Grade"])

    full_features  = PREOP_FEATURES + INTRAOP_FEATURES
    preop_features = PREOP_FEATURES

    # Complete-case cohort for FULL analysis (ensures same cohort for both models)
    needed = [DATE_COL, TOK_COL, KAT_COL] + list(Y_COLS.values()) + full_features
    dff = df.dropna(subset=needed).copy()

    # ✅ Conventional scores BINARIZED per paper
    #   Revised Tokuhashi ≥ 9  -> predicts survival (1)
    #   New Katagiri < 7       -> predicts survival (1)
    tok_score_surv = (dff[TOK_COL].astype(float).values >= 9).astype(float)  # 0/1
    kat_score_surv = (dff[KAT_COL].astype(float).values < 7).astype(float)   # 0/1

    # ✅ For DCA (event=death), convert to death risk
    tok_risk01 = 1.0 - tok_score_surv  # 1=death risk (binary)
    kat_risk01 = 1.0 - kat_score_surv  # 1=death risk (binary)

    # LightGBM params (tuned)
    lgb_params = dict(
        learning_rate=0.05,
        num_leaves=31,
        n_estimators=500,
        reg_lambda=1.0,
        class_weight="balanced",
        random_state=RANDOM_SEED,
    )

    # Build X matrices
    X_full  = build_X(dff, full_features)
    X_preop = build_X(dff, preop_features)
    X_full, X_preop = align_train_test(X_full, X_preop)

    rows_cv = []
    rows_tv = []
    rows_delong = []

    for tp, ycol in Y_COLS.items():
        y = dff[ycol].astype(int).values  # 1=survival
        deaths = int((y == 0).sum())
        n = int(len(y))

        # ---------- CV (OOF) ----------
        res_full = run_lgb_cv_oof(X_full,  y, lgb_params, n_splits=N_SPLITS, seed=RANDOM_SEED, n_boot=N_BOOT)
        res_pre  = run_lgb_cv_oof(X_preop, y, lgb_params, n_splits=N_SPLITS, seed=RANDOM_SEED, n_boot=N_BOOT)

        # Conventional AUC + CI (bootstrap) on binary predictors (paper-consistent)
        tok_auc, tok_lo, tok_hi = bootstrap_auc_ci(y, tok_score_surv, n_boot=N_BOOT, seed=RANDOM_SEED)
        kat_auc, kat_lo, kat_hi = bootstrap_auc_ci(y, kat_score_surv, n_boot=N_BOOT, seed=RANDOM_SEED)

        # Save ROC/Calibration (AUC displayed = Excel values)
        plot_roc(
            y, res_full["p_cal"], tok_score_surv, kat_score_surv,
            OUT_DIR / f"ROC_{tp}_Full.png",
            auc_ai=res_full["auc"], auc_tok=tok_auc, auc_kat=kat_auc
        )
        plot_calibration(y, res_full["p_cal"], OUT_DIR / f"Calibration_{tp}_Full.png")

        plot_roc(
            y, res_pre["p_cal"], tok_score_surv, kat_score_surv,
            OUT_DIR / f"ROC_{tp}_PreopOnly.png",
            auc_ai=res_pre["auc"], auc_tok=tok_auc, auc_kat=kat_auc
        )
        plot_calibration(y, res_pre["p_cal"], OUT_DIR / f"Calibration_{tp}_PreopOnly.png")

        # ---------- DCA (EVENT=DEATH) ----------
        y_event = (y == 0).astype(int)
        p_event_full = 1.0 - res_full["p_cal"]
        p_event_pre  = 1.0 - res_pre["p_cal"]

        curves = {
            "Full (Preop+Intraop)": p_event_full,
            "Preop-only": p_event_pre,
            "Revised Tokuhashi (cutoff)": tok_risk01,  # binary death-risk
            "New Katagiri (cutoff)": kat_risk01,       # binary death-risk
        }
        plot_dca(
            y_event,
            curves,
            OUT_DIR / f"DCA_{tp}.png",
            title=f"Decision Curve Analysis ({tp} mortality)"
        )

        # ---------- DeLong tests ----------
        # Primary: Full vs Tokuhashi; Full vs Katagiri
        p_full_tok, _, _ = delong_pvalue(y, res_full["p_cal"], tok_score_surv)
        p_full_kat, _, _ = delong_pvalue(y, res_full["p_cal"], kat_score_surv)
        # Supportive: Full vs Preop-only
        p_full_pre, _, _ = delong_pvalue(y, res_full["p_cal"], res_pre["p_cal"])

        rows_delong.append({
            "Timepoint": tp,
            "p_Full_vs_Tokuhashi": p_full_tok,
            "p_Full_vs_Katagiri": p_full_kat,
            "p_Full_vs_PreopOnly_supportive": p_full_pre,
            "p_Full_vs_Tokuhashi_BonferroniAdj": bonferroni_adjust(p_full_tok, N_PRIMARY_TESTS),
            "p_Full_vs_Katagiri_BonferroniAdj": bonferroni_adjust(p_full_kat, N_PRIMARY_TESTS),
            "Primary_alpha_Bonferroni": ALPHA_PRIMARY_BONF
        })

        rows_cv.append({
            "Timepoint": tp,
            "N": n,
            "Deaths_by_timepoint": deaths,

            "Full_AUC": res_full["auc"],
            "Full_AUC_Lo": res_full["auc_lo"],
            "Full_AUC_Hi": res_full["auc_hi"],
            "Full_Sens": res_full["sens"],
            "Full_Spec": res_full["spec"],
            "Full_F1": res_full["f1"],
            "Full_Brier": res_full["brier"],
            "Full_CalSlope": res_full["cal_slope"],
            "Full_CalIntercept": res_full["cal_intercept"],
            "Full_TrainAUC_mean": res_full["train_auc_mean"],
            "Full_TrainAUC_sd": res_full["train_auc_sd"],
            "Full_DeltaTrainMinusOOF": res_full["delta_train_oof"],

            "PreopOnly_AUC": res_pre["auc"],
            "PreopOnly_AUC_Lo": res_pre["auc_lo"],
            "PreopOnly_AUC_Hi": res_pre["auc_hi"],
            "PreopOnly_Sens": res_pre["sens"],
            "PreopOnly_Spec": res_pre["spec"],
            "PreopOnly_F1": res_pre["f1"],
            "PreopOnly_Brier": res_pre["brier"],
            "PreopOnly_CalSlope": res_pre["cal_slope"],
            "PreopOnly_CalIntercept": res_pre["cal_intercept"],
            "PreopOnly_TrainAUC_mean": res_pre["train_auc_mean"],
            "PreopOnly_TrainAUC_sd": res_pre["train_auc_sd"],
            "PreopOnly_DeltaTrainMinusOOF": res_pre["delta_train_oof"],

            "Tokuhashi_AUC": tok_auc,
            "Tokuhashi_AUC_Lo": tok_lo,
            "Tokuhashi_AUC_Hi": tok_hi,

            "Katagiri_AUC": kat_auc,
            "Katagiri_AUC_Lo": kat_lo,
            "Katagiri_AUC_Hi": kat_hi,
        })

        # ---------- Temporal validation ----------
        tv_full = run_temporal_validation(dff, ycol, full_features,  lgb_params, TRAIN_YEARS, TEST_YEARS, seed=RANDOM_SEED)
        tv_pre  = run_temporal_validation(dff, ycol, preop_features, lgb_params, TRAIN_YEARS, TEST_YEARS, seed=RANDOM_SEED)

        rows_tv.append({
            "Timepoint": tp,
            "TrainYears": tv_full["train_years"],
            "TestYears": tv_full["test_years"],
            "TestN": tv_full["n_test"],
            "TestDeaths": tv_full["deaths_test"],

            "Temporal_Full_AUC_raw": tv_full["auc_raw"],
            "Temporal_Full_AUC_cal": tv_full["auc_cal"],
            "Temporal_Full_Sens_cal": tv_full["sens_cal"],
            "Temporal_Full_Spec_cal": tv_full["spec_cal"],
            "Temporal_Full_F1_cal": tv_full["f1_cal"],
            "Temporal_Full_Brier_cal": tv_full["brier_cal"],
            "Temporal_Full_CalSlope": tv_full["cal_slope"],
            "Temporal_Full_CalIntercept": tv_full["cal_intercept"],

            "Temporal_PreopOnly_AUC_raw": tv_pre["auc_raw"],
            "Temporal_PreopOnly_AUC_cal": tv_pre["auc_cal"],
            "Temporal_PreopOnly_Sens_cal": tv_pre["sens_cal"],
            "Temporal_PreopOnly_Spec_cal": tv_pre["spec_cal"],
            "Temporal_PreopOnly_F1_cal": tv_pre["f1_cal"],
            "Temporal_PreopOnly_Brier_cal": tv_pre["brier_cal"],
            "Temporal_PreopOnly_CalSlope": tv_pre["cal_slope"],
            "Temporal_PreopOnly_CalIntercept": tv_pre["cal_intercept"],
        })

    df_cv = pd.DataFrame(rows_cv)
    df_tv = pd.DataFrame(rows_tv)
    df_del = pd.DataFrame(rows_delong)

    # ΔAUC (Temporal_cal - CV_AUC)
    df_delta = df_cv.merge(
        df_tv[["Timepoint", "Temporal_Full_AUC_cal", "Temporal_PreopOnly_AUC_cal"]],
        on="Timepoint", how="left"
    )
    df_delta["DeltaAUC_TemporalMinusCV_Full"] = df_delta["Temporal_Full_AUC_cal"] - df_delta["Full_AUC"]
    df_delta["DeltaAUC_TemporalMinusCV_PreopOnly"] = df_delta["Temporal_PreopOnly_AUC_cal"] - df_delta["PreopOnly_AUC"]

    # Save Excel
    xlsx_path = OUT_DIR / "Performance_Summary_with_CV_Temporal_DCA_DeLong.xlsx"
    with pd.ExcelWriter(xlsx_path) as writer:
        df_cv.to_excel(writer, sheet_name="CV_Summary", index=False)
        df_tv.to_excel(writer, sheet_name="Temporal_Validation", index=False)
        df_delta.to_excel(writer, sheet_name="CV_vs_Temporal_Delta", index=False)
        df_del.to_excel(writer, sheet_name="DeLong_pvalues", index=False)

    print("Saved Excel:", xlsx_path)
    print("Saved ROC/Calibration/DCA plots to:", OUT_DIR)
    print(f"Primary Bonferroni alpha = {ALPHA_PRIMARY_BONF:.4f} (6 prespecified tests)")
    print(f"DCA thresholds: {DCA_MIN:.2f}–{DCA_MAX:.2f} ({DCA_NPTS} points)")

    if stats is None:
        print("NOTE: SciPy not found. DeLong p-values computed using erfc-based normal approximation.")
        print("If you prefer SciPy exact normal CDF, install: pip install scipy")

if __name__ == "__main__":
    main()

# -----------------------------
# Methods sentence (paste-ready, ESJ)
# -----------------------------
# “Both the full model (preoperative + intraoperative variables) and the preoperative-only model
#  were evaluated on the same complete-case cohort (n=188), defined by the availability of all
#  variables required for the full model, to ensure a fair head-to-head comparison.
#  Conventional scoring systems were evaluated using established binary cutoffs
#  (revised Tokuhashi ≥ 9; new Katagiri < 7), consistent with their intended clinical use.”


[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
[LightGBM] [Info] Number of positive: 113, number of negative: 37
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000073 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 263
[LightGBM] [Info] Number of data points in the train set: 150, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 114, number of negative: 36
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000071 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 266
[LightGBM] [Info] Number of data points in the train set: 150, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from

  plt.tight_layout(rect=[0, 0, 0.85, 1])
  g = dfc.groupby("bin").agg(obs=("y", "mean"), pred=("p", "mean"))
  plt.tight_layout(rect=[0, 0, 0.85, 1])
  g = dfc.groupby("bin").agg(obs=("y", "mean"), pred=("p", "mean"))


[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
[LightGBM] [Info] Number of positive: 86, number of negative: 64
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000059 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 264
[LightGBM] [Info] Number of data points in the train set: 150, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 86, number of negative: 64
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000025 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 269
[LightGBM] [Info] Number of data points in the train set: 150, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.5000

  plt.tight_layout(rect=[0, 0, 0.85, 1])
  g = dfc.groupby("bin").agg(obs=("y", "mean"), pred=("p", "mean"))
  plt.tight_layout(rect=[0, 0, 0.85, 1])
  g = dfc.groupby("bin").agg(obs=("y", "mean"), pred=("p", "mean"))


[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
[LightGBM] [Info] Number of positive: 65, number of negative: 85
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000082 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 266
[LightGBM] [Info] Number of data points in the train set: 150, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 66, number of negative: 84
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000064 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261
[LightGBM] [Info] Number of data points in the train set: 150, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 6

  plt.tight_layout(rect=[0, 0, 0.85, 1])
  g = dfc.groupby("bin").agg(obs=("y", "mean"), pred=("p", "mean"))
  plt.tight_layout(rect=[0, 0, 0.85, 1])
  g = dfc.groupby("bin").agg(obs=("y", "mean"), pred=("p", "mean"))


[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
[LightGBM] [Info] Number of positive: 29, number of negative: 39
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000033 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150
[LightGBM] [Info] Number of data points in the train set: 68, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 29, number of negative: 39
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000051 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 149
[LightGBM] [Info] Number of data points in the train set: 68, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score