<a href="https://colab.research.google.com/github/Kenny625819/Applied-Data-Science/blob/main/European_submit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# =============================================================================
#  European Spine Journal – Full Reproducibility Pipeline (FINAL VERSION)
#  Includes Calibration Slope, Intercept, Hosmer–Lemeshow Test
# =============================================================================

!pip install -q lightgbm shap scikit-learn pandas matplotlib numpy xlsxwriter openpyxl

# -------------------------------- 2. Imports ---------------------------------
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from numpy.random import default_rng
from scipy.stats import norm, chi2

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    roc_auc_score, roc_curve, confusion_matrix,
    precision_recall_fscore_support, brier_score_loss
)
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression

import lightgbm as lgb
import shap

plt.rcParams["font.family"] = "DejaVu Sans"
plt.rcParams["axes.unicode_minus"] = False
BLACK = "black"

# --------------------------- Output directory --------------------------------
OUT = Path("/content/results_esj_final")
OUT.mkdir(exist_ok=True)
print("Output directory:", OUT)

# ---------------------------- 3. Load Excel file -----------------------------
xls = [f for f in os.listdir("/content") if f.lower().endswith(".xlsx")]
if not xls:
    from google.colab import files
    print("Please upload the .xlsx dataset.")
    files.upload()
    xls = [f for f in os.listdir("/content") if f.lower().endswith(".xlsx")]
DATA_PATH = f"/content/{xls[0]}"
print(f"Using dataset → {DATA_PATH}")

# ---------------------------- 4. Data Preprocessing ---------------------------
df = pd.read_excel(DATA_PATH)
df.columns = df.columns.str.replace(r"[\n\u3000]", " ", regex=True).str.strip()

# Detect year column
year_col = None
for c in ["Year", "Surgery Year", "year", "ope date", "Ope date", "OPE DATE", "Surgery Date", "Date"]:
    if c in df.columns:
        year_col = c
        break

df["_YEAR_"] = pd.to_datetime(df[year_col], errors="coerce").dt.year if year_col else np.nan

# Survival endpoints
timepoints = {
    "3M": "3Month Survival",
    "6M": "6Month Survival",
    "12M": "12Month Survival"
}

# Binary cutoffs
df["Tokuhashi_binary"] = (df["Revised Tokuhashi score"] >= 9).astype(int)
df["Katagiri_binary"]  = (df["New Katagiri score"] < 7).astype(int)

# Frankel / Visceral mapping
def frankel_bin(x):
    if pd.isnull(x): return np.nan
    s = str(x).strip().upper()
    if s in ["A", "B", "C"]: return 0
    if s in ["D", "E"]: return 1
    try: return int(float(s))
    except: return np.nan

def map_visceral(x):
    if isinstance(x, str):
        s = x.strip().lower()
        if s in ["yes", "y", "1", "true"]: return 1
        if s in ["no", "n", "0", "false"]: return 0
    try: return int(x)
    except: return np.nan

df["_frankel_bin"]  = df.get("Frankel Grade", pd.Series([np.nan]*len(df))).apply(frankel_bin)
df["_visc_met_bin"] = df.get("Visceral Metastasis", pd.Series([np.nan]*len(df))).apply(map_visceral)

# Feature list
features = [
    "Age", "Sex (Male/Female)", "BMI", "ECOGPS",
    "_frankel_bin", "Barthel Index", "Serum Albumin", "CRP",
    "Number of Spinal Metastases", "Malignancy (Katagiri Score)",
    "_visc_met_bin", "Operation Time", "Intraoperative Blood Loss"
]
features = [f for f in features if f in df.columns]

df = df.dropna(subset=features + list(timepoints.values())).reset_index(drop=True)

# ----------------------------- 5. SHAP rename table ---------------------------
rename_shap = {
    "Age": "Age",
    "Sex (Male/Female)": "Sex",
    "BMI": "BMI",
    "ECOGPS": "ECOG performance status",
    "_frankel_bin": "Frankel grade",
    "Barthel Index": "Barthel Index",
    "Serum Albumin": "Albumin",
    "CRP": "CRP",
    "Number of Spinal Metastases": "Number of spinal metastases",
    "Malignancy (Katagiri Score)": "Malignancy (New Katagiri score)",
    "_visc_met_bin": "Visceral metastasis",
    "Operation Time": "Operation time",
    "Intraoperative Blood Loss": "Blood loss"
}

# ----------------------------- 6. Utility functions --------------------------
def auc_ci(y, s, n_boot=2000, seed=42):
    rng = default_rng(seed)
    idx = np.arange(len(y))
    aucs = []
    for _ in range(n_boot):
        b = rng.choice(idx, len(idx), replace=True)
        try: aucs.append(roc_auc_score(y[b], s[b]))
        except: pass
    m = roc_auc_score(y, s)
    lo, hi = np.percentile(aucs, [2.5, 97.5])
    return m, lo, hi

def delong_test(y, s1, s2):
    def midrank(x):
        J = np.argsort(x)
        Z = x[J]
        N = len(x)
        T = np.zeros(N)
        i=0
        while i<N:
            j=i
            while j<N and Z[j]==Z[i]: j+=1
            T[i:j] = 0.5*(i+j-1)+1
            i=j
        out=np.empty(N)
        out[J]=T
        return out

    y=np.asarray(y)
    order=np.argsort(-s1)
    y,s1,s2=y[order],s1[order],s2[order]
    m=int(np.sum(y))

    P=np.vstack((s1,s2))
    aucs=[roc_auc_score(y,s1), roc_auc_score(y,s2)]

    def cov(P,m):
        n=P.shape[1]-m
        Tx=np.array([midrank(r[:m]) for r in P])
        Ty=np.array([midrank(r[m:]) for r in P])
        Txy=np.array([midrank(r) for r in P])
        V10=(Txy[:,:m]-Tx)/n
        V01=(Ty-Txy[:,m:])/m
        return np.cov(V10)/m + np.cov(V01)/n

    C=cov(P,m)
    diff=aucs[0]-aucs[1]
    var=C[0,0]+C[1,1]-2*C[0,1]
    z=diff/np.sqrt(var)
    p=2*(1-norm.cdf(abs(z)))
    return diff,p

# ------------- 6b. Calibration slope/intercept/HL test -----------------------
def calibration_metrics(y_true, y_prob, bins=10):
    eps = 1e-6
    y_prob_adj = np.clip(y_prob, eps, 1-eps)
    logit_p = np.log(y_prob_adj/(1-y_prob_adj))

    lr = LogisticRegression(fit_intercept=True, solver="lbfgs")
    lr.fit(logit_p.reshape(-1,1), y_true)
    slope = lr.coef_[0][0]
    intercept = lr.intercept_[0]

    dfc = pd.DataFrame({"y": y_true, "p": y_prob})
    dfc["bin"] = pd.qcut(dfc["p"], q=bins, duplicates="drop")
    g = dfc.groupby("bin").agg(obs=("y","sum"), n=("y","count"), pred=("p","mean"))

    hl_stat = np.sum((g["obs"] - g["n"]*g["pred"])**2 / (g["n"]*g["pred"]*(1-g["pred"])))
    hl_p = 1 - chi2.cdf(hl_stat, bins-2)

    return slope, intercept, hl_p

# ------------------------------ 7. ROC Plot ----------------------------------
def plot_roc(y, s_ai, s_tok, s_kat, name):
    auc_ai  = roc_auc_score(y,s_ai)
    auc_tok = roc_auc_score(y,s_tok)
    auc_kat = roc_auc_score(y,s_kat)

    plt.figure(figsize=(6,5))
    plt.plot([0,1],[0,1],"--",color="gray")

    fpr,tpr,_ = roc_curve(y,s_ai)
    plt.plot(fpr,tpr,"-", color=BLACK,linewidth=2.5,
             label=f"AI (LightGBM), AUC = {auc_ai:.3f}")

    fpr2,tpr2,_ = roc_curve(y,s_tok)
    plt.plot(fpr2,tpr2,"--", color=BLACK,linewidth=2,
             label=f"Revised Tokuhashi score, AUC = {auc_tok:.3f}")

    fpr3,tpr3,_ = roc_curve(y,s_kat)
    plt.plot(fpr3,tpr3,":", color=BLACK,linewidth=2,
             label=f"New Katagiri score, AUC = {auc_kat:.3f}")

    plt.xlabel("1 – Specificity", fontsize=16)
    plt.ylabel("Sensitivity", fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)

    leg=plt.legend(fontsize=11,loc="lower right",frameon=True)
    leg.get_frame().set_edgecolor("black")

    plt.tight_layout()
    plt.savefig(OUT / f"{name}.png", dpi=600)
    plt.close()

# ---------------------------- 8. Calibration Plot ----------------------------
def plot_calibration(y,p,name,bins=10):
    dfc=pd.DataFrame({"y":y,"p":p})
    dfc["bin"]=pd.qcut(dfc["p"],q=bins,duplicates="drop")
    g=dfc.groupby("bin").agg(obs=("y","mean"),pred=("p","mean"))

    plt.figure(figsize=(5,5))
    plt.plot([0,1],[0,1],"--",color="gray")
    plt.plot(g["pred"],g["obs"],"o-",color=BLACK,linewidth=2)

    plt.xlabel("Predicted survival probability",fontsize=16)
    plt.ylabel("Observed survival probability",fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)

    plt.tight_layout()
    plt.savefig(OUT / f"{name}.png", dpi=600)
    plt.close()

# ---------------- 9. Cross-validation + isotonic calibration -----------------
lgb_params = dict(
    objective="binary",
    metric="auc",
    boosting_type="gbdt",
    learning_rate=0.05,
    num_leaves=31,
    n_estimators=500,
    reg_lambda=1.0,
    class_weight="balanced",
    random_state=42,
)

def run_cv(X,y,tok,kat):
    skf=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
    p_ai=np.zeros_like(y,dtype=float)

    for tr,te in skf.split(X,y):
        model=lgb.LGBMClassifier(**lgb_params)
        model.fit(X.iloc[tr],y[tr])
        p_ai[te]=model.predict_proba(X.iloc[te])[:,1]

    iso=IsotonicRegression(out_of_bounds="clip")
    iso.fit(p_ai,y)
    p_cal=iso.transform(p_ai)

    ai,ai_lo,ai_hi = auc_ci(y,p_cal)
    tk,tk_lo,tk_hi = auc_ci(y,tok)
    kt,kt_lo,kt_hi = auc_ci(y,kat)

    brier=brier_score_loss(y,p_cal)

    fpr,tpr,thr = roc_curve(y,p_cal)
    thr_best = thr[np.argmax(tpr - fpr)]
    y_hat = (p_cal >= thr_best).astype(int)

    tn,fp,fn,tp = confusion_matrix(y,y_hat).ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    _,_,f1,_ = precision_recall_fscore_support(y,y_hat,average="binary")

    _,ptk = delong_test(y,p_cal,tok)
    _,pkt = delong_test(y,p_cal,kat)

    # Calibration metrics
    slope, intercept, hl_p = calibration_metrics(y,p_cal)

    return dict(
        p_cal=p_cal,
        auc_ai=ai, ai_lo=ai_lo, ai_hi=ai_hi,
        auc_tok=tk, tk_lo=tk_lo, tk_hi=tk_hi,
        auc_kat=kt, kt_lo=kt_lo, kt_hi=kt_hi,
        sens=sens, spec=spec, f1=f1,
        brier=brier,
        ptk=ptk, pkt=pkt,
        slope=slope, intercept=intercept, hl=hl_p
    )

# --------------------------- 10. SHAP Top7 -----------------------------------
def shap_top7(model,X,tp):
    Xd=X.rename(columns=rename_shap)

    explainer=shap.TreeExplainer(model)
    sv=explainer.shap_values(Xd)
    if isinstance(sv, list): sv=sv[1]

    mean_abs=np.abs(sv).mean(axis=0)
    df_sh=pd.DataFrame({"Feature":Xd.columns,"mean(|SHAP|)":mean_abs})
    df_sh=df_sh.nlargest(7,"mean(|SHAP|)").sort_values("mean(|SHAP|)")

    plt.figure(figsize=(8,4))
    plt.barh(df_sh["Feature"], df_sh["mean(|SHAP|)"])
    plt.xlim(0,2.5)
    plt.xlabel("mean(|SHAP value|)",fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)

    plt.tight_layout()
    plt.savefig(OUT/f"SHAP_{tp}_top7.png",dpi=600)
    plt.close()

    df_sh.to_excel(OUT/f"SHAP_table_{tp}_top7.xlsx",index=False)

# --------------------- 11. SHAP Heatmap (all features) -----------------------
def compute_shap_abs(model,X):
    explainer=shap.TreeExplainer(model)
    sv=explainer.shap_values(X)
    if isinstance(sv,list): sv=sv[1]
    return np.abs(sv)

def train_models_for_shap(df,features,timepoints):
    models={}
    for tp,col in timepoints.items():
        y=df[col].astype(int).values
        X=pd.get_dummies(df[features],drop_first=True)
        m=lgb.LGBMClassifier(**lgb_params)
        m.fit(X,y)
        models[tp]=m
    return models

def shap_heatmap_all_features(df,features,timepoints):
    X=pd.get_dummies(df[features],drop_first=True)
    Xd=X.rename(columns=rename_shap)

    models=train_models_for_shap(df,features,timepoints)

    shap_abs={}
    for tp,m in models.items():
        sv_abs=compute_shap_abs(m,Xd)
        shap_abs[tp]=sv_abs.mean(axis=0)

    shap_df=pd.DataFrame(shap_abs,index=Xd.columns)
    shap_df=shap_df.sort_values(by="3M",ascending=False)

    shap_df.to_excel(OUT/"SHAP_AllFeatures_Table.xlsx")

    plt.figure(figsize=(10,12))
    plt.imshow(shap_df.values,cmap="Blues",aspect="auto")

    cbar=plt.colorbar(label="mean(|SHAP|)",fraction=0.035,pad=0.04)
    cbar.ax.tick_params(labelsize=14)

    plt.xticks(np.arange(len(timepoints)),labels=timepoints.keys(),fontsize=16)
    plt.yticks(np.arange(len(shap_df.index)),labels=shap_df.index,fontsize=16)

    plt.title("SHAP Heatmap (All Features)",fontsize=22,pad=20)
    plt.tight_layout()
    plt.savefig(OUT/"SHAP_AllFeatures_Heatmap.png",dpi=600)
    plt.close()

    return shap_df

# --------------------------- 12. Temporal validation -------------------------
def run_temporal_validation(df, features, timepoints):
    print("\n===== Temporal Validation (≤2015 → ≥2016) =====")

    df_old=df[df["_YEAR_"]<=2015]
    df_new=df[df["_YEAR_"]>=2016]

    if df_old.empty or df_new.empty:
        print("Temporal validation skipped (cohorts empty).")
        return None

    results={}
    for tp,col in timepoints.items():
        print(f"\n--- {tp} ---")
        y_tr=df_old[col].astype(int).values
        y_te=df_new[col].astype(int).values

        X_tr=pd.get_dummies(df_old[features],drop_first=True)
        X_te=pd.get_dummies(df_new[features],drop_first=True)

        X_tr, X_te = X_tr.align(X_te, join="left", axis=1, fill_value=0)
        # Train model on old cohort and predict new cohort
        m = lgb.LGBMClassifier(**lgb_params)
        m.fit(X_tr, y_tr)
        p_te = m.predict_proba(X_te)[:, 1]

        auc_tv = roc_auc_score(y_te, p_te)
        results[tp] = auc_tv
        print(f"AUC (Temporal Validation) = {auc_tv:.3f}")

    # Save to Excel
    pd.DataFrame.from_dict(results, orient="index", columns=["AUC_temporal"]) \
        .to_excel(OUT / "Temporal_Validation.xlsx")

    print("Temporal validation saved →", OUT / "Temporal_Validation.xlsx")
    return results


# --------------------------- 13. Main pipeline -------------------------------
tables = {}

for tp, col in timepoints.items():
    print(f"\n================ {tp} : {col} ================")

    y = df[col].astype(int).values
    X = pd.get_dummies(df[features], drop_first=True)

    tok = df["Tokuhashi_binary"].values.astype(float)
    kat = df["Katagiri_binary"].values.astype(float)

    cv = run_cv(X, y, tok, kat)

    # ROC & calibration plots
    plot_roc(y, cv["p_cal"], tok, kat, f"ROC_{tp}")
    plot_calibration(y, cv["p_cal"], f"Calibration_{tp}")

    # Final model for SHAP Top7
    model_final = lgb.LGBMClassifier(**lgb_params)
    model_final.fit(X, y)
    shap_top7(model_final, X, tp)

    # Collect results
    tables[tp] = {
        "Timepoint": tp,
        "AI AUC (95%CI)":        f"{cv['auc_ai']:.3f} ({cv['ai_lo']:.3f}–{cv['ai_hi']:.3f})",
        "Tokuhashi AUC (95%CI)": f"{cv['auc_tok']:.3f} ({cv['tk_lo']:.3f}–{cv['tk_hi']:.3f})",
        "Katagiri AUC (95%CI)":  f"{cv['auc_kat']:.3f} ({cv['kt_lo']:.3f}–{cv['kt_hi']:.3f})",
        "Sensitivity":           f"{cv['sens']:.3f}",
        "Specificity":           f"{cv['spec']:.3f}",
        "F1-score":              f"{cv['f1']:.3f}",
        "Brier (iso)":           f"{cv['brier']:.3f}",
        "Calibration slope":     f"{cv['slope']:.3f}",
        "Calibration intercept": f"{cv['intercept']:.3f}",
        "HL p-value":            f"{cv['hl']:.3f}",
        "p (AI vs Revised Tokuhashi score)": f"{cv['ptk']:.3e}",
        "p (AI vs New Katagiri score)":      f"{cv['pkt']:.3e}",
    }

# Save performance summary
perf_df = pd.DataFrame(list(tables.values()))
perf_df.to_excel(OUT / "Performance_Summary_ESJ_Final.xlsx", index=False)
print("\nPerformance summary saved →", OUT / "Performance_Summary_ESJ_Final.xlsx")

# SHAP Heatmap & Temporal validation
heatmap_df      = shap_heatmap_all_features(df, features, timepoints)
temporal_result = run_temporal_validation(df, features, timepoints)

print("\nAll generated files in", OUT)
for f in sorted(OUT.iterdir()):
    print(" -", f.name)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/175.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m174.1/175.3 kB[0m [31m5.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.3/175.3 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hOutput directory: /content/results_esj_final
Using dataset → /content/patient All.xlsx

[LightGBM] [Info] Number of positive: 130, number of negative: 40
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000189 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 282
[LightGBM] [Info] Number of data points in the train set: 170, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 130, number of negat

  g = dfc.groupby("bin").agg(obs=("y","sum"), n=("y","count"), pred=("p","mean"))
  g=dfc.groupby("bin").agg(obs=("y","mean"),pred=("p","mean"))


[LightGBM] [Info] Number of positive: 163, number of negative: 50
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000078 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 329
[LightGBM] [Info] Number of data points in the train set: 213, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000





[LightGBM] [Info] Number of positive: 98, number of negative: 72
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000049 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 288
[LightGBM] [Info] Number of data points in the train set: 170, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 98, number of negative: 72
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000048 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 284
[LightGBM] [Info] Number of data points in the train set: 170, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of p

  g = dfc.groupby("bin").agg(obs=("y","sum"), n=("y","count"), pred=("p","mean"))
  g=dfc.groupby("bin").agg(obs=("y","mean"),pred=("p","mean"))


[LightGBM] [Info] Number of positive: 123, number of negative: 90
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000083 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 329
[LightGBM] [Info] Number of data points in the train set: 213, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000





[LightGBM] [Info] Number of positive: 74, number of negative: 96
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000049 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 285
[LightGBM] [Info] Number of data points in the train set: 170, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 74, number of negative: 96
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000046 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 284
[LightGBM] [Info] Number of data points in the train set: 170, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of p

  g = dfc.groupby("bin").agg(obs=("y","sum"), n=("y","count"), pred=("p","mean"))
  g=dfc.groupby("bin").agg(obs=("y","mean"),pred=("p","mean"))


[LightGBM] [Info] Number of positive: 93, number of negative: 120
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000080 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 329
[LightGBM] [Info] Number of data points in the train set: 213, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000





Performance summary saved → /content/results_esj_final/Performance_Summary_ESJ_Final.xlsx
[LightGBM] [Info] Number of positive: 163, number of negative: 50
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000082 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 329
[LightGBM] [Info] Number of data points in the train set: 213, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 123, number of negative: 90
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000031 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 329
[LightGBM] [Info] Number of data points in the train set: 213, number of used features: 12
[LightG




===== Temporal Validation (≤2015 → ≥2016) =====

--- 3M ---
[LightGBM] [Info] Number of positive: 73, number of negative: 17
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000037 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 187
[LightGBM] [Info] Number of data points in the train set: 90, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
AUC (Temporal Validation) = 0.816

--- 6M ---
[LightGBM] [Info] Number of positive: 51, number of negative: 39
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000032 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 187
[LightGBM] [Info] Number of data points in the train set: 90, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 ->