In [1]:
DATA_PATH = "/Users/shiyalin/Desktop/patient_features_final.csv"

import re
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    roc_auc_score, average_precision_score, accuracy_score, f1_score,
    precision_recall_curve, confusion_matrix
)
from xgboost import XGBClassifier

# ---------- Helpers ----------
def split_cols(X: pd.DataFrame):
    """Split categorical vs numerical columns"""
    cat_cols = [c for c in X.columns if X[c].dtype == "object"]
    num_cols = [c for c in X.columns if c not in cat_cols]
    return cat_cols, num_cols

def make_xgb():
    """Build an XGBoost classifier with conservative regularization"""
    return XGBClassifier(
        n_estimators=400,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        min_child_weight=5,
        reg_lambda=2.0,
        objective="binary:logistic",
        eval_metric="auc",
        n_jobs=-1,
        random_state=42,
        tree_method="hist",
    )

def build_preprocessor(X: pd.DataFrame):
    """Preprocessing pipeline: impute + one-hot encode"""
    cat_cols, num_cols = split_cols(X)
    num_tf = Pipeline([("imp", SimpleImputer(strategy="median"))])
    try:
        cat_tf = Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=True))
        ])
    except TypeError:  # fallback for older sklearn
        cat_tf = Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore", sparse=True))
        ])
    prep = ColumnTransformer([
        ("num", num_tf, num_cols),
        ("cat", cat_tf, cat_cols)
    ])
    return prep

def evaluate_cv(pipeline: Pipeline, X: pd.DataFrame, y: np.ndarray, name: str, n_splits: int = 5):
    """Run stratified k-fold CV, print metrics"""
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    rows = []
    for fold, (tr_idx, te_idx) in enumerate(skf.split(X, y), start=1):
        Xtr, Xte = X.iloc[tr_idx], X.iloc[te_idx]
        ytr, yte = y[tr_idx], y[te_idx]

        pipeline.fit(Xtr, ytr)

        # Avoid version issues with pipeline.predict_proba
        prep = pipeline.named_steps["prep"]
        xgb  = pipeline.named_steps["xgb"]
        Xte_t = prep.transform(Xte)
        y_prob = xgb.predict_proba(Xte_t)[:, 1]

        # Metrics
        y_pred = (y_prob >= 0.5).astype(int)
        auroc = roc_auc_score(yte, y_prob)
        auprc = average_precision_score(yte, y_prob)
        acc   = accuracy_score(yte, y_pred)
        f1    = f1_score(yte, y_pred)
        tn, fp, fn, tp = confusion_matrix(yte, y_pred).ravel()

        # Find best F1 threshold
        prec, rec, thr = precision_recall_curve(yte, y_prob)
        f1s = 2 * prec[:-1] * rec[:-1] / (prec[:-1] + rec[:-1] + 1e-12)
        best_ix = int(np.nanargmax(f1s)) if len(f1s) else 0
        best_thr = float(thr[best_ix]) if len(thr) else 0.5
        best_f1  = float(f1s[best_ix]) if len(f1s) else f1

        rows.append((auroc, auprc, acc, f1, best_thr, best_f1, tn, fp, fn, tp))
        print(f"[{name}][Fold {fold}] AUROC={auroc:.3f} AUPRC={auprc:.3f} "
              f"ACC@0.5={acc:.3f} F1@0.5={f1:.3f} | best_thr={best_thr:.3f} best_F1={best_f1:.3f}")

    # Summary across folds
    arr = np.array(rows, dtype=float)
    metrics = ["AUROC", "AUPRC", "ACC@0.5", "F1@0.5", "best_thr", "best_F1"]
    print("\n----- CV Summary (mean ± std) -----")
    for i, m in enumerate(metrics):
        print(f"{m}: {arr[:, i].mean():.3f} ± {arr[:, i].std():.3f}")
    tn_m, fp_m, fn_m, tp_m = arr[:, 6].mean(), arr[:, 7].mean(), arr[:, 8].mean(), arr[:, 9].mean()
    print(f"Confusion (avg over folds): TN={tn_m:.1f}, FP={fp_m:.1f}, FN={fn_m:.1f}, TP={tp_m:.1f}")
    print(f"==== {name} CV finished ====\n")

# ---------- 2) Load data ----------
df = pd.read_csv(DATA_PATH)
print(f"Loaded: {DATA_PATH} | shape={df.shape}")

# ---------- 3) Build binary label (80th percentile) ----------
p80 = df["n_encounters"].quantile(0.80)
THR_INT = int(np.ceil(p80)) if abs(p80 - round(p80)) > 1e-9 else int(round(p80))
df["label_high_util"] = (df["n_encounters"] >= THR_INT).astype(int)

# ---------- 4) Select features: demographics + ICD prefixes ----------
demographic_cols = [c for c in ["PATIENT_SEX", "PATIENT_RACE_ETHNICITY", "DECEASED_FLAG", "age"] if c in df.columns]
icd_cols = [c for c in df.columns if c.startswith("icd_prefix_")]
feature_cols = [c for c in (demographic_cols + icd_cols)]

assert len(feature_cols) > 0, "No features selected. Check column names."

X = df[feature_cols].copy()
y = df["label_high_util"].values

print("Number of selected features:", len(feature_cols))
print("Selected feature columns:")
for col in feature_cols:
    print(" -", col)

Loaded: /Users/shiyalin/Desktop/patient_features_final.csv | shape=(16369, 69)
Number of selected features: 54
Selected feature columns:
 - PATIENT_SEX
 - PATIENT_RACE_ETHNICITY
 - DECEASED_FLAG
 - age
 - icd_prefix_C50
 - icd_prefix_D50
 - icd_prefix_D64
 - icd_prefix_E03
 - icd_prefix_E11
 - icd_prefix_E66
 - icd_prefix_E78
 - icd_prefix_E83
 - icd_prefix_E87
 - icd_prefix_F10
 - icd_prefix_F33
 - icd_prefix_F41
 - icd_prefix_G47
 - icd_prefix_G89
 - icd_prefix_I10
 - icd_prefix_I25
 - icd_prefix_I48
 - icd_prefix_I50
 - icd_prefix_J18
 - icd_prefix_J44
 - icd_prefix_J45
 - icd_prefix_K21
 - icd_prefix_K59
 - icd_prefix_L03
 - icd_prefix_M25
 - icd_prefix_M54
 - icd_prefix_M79
 - icd_prefix_N17
 - icd_prefix_N18
 - icd_prefix_N25
 - icd_prefix_N39
 - icd_prefix_R05
 - icd_prefix_R06
 - icd_prefix_R07
 - icd_prefix_R10
 - icd_prefix_R11
 - icd_prefix_R19
 - icd_prefix_R53
 - icd_prefix_R69
 - icd_prefix_R79
 - icd_prefix_Z01
 - icd_prefix_Z09
 - icd_prefix_Z12
 - icd_prefix_Z51
 - icd

In [2]:
# ---------- 5) Pipeline ----------
pipe = Pipeline([("prep", build_preprocessor(X)), ("xgb", make_xgb())])

# ---------- 6) Cross-validation ----------
evaluate_cv(pipe, X, y, "XGB_ICD+DEMOGRAPHICS", n_splits=5)

[XGB_ICD+DEMOGRAPHICS][Fold 1] AUROC=0.917 AUPRC=0.795 ACC@0.5=0.890 F1@0.5=0.701 | best_thr=0.365 best_F1=0.721
[XGB_ICD+DEMOGRAPHICS][Fold 2] AUROC=0.911 AUPRC=0.776 ACC@0.5=0.883 F1@0.5=0.675 | best_thr=0.291 best_F1=0.695
[XGB_ICD+DEMOGRAPHICS][Fold 3] AUROC=0.908 AUPRC=0.762 ACC@0.5=0.874 F1@0.5=0.655 | best_thr=0.354 best_F1=0.696
[XGB_ICD+DEMOGRAPHICS][Fold 4] AUROC=0.921 AUPRC=0.799 ACC@0.5=0.888 F1@0.5=0.689 | best_thr=0.262 best_F1=0.723
[XGB_ICD+DEMOGRAPHICS][Fold 5] AUROC=0.912 AUPRC=0.775 ACC@0.5=0.881 F1@0.5=0.665 | best_thr=0.301 best_F1=0.706

----- CV Summary (mean ± std) -----
AUROC: 0.914 ± 0.005
AUPRC: 0.782 ± 0.014
ACC@0.5: 0.883 ± 0.005
F1@0.5: 0.677 ± 0.017
best_thr: 0.315 ± 0.039
best_F1: 0.708 ± 0.012
Confusion (avg over folds): TN=2489.8, FP=106.4, FN=276.4, TP=401.2
==== XGB_ICD+DEMOGRAPHICS CV finished ====

