### Connect to DuckDB + load table

In [1]:
from pathlib import Path
import duckdb
import pandas as pd

project_root = Path.cwd().resolve()
while not (project_root / "Day-1").exists():
    if project_root == project_root.parent:
        raise FileNotFoundError("Could not find project root containing Day-1.")
    project_root = project_root.parent

db_path = project_root / "Day-1" / "data" / "warehouse" / "day1.duckdb"
con = duckdb.connect(str(db_path))
print("Connected to:", db_path)

df = con.execute("SELECT * FROM gold_diabetes_features_v1").df()
print("df shape:", df.shape)


Connected to: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-1\data\warehouse\day1.duckdb
df shape: (101766, 20)


### Leakage-safe train/valid/test split (same as Day 4/5)

In [2]:
from sklearn.model_selection import GroupShuffleSplit

y = df["label"].astype(int)
groups = df["person_id"]

gss1 = GroupShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
idx_trainval, idx_test = next(gss1.split(df, y, groups=groups))

df_trainval = df.iloc[idx_trainval].copy()
df_test = df.iloc[idx_test].copy()

gss2 = GroupShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
idx_train, idx_valid = next(gss2.split(df_trainval, df_trainval["label"].astype(int),
                                       groups=df_trainval["person_id"]))

df_train = df_trainval.iloc[idx_train].copy()
df_valid = df_trainval.iloc[idx_valid].copy()

print("Train/Valid/Test:", df_train.shape[0], df_valid.shape[0], df_test.shape[0])
print("Prevalence train/valid/test:",
      float(df_train["label"].mean()),
      float(df_valid["label"].mean()),
      float(df_test["label"].mean()))

print("Overlap train-valid:", len(set(df_train["person_id"]) & set(df_valid["person_id"])))
print("Overlap train-test:", len(set(df_train["person_id"]) & set(df_test["person_id"])))
print("Overlap valid-test:", len(set(df_valid["person_id"]) & set(df_test["person_id"])))


Train/Valid/Test: 60988 20625 20153
Prevalence train/valid/test: 0.11218600380402702 0.11461818181818181 0.10673348881059892
Overlap train-valid: 0
Overlap train-test: 0
Overlap valid-test: 0


### Preprocessing (dense one-hot) + helpers

In [3]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

id_cols = ["encounter_id", "person_id", "label"]
feature_cols = [c for c in df.columns if c not in id_cols]

numeric_cols = [
    "time_in_hospital", "num_lab_procedures", "num_procedures", "num_medications",
    "number_outpatient", "number_emergency", "number_inpatient"
]
categorical_cols = [c for c in feature_cols if c not in numeric_cols]

X_train = df_train[feature_cols]
y_train = df_train["label"].astype(int).to_numpy()
g_train = df_train["person_id"].to_numpy()

X_valid = df_valid[feature_cols]
y_valid = df_valid["label"].astype(int).to_numpy()

X_test = df_test[feature_cols]
y_test = df_test["label"].astype(int).to_numpy()

def make_dense_ohe():
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        return OneHotEncoder(handle_unknown="ignore", sparse=False)

prep_tree_dense = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("impute", SimpleImputer(strategy="median"))]), numeric_cols),
        ("cat", Pipeline([("impute", SimpleImputer(strategy="most_frequent")),
                          ("onehot", make_dense_ohe())]), categorical_cols),
    ]
)

def clip01(p, eps=1e-15):
    p = np.asarray(p)
    return np.clip(p, eps, 1 - eps)


### Define base model + metrics (version-safe logloss)

In [4]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import average_precision_score, roc_auc_score, brier_score_loss, log_loss

def make_base_model():
    return Pipeline([
        ("prep", prep_tree_dense),
        ("clf", HistGradientBoostingClassifier(
            max_depth=6,
            learning_rate=0.05,
            max_iter=400,
            random_state=42
        ))
    ])

def metrics(y_true, p):
    y_true = np.asarray(y_true)
    p = np.asarray(p)
    p_clip = clip01(p)
    return {
        "prevalence": float(y_true.mean()),
        "mean_p": float(p.mean()),
        "median_p": float(np.median(p)),
        "pr_auc": float(average_precision_score(y_true, p)),
        "roc_auc": float(roc_auc_score(y_true, p)),
        "brier": float(brier_score_loss(y_true, p)),
        "logloss": float(log_loss(y_true, p_clip, labels=[0, 1])),
    }

def ece(y_true, p, n_bins=10):
    y_true = np.asarray(y_true)
    p = np.asarray(p)
    bins = np.linspace(0.0, 1.0, n_bins + 1)
    idx = np.digitize(p, bins) - 1
    ece_val = 0.0
    n = len(p)
    for b in range(n_bins):
        m = (idx == b)
        if m.sum() == 0:
            continue
        conf = p[m].mean()
        acc = y_true[m].mean()
        ece_val += (m.sum() / n) * abs(acc - conf)
    return float(ece_val)


### Cross-fitted (out-of-fold) predictions on TRAIN (group-safe)

This is the key Day 6 move. We split TRAIN into folds by patient, then for each fold we fit on “other folds” and predict on the held-out fold. That gives us out-of-fold probabilities that are honest for calibration.

In [6]:
from sklearn.model_selection import GroupKFold

K = 5
gkf = GroupKFold(n_splits=K)

p_oof = np.zeros(len(df_train), dtype=float)

for fold, (tr_idx, cal_idx) in enumerate(gkf.split(X_train, y_train, groups=g_train), start=1):
    model = make_base_model()
    model.fit(X_train.iloc[tr_idx], y_train[tr_idx])
    p_oof[cal_idx] = model.predict_proba(X_train.iloc[cal_idx])[:, 1]
    print(f"Fold {fold}/{K} done. cal size={len(cal_idx)}")

print("OOF preds ready. mean(p_oof)=", float(p_oof.mean()))


Fold 1/5 done. cal size=12198
Fold 2/5 done. cal size=12198
Fold 3/5 done. cal size=12198
Fold 4/5 done. cal size=12197
Fold 5/5 done. cal size=12197
OOF preds ready. mean(p_oof)= 0.11170352563190443


Sanity check (optional but helpful):

In [7]:
print("OOF metrics on TRAIN (not for reporting, just sanity):", metrics(y_train, p_oof))
print("OOF ECE on TRAIN:", ece(y_train, p_oof))


OOF metrics on TRAIN (not for reporting, just sanity): {'prevalence': 0.11218600380402702, 'mean_p': 0.11170352563190443, 'median_p': 0.09638257213695628, 'pr_auc': 0.21460973259443822, 'roc_auc': 0.6670274406569707, 'brier': 0.09528749475684366, 'logloss': 0.33152540670157826}
OOF ECE on TRAIN: 0.0011669272167829485


### Fit calibrators on OOF predictions (Platt + Isotonic)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.isotonic import IsotonicRegression

# Platt (sigmoid) calibration uses logits of probabilities
p_oof_c = clip01(p_oof)
z_oof = np.log(p_oof_c / (1 - p_oof_c)).reshape(-1, 1)

platt = LogisticRegression(solver="lbfgs", C=1e6, max_iter=2000)
platt.fit(z_oof, y_train)

iso = IsotonicRegression(out_of_bounds="clip")
iso.fit(p_oof, y_train)

print("Calibrators fit on OOF predictions.")


Calibrators fit on OOF predictions.


### Fit final base model on full TRAIN, then calibrate VALID and TEST

In [9]:
final_model = make_base_model()
final_model.fit(X_train, y_train)

p_valid_base = final_model.predict_proba(X_valid)[:, 1]
p_test_base  = final_model.predict_proba(X_test)[:, 1]

# Platt transform
z_valid = np.log(clip01(p_valid_base) / (1 - clip01(p_valid_base))).reshape(-1, 1)
z_test  = np.log(clip01(p_test_base)  / (1 - clip01(p_test_base))).reshape(-1, 1)

p_valid_sig = platt.predict_proba(z_valid)[:, 1]
p_test_sig  = platt.predict_proba(z_test)[:, 1]

# Isotonic transform
p_valid_iso = iso.transform(p_valid_base)
p_test_iso  = iso.transform(p_test_base)

print("VALID base:", metrics(y_valid, p_valid_base), "ECE:", ece(y_valid, p_valid_base))
print("VALID sigmoid:", metrics(y_valid, p_valid_sig), "ECE:", ece(y_valid, p_valid_sig))
print("VALID isotonic:", metrics(y_valid, p_valid_iso), "ECE:", ece(y_valid, p_valid_iso))

print("TEST base:", metrics(y_test, p_test_base), "ECE:", ece(y_test, p_test_base))
print("TEST sigmoid:", metrics(y_test, p_test_sig), "ECE:", ece(y_test, p_test_sig))
print("TEST isotonic:", metrics(y_test, p_test_iso), "ECE:", ece(y_test, p_test_iso))


VALID base: {'prevalence': 0.11461818181818181, 'mean_p': 0.11277848922639576, 'median_p': 0.09866236945430058, 'pr_auc': 0.22467831773854963, 'roc_auc': 0.6677610282599987, 'brier': 0.09683001463055108, 'logloss': 0.33576765492464755} ECE: 0.004044470923893383
VALID sigmoid: {'prevalence': 0.11461818181818181, 'mean_p': 0.11327204954074022, 'median_p': 0.09911347204037121, 'pr_auc': 0.22467831773854963, 'roc_auc': 0.6677610282599987, 'brier': 0.09682699948900617, 'logloss': 0.33575903386252465} ECE: 0.0044861429718006446
VALID isotonic: {'prevalence': 0.11461818181818181, 'mean_p': 0.1135638733425948, 'median_p': 0.09396119468061914, 'pr_auc': 0.215795441078798, 'roc_auc': 0.667189460752905, 'brier': 0.09689709281504955, 'logloss': 0.3359497670062253} ECE: 0.0034775109251076308
TEST base: {'prevalence': 0.10673348881059892, 'mean_p': 0.1111784920382252, 'median_p': 0.0975009866650095, 'pr_auc': 0.2078325210559714, 'roc_auc': 0.6666140354981995, 'brier': 0.0913437944101995, 'logloss': 

### Pick calibration method using VALID Brier, then do top-K on TEST

In [10]:
def topk_summary(y_true, p, frac=0.10):
    y_true = np.asarray(y_true)
    n = len(y_true)
    k = max(1, int(np.floor(frac*n)))
    order = np.argsort(-p)
    idx = order[:k]
    return {
        "frac": float(frac),
        "k": int(k),
        "captured": int(y_true[idx].sum()),
        "precision_at_k": float(y_true[idx].mean()),
        "threshold": float(np.quantile(p, 1-frac))
    }

cand = [
    ("base", p_valid_base, p_test_base),
    ("sigmoid_platt_oof", p_valid_sig, p_test_sig),
    ("isotonic_oof", p_valid_iso, p_test_iso),
]

# choose by VALID brier
best_name, p_valid_best, p_test_best = min(
    cand, key=lambda t: metrics(y_valid, t[1])["brier"]
)

print("Best by VALID Brier:", best_name)
print("VALID metrics:", metrics(y_valid, p_valid_best), "ECE:", ece(y_valid, p_valid_best))
print("TEST metrics :", metrics(y_test,  p_test_best),  "ECE:", ece(y_test,  p_test_best))

for frac in [0.01, 0.05, 0.10, 0.20]:
    print("TEST", best_name, topk_summary(y_test, p_test_best, frac=frac))


Best by VALID Brier: sigmoid_platt_oof
VALID metrics: {'prevalence': 0.11461818181818181, 'mean_p': 0.11327204954074022, 'median_p': 0.09911347204037121, 'pr_auc': 0.22467831773854963, 'roc_auc': 0.6677610282599987, 'brier': 0.09682699948900617, 'logloss': 0.33575903386252465} ECE: 0.0044861429718006446
TEST metrics : {'prevalence': 0.10673348881059892, 'mean_p': 0.11166645743436707, 'median_p': 0.09794703429186898, 'pr_auc': 0.2078325210559714, 'roc_auc': 0.6666140354981995, 'brier': 0.0913482386427387, 'logloss': 0.3211129468363118} ECE: 0.004971473772442335
TEST sigmoid_platt_oof {'frac': 0.01, 'k': 201, 'captured': 73, 'precision_at_k': 0.36318407960199006, 'threshold': 0.3494286102081742}
TEST sigmoid_platt_oof {'frac': 0.05, 'k': 1007, 'captured': 304, 'precision_at_k': 0.3018867924528302, 'threshold': 0.23179108027082457}
TEST sigmoid_platt_oof {'frac': 0.1, 'k': 2015, 'captured': 502, 'precision_at_k': 0.2491315136476427, 'threshold': 0.19038341536822131}
TEST sigmoid_platt_oof

### Save Day 6 artifacts

In [11]:
import json
from pathlib import Path

reports_dir = project_root / "Day-6" / "reports"
reports_dir.mkdir(parents=True, exist_ok=True)

out = {
    "K": K,
    "method": "Cross-fitted calibration: OOF predictions on TRAIN (GroupKFold by person_id), calibrators fit on OOF, applied to VALID/TEST probabilities from final model.",
    "valid": {
        "base": metrics(y_valid, p_valid_base),
        "sigmoid_platt_oof": metrics(y_valid, p_valid_sig),
        "isotonic_oof": metrics(y_valid, p_valid_iso),
        "ece": {
            "base": ece(y_valid, p_valid_base),
            "sigmoid_platt_oof": ece(y_valid, p_valid_sig),
            "isotonic_oof": ece(y_valid, p_valid_iso),
        }
    },
    "test": {
        "base": metrics(y_test, p_test_base),
        "sigmoid_platt_oof": metrics(y_test, p_test_sig),
        "isotonic_oof": metrics(y_test, p_test_iso),
        "ece": {
            "base": ece(y_test, p_test_base),
            "sigmoid_platt_oof": ece(y_test, p_test_sig),
            "isotonic_oof": ece(y_test, p_test_iso),
        }
    }
}

with open(reports_dir / "DAY06_crossfit_calibration.json", "w", encoding="utf-8") as f:
    json.dump(out, f, indent=2)

print("Saved:", reports_dir / "DAY06_crossfit_calibration.json")


Saved: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-6\reports\DAY06_crossfit_calibration.json


Close DuckDB

In [12]:
con.close()
print("DuckDB closed.")


DuckDB closed.
