### Imports + paths

In [1]:
from pathlib import Path
import json
import numpy as np
import pandas as pd
import duckdb
import joblib

project_root = Path.cwd().resolve()
while not (project_root / "Day-1").exists():
    if project_root == project_root.parent:
        raise FileNotFoundError("Could not find project root containing Day-1.")
    project_root = project_root.parent

db_path = project_root / "Day-1" / "data" / "warehouse" / "day1.duckdb"

day9_dir = project_root / "Day-9"
art_dir = day9_dir / "artifacts"
rep_dir = day9_dir / "reports"

art_dir.mkdir(parents=True, exist_ok=True)
rep_dir.mkdir(parents=True, exist_ok=True)

print("DB:", db_path)
print("Artifacts:", art_dir)
print("Reports:", rep_dir)


DB: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-1\data\warehouse\day1.duckdb
Artifacts: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-9\artifacts
Reports: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-9\reports


### Load the modeling table

In [2]:
con = duckdb.connect(str(db_path))
df = con.execute("SELECT * FROM gold_diabetes_features_v1").df()
con.close()

print("df shape:", df.shape)
print("prevalence:", float(df["label"].mean()))
df.head()


df shape: (101766, 20)
prevalence: 0.11159915885462728


Unnamed: 0,encounter_id,person_id,label,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,diabetesMed,change,insulin,A1Cresult
0,2278392,8222157,0,1,41,0,1,0,0,0,Caucasian,Female,[0-10),6,25,1,No,No,No,
1,149190,55629189,0,3,59,0,18,0,0,0,Caucasian,Female,[10-20),1,1,7,Yes,Ch,Up,
2,64410,86047875,0,2,11,5,13,2,0,1,AfricanAmerican,Female,[20-30),1,1,7,Yes,No,No,
3,500364,82442376,0,2,44,1,16,0,0,0,Caucasian,Male,[30-40),1,1,7,Yes,Ch,Up,
4,16680,42519267,0,1,51,0,8,0,0,0,Caucasian,Male,[40-50),1,1,7,Yes,Ch,Steady,


### Define “data contract”: IDs, features, and preprocessing

This is important for deployment. We explicitly define what columns are required and how they’re handled.

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GroupShuffleSplit, GroupKFold

ID_COLS = ["encounter_id", "person_id", "label"]
FEATURE_COLS = [c for c in df.columns if c not in ID_COLS]

NUMERIC_COLS = [
    "time_in_hospital", "num_lab_procedures", "num_procedures", "num_medications",
    "number_outpatient", "number_emergency", "number_inpatient"
]
CATEGORICAL_COLS = [c for c in FEATURE_COLS if c not in NUMERIC_COLS]

def make_dense_ohe():
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        return OneHotEncoder(handle_unknown="ignore", sparse=False)

PREP = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("impute", SimpleImputer(strategy="median"))]), NUMERIC_COLS),
        ("cat", Pipeline([("impute", SimpleImputer(strategy="most_frequent")),
                          ("onehot", make_dense_ohe())]), CATEGORICAL_COLS),
    ]
)

def make_base_model():
    return Pipeline([
        ("prep", PREP),
        ("clf", HistGradientBoostingClassifier(
            max_depth=6,
            learning_rate=0.05,
            max_iter=400,
            random_state=42
        ))
    ])

def clip01(p, eps=1e-15):
    p = np.asarray(p, dtype=float)
    return np.clip(p, eps, 1-eps)


### Group split (patient leakage safe) + helper metrics

In [4]:
from sklearn.metrics import average_precision_score, roc_auc_score, brier_score_loss, log_loss

y = df["label"].astype(int)
groups = df["person_id"]

gss1 = GroupShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
idx_trainval, idx_test = next(gss1.split(df, y, groups=groups))
df_trainval = df.iloc[idx_trainval].copy()
df_test = df.iloc[idx_test].copy()

gss2 = GroupShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
idx_train, idx_valid = next(gss2.split(df_trainval, df_trainval["label"].astype(int),
                                       groups=df_trainval["person_id"]))
df_train = df_trainval.iloc[idx_train].copy()
df_valid = df_trainval.iloc[idx_valid].copy()

print("Train/Valid/Test:", df_train.shape[0], df_valid.shape[0], df_test.shape[0])
print("Overlap train-valid:", len(set(df_train["person_id"]) & set(df_valid["person_id"])))
print("Overlap train-test:", len(set(df_train["person_id"]) & set(df_test["person_id"])))

X_train, y_train, g_train = df_train[FEATURE_COLS], df_train["label"].astype(int).to_numpy(), df_train["person_id"].to_numpy()
X_valid, y_valid = df_valid[FEATURE_COLS], df_valid["label"].astype(int).to_numpy()
X_test,  y_test  = df_test[FEATURE_COLS],  df_test["label"].astype(int).to_numpy()

def compute_metrics(y_true, p):
    y_true = np.asarray(y_true).astype(int)
    p = np.asarray(p).astype(float)
    pc = clip01(p)
    return {
        "prevalence": float(y_true.mean()),
        "mean_p": float(p.mean()),
        "median_p": float(np.median(p)),
        "pr_auc": float(average_precision_score(y_true, p)),
        "roc_auc": float(roc_auc_score(y_true, p)),
        "brier": float(brier_score_loss(y_true, p)),
        "logloss": float(log_loss(y_true, pc, labels=[0,1])),
    }

def topk(y_true, p, frac):
    y_true = np.asarray(y_true).astype(int)
    p = np.asarray(p).astype(float)
    n = len(y_true)
    k = max(1, int(np.floor(frac*n)))
    idx = np.argsort(-p)[:k]
    return {
        "top_frac": float(frac),
        "k": int(k),
        "captured": int(y_true[idx].sum()),
        "precision_at_k": float(y_true[idx].mean()),
        "threshold": float(np.quantile(p, 1-frac)),
    }


Train/Valid/Test: 60988 20625 20153
Overlap train-valid: 0
Overlap train-test: 0


### Train base model + build OOF Platt calibrator (the Day 6 method)

This is the “deployable” probability model: base_model + platt_calibrator.

In [5]:
# OOF probabilities on TRAIN for calibrator fitting (GroupKFold by person_id)
K = 5
gkf = GroupKFold(n_splits=K)
p_oof = np.zeros(len(df_train), dtype=float)

for fold, (tr_idx, cal_idx) in enumerate(gkf.split(X_train, y_train, groups=g_train), start=1):
    m = make_base_model()
    m.fit(X_train.iloc[tr_idx], y_train[tr_idx])
    p_oof[cal_idx] = m.predict_proba(X_train.iloc[cal_idx])[:, 1]
    print(f"Fold {fold}/{K} done. cal size={len(cal_idx)}")

# Platt scaling on logit(p_oof)
p_oof_c = clip01(p_oof)
z_oof = np.log(p_oof_c / (1 - p_oof_c)).reshape(-1, 1)

platt = LogisticRegression(solver="lbfgs", C=1e6, max_iter=2000)
platt.fit(z_oof, y_train)

# Fit final base model on full TRAIN
base_model = make_base_model()
base_model.fit(X_train, y_train)

# Predict on VALID/TEST, apply platt
p_valid_raw = base_model.predict_proba(X_valid)[:, 1]
p_test_raw  = base_model.predict_proba(X_test)[:, 1]

z_valid = np.log(clip01(p_valid_raw) / (1 - clip01(p_valid_raw))).reshape(-1, 1)
z_test  = np.log(clip01(p_test_raw)  / (1 - clip01(p_test_raw))).reshape(-1, 1)

p_valid_hat = platt.predict_proba(z_valid)[:, 1]
p_test_hat  = platt.predict_proba(z_test)[:, 1]

print("VALID (calibrated):", compute_metrics(y_valid, p_valid_hat))
print("TEST  (calibrated):", compute_metrics(y_test,  p_test_hat))

for frac in [0.01, 0.05, 0.10, 0.20]:
    print("TEST", topk(y_test, p_test_hat, frac))


Fold 1/5 done. cal size=12198
Fold 2/5 done. cal size=12198
Fold 3/5 done. cal size=12198
Fold 4/5 done. cal size=12197
Fold 5/5 done. cal size=12197
VALID (calibrated): {'prevalence': 0.11461818181818181, 'mean_p': 0.11327204954074022, 'median_p': 0.09911347204037121, 'pr_auc': 0.22467831773854963, 'roc_auc': 0.6677610282599987, 'brier': 0.09682699948900617, 'logloss': 0.33575903386252465}
TEST  (calibrated): {'prevalence': 0.10673348881059892, 'mean_p': 0.11166645743436707, 'median_p': 0.09794703429186898, 'pr_auc': 0.2078325210559714, 'roc_auc': 0.6666140354981995, 'brier': 0.0913482386427387, 'logloss': 0.3211129468363118}
TEST {'top_frac': 0.01, 'k': 201, 'captured': 73, 'precision_at_k': 0.36318407960199006, 'threshold': 0.3494286102081742}
TEST {'top_frac': 0.05, 'k': 1007, 'captured': 304, 'precision_at_k': 0.3018867924528302, 'threshold': 0.23179108027082457}
TEST {'top_frac': 0.1, 'k': 2015, 'captured': 502, 'precision_at_k': 0.2491315136476427, 'threshold': 0.190383415368221

### Save artifacts (this is what makes it “deployable”)

We save: model, calibrator, and metadata that a real team expects.

In [6]:
model_path = art_dir / "readmit_base_model.joblib"
platt_path = art_dir / "readmit_platt_calibrator.joblib"
meta_path  = art_dir / "readmit_metadata.json"
cols_path  = art_dir / "readmit_feature_cols.json"

joblib.dump(base_model, model_path)
joblib.dump(platt, platt_path)

metadata = {
    "project": "diabetes_readmission",
    "label": "readmitted <30 (y_readmit_30)",
    "split": "GroupShuffleSplit by person_id (train/valid/test), calibrator fit on OOF predictions within train (GroupKFold K=5)",
    "model": "HistGradientBoostingClassifier(max_depth=6, learning_rate=0.05, max_iter=400)",
    "calibration": "Platt scaling (logistic regression on logit(p_raw))",
    "valid_metrics": compute_metrics(y_valid, p_valid_hat),
    "test_metrics": compute_metrics(y_test, p_test_hat),
    "topk_test": [topk(y_test, p_test_hat, f) for f in [0.01, 0.05, 0.10, 0.20]],
    "created_by": "Day 9 pipeline",
}

meta_path.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
cols_path.write_text(json.dumps({
    "id_cols": ID_COLS,
    "feature_cols": FEATURE_COLS,
    "numeric_cols": NUMERIC_COLS,
    "categorical_cols": CATEGORICAL_COLS
}, indent=2), encoding="utf-8")

print("Saved model:", model_path)
print("Saved platt:", platt_path)
print("Saved metadata:", meta_path)
print("Saved columns:", cols_path)


Saved model: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-9\artifacts\readmit_base_model.joblib
Saved platt: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-9\artifacts\readmit_platt_calibrator.joblib
Saved metadata: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-9\artifacts\readmit_metadata.json
Saved columns: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-9\artifacts\readmit_feature_cols.json


### “Cold start” test: reload artifacts and score again

This is a professional sanity check: if this passes, tomorrow’s deployment step becomes straightforward.

In [7]:
base_model2 = joblib.load(model_path)
platt2 = joblib.load(platt_path)

p_test_raw2 = base_model2.predict_proba(X_test)[:, 1]
z_test2 = np.log(clip01(p_test_raw2) / (1 - clip01(p_test_raw2))).reshape(-1, 1)
p_test_hat2 = platt2.predict_proba(z_test2)[:, 1]

print("Reloaded TEST metrics:", compute_metrics(y_test, p_test_hat2))


Reloaded TEST metrics: {'prevalence': 0.10673348881059892, 'mean_p': 0.11166645743436707, 'median_p': 0.09794703429186898, 'pr_auc': 0.2078325210559714, 'roc_auc': 0.6666140354981995, 'brier': 0.0913482386427387, 'logloss': 0.3211129468363118}


### Create a scored test table artifact (for BI / ops)

In [8]:
scored_test = df_test[["encounter_id", "person_id", "label"]].copy()
scored_test["p_raw"] = p_test_raw
scored_test["p_hat"] = p_test_hat
scored_test = scored_test.sort_values("p_hat", ascending=False)

scored_path = rep_dir / "DAY09_scored_test.csv"
scored_test.to_csv(scored_path, index=False)

print("Saved:", scored_path)
scored_test.head()


Saved: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-9\reports\DAY09_scored_test.csv


Unnamed: 0,encounter_id,person_id,label,p_raw,p_hat
67387,189144708,42941232,1,0.610624,0.612003
38644,120136542,23838849,1,0.556518,0.557934
87183,277879686,88227540,1,0.542616,0.544036
67883,190944528,57751650,0,0.517605,0.519027
42842,132138702,76743099,0,0.511674,0.513096


### Write a Day 9 model card

In [9]:
card = []
card.append("# Day 9 — Readmission model artifacts (deployable locally)\n")
card.append("## What exists after Day 9\n")
card.append("- Trained base model + Platt calibrator saved as joblib artifacts\n")
card.append("- Metadata JSON with metrics and procedure\n")
card.append("- Feature/column contract JSON\n")
card.append("- Example scored test CSV\n")
card.append("\n## Test metrics (calibrated)\n")
tm = metadata["test_metrics"]
card.append(f"- Prevalence: {tm['prevalence']:.6f}\n")
card.append(f"- PR-AUC: {tm['pr_auc']:.6f}\n")
card.append(f"- ROC-AUC: {tm['roc_auc']:.6f}\n")
card.append(f"- Brier: {tm['brier']:.6f}\n")
card.append(f"- Log loss: {tm['logloss']:.6f}\n")
card.append("\n## Capacity targeting (TEST)\n")
for row in metadata["topk_test"]:
    card.append(f"- Top {int(row['top_frac']*100)}%: k={row['k']} captured={row['captured']} "
                f"precision@k={row['precision_at_k']:.6f} threshold={row['threshold']:.6f}\n")

(md_path := rep_dir / "DAY09_model_card.md").write_text("".join(card), encoding="utf-8")
print("Saved:", md_path)


Saved: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-9\reports\DAY09_model_card.md
