### Connect to DuckDB (same as before)

In [1]:
from pathlib import Path
import duckdb

project_root = Path.cwd().resolve()
while not (project_root / "Day-1").exists():
    if project_root == project_root.parent:
        raise FileNotFoundError("Could not find project root containing Day-1.")
    project_root = project_root.parent

db_path = project_root / "Day-1" / "data" / "warehouse" / "day1.duckdb"
con = duckdb.connect(str(db_path))
print("Connected to:", db_path)
print("Tables:", [t[0] for t in con.execute("SHOW TABLES").fetchall()])


Connected to: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-1\data\warehouse\day1.duckdb
Tables: ['bronze_diabetes', 'gold_diabetes_base', 'gold_diabetes_features_v1', 'silver_diabetes', 'silver_diabetes_typed']


### Load the modeling table

In [2]:
import pandas as pd

df = con.execute("SELECT * FROM gold_diabetes_features_v1").df()
print(df.shape)
df.head()

(101766, 20)


Unnamed: 0,encounter_id,person_id,label,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,diabetesMed,change,insulin,A1Cresult
0,2278392,8222157,0,1,41,0,1,0,0,0,Caucasian,Female,[0-10),6,25,1,No,No,No,
1,149190,55629189,0,3,59,0,18,0,0,0,Caucasian,Female,[10-20),1,1,7,Yes,Ch,Up,
2,64410,86047875,0,2,11,5,13,2,0,1,AfricanAmerican,Female,[20-30),1,1,7,Yes,No,No,
3,500364,82442376,0,2,44,1,16,0,0,0,Caucasian,Male,[30-40),1,1,7,Yes,Ch,Up,
4,16680,42519267,0,1,51,0,8,0,0,0,Caucasian,Male,[40-50),1,1,7,Yes,Ch,Steady,


### Train/validation/test split (leakage-safe by patient)

We’ll do: test = 20%, then split the remaining into train/valid so overall is roughly 60/20/20.

In [3]:
from sklearn.model_selection import GroupShuffleSplit

y = df["label"].astype(int)
groups = df["person_id"]

# 1) Hold out TEST (20%)
gss1 = GroupShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
idx_trainval, idx_test = next(gss1.split(df, y, groups=groups))

df_trainval = df.iloc[idx_trainval].copy()
df_test = df.iloc[idx_test].copy()

# 2) Split TRAIN vs VALID inside trainval (valid is 25% of trainval -> 20% of total)
y_tv = df_trainval["label"].astype(int)
g_tv = df_trainval["person_id"]

gss2 = GroupShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
idx_train, idx_valid = next(gss2.split(df_trainval, y_tv, groups=g_tv))

df_train = df_trainval.iloc[idx_train].copy()
df_valid = df_trainval.iloc[idx_valid].copy()

print("Train rows:", df_train.shape[0], "Valid rows:", df_valid.shape[0], "Test rows:", df_test.shape[0])
print("Prevalence train/valid/test:",
      float(df_train["label"].mean()),
      float(df_valid["label"].mean()),
      float(df_test["label"].mean()))

# Leakage checks
overlap_tv = set(df_train["person_id"]).intersection(set(df_valid["person_id"]))
overlap_tt = set(df_train["person_id"]).intersection(set(df_test["person_id"]))
overlap_vt = set(df_valid["person_id"]).intersection(set(df_test["person_id"]))
print("Overlap train-valid:", len(overlap_tv))
print("Overlap train-test:", len(overlap_tt))
print("Overlap valid-test:", len(overlap_vt))


Train rows: 60988 Valid rows: 20625 Test rows: 20153
Prevalence train/valid/test: 0.11218600380402702 0.11461818181818181 0.10673348881059892
Overlap train-valid: 0
Overlap train-test: 0
Overlap valid-test: 0


### Define features + preprocessors

We’ll use one preprocessor for linear models (includes scaling) and one for trees (no scaling).

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

id_cols = ["encounter_id", "person_id", "label"]
feature_cols = [c for c in df.columns if c not in id_cols]

numeric_cols = [
    "time_in_hospital", "num_lab_procedures", "num_procedures", "num_medications",
    "number_outpatient", "number_emergency", "number_inpatient"
]
categorical_cols = [c for c in feature_cols if c not in numeric_cols]

X_train = df_train[feature_cols]
y_train = df_train["label"].astype(int)

X_valid = df_valid[feature_cols]
y_valid = df_valid["label"].astype(int)

X_test = df_test[feature_cols]
y_test = df_test["label"].astype(int)

# Linear preprocessor: impute + scale numeric; impute + one-hot categorical
prep_linear = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("impute", SimpleImputer(strategy="median")),
                          ("scale", StandardScaler())]), numeric_cols),
        ("cat", Pipeline([("impute", SimpleImputer(strategy="most_frequent")),
                          ("onehot", OneHotEncoder(handle_unknown="ignore"))]), categorical_cols),
    ]
)

# Tree preprocessor: impute numeric (no scaling); impute + one-hot categorical
prep_tree = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("impute", SimpleImputer(strategy="median"))]), numeric_cols),
        ("cat", Pipeline([("impute", SimpleImputer(strategy="most_frequent")),
                          ("onehot", OneHotEncoder(handle_unknown="ignore"))]), categorical_cols),
    ]
)

# ---- Day 4 fix: HistGradientBoosting needs DENSE (not sparse) input ----
def make_dense_ohe():
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)  # newer sklearn
    except TypeError:
        return OneHotEncoder(handle_unknown="ignore", sparse=False)         # older sklearn

prep_tree_dense = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("impute", SimpleImputer(strategy="median"))]), numeric_cols),
        ("cat", Pipeline([("impute", SimpleImputer(strategy="most_frequent")),
                          ("onehot", make_dense_ohe())]), categorical_cols),
    ]
)



### Define models (baseline + tree models)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

models = {
    "logreg_unweighted": Pipeline([
        ("prep", prep_linear),
        ("clf", LogisticRegression(max_iter=2000, solver="lbfgs"))
    ]),
    "decision_tree": Pipeline([
        ("prep", prep_tree),
        ("clf", DecisionTreeClassifier(
            max_depth=6,
            min_samples_leaf=200,
            random_state=42
        ))
    ]),
    "random_forest": Pipeline([
        ("prep", prep_tree),
        ("clf", RandomForestClassifier(
            n_estimators=400,
            max_depth=None,
            min_samples_leaf=50,
            n_jobs=-1,
            random_state=42
        ))
    ]),
    "hist_gb": Pipeline([
        ("prep", prep_tree_dense),   # <-- this is the fix
        ("clf", HistGradientBoostingClassifier(
            max_depth=6,
            learning_rate=0.05,
            max_iter=400,
            random_state=42
        ))
    ])
}


### Evaluation functions (PR-AUC, ROC-AUC, Brier, top-K)

In [10]:
import numpy as np
import pandas as pd
from sklearn.metrics import average_precision_score, roc_auc_score, brier_score_loss

def eval_probs(y_true, p):
    y_true = np.asarray(y_true)
    return {
        "prevalence": float(y_true.mean()),
        "mean_p": float(np.mean(p)),
        "pr_auc": float(average_precision_score(y_true, p)),
        "roc_auc": float(roc_auc_score(y_true, p)),
        "brier": float(brier_score_loss(y_true, p)),
    }

def topk(y_true, p, fracs=(0.01, 0.05, 0.10, 0.20)):
    y_true = np.asarray(y_true)
    n = len(y_true)
    order = np.argsort(-p)
    rows = []
    for frac in fracs:
        k = max(1, int(np.floor(frac*n)))
        idx = order[:k]
        rows.append({
            "top_frac": frac,
            "k": k,
            "captured": int(np.sum(y_true[idx])),
            "precision_at_k": float(np.mean(y_true[idx])),
            "threshold": float(np.quantile(p, 1-frac))
        })
    return pd.DataFrame(rows)


### Train on TRAIN, select on VALID, report on TEST

In [11]:
leader = []

for name, pipe in models.items():
    pipe.fit(X_train, y_train)

    p_valid = pipe.predict_proba(X_valid)[:, 1]
    p_test  = pipe.predict_proba(X_test)[:, 1]

    m_valid = eval_probs(y_valid, p_valid)
    m_test  = eval_probs(y_test, p_test)

    tk_valid = topk(y_valid, p_valid)
    tk_test  = topk(y_test, p_test)

    leader.append({
        "model": name,
        "valid_pr_auc": m_valid["pr_auc"],
        "valid_brier": m_valid["brier"],
        "valid_top10_precision": float(tk_valid.loc[tk_valid["top_frac"]==0.10, "precision_at_k"]),
        "test_pr_auc": m_test["pr_auc"],
        "test_brier": m_test["brier"],
        "test_top10_precision": float(tk_test.loc[tk_test["top_frac"]==0.10, "precision_at_k"]),
    })

leaderboard = pd.DataFrame(leader).sort_values("valid_pr_auc", ascending=False)
leaderboard


  "valid_top10_precision": float(tk_valid.loc[tk_valid["top_frac"]==0.10, "precision_at_k"]),
  "test_top10_precision": float(tk_test.loc[tk_test["top_frac"]==0.10, "precision_at_k"]),
  "valid_top10_precision": float(tk_valid.loc[tk_valid["top_frac"]==0.10, "precision_at_k"]),
  "test_top10_precision": float(tk_test.loc[tk_test["top_frac"]==0.10, "precision_at_k"]),
  "valid_top10_precision": float(tk_valid.loc[tk_valid["top_frac"]==0.10, "precision_at_k"]),
  "test_top10_precision": float(tk_test.loc[tk_test["top_frac"]==0.10, "precision_at_k"]),
  "valid_top10_precision": float(tk_valid.loc[tk_valid["top_frac"]==0.10, "precision_at_k"]),
  "test_top10_precision": float(tk_test.loc[tk_test["top_frac"]==0.10, "precision_at_k"]),


Unnamed: 0,model,valid_pr_auc,valid_brier,valid_top10_precision,test_pr_auc,test_brier,test_top10_precision
3,hist_gb,0.224678,0.09683,0.259942,0.207833,0.091344,0.249132
2,random_forest,0.224002,0.097494,0.251697,0.204242,0.09186,0.247643
0,logreg_unweighted,0.222778,0.097112,0.257032,0.200228,0.091835,0.254591
1,decision_tree,0.197429,0.097766,0.245393,0.18144,0.092266,0.241191


### Save Day-4 artifacts

In [12]:
from pathlib import Path
import json

reports_dir = project_root / "Day-4" / "reports"
reports_dir.mkdir(parents=True, exist_ok=True)

leaderboard.to_csv(reports_dir / "DAY04_leaderboard.csv", index=False)

with open(reports_dir / "DAY04_leaderboard.json", "w", encoding="utf-8") as f:
    json.dump(leader, f, indent=2)

print("Saved:", reports_dir / "DAY04_leaderboard.csv")
print("Saved:", reports_dir / "DAY04_leaderboard.json")


Saved: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-4\reports\DAY04_leaderboard.csv
Saved: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-4\reports\DAY04_leaderboard.json


### Save winner model

This lets us reload the best model later without retraining.

In [13]:
import joblib

best_name = leaderboard.iloc[0]["model"]
best_model = models[best_name]
best_model.fit(X_train, y_train)   # ensure it's fit

joblib.dump(best_model, reports_dir / "DAY04_best_model.joblib")
print("Best model:", best_name)
print("Saved:", reports_dir / "DAY04_best_model.joblib")


Best model: hist_gb
Saved: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-4\reports\DAY04_best_model.joblib


### Make and save top-200 predictions on the TEST set (winner model)

In [14]:
import numpy as np
import pandas as pd

p_test_best = best_model.predict_proba(X_test)[:, 1]

top = df_test[["encounter_id", "person_id", "label"]].copy()
top["p_hat"] = p_test_best
top = top.sort_values("p_hat", ascending=False).head(200)

top.to_csv(reports_dir / "DAY04_top200_test_predictions.csv", index=False)
print("Saved:", reports_dir / "DAY04_top200_test_predictions.csv")


Saved: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-4\reports\DAY04_top200_test_predictions.csv


Close DB:

In [15]:
con.close()
print("DuckDB closed.")


DuckDB closed.
