### Load data from DuckDB

In [1]:
from pathlib import Path
import duckdb
import pandas as pd
import numpy as np

project_root = Path.cwd().resolve()
while not (project_root / "Day-1").exists():
    if project_root == project_root.parent:
        raise FileNotFoundError("Could not find project root containing Day-1.")
    project_root = project_root.parent

db_path = project_root / "Day-1" / "data" / "warehouse" / "day1.duckdb"
con = duckdb.connect(str(db_path))
print("Connected to:", db_path)

df = con.execute("SELECT * FROM gold_diabetes_features_v1").df()
print("df shape:", df.shape)
df.head()


Connected to: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-1\data\warehouse\day1.duckdb
df shape: (101766, 20)


Unnamed: 0,encounter_id,person_id,label,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,diabetesMed,change,insulin,A1Cresult
0,2278392,8222157,0,1,41,0,1,0,0,0,Caucasian,Female,[0-10),6,25,1,No,No,No,
1,149190,55629189,0,3,59,0,18,0,0,0,Caucasian,Female,[10-20),1,1,7,Yes,Ch,Up,
2,64410,86047875,0,2,11,5,13,2,0,1,AfricanAmerican,Female,[20-30),1,1,7,Yes,No,No,
3,500364,82442376,0,2,44,1,16,0,0,0,Caucasian,Male,[30-40),1,1,7,Yes,Ch,Up,
4,16680,42519267,0,1,51,0,8,0,0,0,Caucasian,Male,[40-50),1,1,7,Yes,Ch,Steady,


### Leakage-safe split (same as Day 4–6)

In [2]:
from sklearn.model_selection import GroupShuffleSplit

y = df["label"].astype(int)
groups = df["person_id"]

gss1 = GroupShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
idx_trainval, idx_test = next(gss1.split(df, y, groups=groups))

df_trainval = df.iloc[idx_trainval].copy()
df_test = df.iloc[idx_test].copy()

gss2 = GroupShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
idx_train, idx_valid = next(gss2.split(df_trainval, df_trainval["label"].astype(int),
                                       groups=df_trainval["person_id"]))

df_train = df_trainval.iloc[idx_train].copy()
df_valid = df_trainval.iloc[idx_valid].copy()

print("Train/Valid/Test:", df_train.shape[0], df_valid.shape[0], df_test.shape[0])
print("Prevalence train/valid/test:",
      float(df_train["label"].mean()),
      float(df_valid["label"].mean()),
      float(df_test["label"].mean()))

print("Overlap train-valid:", len(set(df_train["person_id"]) & set(df_valid["person_id"])))
print("Overlap train-test:", len(set(df_train["person_id"]) & set(df_test["person_id"])))
print("Overlap valid-test:", len(set(df_valid["person_id"]) & set(df_test["person_id"])))


Train/Valid/Test: 60988 20625 20153
Prevalence train/valid/test: 0.11218600380402702 0.11461818181818181 0.10673348881059892
Overlap train-valid: 0
Overlap train-test: 0
Overlap valid-test: 0


### Preprocess + model + Day-6 style cross-fitted Platt calibration

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LogisticRegression

id_cols = ["encounter_id", "person_id", "label"]
feature_cols = [c for c in df.columns if c not in id_cols]

numeric_cols = [
    "time_in_hospital", "num_lab_procedures", "num_procedures", "num_medications",
    "number_outpatient", "number_emergency", "number_inpatient"
]
categorical_cols = [c for c in feature_cols if c not in numeric_cols]

X_train = df_train[feature_cols]
y_train = df_train["label"].astype(int).to_numpy()
g_train = df_train["person_id"].to_numpy()

X_valid = df_valid[feature_cols]
y_valid = df_valid["label"].astype(int).to_numpy()

X_test = df_test[feature_cols]
y_test = df_test["label"].astype(int).to_numpy()

def make_dense_ohe():
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        return OneHotEncoder(handle_unknown="ignore", sparse=False)

prep_tree_dense = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("impute", SimpleImputer(strategy="median"))]), numeric_cols),
        ("cat", Pipeline([("impute", SimpleImputer(strategy="most_frequent")),
                          ("onehot", make_dense_ohe())]), categorical_cols),
    ]
)

def make_base_model():
    return Pipeline([
        ("prep", prep_tree_dense),
        ("clf", HistGradientBoostingClassifier(
            max_depth=6,
            learning_rate=0.05,
            max_iter=400,
            random_state=42
        ))
    ])

def clip01(p, eps=1e-15):
    p = np.asarray(p)
    return np.clip(p, eps, 1 - eps)

# --- OOF predictions on TRAIN (grouped) ---
K = 5
gkf = GroupKFold(n_splits=K)
p_oof = np.zeros(len(df_train), dtype=float)

for fold, (tr_idx, cal_idx) in enumerate(gkf.split(X_train, y_train, groups=g_train), start=1):
    m = make_base_model()
    m.fit(X_train.iloc[tr_idx], y_train[tr_idx])
    p_oof[cal_idx] = m.predict_proba(X_train.iloc[cal_idx])[:, 1]
    print(f"Fold {fold}/{K} done. cal size={len(cal_idx)}")

# --- Platt scaling (sigmoid) on OOF predictions ---
p_oof_c = clip01(p_oof)
z_oof = np.log(p_oof_c / (1 - p_oof_c)).reshape(-1, 1)

platt = LogisticRegression(solver="lbfgs", C=1e6, max_iter=2000)
platt.fit(z_oof, y_train)

# --- Fit final model on full TRAIN ---
final_model = make_base_model()
final_model.fit(X_train, y_train)

p_test_raw = final_model.predict_proba(X_test)[:, 1]
z_test = np.log(clip01(p_test_raw) / (1 - clip01(p_test_raw))).reshape(-1, 1)
p_test_hat = platt.predict_proba(z_test)[:, 1]

print("Done. Have p_test_raw and p_test_hat.")


Fold 1/5 done. cal size=12198
Fold 2/5 done. cal size=12198
Fold 3/5 done. cal size=12198
Fold 4/5 done. cal size=12197
Fold 5/5 done. cal size=12197
Done. Have p_test_raw and p_test_hat.


### Create a “scored table” + save to CSV

In [4]:
scored_test = df_test[["encounter_id", "person_id", "label"]].copy()
scored_test["p_raw"] = p_test_raw
scored_test["p_hat"] = p_test_hat  # calibrated probability
scored_test = scored_test.sort_values("p_hat", ascending=False)

reports_dir = project_root / "Day-7" / "reports"
reports_dir.mkdir(parents=True, exist_ok=True)

scored_path = reports_dir / "DAY07_scored_test.csv"
scored_test.to_csv(scored_path, index=False)

print("Saved:", scored_path)
scored_test.head()


Saved: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-7\reports\DAY07_scored_test.csv


Unnamed: 0,encounter_id,person_id,label,p_raw,p_hat
67387,189144708,42941232,1,0.610624,0.612003
38644,120136542,23838849,1,0.556518,0.557934
87183,277879686,88227540,1,0.542616,0.544036
67883,190944528,57751650,0,0.517605,0.519027
42842,132138702,76743099,0,0.511674,0.513096


### Write the scored table into DuckDB (so it behaves like a warehouse “gold” output)

In [5]:
# Write to DuckDB as a new table (replace each time you run Day 7)
con.execute("CREATE OR REPLACE TABLE gold_diabetes_scored_day7 AS SELECT * FROM scored_test")

print("DuckDB table created: gold_diabetes_scored_day7")
print("Rows:", con.execute("SELECT COUNT(*) FROM gold_diabetes_scored_day7").fetchone()[0])


DuckDB table created: gold_diabetes_scored_day7
Rows: 20153


### Make the 4 evaluation plots and save them (PR, ROC, Calibration, Lift)

In [6]:
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, roc_curve, average_precision_score, roc_auc_score

y = scored_test["label"].to_numpy()
p = scored_test["p_hat"].to_numpy()

# --- PR curve ---
prec, rec, _ = precision_recall_curve(y, p)
ap = average_precision_score(y, p)

plt.figure()
plt.plot(rec, prec)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title(f"Precision–Recall Curve (AP={ap:.3f})")
plt.tight_layout()
plt.savefig(reports_dir / "DAY07_pr_curve.png", dpi=200)
plt.close()

# --- ROC curve ---
fpr, tpr, _ = roc_curve(y, p)
auc = roc_auc_score(y, p)

plt.figure()
plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title(f"ROC Curve (AUC={auc:.3f})")
plt.tight_layout()
plt.savefig(reports_dir / "DAY07_roc_curve.png", dpi=200)
plt.close()

# --- Calibration (reliability) plot using deciles ---
df_cal = scored_test.copy()
df_cal["bin"] = pd.qcut(df_cal["p_hat"], q=10, duplicates="drop")
cal = df_cal.groupby("bin").agg(mean_p=("p_hat", "mean"),
                                obs_rate=("label", "mean"),
                                n=("label", "size")).reset_index()

plt.figure()
plt.plot(cal["mean_p"], cal["obs_rate"], marker="o")
plt.plot([0,1],[0,1], linestyle="--")
plt.xlabel("Mean predicted probability")
plt.ylabel("Observed event rate")
plt.title("Calibration Plot (Deciles)")
plt.tight_layout()
plt.savefig(reports_dir / "DAY07_calibration.png", dpi=200)
plt.close()

# --- Lift / Gains: cumulative captured positives vs population fraction ---
df_gain = scored_test.copy()
df_gain["cum_pos"] = df_gain["label"].cumsum()
total_pos = df_gain["label"].sum()
df_gain["cum_recall"] = df_gain["cum_pos"] / (total_pos if total_pos > 0 else 1)
df_gain["pop_frac"] = (np.arange(len(df_gain)) + 1) / len(df_gain)

plt.figure()
plt.plot(df_gain["pop_frac"], df_gain["cum_recall"])
plt.plot([0,1],[0,1], linestyle="--")
plt.xlabel("Fraction of population targeted")
plt.ylabel("Fraction of positives captured (recall)")
plt.title("Cumulative Gains (Lift) Curve")
plt.tight_layout()
plt.savefig(reports_dir / "DAY07_gains_curve.png", dpi=200)
plt.close()

print("Saved plots into:", reports_dir)


  cal = df_cal.groupby("bin").agg(mean_p=("p_hat", "mean"),


Saved plots into: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-7\reports


### Top-K table (what leadership actually understands)

In [7]:
def topk_summary(y_true, p, frac):
    y_true = np.asarray(y_true)
    p = np.asarray(p)
    n = len(y_true)
    k = max(1, int(np.floor(frac*n)))
    order = np.argsort(-p)
    idx = order[:k]
    return {
        "top_frac": frac,
        "k": k,
        "captured": int(y_true[idx].sum()),
        "precision_at_k": float(y_true[idx].mean()),
        "threshold": float(np.quantile(p, 1-frac))
    }

rows = [topk_summary(y, p, f) for f in [0.01, 0.05, 0.10, 0.20]]
topk_df = pd.DataFrame(rows)
topk_df


Unnamed: 0,top_frac,k,captured,precision_at_k,threshold
0,0.01,201,73,0.363184,0.349429
1,0.05,1007,304,0.301887,0.231791
2,0.1,2015,502,0.249132,0.190383
3,0.2,4030,829,0.205707,0.143511


Save it:

In [8]:
topk_df.to_csv(reports_dir / "DAY07_topk_summary.csv", index=False)
print("Saved:", reports_dir / "DAY07_topk_summary.csv")

Saved: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-7\reports\DAY07_topk_summary.csv


### Write DAY07.md (a mini model card/report)

In [9]:
from sklearn.metrics import brier_score_loss, log_loss

p_clip = np.clip(p, 1e-15, 1-1e-15)

report = f"""# Day 7 — Scoring output + Model report

## What I produced
- A scored test table with calibrated probabilities (p_hat)
- A DuckDB gold-style scored table: gold_diabetes_scored_day7
- Four standard evaluation plots: PR, ROC, Calibration, Gains (Lift)
- A top-K summary table (capacity targeting)

## Test-set metrics (calibrated probabilities)
Prevalence: {y.mean():.6f}
PR-AUC (Average Precision): {average_precision_score(y, p):.6f}
ROC-AUC: {roc_auc_score(y, p):.6f}
Brier score: {brier_score_loss(y, p):.6f}
Log loss: {log_loss(y, p_clip, labels=[0,1]):.6f}

## Files
- DAY07_scored_test.csv
- DAY07_topk_summary.csv
- DAY07_pr_curve.png
- DAY07_roc_curve.png
- DAY07_calibration.png
- DAY07_gains_curve.png
"""

md_path = reports_dir / "DAY07.md"
md_path.write_text(report, encoding="utf-8")
print("Saved:", md_path)


Saved: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-7\reports\DAY07.md


### Close DuckDB

In [10]:
con.close()
print("DuckDB closed.")


DuckDB closed.
