### Setup + connect to DuckDB (robust paths)

In [1]:
from pathlib import Path
import duckdb
import pandas as pd
import numpy as np
import json

def find_repo_root(start=None):
    p = (start or Path.cwd()).resolve()
    while True:
        if (p / "Day-1").exists():
            return p
        if p == p.parent:
            raise FileNotFoundError("Could not find repo root (expected a Day-1 folder).")
        p = p.parent

repo = find_repo_root()
print("Repo root:", repo)

db_candidates = [
    repo / "Day-13" / "data" / "warehouse" / "day13_noshow.duckdb",
    repo / "Day-11" / "data" / "warehouse" / "day11_noshow.duckdb",
]
db_path = next((p for p in db_candidates if p.exists()), None)
print("DB path:", db_path)
if db_path is None:
    raise FileNotFoundError("Could not find Day-13 or Day-11 noshow DuckDB.")

con = duckdb.connect(str(db_path))
print("Tables:", [t[0] for t in con.execute("SHOW TABLES").fetchall()])


Repo root: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science
DB path: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-11\data\warehouse\day11_noshow.duckdb
Tables: ['bronze_appointments', 'gold_appointments_base', 'gold_appointments_features_v1', 'gold_appointments_features_v1_patient_split', 'silver_appointments', 'split_patient_v1']


### Load the feature contract (from Day 13)

In [2]:
contract_candidates = [
    repo / "Day-13" / "artifacts" / "noshow_feature_contract_v1.json",
    repo / "Day-13" / "artifacts" / "noshow_feature_contract_v2.json",
]
contract_path = next((p for p in contract_candidates if p.exists()), None)
print("Contract:", contract_path)
if contract_path is None:
    raise FileNotFoundError("Could not find noshow feature contract in Day-13/artifacts.")

contract = json.loads(Path(contract_path).read_text(encoding="utf-8"))

id_cols = contract["id_cols"]
label_col = contract["label_col"]
treat_col = contract["treatment_col"]
feature_cols = contract["feature_cols"]
numeric_cols = contract["numeric_cols"]
categorical_cols = contract["categorical_cols"]

print("n_features:", len(feature_cols))
print("numeric:", len(numeric_cols), "categorical:", len(categorical_cols))
print("label_col:", label_col, "| treatment_col:", treat_col)


Contract: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-13\artifacts\noshow_feature_contract_v1.json
n_features: 19
numeric: 16 categorical: 3
label_col: label | treatment_col: sms_received


### Load gold + splits into pandas (train/valid/test)

You created the view on Day 14: gold_appointments_features_v1_patient_split.

In [3]:
view_name = "gold_appointments_features_v1_patient_split"

df = con.execute(f"""
SELECT {", ".join(id_cols)}, {label_col}, {treat_col}, split,
       {", ".join(feature_cols)}
FROM {view_name}
""").df()

print("Rows:", len(df))
print(df["split"].value_counts())
print("Prevalence overall:", df[label_col].mean())


Rows: 110516
split
train    77368
valid    16649
test     16499
Name: count, dtype: int64
Prevalence overall: 0.20188027073003004


### Build X/y per split (and confirm no leakage columns)

We do not include sms_received in the predictive feature set, because tomorrow you’ll be deciding who gets SMS.

In [4]:
df_train = df[df["split"]=="train"].copy()
df_valid = df[df["split"]=="valid"].copy()
df_test  = df[df["split"]=="test"].copy()

X_train = df_train[feature_cols]
y_train = df_train[label_col].astype(int)

X_valid = df_valid[feature_cols]
y_valid = df_valid[label_col].astype(int)

X_test  = df_test[feature_cols]
y_test  = df_test[label_col].astype(int)

print("Train rows:", len(X_train), "prev:", y_train.mean())
print("Valid rows:", len(X_valid), "prev:", y_valid.mean())
print("Test rows :", len(X_test),  "prev:", y_test.mean())

# quick leakage sanity: treatment should NOT be in feature_cols
assert treat_col not in feature_cols, "treatment_col is in feature_cols — remove it from the contract."


Train rows: 77368 prev: 0.20301675111157066
Valid rows: 16649 prev: 0.20349570544777465
Test rows : 16499 prev: 0.19492090429723014


### Preprocessing (sparse for logistic, dense for HistGB)

HistGradientBoosting needs dense input, so we’ll define a dense preprocessor for tree models.

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Logistic: allow sparse
prep_linear = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("impute", SimpleImputer(strategy="median")),
            ("scale", StandardScaler(with_mean=False)),  # OK for sparse pipelines
        ]), numeric_cols),
        ("cat", Pipeline([
            ("impute", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore")),  # sparse by default
        ]), categorical_cols),
    ],
    remainder="drop"
)

# Trees (dense): force dense one-hot (float32 helps memory)
prep_tree_dense = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("impute", SimpleImputer(strategy="median")),
        ]), numeric_cols),
        ("cat", Pipeline([
            ("impute", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.float32)),
        ]), categorical_cols),
    ],
    remainder="drop"
)


### Models (baseline leaderboard)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

models = {
    "logreg": Pipeline([
        ("prep", prep_linear),
        ("clf", LogisticRegression(max_iter=3000, solver="lbfgs"))
    ]),
    "tree": Pipeline([
        ("prep", prep_tree_dense),
        ("clf", DecisionTreeClassifier(max_depth=6, min_samples_leaf=200, random_state=42))
    ]),
    "rf": Pipeline([
        ("prep", prep_tree_dense),
        ("clf", RandomForestClassifier(
            n_estimators=400, min_samples_leaf=50, n_jobs=-1, random_state=42
        ))
    ]),
    "hist_gb": Pipeline([
        ("prep", prep_tree_dense),
        ("clf", HistGradientBoostingClassifier(
            max_depth=6, learning_rate=0.05, max_iter=400, random_state=42
        ))
    ]),
}


### Metrics + Top-K targeting function (safe logloss)

In [7]:
from sklearn.metrics import average_precision_score, roc_auc_score, brier_score_loss, log_loss

def compute_metrics(y_true, p):
    y_true = np.asarray(y_true)
    p = np.asarray(p)
    p_clip = np.clip(p, 1e-15, 1-1e-15)
    return {
        "prevalence": float(y_true.mean()),
        "mean_p": float(p.mean()),
        "median_p": float(np.median(p)),
        "pr_auc": float(average_precision_score(y_true, p)),
        "roc_auc": float(roc_auc_score(y_true, p)),
        "brier": float(brier_score_loss(y_true, p)),
        "logloss": float(log_loss(y_true, p_clip)),
    }

def topk_metrics(y_true, p, frac):
    y_true = np.asarray(y_true)
    p = np.asarray(p)
    n = len(y_true)
    k = max(1, int(frac * n))
    idx = np.argsort(-p)[:k]
    captured = int(y_true[idx].sum())
    prec = float(captured / k)
    thr = float(np.sort(p)[-k])
    return {"top_frac": frac, "k": k, "captured": captured, "precision_at_k": prec, "threshold": thr}


### Fit all models, compare on VALID, then evaluate once on TEST

In [8]:
leader = []
top_fracs = [0.01, 0.05, 0.10, 0.20]

for name, pipe in models.items():
    pipe.fit(X_train, y_train)

    p_valid = pipe.predict_proba(X_valid)[:, 1]
    p_test  = pipe.predict_proba(X_test)[:, 1]

    m_valid = compute_metrics(y_valid, p_valid)
    m_test  = compute_metrics(y_test, p_test)

    # one operational metric to compare quickly
    t10_valid = topk_metrics(y_valid, p_valid, 0.10)["precision_at_k"]
    t10_test  = topk_metrics(y_test,  p_test,  0.10)["precision_at_k"]

    leader.append({
        "model": name,
        "valid_pr_auc": m_valid["pr_auc"],
        "valid_brier": m_valid["brier"],
        "valid_logloss": m_valid["logloss"],
        "valid_top10_precision": t10_valid,
        "test_pr_auc": m_test["pr_auc"],
        "test_brier": m_test["brier"],
        "test_logloss": m_test["logloss"],
        "test_top10_precision": t10_test,
    })

leader_df = pd.DataFrame(leader).sort_values(["valid_brier","valid_pr_auc"], ascending=[True, False])
leader_df


Unnamed: 0,model,valid_pr_auc,valid_brier,valid_logloss,valid_top10_precision,test_pr_auc,test_brier,test_logloss,test_top10_precision
3,hist_gb,0.371375,0.143703,0.439805,0.41887,0.359799,0.139529,0.429035,0.410552
2,rf,0.363924,0.144681,0.4432,0.405048,0.352447,0.140471,0.432553,0.398423
1,tree,0.335357,0.145812,0.44621,0.36899,0.324296,0.141487,0.4349,0.351122
0,logreg,0.341447,0.146467,0.449605,0.378005,0.335375,0.141968,0.438471,0.378411


### Calibrate the best model with Platt scaling on VALID

We avoid CalibratedClassifierCV(cv="prefit") because your sklearn version rejects it. This Platt approach works everywhere.

In [9]:
import joblib
from sklearn.linear_model import LogisticRegression

best_name = leader_df.iloc[0]["model"]
best_model = models[best_name]

# Refit best model on TRAIN only (keeps VALID clean for calibration)
best_model.fit(X_train, y_train)
p_valid_raw = best_model.predict_proba(X_valid)[:, 1]
p_test_raw  = best_model.predict_proba(X_test)[:, 1]

# Platt scaling: fit logistic regression on raw probability as a single feature
platt = LogisticRegression(max_iter=2000, solver="lbfgs")
platt.fit(p_valid_raw.reshape(-1, 1), y_valid)

p_valid_hat = platt.predict_proba(p_valid_raw.reshape(-1, 1))[:, 1]
p_test_hat  = platt.predict_proba(p_test_raw.reshape(-1, 1))[:, 1]

print("BEST MODEL:", best_name)
print("VALID (calibrated):", compute_metrics(y_valid, p_valid_hat))
print("TEST  (calibrated):", compute_metrics(y_test,  p_test_hat))

for f in top_fracs:
    print("TEST", topk_metrics(y_test, p_test_hat, f))


BEST MODEL: hist_gb
VALID (calibrated): {'prevalence': 0.20349570544777465, 'mean_p': 0.20356071860360694, 'median_p': 0.17965294343169308, 'pr_auc': 0.37137528441261786, 'roc_auc': 0.7368793740279505, 'brier': 0.1452222568330195, 'logloss': 0.44661341284334144}
TEST  (calibrated): {'prevalence': 0.19492090429723014, 'mean_p': 0.20335601817356758, 'median_p': 0.1800428200244986, 'pr_auc': 0.3597989973811414, 'roc_auc': 0.7397114639480457, 'brier': 0.14084112620677178, 'logloss': 0.4358437775024725}
TEST {'top_frac': 0.01, 'k': 164, 'captured': 85, 'precision_at_k': 0.5182926829268293, 'threshold': 0.6005846754320254}
TEST {'top_frac': 0.05, 'k': 824, 'captured': 362, 'precision_at_k': 0.4393203883495146, 'threshold': 0.4507367716001566}
TEST {'top_frac': 0.1, 'k': 1649, 'captured': 677, 'precision_at_k': 0.4105518496058217, 'threshold': 0.39323639605387173}
TEST {'top_frac': 0.2, 'k': 3299, 'captured': 1220, 'precision_at_k': 0.36980903304031526, 'threshold': 0.31780418585262915}


### Save artifacts for Day 16–20 (model + calibrator + contract + metadata)

In [10]:
from datetime import datetime

art_dir = repo / "Day-15" / "artifacts"
rep_dir = repo / "Day-15" / "reports"
art_dir.mkdir(parents=True, exist_ok=True)
rep_dir.mkdir(parents=True, exist_ok=True)

base_model_path = art_dir / "noshow_base_model.joblib"
platt_path = art_dir / "noshow_platt_calibrator.joblib"
contract_copy_path = art_dir / "noshow_feature_contract.json"
metrics_path = rep_dir / "day15_metrics.json"

joblib.dump(best_model, base_model_path)
joblib.dump(platt, platt_path)

contract_copy_path.write_text(json.dumps(contract, indent=2), encoding="utf-8")

summary = {
    "timestamp_local": datetime.now().isoformat(timespec="seconds"),
    "db_path": str(db_path),
    "view_used": view_name,
    "best_model": best_name,
    "leaderboard_valid_sorted": leader_df.to_dict(orient="records"),
    "valid_calibrated_metrics": compute_metrics(y_valid, p_valid_hat),
    "test_calibrated_metrics": compute_metrics(y_test, p_test_hat),
    "test_topk_calibrated": [topk_metrics(y_test, p_test_hat, f) for f in top_fracs],
}
metrics_path.write_text(json.dumps(summary, indent=2), encoding="utf-8")

print("Saved base model ->", base_model_path)
print("Saved platt      ->", platt_path)
print("Saved contract   ->", contract_copy_path)
print("Saved metrics    ->", metrics_path)


Saved base model -> C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-15\artifacts\noshow_base_model.joblib
Saved platt      -> C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-15\artifacts\noshow_platt_calibrator.joblib
Saved contract   -> C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-15\artifacts\noshow_feature_contract.json
Saved metrics    -> C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-15\reports\day15_metrics.json


### Create Day-15/DAY15.md (so Day 15 isn’t empty)

In [11]:
md_path = repo / "Day-15" / "reports"/ "DAY15.md"
md_path.write_text(
f"""# Day 15 — Predictive no-show model + calibration

Today I trained leakage-safe predictive baselines using the Day 14 patient split (no patient overlap across train/valid/test).

I compared multiple models on VALID using probability quality metrics (Brier score / log loss) and ranking metrics (PR-AUC, ROC-AUC, top-K precision). I selected the best model by VALID Brier score and calibrated its probabilities using Platt scaling fit on the VALID set.

Artifacts saved for later (Day 16–20):
- `{base_model_path.relative_to(repo)}`
- `{platt_path.relative_to(repo)}`
- `{contract_copy_path.relative_to(repo)}`
- `{metrics_path.relative_to(repo)}`

Next: Day 16 begins causal work (propensity + ATE) using SMS_received as treatment.
""",
encoding="utf-8")
print("Wrote:", md_path)


Wrote: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-15\reports\DAY15.md
