### Load scored predictions from Day 7

We already have a scored test table from Day 7. That’s perfect because bootstrap needs (label, predicted probability).

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

project_root = Path.cwd().resolve()
while not (project_root / "Day-1").exists():
    if project_root == project_root.parent:
        raise FileNotFoundError("Could not find project root containing Day-1.")
    project_root = project_root.parent

scored_path = project_root / "Day-7" / "reports" / "DAY07_scored_test.csv"
scored = pd.read_csv(scored_path)

print("Loaded:", scored_path)
print("Shape:", scored.shape)
scored.head()


Loaded: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-7\reports\DAY07_scored_test.csv
Shape: (20153, 5)


Unnamed: 0,encounter_id,person_id,label,p_raw,p_hat
0,189144708,42941232,1,0.610624,0.612003
1,120136542,23838849,1,0.556518,0.557934
2,277879686,88227540,1,0.542616,0.544036
3,190944528,57751650,0,0.517605,0.519027
4,132138702,76743099,0,0.511674,0.513096


Sanity checks

In [2]:
assert {"encounter_id","person_id","label","p_hat"}.issubset(scored.columns)
print("Prevalence:", scored["label"].mean())
print("People:", scored["person_id"].nunique())
print("Encounters:", scored.shape[0])


Prevalence: 0.10673348881059892
People: 14304
Encounters: 20153


### Define the metrics we will bootstrap

In [3]:
from sklearn.metrics import (
    average_precision_score,
    roc_auc_score,
    brier_score_loss,
    log_loss
)

def clip01(p, eps=1e-15):
    p = np.asarray(p)
    return np.clip(p, eps, 1 - eps)

def compute_metrics(y, p):
    y = np.asarray(y).astype(int)
    p = np.asarray(p).astype(float)
    p_clip = clip01(p)
    out = {
        "prevalence": float(y.mean()),
        "pr_auc": float(average_precision_score(y, p)),
        "roc_auc": float(roc_auc_score(y, p)),
        "brier": float(brier_score_loss(y, p)),
        "logloss": float(log_loss(y, p_clip, labels=[0,1])),
        "mean_p": float(p.mean()),
        "median_p": float(np.median(p)),
    }
    return out

def precision_at_top_frac(y, p, frac):
    y = np.asarray(y).astype(int)
    p = np.asarray(p).astype(float)
    n = len(y)
    k = max(1, int(np.floor(frac * n)))
    idx = np.argsort(-p)[:k]
    return float(y[idx].mean()), int(y[idx].sum()), int(k)

# Point estimates (what you already report)
y0 = scored["label"].to_numpy()
p0 = scored["p_hat"].to_numpy()

point = compute_metrics(y0, p0)
print("Point metrics:", point)

for frac in [0.01, 0.05, 0.10, 0.20]:
    prec, captured, k = precision_at_top_frac(y0, p0, frac)
    print(f"Top {int(frac*100)}%: precision={prec:.6f} captured={captured} k={k}")


Point metrics: {'prevalence': 0.10673348881059892, 'pr_auc': 0.2078325210559714, 'roc_auc': 0.6666140354981995, 'brier': 0.09134823864273871, 'logloss': 0.3211129468363118, 'mean_p': 0.11166645743436704, 'median_p': 0.0979470342918689}
Top 1%: precision=0.363184 captured=73 k=201
Top 5%: precision=0.301887 captured=304 k=1007
Top 10%: precision=0.249132 captured=502 k=2015
Top 20%: precision=0.205707 captured=829 k=4030


### Patient-level (cluster) bootstrap

Key idea: sample patients with replacement, then include all their encounters.

In [4]:
def cluster_bootstrap_indices(df, group_col, rng):
    # Returns row indices for a bootstrap sample created by sampling groups with replacement
    groups = df[group_col].unique()
    sampled_groups = rng.choice(groups, size=len(groups), replace=True)

    # Collect rows for sampled groups (with multiplicity)
    parts = []
    for g in sampled_groups:
        parts.append(df.index[df[group_col] == g].to_numpy())
    return np.concatenate(parts)

def bootstrap_cluster(df, group_col="person_id", B=500, seed=42):
    rng = np.random.default_rng(seed)

    rows = []
    for b in range(1, B+1):
        idx = cluster_bootstrap_indices(df, group_col, rng)
        y = df.loc[idx, "label"].to_numpy()
        p = df.loc[idx, "p_hat"].to_numpy()

        m = compute_metrics(y, p)

        # add top-K targeting metrics
        for frac in [0.01, 0.05, 0.10, 0.20]:
            prec, captured, k = precision_at_top_frac(y, p, frac)
            m[f"prec_top_{int(frac*100)}"] = prec
            m[f"captured_top_{int(frac*100)}"] = captured
            m[f"k_top_{int(frac*100)}"] = k

        rows.append(m)

        if b % 50 == 0:
            print(f"Bootstrap {b}/{B} complete")

    return pd.DataFrame(rows)

B = 500  # fast but useful; later you can increase to 1000
boot_df = bootstrap_cluster(scored, group_col="person_id", B=B, seed=42)
boot_df.head()


Bootstrap 50/500 complete
Bootstrap 100/500 complete
Bootstrap 150/500 complete
Bootstrap 200/500 complete
Bootstrap 250/500 complete
Bootstrap 300/500 complete
Bootstrap 350/500 complete
Bootstrap 400/500 complete
Bootstrap 450/500 complete
Bootstrap 500/500 complete


Unnamed: 0,prevalence,pr_auc,roc_auc,brier,logloss,mean_p,median_p,prec_top_1,captured_top_1,k_top_1,prec_top_5,captured_top_5,k_top_5,prec_top_10,captured_top_10,k_top_10,prec_top_20,captured_top_20,k_top_20
0,0.109698,0.205065,0.657946,0.093957,0.328779,0.110773,0.097907,0.325,65,200,0.289421,290,1002,0.250374,502,2005,0.205934,826,4011
1,0.105495,0.201604,0.659651,0.09075,0.320034,0.112248,0.09829,0.356436,72,202,0.283168,286,1010,0.249505,504,2020,0.205693,831,4040
2,0.109479,0.213595,0.661901,0.093324,0.326743,0.111903,0.096937,0.365,73,200,0.323705,325,1004,0.257968,518,2008,0.205875,827,4017
3,0.109775,0.218883,0.668131,0.093305,0.326445,0.11205,0.098396,0.412935,83,201,0.317097,319,1006,0.25236,508,2013,0.213115,858,4026
4,0.104594,0.209806,0.669531,0.089702,0.316644,0.112263,0.098298,0.358209,72,201,0.282306,284,1006,0.242921,489,2013,0.203129,818,4027


### Convert bootstrap draws into 95% intervals

In [7]:
def ci_from_boot(series, alpha=0.05):
    lo = series.quantile(alpha/2)
    hi = series.quantile(1 - alpha/2)
    return float(lo), float(hi)

summary_rows = []
for col in [
    "pr_auc", "roc_auc", "brier", "logloss",
    "prec_top_1", "prec_top_5", "prec_top_10", "prec_top_20",
    "captured_top_1", "captured_top_5", "captured_top_10", "captured_top_20",
]:
    lo, hi = ci_from_boot(boot_df[col])
    summary_rows.append({
        "metric": col,
        "point_estimate": float(
            point["pr_auc"] if col=="pr_auc" else
            point["roc_auc"] if col=="roc_auc" else
            point["brier"] if col=="brier" else
            point["logloss"] if col=="logloss" else
            precision_at_top_frac(y0,p0,0.01)[0] if col=="prec_top_1" else
            precision_at_top_frac(y0,p0,0.05)[0] if col=="prec_top_5" else
            precision_at_top_frac(y0,p0,0.10)[0] if col=="prec_top_10" else
            precision_at_top_frac(y0,p0,0.20)[0] if col=="prec_top_20" else
            precision_at_top_frac(y0,p0,0.01)[1] if col=="captured_top_1" else
            precision_at_top_frac(y0,p0,0.05)[1] if col=="captured_top_5" else
            precision_at_top_frac(y0,p0,0.10)[1] if col=="captured_top_10" else
            precision_at_top_frac(y0,p0,0.20)[1]
        ),
        "ci95_low": lo,
        "ci95_high": hi
    })

ci_df = pd.DataFrame(summary_rows)
ci_df


Unnamed: 0,metric,point_estimate,ci95_low,ci95_high
0,pr_auc,0.207833,0.191132,0.226985
1,roc_auc,0.666614,0.653555,0.680908
2,brier,0.091348,0.087999,0.094434
3,logloss,0.321113,0.311778,0.329817
4,prec_top_1,0.363184,0.297732,0.446174
5,prec_top_5,0.301887,0.26775,0.337954
6,prec_top_10,0.249132,0.226828,0.273963
7,prec_top_20,0.205707,0.190381,0.221538
8,captured_top_1,73.0,60.0,89.525
9,captured_top_5,304.0,269.475,341.525


### Save artifacts for Day 8

In [8]:
reports_dir = project_root / "Day-8" / "reports"
reports_dir.mkdir(parents=True, exist_ok=True)

boot_path = reports_dir / "DAY08_cluster_bootstrap_draws.csv"
ci_path = reports_dir / "DAY08_cluster_bootstrap_ci95.csv"

boot_df.to_csv(boot_path, index=False)
ci_df.to_csv(ci_path, index=False)

print("Saved bootstrap draws:", boot_path)
print("Saved CI summary:", ci_path)


Saved bootstrap draws: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-8\reports\DAY08_cluster_bootstrap_draws.csv
Saved CI summary: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-8\reports\DAY08_cluster_bootstrap_ci95.csv


### Write DAY08.md (simple reporting)

In [9]:
md = []
md.append("# Day 8 — Patient-level bootstrap uncertainty\n")
md.append(f"Bootstrap type: cluster bootstrap by person_id\n")
md.append(f"Bootstrap replications B = {B}\n")

md.append("\n## Point estimates (test set, calibrated p_hat)\n")
md.append(f"- Prevalence: {point['prevalence']:.6f}\n")
md.append(f"- PR-AUC: {point['pr_auc']:.6f}\n")
md.append(f"- ROC-AUC: {point['roc_auc']:.6f}\n")
md.append(f"- Brier: {point['brier']:.6f}\n")
md.append(f"- Log loss: {point['logloss']:.6f}\n")

md.append("\n## 95% bootstrap CIs (clustered by person_id)\n")
md.append(ci_df.to_string(index=False))

md_path = reports_dir / "DAY08.md"
md_path.write_text("\n".join(md), encoding="utf-8")
print("Saved:", md_path)


Saved: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-8\reports\DAY08.md
