### Connect to the correct DuckDB (robust path finder)

In [1]:
from pathlib import Path
import duckdb
import pandas as pd
import json

def find_repo_root(start=None):
    p = (start or Path.cwd()).resolve()
    while True:
        # repo root marker (we know Day-1 exists in your repo)
        if (p / "Day-1").exists():
            return p
        if p == p.parent:
            raise FileNotFoundError("Could not find repo root (expected a Day-1 folder).")
        p = p.parent

repo = find_repo_root()
print("Repo root:", repo)

# Prefer Day-13 DB if it exists; else fall back to Day-11 DB
db_candidates = [
    repo / "Day-13" / "data" / "warehouse" / "day13_noshow.duckdb",
    repo / "Day-11" / "data" / "warehouse" / "day11_noshow.duckdb",
]
db_path = next((p for p in db_candidates if p.exists()), None)
print("DB path:", db_path)

if db_path is None:
    raise FileNotFoundError("Could not find Day-13 or Day-11 noshow DuckDB.")

con = duckdb.connect(str(db_path))
print("Tables:", [t[0] for t in con.execute("SHOW TABLES").fetchall()])


Repo root: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science
DB path: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-11\data\warehouse\day11_noshow.duckdb
Tables: ['bronze_appointments', 'gold_appointments_base', 'gold_appointments_features_v1', 'silver_appointments']


### Confirm the gold table + its columns (so we don’t query missing fields)

In [2]:
gold_table = "gold_appointments_features_v1"

cols = con.execute(f"PRAGMA table_info('{gold_table}')").df()
display(cols[["name","type"]])
print("n_cols:", len(cols), "n_rows:", con.execute(f"SELECT COUNT(*) FROM {gold_table}").fetchone()[0])


Unnamed: 0,name,type
0,appointment_id,BIGINT
1,person_id,BIGINT
2,label,INTEGER
3,sms_received,INTEGER
4,age,DOUBLE
5,gender,VARCHAR
6,neighbourhood,VARCHAR
7,scholarship,INTEGER
8,hipertension,INTEGER
9,diabetes,INTEGER


n_cols: 25 n_rows: 110516


### Patient-level split (no person overlap) and store it in DuckDB

This is the “leakage-safe” split: one patient appears in exactly one of train/valid/test.

In [3]:
# deterministic hash-based split by person_id
con.execute(f"""
CREATE OR REPLACE TABLE split_patient_v1 AS
SELECT
  appointment_id,
  person_id,
  CASE
    WHEN u < 0.70 THEN 'train'
    WHEN u < 0.85 THEN 'valid'
    ELSE 'test'
  END AS split
FROM (
  SELECT
    appointment_id,
    person_id,
    (abs(hash(CAST(person_id AS VARCHAR))) % 1000000) / 1000000.0 AS u
  FROM {gold_table}
);
""")

# handy view that attaches split to the gold rows
con.execute(f"""
CREATE OR REPLACE VIEW gold_appointments_features_v1_patient_split AS
SELECT g.*, s.split
FROM {gold_table} g
JOIN split_patient_v1 s
USING (appointment_id, person_id);
""")

print("Created: split_patient_v1 (table) and gold_appointments_features_v1_patient_split (view)")


Created: split_patient_v1 (table) and gold_appointments_features_v1_patient_split (view)


### Split quality checks (overlap, counts, prevalence)

In [4]:
# 1) patient overlap should be 0
overlap = con.execute("""
SELECT COUNT(*) AS n_people_in_multiple_splits
FROM (
  SELECT person_id, COUNT(DISTINCT split) AS k
  FROM split_patient_v1
  GROUP BY person_id
) t
WHERE k > 1;
""").df()
display(overlap)

# 2) rows / patients / prevalence per split
by_split = con.execute(f"""
SELECT
  split,
  COUNT(*) AS rows,
  COUNT(DISTINCT person_id) AS people,
  AVG(label) AS no_show_prevalence
FROM gold_appointments_features_v1_patient_split
GROUP BY split
ORDER BY split;
""").df()
display(by_split)


Unnamed: 0,n_people_in_multiple_splits
0,0


Unnamed: 0,split,rows,people,no_show_prevalence
0,test,16499,9410,0.194921
1,train,77368,43579,0.203017
2,valid,16649,9307,0.203496


### Time leakage guardrail (verify “prior” feature is really prior)

Your prior_appt_count feature must never look into the future. A quick sanity check is: it should never be negative, and its minimum should be 0.

In [5]:
if "prior_appt_count" in cols["name"].tolist():
    chk = con.execute("""
    SELECT
      MIN(prior_appt_count) AS min_prior,
      MAX(prior_appt_count) AS max_prior,
      AVG(prior_appt_count) AS mean_prior
    FROM gold_appointments_features_v1;
    """).df()
    display(chk)
else:
    print("No prior_appt_count column found — skipping this check.")


Unnamed: 0,min_prior,max_prior,mean_prior
0,0,87,1.270223


### Save a small split artifact (so Day 15+ can reuse it)

In [6]:
out_dir = repo / "Day-14" / "artifacts"
out_dir.mkdir(parents=True, exist_ok=True)

summary = {
    "db_path": str(db_path),
    "gold_table": gold_table,
    "split_table": "split_patient_v1",
    "split_view": "gold_appointments_features_v1_patient_split",
    "split_counts": by_split.to_dict(orient="records"),
    "patient_overlap_count": int(overlap["n_people_in_multiple_splits"].iloc[0]),
    "split_rule": "patient-hash: u<0.70 train, u<0.85 valid, else test",
}

path = out_dir / "day14_split_patient_v1.json"
path.write_text(json.dumps(summary, indent=2), encoding="utf-8")
print("Wrote:", path)


Wrote: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-14\artifacts\day14_split_patient_v1.json
