### Connect to DuckDB (Day-11 warehouse)

Assumption: notebook is in Day-12/notebooks/

In [2]:
from pathlib import Path
import duckdb

def find_repo_root(start: Path | None = None) -> Path:
    """
    Walk upward until we find the 30-days repo root by checking for Day-1 folder.
    """
    p = (start or Path.cwd()).resolve()
    while True:
        if (p / "Day-1").exists() and (p / "Day-11").exists():
            return p
        if p == p.parent:
            raise FileNotFoundError("Could not locate repo root. Expected folders: Day-1 and Day-11.")
        p = p.parent

repo_root = find_repo_root()
db_path = repo_root / "Day-11" / "data" / "warehouse" / "day11_noshow.duckdb"

print("CWD      :", Path.cwd())
print("Repo root :", repo_root)
print("DB path  :", db_path)
print("DB exists:", db_path.exists())

con = duckdb.connect(str(db_path))
print("Connected ✅")
print("Tables:", [t[0] for t in con.execute("SHOW TABLES").fetchall()])


CWD      : C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-12\notebooks
Repo root : C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science
DB path  : C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-11\data\warehouse\day11_noshow.duckdb
DB exists: True
Connected ✅
Tables: ['bronze_appointments', 'gold_appointments_base', 'silver_appointments']


### The “expert discovery” checklist (what you produce today)
### 1) Scoring-time vs leakage (write this down)

Decision point for this project (we will use going forward):

We decide whether to send SMS at scheduling time, so allowed predictors are things known at scheduling.

sms_received is NOT a feature in the risk model (it’s the treatment / policy lever).

Anything that happens after the appointment cannot be used. (We don’t have those fields anyway, but we enforce this mindset.)

Create a tiny “allowed columns” list for the risk model:

#### Define allowed features for risk model (v0

In [4]:
allowed_features_v0 = [
    "age","gender","neighbourhood","scholarship",
    "hipertension","diabetes","alcoholism","handcap",
    "lead_time_days"
]
id_cols = ["appointment_id","person_id"]
label_col = "label"
treatment_col = "sms_received"

allowed_features_v0


['age',
 'gender',
 'neighbourhood',
 'scholarship',
 'hipertension',
 'diabetes',
 'alcoholism',
 'handcap',
 'lead_time_days']

### 2) Missingness profile (expert version)

Even if missingness is low, we document it and test whether missingness correlates with the outcome.

#### Missingness table

In [5]:
df = con.execute("""
SELECT appointment_id, person_id, lead_time_days, age, gender, neighbourhood,
       scholarship, hipertension, diabetes, alcoholism, handcap,
       sms_received, label
FROM gold_appointments_base
""").df()

missing = pd.DataFrame({
    "missing_count": df.isna().sum(),
    "missing_rate": df.isna().mean()
}).sort_values("missing_rate", ascending=False)

missing


Unnamed: 0,missing_count,missing_rate
appointment_id,0,0.0
person_id,0,0.0
lead_time_days,0,0.0
age,0,0.0
gender,0,0.0
neighbourhood,0,0.0
scholarship,0,0.0
hipertension,0,0.0
diabetes,0,0.0
alcoholism,0,0.0


In [6]:
import pandas as pd

allowed_features_v0 = [
    "age","gender","neighbourhood","scholarship",
    "hipertension","diabetes","alcoholism","handcap",
    "lead_time_days"
]

miss_any = df[allowed_features_v0].isna().any()

if not miss_any.any():
    miss_effect = pd.DataFrame({
        "column": allowed_features_v0,
        "missing_count": 0,
        "missing_rate": 0.0,
        "prev_if_missing": [None]*len(allowed_features_v0),
        "prev_if_observed": [float(df["label"].mean())]*len(allowed_features_v0),
        "diff_missing_minus_observed": [None]*len(allowed_features_v0),
    })
    print("No missingness in selected features ✅ (skipping missingness-outcome shift analysis)")
else:
    rows = []
    for c in allowed_features_v0:
        m = df[c].isna()
        if m.sum() == 0:
            rows.append([c, 0, 0.0, None, float(df["label"].mean()), None])
            continue
        prev_missing = float(df.loc[m, "label"].mean())
        prev_obs = float(df.loc[~m, "label"].mean())
        rows.append([c, int(m.sum()), float(m.mean()), prev_missing, prev_obs, prev_missing - prev_obs])

    miss_effect = pd.DataFrame(rows, columns=[
        "column","missing_count","missing_rate",
        "prev_if_missing","prev_if_observed","diff_missing_minus_observed"
    ]).sort_values("missing_rate", ascending=False)

miss_effect


No missingness in selected features ✅ (skipping missingness-outcome shift analysis)


Unnamed: 0,column,missing_count,missing_rate,prev_if_missing,prev_if_observed,diff_missing_minus_observed
0,age,0,0.0,,0.20188,
1,gender,0,0.0,,0.20188,
2,neighbourhood,0,0.0,,0.20188,
3,scholarship,0,0.0,,0.20188,
4,hipertension,0,0.0,,0.20188,
5,diabetes,0,0.0,,0.20188,
6,alcoholism,0,0.0,,0.20188,
7,handcap,0,0.0,,0.20188,
8,lead_time_days,0,0.0,,0.20188,


Save to Day-12 reports:

In [7]:
out_dir = repo_root / "Day-12" / "reports" / "tables"
out_dir.mkdir(parents=True, exist_ok=True)

missing.to_csv(out_dir / "missingness.csv", index=True)
miss_effect.to_csv(out_dir / "missingness_label_shift.csv", index=False)
print("Wrote:", out_dir)


Wrote: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-12\reports\tables


### 3) Outcome discovery (EDA that matters)

We focus on: prevalence, lead time, age, neighbourhood, and group differences.

#### Basic prevalence slices

In [8]:
def rate_by(col):
    return (df.groupby(col)["label"].mean()
              .sort_values(ascending=False)
              .reset_index()
              .rename(columns={"label":"no_show_rate"}))

rate_gender = rate_by("gender")
rate_sms = rate_by("sms_received")
rate_sch = rate_by("scholarship")

rate_gender, rate_sms, rate_sch


(  gender  no_show_rate
 0      F      0.203088
 1      M      0.199638,
    sms_received  no_show_rate
 0             1      0.275753
 1             0      0.166949,
    scholarship  no_show_rate
 0            1      0.237363
 1            0      0.198013)

#### Lead time buckets (super important)

In [9]:
df["lead_bucket"] = pd.cut(
    df["lead_time_days"],
    bins=[-0.1,0,2,7,14,30,60,180],
    include_lowest=True
)

lead_tbl = (df.groupby("lead_bucket")["label"].agg(["count","mean"])
              .reset_index()
              .rename(columns={"mean":"no_show_rate"}))

lead_tbl


  lead_tbl = (df.groupby("lead_bucket")["label"].agg(["count","mean"])


Unnamed: 0,lead_bucket,count,no_show_rate
0,"(-0.101, 0.0]",38561,0.046472
1,"(0.0, 2.0]",11938,0.227425
2,"(2.0, 7.0]",20245,0.249691
3,"(7.0, 14.0]",12025,0.304699
4,"(14.0, 30.0]",17370,0.325907
5,"(30.0, 60.0]",8282,0.341463
6,"(60.0, 180.0]",2095,0.284487


#### Neighbourhood (high-cardinality, but we can still inspect top counts)

In [10]:
top_nbhd = df["neighbourhood"].value_counts().head(15).reset_index()
top_nbhd.columns = ["neighbourhood","n"]

nbhd_rate = (df.groupby("neighbourhood")["label"].agg(["count","mean"])
               .reset_index()
               .rename(columns={"count":"n","mean":"no_show_rate"})
               .sort_values("n", ascending=False)
               .head(15))

top_nbhd, nbhd_rate


(        neighbourhood     n
 0      JARDIM CAMBURI  7717
 1         MARIA ORTIZ  5805
 2         RESISTÊNCIA  4430
 3     JARDIM DA PENHA  3877
 4             ITARARÉ  3514
 5              CENTRO  3334
 6          TABUAZEIRO  3131
 7        SANTA MARTHA  3131
 8   JESUS DE NAZARETH  2853
 9              BONFIM  2773
 10      SANTO ANTÔNIO  2744
 11        SANTO ANDRÉ  2571
 12          CARATOÍRA  2565
 13             JABOUR  2509
 14          SÃO PEDRO  2448,
         neighbourhood     n  no_show_rate
 38     JARDIM CAMBURI  7717      0.189841
 43        MARIA ORTIZ  5805      0.209991
 59        RESISTÊNCIA  4430      0.204289
 39    JARDIM DA PENHA  3877      0.162755
 36            ITARARÉ  3514      0.262664
 10             CENTRO  3334      0.210858
 78         TABUAZEIRO  3131      0.182689
 66       SANTA MARTHA  3131      0.158416
 40  JESUS DE NAZARETH  2853      0.243954
 8              BONFIM  2773      0.198341
 69      SANTO ANTÔNIO  2744      0.175656
 68        SANTO AN

Save those tables:

In [11]:
rate_gender.to_csv(out_dir / "rate_by_gender.csv", index=False)
rate_sms.to_csv(out_dir / "rate_by_sms.csv", index=False)
lead_tbl.to_csv(out_dir / "rate_by_lead_bucket.csv", index=False)
nbhd_rate.to_csv(out_dir / "top_neighbourhood_rates.csv", index=False)


### 4) Treatment assignment is not random (confounding check)

We don’t estimate causal effects today, but we prove to ourselves SMS isn’t randomly assigned.

#### Compare covariates by sms_received

In [12]:
by_sms = df.groupby("sms_received").agg(
    n=("appointment_id","count"),
    no_show_rate=("label","mean"),
    mean_age=("age","mean"),
    mean_lead=("lead_time_days","mean"),
    scholarship_rate=("scholarship","mean"),
    hipertension_rate=("hipertension","mean"),
    diabetes_rate=("diabetes","mean"),
).reset_index()

by_sms


Unnamed: 0,sms_received,n,no_show_rate,mean_age,mean_lead,scholarship_rate,hipertension_rate,diabetes_rate
0,0,75035,0.166949,36.884534,6.007716,0.098034,0.198987,0.074459
1,1,35481,0.275753,37.511626,19.016826,0.098785,0.193597,0.066402


Save it:

In [13]:
by_sms.to_csv(out_dir / "covariates_by_sms.csv", index=False)


### 5) Formalize Day-12 data-quality checks (these become “tests” later)

In [14]:
checks = {}

checks["rows_gold"] = int(len(df))
checks["dup_appointment_id"] = int(df["appointment_id"].duplicated().sum())
checks["label_nulls"] = int(df["label"].isna().sum())
checks["sms_nulls"] = int(df["sms_received"].isna().sum())
checks["lead_negative"] = int((df["lead_time_days"] < 0).sum())
checks["age_out_of_range"] = int(((df["age"] < 0) | (df["age"] > 110)).sum())

checks


{'rows_gold': 110516,
 'dup_appointment_id': 0,
 'label_nulls': 0,
 'sms_nulls': 0,
 'lead_negative': 0,
 'age_out_of_range': 0}

Write it to JSON:

In [15]:
import json
dq_path = repo_root / "Day-12" / "reports" / "dq_checks.json"
dq_path.write_text(json.dumps(checks, indent=2), encoding="utf-8")
print("Wrote:", dq_path)


Wrote: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-12\reports\dq_checks.json


### Close the DB (final cell)

In [16]:
con.close()
print("Day 12 discovery complete ✅")


Day 12 discovery complete ✅
