### Connect to DuckDB + ingest Bronze (raw)

In [1]:
from pathlib import Path
import duckdb

repo_root = Path("..").resolve().parent  # from Day-11/notebooks -> repo root
day11_root = repo_root / "Day-11"

raw_csv = day11_root / "data" / "raw" / "appointments.csv"
db_path = day11_root / "data" / "warehouse" / "day11_noshow.duckdb"
db_path.parent.mkdir(parents=True, exist_ok=True)

print("Repo root   :", repo_root)
print("Day-11 root :", day11_root)
print("CSV         :", raw_csv)
print("DB          :", db_path)
print("CSV exists? :", raw_csv.exists())

con = duckdb.connect(str(db_path))

con.execute("""
CREATE OR REPLACE TABLE bronze_appointments AS
SELECT *, current_timestamp AS ingest_ts
FROM read_csv_auto(?, ALL_VARCHAR=1)
""", [str(raw_csv)])

print("bronze rows:", con.execute("SELECT COUNT(*) FROM bronze_appointments").fetchone()[0])
print("bronze cols:", [r[1] for r in con.execute("PRAGMA table_info('bronze_appointments')").fetchall()])


Repo root   : C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science
Day-11 root : C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-11
CSV         : C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-11\data\raw\appointments.csv
DB          : C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-11\data\warehouse\day11_noshow.duckdb
CSV exists? : True
bronze rows: 110527
bronze cols: ['PatientId', 'AppointmentID', 'Gender', 'ScheduledDay', 'AppointmentDay', 'Age', 'Neighbourhood', 'Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received', 'No-show', 'ingest_ts']


### Pull Bronze into pandas and standardize column names

In [2]:
import pandas as pd
import numpy as np

df = con.execute("SELECT * FROM bronze_appointments").df()

df.columns = (
    pd.Index(df.columns)
    .str.strip()
    .str.lower()
    .str.replace(" ", "_", regex=False)
    .str.replace("-", "_", regex=False)
)

print("standardized cols:", df.columns.tolist())


standardized cols: ['patientid', 'appointmentid', 'gender', 'scheduledday', 'appointmentday', 'age', 'neighbourhood', 'scholarship', 'hipertension', 'diabetes', 'alcoholism', 'handcap', 'sms_received', 'no_show', 'ingest_ts']


### Canonical rename + type coercions + label + DATE logic

In [3]:
# Canonicalize column names to what we will use in the whole 10-day project
rename_map = {
    "patientid": "person_id",
    "appointmentid": "appointment_id",
}
df = df.rename(columns={k:v for k,v in rename_map.items() if k in df.columns})

# Required columns check (fail fast)
required = ["person_id","appointment_id","gender","scheduledday","appointmentday",
            "age","neighbourhood","scholarship","hipertension","diabetes",
            "alcoholism","handcap","sms_received","no_show"]
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Missing expected columns: {missing}")

# Coerce IDs
df["person_id"] = pd.to_numeric(df["person_id"], errors="coerce")
df["appointment_id"] = pd.to_numeric(df["appointment_id"], errors="coerce")

# Parse timestamps
df["scheduled_ts"]   = pd.to_datetime(df["scheduledday"], errors="coerce")
df["appointment_ts"] = pd.to_datetime(df["appointmentday"], errors="coerce")

# DATE (calendar day) version — this fixes same-day timestamp vs midnight issues
df["scheduled_date"]   = df["scheduled_ts"].dt.date
df["appointment_date"] = df["appointment_ts"].dt.date

# Lead time in days
df["lead_time_days"] = (
    pd.to_datetime(df["appointment_date"]) - pd.to_datetime(df["scheduled_date"])
).dt.days

# Flag true date inversions (should be near 0 after DATE logic)
df["date_inversion_flag"] = (df["lead_time_days"] < 0).astype(int)

# Coerce other types
df["age"] = pd.to_numeric(df["age"], errors="coerce")
df["sms_received"] = pd.to_numeric(df["sms_received"], errors="coerce")

# Label: 1 = no-show (no_show == "Yes"), 0 = show
df["label"] = (df["no_show"].astype(str).str.lower() == "yes").astype(int)

# Handle bad ages (6 rows in your earlier run): filter them out
before = len(df)
df = df[(df["age"].isna()) | ((df["age"] >= 0) & (df["age"] <= 110))].copy()
after = len(df)

print("Rows removed for age out of range:", before - after)
print("DATE-level inversions:", int(df["date_inversion_flag"].sum()))
print("lead_time_days summary:")
print(df["lead_time_days"].describe())
print("no-show prevalence:", df["label"].mean())


Rows removed for age out of range: 6
DATE-level inversions: 5
lead_time_days summary:
count    110521.000000
mean         10.183721
std          15.255082
min          -6.000000
25%           0.000000
50%           4.000000
75%          15.000000
max         179.000000
Name: lead_time_days, dtype: float64
no-show prevalence: 0.2019163778829363


### Silver table (cleaned, standardized)

In [4]:
con.register("silver_df", df)

con.execute("""
CREATE OR REPLACE TABLE silver_appointments AS
SELECT * FROM silver_df
""")

print("silver rows:", con.execute("SELECT COUNT(*) FROM silver_appointments").fetchone()[0])


silver rows: 110521


### Gold base table (canonical modeling table for the project)

In [5]:
con.execute("""
CREATE OR REPLACE TABLE gold_appointments_base AS
SELECT
  CAST(appointment_id AS BIGINT) AS appointment_id,
  CAST(person_id AS BIGINT) AS person_id,
  CAST(scheduledday AS TIMESTAMP) AS scheduled_ts,
  CAST(appointmentday AS TIMESTAMP) AS appointment_ts,
  CAST(scheduled_date AS DATE) AS scheduled_date,
  CAST(appointment_date AS DATE) AS appointment_date,
  CAST(lead_time_days AS INTEGER) AS lead_time_days,
  CAST(date_inversion_flag AS INTEGER) AS date_inversion_flag,
  CAST(age AS DOUBLE) AS age,
  CAST(sms_received AS INTEGER) AS sms_received,
  CAST(label AS INTEGER) AS label
FROM silver_appointments
WHERE appointment_id IS NOT NULL
""")

print("gold rows:", con.execute("SELECT COUNT(*) FROM gold_appointments_base").fetchone()[0])
print("no-show prevalence:", con.execute("SELECT AVG(CAST(label AS DOUBLE)) FROM gold_appointments_base").fetchone()[0])


gold rows: 110521
no-show prevalence: 0.2019163778829363


### Day 11 Data Quality checks (must be clean)

In [6]:
bronze = con.execute("SELECT COUNT(*) FROM bronze_appointments").fetchone()[0]
silver = con.execute("SELECT COUNT(*) FROM silver_appointments").fetchone()[0]
gold   = con.execute("SELECT COUNT(*) FROM gold_appointments_base").fetchone()[0]

dup = con.execute("""
SELECT COUNT(*) - COUNT(DISTINCT appointment_id)
FROM gold_appointments_base
""").fetchone()[0]

label_nulls = con.execute("""
SELECT SUM(CASE WHEN label IS NULL THEN 1 ELSE 0 END)
FROM gold_appointments_base
""").fetchone()[0]

date_inversions = con.execute("""
SELECT SUM(date_inversion_flag)
FROM gold_appointments_base
""").fetchone()[0]

age_bad = con.execute("""
SELECT COUNT(*)
FROM gold_appointments_base
WHERE age IS NOT NULL AND (age < 0 OR age > 110)
""").fetchone()[0]

print("bronze:", bronze)
print("silver:", silver)
print("gold  :", gold)
print("duplicates appointment_id:", dup)
print("label nulls:", label_nulls)
print("DATE-level inversions:", date_inversions)
print("age out of range:", age_bad)


bronze: 110527
silver: 110521
gold  : 110521
duplicates appointment_id: 0
label nulls: 0
DATE-level inversions: 5
age out of range: 0


### First, inspect what columns are actually in gold_appointments_base

In [8]:
import duckdb
from pathlib import Path

db_path = Path("..").resolve() / "data" / "warehouse" / "day11_noshow.duckdb"
con = duckdb.connect(str(db_path))

cols = con.execute("PRAGMA table_info('gold_appointments_base')").fetchall()
print([c[1] for c in cols])


['appointment_id', 'person_id', 'scheduled_ts', 'appointment_ts', 'scheduled_date', 'appointment_date', 'lead_time_days', 'date_inversion_flag', 'age', 'sms_received', 'label']


If you don’t see gender and neighbourhood in that printed list, then we rebuild gold properly 

### rebuild gold correctly from silver

In [9]:
con.execute("""
CREATE OR REPLACE TABLE gold_appointments_base AS
SELECT
  CAST(appointment_id AS BIGINT) AS appointment_id,
  CAST(person_id AS BIGINT) AS person_id,

  CAST(scheduled_ts AS TIMESTAMP) AS scheduled_ts,
  CAST(appointment_ts AS TIMESTAMP) AS appointment_ts,

  CAST(scheduled_date AS DATE) AS scheduled_date,
  CAST(appointment_date AS DATE) AS appointment_date,

  CAST(lead_time_days AS INTEGER) AS lead_time_days,
  CAST(CASE WHEN lead_time_days < 0 THEN 1 ELSE 0 END AS INTEGER) AS date_inversion_flag,

  CAST(age AS DOUBLE) AS age,
  CAST(gender AS VARCHAR) AS gender,
  CAST(neighbourhood AS VARCHAR) AS neighbourhood,

  CAST(scholarship AS INTEGER) AS scholarship,
  CAST(hipertension AS INTEGER) AS hipertension,
  CAST(diabetes AS INTEGER) AS diabetes,
  CAST(alcoholism AS INTEGER) AS alcoholism,
  CAST(handcap AS INTEGER) AS handcap,

  CAST(sms_received AS INTEGER) AS sms_received,
  CAST(label AS INTEGER) AS label
FROM silver_appointments
WHERE appointment_id IS NOT NULL
""")

print("gold cols now:",
      [r[1] for r in con.execute("PRAGMA table_info('gold_appointments_base')").fetchall()])


gold cols now: ['appointment_id', 'person_id', 'scheduled_ts', 'appointment_ts', 'scheduled_date', 'appointment_date', 'lead_time_days', 'date_inversion_flag', 'age', 'gender', 'neighbourhood', 'scholarship', 'hipertension', 'diabetes', 'alcoholism', 'handcap', 'sms_received', 'label']


### Inspect the 5 DATE-level inversions

In [10]:
inv = con.execute("""
SELECT *
FROM gold_appointments_base
WHERE date_inversion_flag = 1
ORDER BY lead_time_days ASC
""").df()

inv


Unnamed: 0,appointment_id,person_id,scheduled_ts,appointment_ts,scheduled_date,appointment_date,lead_time_days,date_inversion_flag,age,gender,neighbourhood,scholarship,hipertension,diabetes,alcoholism,handcap,sms_received,label
0,5686628,998231581612122,2016-05-11 09:49:20,2016-05-04 20:00:00,2016-05-11,2016-05-05,-6,1,81.0,F,SANTO ANTÔNIO,0,0,0,0,0,0,1
1,5679978,7839272661752,2016-05-10 06:51:53,2016-05-08 20:00:00,2016-05-10,2016-05-09,-1,1,38.0,M,RESISTÊNCIA,0,0,0,0,1,0,1
2,5715660,7896293967868,2016-05-18 10:50:41,2016-05-16 20:00:00,2016-05-18,2016-05-17,-1,1,19.0,F,SANTO ANTÔNIO,0,0,0,0,1,0,1
3,5664962,24252258389979,2016-05-05 09:43:58,2016-05-03 20:00:00,2016-05-05,2016-05-04,-1,1,22.0,F,CONSOLAÇÃO,0,0,0,0,0,0,1
4,5655637,3787481966821,2016-05-04 02:50:57,2016-05-02 20:00:00,2016-05-04,2016-05-03,-1,1,7.0,M,TABUAZEIRO,0,0,0,0,0,0,1


### Drop inversions + rebuild tables + re-check

In [11]:
con.execute("""
CREATE OR REPLACE TABLE silver_appointments AS
SELECT *
FROM silver_appointments
WHERE lead_time_days IS NULL OR lead_time_days >= 0
""")

con.execute("""
CREATE OR REPLACE TABLE gold_appointments_base AS
SELECT
  CAST(appointment_id AS BIGINT) AS appointment_id,
  CAST(person_id AS BIGINT) AS person_id,
  CAST(scheduled_ts AS TIMESTAMP) AS scheduled_ts,
  CAST(appointment_ts AS TIMESTAMP) AS appointment_ts,
  CAST(scheduled_date AS DATE) AS scheduled_date,
  CAST(appointment_date AS DATE) AS appointment_date,
  CAST(lead_time_days AS INTEGER) AS lead_time_days,
  CAST(CASE WHEN lead_time_days < 0 THEN 1 ELSE 0 END AS INTEGER) AS date_inversion_flag,
  CAST(age AS DOUBLE) AS age,
  CAST(gender AS VARCHAR) AS gender,
  CAST(neighbourhood AS VARCHAR) AS neighbourhood,
  CAST(scholarship AS INTEGER) AS scholarship,
  CAST(hipertension AS INTEGER) AS hipertension,
  CAST(diabetes AS INTEGER) AS diabetes,
  CAST(alcoholism AS INTEGER) AS alcoholism,
  CAST(handcap AS INTEGER) AS handcap,
  CAST(sms_received AS INTEGER) AS sms_received,
  CAST(label AS INTEGER) AS label
FROM silver_appointments
WHERE appointment_id IS NOT NULL
""")

bronze = con.execute("SELECT COUNT(*) FROM bronze_appointments").fetchone()[0]
silver = con.execute("SELECT COUNT(*) FROM silver_appointments").fetchone()[0]
gold   = con.execute("SELECT COUNT(*) FROM gold_appointments_base").fetchone()[0]
inv2   = con.execute("SELECT SUM(date_inversion_flag) FROM gold_appointments_base").fetchone()[0]

print("bronze:", bronze)
print("silver:", silver)
print("gold  :", gold)
print("DATE-level inversions:", inv2)


bronze: 110527
silver: 110516
gold  : 110516
DATE-level inversions: 0


### Write the “feature contract” JSON (what we expect at scoring time)

In [12]:
import json

contract = {
  "id_cols": ["appointment_id", "person_id"],
  "label_col": "label",
  "treatment_col": "sms_received",
  "feature_cols": [
    "age",
    "gender",
    "neighbourhood",
    "scholarship",
    "hipertension",
    "diabetes",
    "alcoholism",
    "handcap",
    "lead_time_days"
  ],
  "numeric_cols": ["age", "lead_time_days"],
  "categorical_cols": ["gender", "neighbourhood"],
  "binary_cols": ["scholarship","hipertension","diabetes","alcoholism","handcap"]
}

contract_path = day11_root / "artifacts" / "noshow_feature_contract.json"
contract_path.parent.mkdir(parents=True, exist_ok=True)
contract_path.write_text(json.dumps(contract, indent=2), encoding="utf-8")

print("Wrote contract:", contract_path)


Wrote contract: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-11\artifacts\noshow_feature_contract.json


### Create a smoke input template (deployment mindset)

In [13]:
smoke = pd.DataFrame({
    "appointment_id": [1, 2, 3],
    "person_id": [101, 102, 103],
    "age": [25, 60, 41],
    "gender": ["F", "M", "F"],
    "neighbourhood": ["JARDIM DA PENHA", "CENTRO", "SANTA MARTHA"],
    "scholarship": [0, 1, 0],
    "hipertension": [0, 1, 0],
    "diabetes": [0, 0, 1],
    "alcoholism": [0, 0, 0],
    "handcap": [0, 0, 0],
    "lead_time_days": [10, 3, 0]
})

smoke_path = day11_root / "data" / "input_example.csv"
smoke.to_csv(smoke_path, index=False)

print("Wrote smoke input:", smoke_path)
smoke


Wrote smoke input: C:\Users\sarfo\Dropbox\Courses\Data Science\30-days-of-data-science\Day-11\data\input_example.csv


Unnamed: 0,appointment_id,person_id,age,gender,neighbourhood,scholarship,hipertension,diabetes,alcoholism,handcap,lead_time_days
0,1,101,25,F,JARDIM DA PENHA,0,0,0,0,0,10
1,2,102,60,M,CENTRO,1,1,0,0,0,3
2,3,103,41,F,SANTA MARTHA,0,0,1,0,0,0


### Close DB

In [14]:
con.close()