In [1]:
import os
import numpy as np
import pandas as pd

RAW_PATH = "../data/raw/KaggleV2-May-2016.csv"
assert os.path.exists(RAW_PATH), f"Not found: {RAW_PATH}"

df = pd.read_csv(RAW_PATH)
print("shape:", df.shape)
df.head()

shape: (110527, 14)


Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [2]:
df["ScheduledDT"] = pd.to_datetime(df["ScheduledDay"].astype(str).str.replace("Z","", regex=False), errors="coerce")
df["AppointmentDT"] = pd.to_datetime(df["AppointmentDay"].astype(str).str.replace("Z","", regex=False), errors="coerce")

df["LeadTime"] = (df["AppointmentDT"] - df["ScheduledDT"]).dt.days

df = df.dropna(subset=["LeadTime"])
df = df[df["LeadTime"] >= 0]
df.loc[df["LeadTime"] > 60, "LeadTime"] = 60

df.loc[df["Age"] < 0, "Age"] = np.nan
df.loc[df["Age"] > 100, "Age"] = 100
df = df.dropna(subset=["Age"])

df["NoShow"] = (df["No-show"] == "Yes").astype(int)

print(df[["ScheduledDT","AppointmentDT", "LeadTime", "Age", "NoShow"]].head())

           ScheduledDT AppointmentDT  LeadTime   Age  NoShow
5  2016-04-27 08:36:51    2016-04-29         1  76.0       0
6  2016-04-27 15:05:12    2016-04-29         1  23.0       1
7  2016-04-27 15:39:58    2016-04-29         1  39.0       1
9  2016-04-27 12:48:25    2016-04-29         1  19.0       0
10 2016-04-27 14:58:11    2016-04-29         1  30.0       0


In [3]:
df["GenderNum"] = df["Gender"].map({"F":0, "M":1})

df["AppointmentWeekday"] = df["AppointmentDT"].dt.day_name()
df = pd.get_dummies(df, columns=["AppointmentWeekday"], drop_first=True)

weekday_cols = [c for c in df.columns if c.startswith("AppointmentWeekday_")]
feature_cols = ["Age", "GenderNum", "LeadTime", "SMS_received"] + weekday_cols

x = df[feature_cols].copy()
y = df["NoShow"].copy()

print("x shape:", x.shape, "| y mean(NoShow rate):", y.mean().round(4))
x.head()


X shape: (71959, 9) | y mean(NoShow rate): 0.2852


Unnamed: 0,Age,GenderNum,LeadTime,SMS_received,AppointmentWeekday_Monday,AppointmentWeekday_Saturday,AppointmentWeekday_Thursday,AppointmentWeekday_Tuesday,AppointmentWeekday_Wednesday
5,76.0,0,1,0,False,False,False,False,False
6,23.0,0,1,0,False,False,False,False,False
7,39.0,0,1,0,False,False,False,False,False
9,19.0,0,1,0,False,False,False,False,False
10,30.0,0,1,0,False,False,False,False,False


In [7]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x, y,
    test_size = 0.2,
    random_state = 42,
    stratify = y
)

print("x_train:", x_train.shape, "x_test:", x_test.shape)
print("y_train pos rate:", y_train.mean().round(4), " | y_test pos rate:", y_test.mean().round(4))

x_train: (57567, 9) x_test: (14392, 9)
y_train pos rate: 0.2852  | y_test pos rate: 0.2852


In [8]:
os.makedirs("../data/processed", exist_ok=True)

train_df = x_train.copy()
train_df["NoShow"] = y_train.values

test_df = x_test.copy()
test_df["NoShow"] = y_test.values

train_path = "../data/processed/train_model_prep.csv"
test_path = "../data/processed/test_model_prep.csv"

train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)

print("Saved:", train_path, "and", test_path)

Saved: ../data/processed/train_model_prep.csv and ../data/processed/test_model_prep.csv
