In [1]:
# ============================
# Week 5-6: Preprocessing Setup
# ============================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv("../data/raw/diabetic_data.csv")

# Step 1: Drop ID-like columns
df = df.drop(["encounter_id", "patient_nbr"], axis=1)

# Step 2: Replace "?" with NaN
df = df.replace("?", np.nan)

# Quick check missing values
print("Missing values per column:\n", df.isnull().sum())

# Step 3: Define target and features
X = df.drop("readmitted", axis=1)
y = df["readmitted"]

# Simplify target variable: "NO" vs "YES" (combine <30 and >30 as "YES")
y = y.replace({">30": "YES", "<30": "YES", "NO": "NO"})

# Step 4: Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 5: Separate categorical & numerical columns
categorical_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
numeric_cols = [col for col in X_train.columns if col not in categorical_cols]

print("Categorical features:", len(categorical_cols))
print("Numeric features:", len(numeric_cols))

# Step 6: Preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

print("✅ Preprocessing pipeline ready!")

Missing values per column:
 race                         2273
gender                          0
age                             0
weight                      98569
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  40256
medical_specialty           49949
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         21
diag_2                        358
diag_3                       1423
number_diagnoses                0
max_glu_serum               96420
A1Cresult                   84748
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide                   0
glipizide           