# 2) Data Preprocessing — Robust Pipeline


We translate raw data into **model-ready features** using a reproducible pipeline. This ensures **consistency** between training and deployment.


In [1]:

# Common setup
import json, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [6]:

import joblib
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, RobustScaler

DATA_PATH = "./data/processed.csv"
PREPROC_PATH = "./models/preprocessor.joblib"
FEATURE_META_PATH = "./models/feature_meta.json"
SPLIT_PATH = "./outputs/split_indices.json"

df = pd.read_csv(DATA_PATH)
y = (df["y"].astype(str).str.lower() == "yes").astype(int)
X = df.drop(columns=["y"])

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

numeric_features = [c for c in X_train.columns if X_train[c].dtype != 'object']
categorical_features = [c for c in X_train.columns if X_train[c].dtype == 'object']

preprocess = ColumnTransformer(
    transformers=[
        ("num", RobustScaler(with_centering=False), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), categorical_features),
    ]
)

# Fit on train only
preprocess.fit(X_train)

# Save preprocessor
joblib.dump(preprocess, PREPROC_PATH)

# Save split indices & feature metadata
with open(SPLIT_PATH, "w") as f:
    json.dump({"train_idx": X_train.index.tolist(), "valid_idx": X_valid.index.tolist()}, f)

with open(FEATURE_META_PATH, "w") as f:
    json.dump({
        "numeric_features": numeric_features,
        "categorical_features": categorical_features
    }, f, indent=2)

print("Saved preprocessor ->", PREPROC_PATH)
print("Saved split indices ->", SPLIT_PATH)
print("Saved feature metadata ->", FEATURE_META_PATH)

# Smoke tests
Xt = preprocess.transform(X_train)
Xv = preprocess.transform(X_valid)
assert Xt.shape[1] == Xv.shape[1], "Train/valid feature dims mismatch."
assert Xt.shape[0] == len(X_train)
assert Xv.shape[0] == len(X_valid)
print("Preprocessing tests passed. Shapes:", Xt.shape, Xv.shape)


Saved preprocessor -> ./models/preprocessor.joblib
Saved split indices -> ./outputs/split_indices.json
Saved feature metadata -> ./models/feature_meta.json
Preprocessing tests passed. Shapes: (32950, 63) (8238, 63)
