# Export the dataset

This step preprocesses the data for modelling by:
  - Median imputation for numerical variables
  - Mode imputation and one-hot encoding categorical variables
  - Aligning the train/test columns


In [1]:
# Imports & configuration
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

# Directories
PROJECT_ROOT = Path("..").resolve()
RAW_DIR   = PROJECT_ROOT / "data" / "raw"
CLEAN_DIR = PROJECT_ROOT / "data" / "clean"
MODEL_DIR = PROJECT_ROOT / "model"

CLEAN_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("CLEAN_DIR:", CLEAN_DIR)
print("MODEL_DIR:", MODEL_DIR)

PROJECT_ROOT: /Users/domnjue/Desktop/Data-Science/projects/home_credit_default_risk
CLEAN_DIR: /Users/domnjue/Desktop/Data-Science/projects/home_credit_default_risk/data/clean
MODEL_DIR: /Users/domnjue/Desktop/Data-Science/projects/home_credit_default_risk/model


### 1. Separate target and IDs

In [2]:
# Load merged application + bureau + behavior datasets
train_merged = pd.read_csv(CLEAN_DIR / "train_merged.csv")
test_merged  = pd.read_csv(CLEAN_DIR / "test_merged.csv")

print("Loaded train_merged:", train_merged.shape)
print("Loaded test_merged:", test_merged.shape)

# Replace infinities with NaN (for safety)
train_merged = train_merged.replace([np.inf, -np.inf], np.nan)
test_merged  = test_merged.replace([np.inf, -np.inf], np.nan)

def clip_extreme_values(df: pd.DataFrame, max_abs: float = 1e12) -> pd.DataFrame:
    """Avoid insane values from blowing up the model."""
    df = df.copy()
    num_cols_local = df.select_dtypes(include=[np.number]).columns
    df[num_cols_local] = df[num_cols_local].clip(lower=-max_abs, upper=max_abs)
    return df

train_merged = clip_extreme_values(train_merged)
test_merged  = clip_extreme_values(test_merged)

Loaded train_merged: (307511, 181)
Loaded test_merged: (48744, 180)


### 2. Identify categorical vs numeric

In [3]:
# Separate target, ID, and raw features
TARGET_COL = "TARGET"
ID_COL = "SK_ID_CURR"

# Target
y = train_merged[TARGET_COL].astype(int)

# Save IDs
id_train = train_merged[ID_COL].values
id_test  = test_merged[ID_COL].values

# Raw feature frames (keep ID for now; drop TARGET from train)
X_train_raw = train_merged.drop(columns=[TARGET_COL])
X_test_raw  = test_merged.copy()

print("X_train_raw:", X_train_raw.shape)
print("X_test_raw:", X_test_raw.shape)

X_train_raw: (307511, 180)
X_test_raw: (48744, 180)


### 3. Build the preprocessing pipeline

In [4]:
# All features except ID
feature_cols = [c for c in X_train_raw.columns if c != ID_COL]

cat_cols = (
    X_train_raw[feature_cols]
    .select_dtypes(include=["object", "category"])
    .columns
    .tolist()
)
num_cols = (
    X_train_raw[feature_cols]
    .select_dtypes(include=[np.number])
    .columns
    .tolist()
)

print("Total feature cols:", len(feature_cols))
print("Number of categorical cols:", len(cat_cols))
print("Number of numeric cols:", len(num_cols))


Total feature cols: 179
Number of categorical cols: 14
Number of numeric cols: 165


### 4. Fit on train only, the transform both train and test sets

In [5]:
# Build preprocessing pipeline (imputation + one-hot encoding)
numeric_transformer = SimpleImputer(strategy="median")

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ],
    remainder="drop",  # ignore any unexpected cols
)

print(preprocessor)


ColumnTransformer(transformers=[('num', SimpleImputer(strategy='median'),
                                 ['FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
                                  'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
                                  'AMT_CREDIT', 'AMT_ANNUITY',
                                  'AMT_GOODS_PRICE',
                                  'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH',
                                  'DAYS_EMPLOYED', 'DAYS_REGISTRATION',
                                  'DAYS_ID_PUBLISH', 'OWN_CAR_AGE',
                                  'FLAG_MOBIL', 'FLAG_EMP_PHONE',
                                  'FLAG_WORK_PHONE', 'FLAG_C...
                                                 ('encoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False))]),
                                 ['NAME_CONTRACT_TYPE', 'CODE_GENDER',
             

### 5. Build feature names

In [6]:
# Fit preprocessor on train, transform both train & test
X_train_arr = preprocessor.fit_transform(X_train_raw[feature_cols])
X_test_arr  = preprocessor.transform(X_test_raw[feature_cols])

print("Transformed train shape:", X_train_arr.shape)
print("Transformed test shape:", X_test_arr.shape)


Transformed train shape: (307511, 301)
Transformed test shape: (48744, 301)


### 6. Wrap back into DataFrames

In [7]:
# Extract final feature names

num_feature_names = num_cols

# One-hot feature names for categorical columns
ohe = preprocessor.named_transformers_["cat"].named_steps["encoder"]
ohe_feature_names = ohe.get_feature_names_out(cat_cols).tolist()

final_feature_names = num_feature_names + ohe_feature_names

print("Number of final features:", len(final_feature_names))
print(
    "Feature count matches train array?",
    len(final_feature_names) == X_train_arr.shape[1],
)


Number of final features: 301
Feature count matches train array? True


## 7. Export final modeling datasets

In [8]:
# Build model-ready DataFrames (X_train_model / X_test_model)

X_train_model = pd.DataFrame(
    X_train_arr,
    columns=final_feature_names,
    index=X_train_raw.index,
)
X_train_model.insert(0, ID_COL, id_train)
X_train_model[TARGET_COL] = y.values

# Test
X_test_model = pd.DataFrame(
    X_test_arr,
    columns=final_feature_names,
    index=X_test_raw.index,
)
X_test_model.insert(0, ID_COL, id_test)

print("X_train_model shape:", X_train_model.shape)
print("X_test_model shape:", X_test_model.shape)

print(
    "Train NaN % rows w/ any NaN:",
    np.mean(X_train_model.isna().any(axis=1)),
)
print(
    "Test NaN % rows w/ any NaN:",
    np.mean(X_test_model.isna().any(axis=1)),
)


X_train_model shape: (307511, 303)
X_test_model shape: (48744, 302)
Train NaN % rows w/ any NaN: 0.0
Test NaN % rows w/ any NaN: 0.0


In [9]:
# Save model matrices and preprocessing artifacts
model_train_path = CLEAN_DIR / "model_train.csv"
model_test_path  = CLEAN_DIR / "model_test.csv"

X_train_model.to_csv(model_train_path, index=False)
X_test_model.to_csv(model_test_path, index=False)

print(f"Saved {model_train_path}")
print(f"Saved {model_test_path}")

# Also save feature names for deployment
feature_names_path = MODEL_DIR / "feature_names_raw_pipeline.pkl"
joblib.dump(final_feature_names, feature_names_path)
print(f"Saved feature names to {feature_names_path}")

# And save the sklearn preprocessor so the API can reuse identical logic
preprocessor_path = MODEL_DIR / "preprocessor.pkl"
joblib.dump(preprocessor, preprocessor_path)
print(f"Saved preprocessor to {preprocessor_path}")


Saved /Users/domnjue/Desktop/Data-Science/projects/home_credit_default_risk/data/clean/model_train.csv
Saved /Users/domnjue/Desktop/Data-Science/projects/home_credit_default_risk/data/clean/model_test.csv
Saved feature names to /Users/domnjue/Desktop/Data-Science/projects/home_credit_default_risk/model/feature_names_raw_pipeline.pkl
Saved preprocessor to /Users/domnjue/Desktop/Data-Science/projects/home_credit_default_risk/model/preprocessor.pkl
