# Export the dataset

This step preprocesses the data for modelling by:
  - Median imputation for numerical variables
  - Mode imputation and one-hot encoding categorical variables
  - Aligning the train/test columns


In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

CLEAN_DIR = Path("../data/clean")
CLEAN_DIR.mkdir(parents=True, exist_ok=True)

train_merged = pd.read_csv(CLEAN_DIR / "train_merged.csv")
test_merged  = pd.read_csv(CLEAN_DIR / "test_merged.csv")

print("Loaded train_merged:", train_merged.shape)
print("Loaded test_merged:", test_merged.shape)

train_merged = train_merged.replace([np.inf, -np.inf], np.nan)
test_merged  = test_merged.replace([np.inf, -np.inf], np.nan)

def clip_extreme_values(df, max_abs=1e12):
    num_cols_local = df.select_dtypes(include=[np.number]).columns
    df[num_cols_local] = df[num_cols_local].clip(lower=-max_abs, upper=max_abs)
    return df

train_merged = clip_extreme_values(train_merged)
test_merged  = clip_extreme_values(test_merged)


Loaded train_merged: (307511, 181)
Loaded test_merged: (48744, 180)


### 1. Separate target and IDs

In [2]:
TARGET_COL = "TARGET"
ID_COL = "SK_ID_CURR"

y = train_merged[TARGET_COL]


id_train = train_merged[ID_COL].values
id_test  = test_merged[ID_COL].values


X_train_raw = train_merged.drop(columns=[TARGET_COL])
X_test_raw  = test_merged.copy()


### 2. Identify categorical vs numeric

In [3]:
feature_cols = [c for c in X_train_raw.columns if c != ID_COL]

cat_cols = X_train_raw[feature_cols].select_dtypes(include=["object","category"]).columns.tolist()
num_cols = X_train_raw[feature_cols].select_dtypes(include=[np.number]).columns.tolist()

print("Number of categorical cols:", len(cat_cols))
print("Number of numeric cols:", len(num_cols))



Number of categorical cols: 14
Number of numeric cols: 165


### 3. Build the preprocessing pipeline

In [4]:
numeric_transformer = SimpleImputer(strategy="median")

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ],
    remainder="drop"
)


### 4. Fit on train only, the transform both train and test sets

In [5]:
X_train_arr = preprocessor.fit_transform(X_train_raw[feature_cols])
X_test_arr  = preprocessor.transform(X_test_raw[feature_cols])

print("Transformed train shape:", X_train_arr.shape)
print("Transformed test shape:", X_test_arr.shape)

Transformed train shape: (307511, 301)
Transformed test shape: (48744, 301)


### 5. Build feature names

In [6]:
num_feature_names = num_cols

ohe = preprocessor.named_transformers_["cat"].named_steps["encoder"]
ohe_feature_names = ohe.get_feature_names_out(cat_cols).tolist()

final_feature_names = num_feature_names + ohe_feature_names

print("Feature count matches train array?",
      len(final_feature_names) == X_train_arr.shape[1])


Feature count matches train array? True


### 6. Wrap back into DataFrames

In [7]:
X_train_model = pd.DataFrame(X_train_arr, columns=final_feature_names, index=X_train_raw.index)
X_train_model.insert(0, ID_COL, id_train)
X_train_model[TARGET_COL] = y.values

X_test_model = pd.DataFrame(X_test_arr, columns=final_feature_names, index=X_test_raw.index)
X_test_model.insert(0, ID_COL, id_test)

print("X_train_model shape:", X_train_model.shape)
print("X_test_model shape:", X_test_model.shape)

print("Train NaN % rows w/ any NaN:", np.mean(X_train_model.isna().any(axis=1)))
print("Test NaN % rows w/ any NaN:", np.mean(X_test_model.isna().any(axis=1)))

X_train_model shape: (307511, 303)
X_test_model shape: (48744, 302)
Train NaN % rows w/ any NaN: 0.0
Test NaN % rows w/ any NaN: 0.0


## 7. Export final modeling datasets

In [8]:
model_train_path = CLEAN_DIR / "model_train.csv"
model_test_path  = CLEAN_DIR / "model_test.csv"

X_train_model.to_csv(model_train_path, index=False)
X_test_model.to_csv(model_test_path, index=False)

print(f"Saved {model_train_path}")
print(f"Saved {model_test_path}")


Saved ../data/clean/model_train.csv
Saved ../data/clean/model_test.csv
