**Encoding**

In [50]:
import pandas as pd, numpy as np, pathlib, joblib
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

OUT = pathlib.Path("results/outputs")
TARGET = "Default"

train = pd.read_csv(OUT/"train_outliers_capped.csv")
val   = pd.read_csv(OUT/"val_outliers_capped.csv")
test  = pd.read_csv(OUT/"test_outliers_capped.csv")

X_train, y_train = train.drop(columns=[TARGET]), train[TARGET].astype(int)
X_val,   y_val   = val.drop(columns=[TARGET]),   val[TARGET].astype(int)
X_test,  y_test  = test.drop(columns=[TARGET]),  test[TARGET].astype(int)

num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X_train.columns if c not in num_cols]
print(f"Numeric: {len(num_cols)} | Categorical: {len(cat_cols)}")

# numeric-only preprocessor
pre = ColumnTransformer(
    transformers=[("num", Pipeline([("impute", SimpleImputer(strategy="median"))]), num_cols)],
    remainder="drop",
    verbose_feature_names_out=False
)

Xtr = pre.fit_transform(X_train)
Xv  = pre.transform(X_val)
Xt  = pre.transform(X_test)

feat_names = list(num_cols)  # numeric-only
pd.DataFrame(Xtr, columns=feat_names).assign(**{TARGET: y_train.values}).to_csv(OUT/"train_encoded.csv", index=False)
pd.DataFrame(Xv,  columns=feat_names).assign(**{TARGET: y_val.values}).to_csv(OUT/"val_encoded.csv",   index=False)
pd.DataFrame(Xt,  columns=feat_names).assign(**{TARGET: y_test.values}).to_csv(OUT/"test_encoded.csv",  index=False)
joblib.dump(pre, OUT/"preprocessor_cats.joblib")
print("Saved numeric-only encoded files.")


Numeric: 7 | Categorical: 0
Saved numeric-only encoded files.


In [51]:
import pandas as pd, pathlib
OUT = pathlib.Path("results/outputs")

for f in ["train_encoded.csv","val_encoded.csv","test_encoded.csv"]:
    df = pd.read_csv(OUT/f)
    print(f, df.shape, "| target present? ->", "Default" in df.columns)

# peek features
pd.read_csv(OUT/"train_encoded.csv").head()


train_encoded.csv (862, 8) | target present? -> True
val_encoded.csv (150, 8) | target present? -> True
test_encoded.csv (150, 8) | target present? -> True


Unnamed: 0,CustomerID,Age,AnnualIncome,LoanAmount,CreditScore,LoanTerm,ExistingDebt,Default
0,944.0,61.0,89659.0,29759.0,801.0,36.0,21856.0,0
1,200.0,61.0,49569.0,45087.0,452.0,12.0,12558.0,1
2,778.0,38.0,97010.0,19387.0,444.0,36.0,11979.0,1
3,366.0,22.0,83605.0,11322.0,447.0,24.0,7447.0,1
4,392.0,28.0,95022.0,25231.0,330.0,12.0,2835.0,1
