**Feature selection**

In [25]:
import numpy as np
import pandas as pd

def engineer_v2(df_train, df_val, df_test):
    Xtr, Xv, Xt = df_train.copy(), df_val.copy(), df_test.copy()

    # --- 0) Ensure no 'category' dtype columns block assignments ---
    for X in (Xtr, Xv, Xt):
        for c in X.select_dtypes(include=['category']).columns:
            X[c] = X[c].astype(str)  # drop categorical dtype

    # --- 0b) Helper: coerce potential numeric sources to numeric (no crash on strings) ---
    def to_num(X, col):
        if col and col in X.columns:
            return pd.to_numeric(X[col], errors='coerce')
        return None

    inc_tr = to_num(Xtr, C_INCOME)
    loan_tr = to_num(Xtr, C_LOAN_AMT)
    lim_tr = to_num(Xtr, C_LIMIT)
    bal_tr = to_num(Xtr, C_BAL)
    util_tr = to_num(Xtr, C_UIL)
    inst_tr = to_num(Xtr, C_INSTALL)

    # --- 1) Ratios & rates (fit-free, applied columnwise) ---
    # Debt-to-Income: installment / income
    if C_INSTALL and C_INCOME:
        for X in (Xtr, Xv, Xt):
            inst = to_num(X, C_INSTALL)
            inc  = to_num(X, C_INCOME)
            if inst is not None and inc is not None:
                X["dti_install"] = inst / inc.replace(0, np.nan)

    # Loan-to-Income
    if C_LOAN_AMT and C_INCOME:
        for X in (Xtr, Xv, Xt):
            loan = to_num(X, C_LOAN_AMT)
            inc  = to_num(X, C_INCOME)
            if loan is not None and inc is not None:
                X["loan_to_income"] = loan / inc.replace(0, np.nan)

    # Balance-to-Limit (utilization proxy)
    if C_BAL and C_LIMIT:
        for X in (Xtr, Xv, Xt):
            bal = to_num(X, C_BAL)
            lim = to_num(X, C_LIMIT)
            if bal is not None and lim is not None:
                X["bal_to_limit"] = bal / lim.replace(0, np.nan)

    # Existing utilization column (clip)
    if C_UIL and C_UIL in Xtr.columns:
        for X in (Xtr, Xv, Xt):
            util = to_num(X, C_UIL)
            if util is not None:
                X["util_capped"] = util.clip(lower=0, upper=1.5)

    # --- 2) Binning (creates NEW categorical columns; no fillna with numbers) ---
    if C_AGE and C_AGE in Xtr.columns:
        bins = [0, 21, 25, 30, 35, 45, 55, 65, np.inf]
        labels = ["<=21","22-25","26-30","31-35","36-45","46-55","56-65","65+"]
        for X in (Xtr, Xv, Xt):
            X["age_band"] = pd.cut(pd.to_numeric(X[C_AGE], errors='coerce'),
                                   bins=bins, labels=labels, right=True, include_lowest=True)

    if C_SCORE and C_SCORE in Xtr.columns:
        bins = [0, 580, 620, 660, 700, 740, 780, 900]
        labels = ["<580","580-619","620-659","660-699","700-739","740-779","780+"]
        for X in (Xtr, Xv, Xt):
            X["score_band"] = pd.cut(pd.to_numeric(X[C_SCORE], errors='coerce'),
                                     bins=bins, labels=labels, right=True, include_lowest=True)

    # --- 3) Frequency (count) encoding for ALL categorical/object columns ---
    # Learn on TRAIN only; apply mapping; NEW numeric columns with suffix __freq
    cat_cols = Xtr.select_dtypes(exclude=[np.number]).columns.tolist()
    cat_cols = [c for c in cat_cols if c != TARGET]
    freq_maps = {}
    for c in cat_cols:
        freq = Xtr[c].astype(str).value_counts(dropna=False)
        freq_maps[c] = freq
        for X in (Xtr, Xv, Xt):
            X[f"{c}__freq"] = X[c].astype(str).map(freq).fillna(0).astype(float)

    # --- 4) Rare-category grouping into "__OTHER__" (strings only, no numbers) ---
    for c in cat_cols:
        freq = freq_maps[c]
        cutoff = max(2, int(0.01 * len(Xtr)))  # <=1% or <=2 rows
        rare = set(freq[freq <= cutoff].index.astype(str))

        def group_to_other(s):
            s = s.astype(str)
            return s.where(~s.isin(rare), other="__OTHER__")

        for X in (Xtr, Xv, Xt):
            X[f"{c}__grp"] = group_to_other(X[c])

    # --- 5) Simple interactions (numeric only) ---
    pairs = []
    if C_INCOME and C_LOAN_AMT: pairs.append((C_LOAN_AMT, C_INCOME, "loan_x_income"))
    if C_BAL and C_LIMIT:       pairs.append((C_BAL, C_LIMIT, "bal_x_limit"))
    for a,b,name in pairs:
        for X in (Xtr, Xv, Xt):
            va = pd.to_numeric(X[a], errors='coerce') if a in X.columns else None
            vb = pd.to_numeric(X[b], errors='coerce') if b in X.columns else None
            if va is not None and vb is not None:
                X[name] = va * vb

    # --- 6) Clean infinities produced by divisions ---
    for X in (Xtr, Xv, Xt):
        X.replace([np.inf, -np.inf], np.nan, inplace=True)

    return Xtr, Xv, Xt, {"cat_cols": cat_cols}


In [27]:
# --- Detect common finance columns in your data (sets the C_* variables engineer_v2 uses) ---
import re

all_cols = list(train.columns)

def find_col(patterns, cols=all_cols):
    """Return the first column matching any regex pattern (case-insensitive), else None."""
    for pat in patterns:
        for c in cols:
            if re.search(pat, c, re.I):
                return c
    return None

C_INCOME   = find_col([r"\b(income|salary|annual[_ ]?inc)\b"])
C_LOAN_AMT = find_col([r"\b(loan[_ ]?amount|principal|disbursed|amt)\b"])
C_LIMIT    = find_col([r"\b(limit|credit[_ ]?limit)\b"])
C_BAL      = find_col([r"\b(balance|bal|outstanding)\b"])
C_UIL      = find_col([r"\b(util(?:ization|isation)?|util)\b"])
C_INSTALL  = find_col([r"\b(install(?:ment)?|emi|payment[_ ]?amount|monthly[_ ]?pay)\b"])
C_AGE      = find_col([r"\b(age|years[_ ]?old)\b"])
C_SCORE    = find_col([r"\b(credit[_ ]?score|score)\b"])

print("Detected columns:")
print("  INCOME     ->", C_INCOME)
print("  LOAN_AMT   ->", C_LOAN_AMT)
print("  LIMIT      ->", C_LIMIT)
print("  BALANCE    ->", C_BAL)
print("  UTIL       ->", C_UIL)
print("  INSTALL    ->", C_INSTALL)
print("  AGE        ->", C_AGE)
print("  SCORE      ->", C_SCORE)


Detected columns:
  INCOME     -> None
  LOAN_AMT   -> LoanAmount
  LIMIT      -> None
  BALANCE    -> None
  UTIL       -> None
  INSTALL    -> None
  AGE        -> Age
  SCORE      -> CreditScore


In [34]:
Xtr = train.drop(columns=[TARGET]); ytr = train[TARGET]
Xv  = val.drop(columns=[TARGET]);   yv  = val[TARGET]
Xt  = test.drop(columns=[TARGET]);  yt  = test[TARGET]

Xtr_fe, Xv_fe, Xt_fe, meta = engineer_v2(Xtr, Xv, Xt)

train_fe = Xtr_fe.copy(); train_fe[TARGET] = ytr.values
val_fe   = Xv_fe.copy();  val_fe[TARGET]   = yv.values
test_fe  = Xt_fe.copy();  test_fe[TARGET]  = yt.values


In [32]:
# --- 0) Setup & find inputs ---
import pandas as pd, numpy as np, pathlib, re, os

OUT = pathlib.Path("results/outputs"); OUT.mkdir(parents=True, exist_ok=True)
TARGET = "Default"  # change if your label is different

def pick(*names):
    for n in names:
        p = OUT/n
        if p.exists(): return p
    return None

p_train = pick("train_outliers_capped.csv","train_missing_handled.csv","train_split.csv")
p_val   = pick("val_outliers_capped.csv","val_missing_handled.csv","val_split.csv")
p_test  = pick("test_outliers_capped.csv","test_missing_handled.csv","test_split.csv")
assert p_train and p_val and p_test, "Upstream files not found. Run earlier steps first."

train = pd.read_csv(p_train); val = pd.read_csv(p_val); test = pd.read_csv(p_test)
assert TARGET in train.columns, f"{TARGET} not in train."

# --- 1) Column detection used by engineer_v2 ---
all_cols = list(train.columns)
def find_col(patterns, cols=all_cols):
    for pat in patterns:
        for c in cols:
            if re.search(pat, c, re.I): return c
    return None

C_INCOME   = find_col([r"\b(income|salary|annual[_ ]?inc)\b"])
C_LOAN_AMT = find_col([r"\b(loan[_ ]?amount|principal|disbursed|amt)\b"])
C_LIMIT    = find_col([r"\b(limit|credit[_ ]?limit)\b"])
C_BAL      = find_col([r"\b(balance|bal|outstanding)\b"])
C_UIL      = find_col([r"\b(util(?:ization|isation)?|util)\b"])
C_INSTALL  = find_col([r"\b(install(?:ment)?|emi|payment[_ ]?amount|monthly[_ ]?pay)\b"])
C_AGE      = find_col([r"\b(age|years[_ ]?old)\b"])
C_SCORE    = find_col([r"\b(credit[_ ]?score|score)\b"])

# --- 2) Robust feature engineering (engineer_v2) ---
def engineer_v2(df_train, df_val, df_test):
    Xtr, Xv, Xt = df_train.copy(), df_val.copy(), df_test.copy()

    # drop pandas 'category' dtype (prevents setitem errors)
    for X in (Xtr, Xv, Xt):
        for c in X.select_dtypes(include=['category']).columns:
            X[c] = X[c].astype(str)

    def to_num(X, col):
        if col and col in X.columns:
            return pd.to_numeric(X[col], errors='coerce')
        return None

    # ratios
    if C_INSTALL and C_INCOME:
        for X in (Xtr, Xv, Xt):
            inst = to_num(X, C_INSTALL); inc = to_num(X, C_INCOME)
            if inst is not None and inc is not None:
                X["dti_install"] = inst / inc.replace(0, np.nan)

    if C_LOAN_AMT and C_INCOME:
        for X in (Xtr, Xv, Xt):
            loan = to_num(X, C_LOAN_AMT); inc = to_num(X, C_INCOME)
            if loan is not None and inc is not None:
                X["loan_to_income"] = loan / inc.replace(0, np.nan)

    if C_BAL and C_LIMIT:
        for X in (Xtr, Xv, Xt):
            bal = to_num(X, C_BAL); lim = to_num(X, C_LIMIT)
            if bal is not None and lim is not None:
                X["bal_to_limit"] = bal / lim.replace(0, np.nan)

    if C_UIL:
        for X in (Xtr, Xv, Xt):
            util = to_num(X, C_UIL)
            if util is not None:
                X["util_capped"] = util.clip(0, 1.5)

    # bins
    if C_AGE:
        bins = [0,21,25,30,35,45,55,65,np.inf]
        labels = ["<=21","22-25","26-30","31-35","36-45","46-55","56-65","65+"]
        for X in (Xtr, Xv, Xt):
            X["age_band"] = pd.cut(pd.to_numeric(X[C_AGE], errors='coerce'),
                                   bins=bins, labels=labels, include_lowest=True, right=True)

    if C_SCORE:
        bins = [0,580,620,660,700,740,780,900]
        labels = ["<580","580-619","620-659","660-699","700-739","740-779","780+"]
        for X in (Xtr, Xv, Xt):
            X["score_band"] = pd.cut(pd.to_numeric(X[C_SCORE], errors='coerce'),
                                     bins=bins, labels=labels, include_lowest=True, right=True)

    # frequency encoding (fit on train)
    cat_cols = Xtr.select_dtypes(exclude=[np.number]).columns.tolist()
    cat_cols = [c for c in cat_cols if c != TARGET]
    freq_maps = {}
    for c in cat_cols:
        freq = Xtr[c].astype(str).value_counts(dropna=False)
        freq_maps[c] = freq
        for X in (Xtr, Xv, Xt):
            X[f"{c}__freq"] = X[c].astype(str).map(freq).fillna(0).astype(float)

    # rare grouping
    for c in cat_cols:
        freq = freq_maps[c]
        cutoff = max(2, int(0.01*len(Xtr)))
        rare = set(freq[freq <= cutoff].index.astype(str))
        def group_other(s):
            s = s.astype(str)
            return s.where(~s.isin(rare), "__OTHER__")
        for X in (Xtr, Xv, Xt):
            X[f"{c}__grp"] = group_other(X[c])

    # simple interactions
    pairs = []
    if C_INCOME and C_LOAN_AMT: pairs.append((C_LOAN_AMT, C_INCOME, "loan_x_income"))
    if C_BAL and C_LIMIT:       pairs.append((C_BAL, C_LIMIT, "bal_x_limit"))
    for a,b,name in pairs:
        for X in (Xtr, Xv, Xt):
            va = to_num(X, a); vb = to_num(X, b)
            if va is not None and vb is not None:
                X[name] = va * vb

    for X in (Xtr, Xv, Xt):
        X.replace([np.inf,-np.inf], np.nan, inplace=True)

    return Xtr, Xv, Xt, {"cat_cols": cat_cols}

# --- 3) Apply FE & save ---
Xtr = train.drop(columns=[TARGET]); ytr = train[TARGET]
Xv  = val.drop(columns=[TARGET]);   yv  = val[TARGET]
Xt  = test.drop(columns=[TARGET]);  yt  = test[TARGET]

Xtr_fe, Xv_fe, Xt_fe, meta = engineer_v2(Xtr, Xv, Xt)

train_fe = Xtr_fe.copy(); train_fe[TARGET] = ytr.values
val_fe   = Xv_fe.copy();  val_fe[TARGET]   = yv.values
test_fe  = Xt_fe.copy();  test_fe[TARGET]  = yt.values

train_fe.to_csv(OUT/"train_feature_engineered.csv", index=False)
val_fe.to_csv(  OUT/"val_feature_engineered.csv",   index=False)
test_fe.to_csv( OUT/"test_feature_engineered.csv",  index=False)
print("Saved FE files:",
      OUT/"train_feature_engineered.csv",
      OUT/"val_feature_engineered.csv",
      OUT/"test_feature_engineered.csv")

# --- 4) Now load to dfx safely ---
dfx = pd.read_csv(OUT/"train_feature_engineered.csv")
print("Loaded dfx:", dfx.shape)
dfx.dtypes.head(20)


Saved FE files: results/outputs/train_feature_engineered.csv results/outputs/val_feature_engineered.csv results/outputs/test_feature_engineered.csv
Loaded dfx: (862, 14)


Unnamed: 0,0
CustomerID,float64
Age,float64
AnnualIncome,float64
LoanAmount,float64
CreditScore,float64
LoanTerm,float64
ExistingDebt,float64
age_band,object
score_band,object
age_band__freq,float64
