In [None]:
# Imports
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

RANDOM_STATE = 42
CSV_PATH = 'UCI_Credit_Card.csv'


In [None]:
# Load dataset
df = pd.read_csv(CSV_PATH)
df.head()

In [None]:
# Feature engineering
def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # Drop ID if present
    if 'ID' in df.columns:
        df.drop(columns=['ID'], inplace=True)

    # Ratios: BILL/Limit and PAY/BILL
    for k in range(1, 7):
        bill_col = f'BILL_AMT{k}'
        pay_col = f'PAY_AMT{k}'
        # BILL ratio to limit
        df[f'bill_ratio_{k}'] = df[bill_col] / (df['LIMIT_BAL'] + 1.0)
        # PAY ratio to bill (clip to handle outliers)
        df[f'pay_ratio_{k}'] = df[pay_col] / (df[bill_col] + 1.0)
        df[f'pay_ratio_{k}'] = df[f'pay_ratio_{k}'].clip(0, 1.5)

    # Deltas month-to-month for bills and payments
    for k in range(2, 7):
        df[f'bill_diff_{k}'] = df[f'BILL_AMT{k}'] - df[f'BILL_AMT{k-1}']
        df[f'pay_diff_{k}'] = df[f'PAY_AMT{k}'] - df[f'PAY_AMT{k-1}']

    # Aggregates
    bill_cols = [f'BILL_AMT{k}' for k in range(1, 7)]
    pay_cols = [f'PAY_AMT{k}' for k in range(1, 7)]
    df['bill_mean'] = df[bill_cols].mean(axis=1)
    df['bill_std'] = df[bill_cols].std(axis=1)
    df['pay_mean'] = df[pay_cols].mean(axis=1)
    df['pay_std'] = df[pay_cols].std(axis=1)

    # Current utilization proxy
    df['current_util'] = df['BILL_AMT6'] / (df['LIMIT_BAL'] + 1.0)

    # Ensure PAY_* are integers (if not already)
    for k in [0, 2, 3, 4, 5, 6]:
        col = f'PAY_{k}'
        if col in df.columns:
            df[col] = df[col].astype(int)

    return df

df_fe = feature_engineering(df)
df_fe.head()

In [None]:
# Split data (stratified 60/20/20)
y = df_fe['default'].astype(int)
X = df_fe.drop(columns=['default'])

X_full_train, X_test, y_full_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_full_train, y_full_train, test_size=0.25, random_state=RANDOM_STATE, stratify=y_full_train
)

features = list(X_train.columns)
X_train.shape, X_val.shape, X_test.shape

In [None]:
# XGBoost with early stopping and proper class imbalance handling
neg = int((y_train == 0).sum())
pos = int((y_train == 1).sum())
scale_pos_weight = float(neg) / float(pos) if pos > 0 else 1.0

dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=features)

params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'eta': 0.05,
    'max_depth': 5,
    'min_child_weight': 5,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.0,
    'reg_lambda': 1.0,
    'scale_pos_weight': scale_pos_weight,
    'seed': RANDOM_STATE,
}

watchlist = [(dtrain, 'train'), (dval, 'valid')]
xgb_model = xgb.train(params, dtrain, num_boost_round=2000, evals=watchlist, early_stopping_rounds=100, verbose_eval=100)

# Predict probabilities and compute AUC
y_val_pred = xgb_model.predict(dval, iteration_range=(0, xgb_model.best_iteration))
y_test_pred = xgb_model.predict(dtest, iteration_range=(0, xgb_model.best_iteration))
val_auc = roc_auc_score(y_val, y_val_pred)
test_auc = roc_auc_score(y_test, y_test_pred)
val_auc, test_auc

In [None]:
# RandomForest baseline with probability-based AUC
rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    min_samples_leaf=2,
    class_weight='balanced',
    n_jobs=-1,
    random_state=RANDOM_STATE,
)
rf.fit(X_train, y_train)
rf_val_pred = rf.predict_proba(X_val)[:, 1]
rf_test_pred = rf.predict_proba(X_test)[:, 1]
rf_val_auc = roc_auc_score(y_val, rf_val_pred)
rf_test_auc = roc_auc_score(y_test, rf_test_pred)
rf_val_auc, rf_test_auc