In [1]:
# ===============================================================
# CREDIT SCORING ‚Äî ADVANCED FEATURE & HYPERPARAMETER TUNED MODEL
# ===============================================================

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import optuna
import warnings
warnings.filterwarnings("ignore")

# ---------- Load Data ----------
train = pd.read_csv("/kaggle/input/aiful-dataset/train.csv")
test = pd.read_csv("/kaggle/input/aiful-dataset/test.csv")
sample_submission = pd.read_csv("/kaggle/input/aiful-dataset/sample_submission.csv")
TARGET = "Default 12 Flag"

# ---------- Basic Cleaning ----------
for df in [train, test]:
    df['JIS Address Code'] = df['JIS Address Code'].fillna(-999).astype(int)
    df['Application Date'] = pd.to_datetime(df['Application Date'])
    df['Date of Birth'] = pd.to_datetime(df['Date of Birth'])
    df['Application_Hour'] = df['Application Time'] // 10000
    df['Application_Month'] = df['Application Date'].dt.month
    df['Application_DayOfWeek'] = df['Application Date'].dt.dayofweek
    df['Age'] = (df['Application Date'] - df['Date of Birth']).dt.days / 365.25

# ---------- Financial Ratios ----------
epsilon = 1e-6
for df in [train, test]:
    df['MONTHLY_INCOME'] = df['Total Annual Income'] / 12
    df['LOAN_TO_INCOME_RATIO'] = df['Application Limit Amount(Desired)'] / (df['Total Annual Income'] + epsilon)
    df['DECLARED_DEBT_TO_INCOME_RATIO'] = df['Declared Amount of Unsecured Loans'] / (df['Total Annual Income'] + epsilon)
    df['ACTUAL_DEBT_TO_INCOME_RATIO'] = df['Amount of Unsecured Loans'] / (df['Total Annual Income'] + epsilon)
    df['RENT_TO_INCOME'] = df['Rent Burden Amount'] / (df['MONTHLY_INCOME'] + epsilon)
    df['AGE_BUCKET'] = pd.cut(df['Age'], bins=[0,25,35,45,55,100], labels=[0,1,2,3,4]).astype(int)

# ---------- Stability & Employment ----------
for df in [train, test]:
    df['EMPLOYMENT_YEARS'] = df['Duration of Employment at Company (Months)'] / 12
    df['STABILITY_RATIO'] = df['EMPLOYMENT_YEARS'] / (df['Age'] + epsilon)
    df['INCOME_PER_DEPENDENT'] = df['Total Annual Income'] / (df['Number of Dependents'] + 1)
    df['INCOME_PER_AGE'] = df['Total Annual Income'] / (df['Age'] + epsilon)

# ---------- Advanced Financial Features ----------
for df in [train, test]:
    df['DECLARED_VS_ACTUAL_RATIO'] = df['Declared Amount of Unsecured Loans'] / (df['Amount of Unsecured Loans'] + epsilon)
    df['AVG_LOAN_TO_LIMIT'] = (df['Amount of Unsecured Loans'] / (df['Number of Unsecured Loans'] + 1)) / (df['Application Limit Amount(Desired)'] + epsilon)
    df['AVG_RENT_BURDEN_RATIO'] = df['Rent Burden Amount'] / (df['Declared Amount of Unsecured Loans'] + 1)
    df['LIQUIDITY_SCORE'] = df['Total Annual Income'] / (df['Rent Burden Amount'] + df['Amount of Unsecured Loans'] + epsilon)
    df['FIN_STABILITY_SCORE'] = df['EMPLOYMENT_YEARS'] * df['INCOME_PER_AGE']
    df['BURDEN_SCORE'] = df['RENT_TO_INCOME'] + df['LOAN_TO_INCOME_RATIO'] + df['ACTUAL_DEBT_TO_INCOME_RATIO']

# ---------- Group Normalization ----------
group_cols = ['JIS Address Code', 'Industry Type', 'Employment Type']
agg_cols = ['Total Annual Income', 'ACTUAL_DEBT_TO_INCOME_RATIO']

for gcol in group_cols:
    for acol in agg_cols:
        stats = train.groupby(gcol)[acol].mean().reset_index()
        stats.rename(columns={acol: f"{acol}_{gcol}_mean"}, inplace=True)
        train = train.merge(stats, on=gcol, how='left')
        test = test.merge(stats, on=gcol, how='left')
        train[f"{acol}_to_{gcol}_rel"] = train[acol] / (train[f"{acol}_{gcol}_mean"] + epsilon)
        test[f"{acol}_to_{gcol}_rel"] = test[acol] / (test[f"{acol}_{gcol}_mean"] + epsilon)

# ---------- Risk Buckets ----------
for df in [train, test]:
    df['INCOME_BUCKET'] = pd.qcut(df['Total Annual Income'], 5, labels=False, duplicates='drop')
    df['DEBT_BUCKET'] = pd.qcut(df['ACTUAL_DEBT_TO_INCOME_RATIO'], 5, labels=False, duplicates='drop')
    df['LOAN_BUCKET'] = pd.qcut(df['LOAN_TO_INCOME_RATIO'], 5, labels=False, duplicates='drop')

# ---------- Interactions ----------
for df in [train, test]:
    df['DEBT_X_EMP'] = df['ACTUAL_DEBT_TO_INCOME_RATIO'] * df['EMPLOYMENT_YEARS']
    df['DEBT_X_AGE'] = df['ACTUAL_DEBT_TO_INCOME_RATIO'] * df['Age']
    df['INCOME_X_EMP'] = df['Total Annual Income'] * df['EMPLOYMENT_YEARS']
    df['INCOME_X_RENT'] = df['Total Annual Income'] / (df['Rent Burden Amount'] + 1)

# ---------- Label Encoding ----------
cat_cols = [
    'Major Media Code', 'Internet Details', 'Reception Type Category', 'Gender',
    'Single/Married Status', 'Residence Type', 'Employment Type',
    'Industry Type', 'Company Size Category', 'AGE_BUCKET', 
    'INCOME_BUCKET', 'DEBT_BUCKET', 'LOAN_BUCKET'
]

for col in cat_cols:
    if col in train.columns:
        le = LabelEncoder()
        full = pd.concat([train[col].astype(str), test[col].astype(str)], axis=0)
        le.fit(full)
        train[col] = le.transform(train[col].astype(str))
        test[col] = le.transform(test[col].astype(str))

# ---------- Feature Selection ----------
drop_cols = [TARGET, 'Application Date', 'Date of Birth', 'Application Time']
features = [c for c in train.columns if c not in drop_cols]

X, y = train[features], train[TARGET]
X_test = test[features]

# ---------- Optuna Hyperparameter Optimization ----------
def objective(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate': trial.suggest_float("learning_rate", 0.01, 0.08),
        'max_depth': trial.suggest_int("max_depth", 5, 10),
        'min_child_weight': trial.suggest_int("min_child_weight", 5, 50),
        'subsample': trial.suggest_float("subsample", 0.6, 0.95),
        'colsample_bytree': trial.suggest_float("colsample_bytree", 0.6, 0.95),
        'reg_lambda': trial.suggest_float("reg_lambda", 1, 8),
        'reg_alpha': trial.suggest_float("reg_alpha", 0, 5),
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
        'random_state': 42,
    }

    folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(X))

    for tr_idx, val_idx in folds.split(X, y):
        X_train, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[tr_idx], y.iloc[val_idx]
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)

        model = xgb.train(
            params,
            dtrain,
            num_boost_round=2000,
            evals=[(dval, "valid")],
            early_stopping_rounds=100,
            verbose_eval=False,
        )
        oof_preds[val_idx] = model.predict(dval)

    auc = roc_auc_score(y, oof_preds)
    return auc

print("\nüîç Starting Optuna Tuning (this may take ~10-20 mins)...")
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

print("‚úÖ Best Trial:", study.best_trial.params)

# ---------- Final Model Training ----------
params = study.best_trial.params
params.update({'objective': 'binary:logistic', 'eval_metric': 'auc', 'tree_method': 'gpu_hist'})

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))
sub_preds = np.zeros(len(X_test))

for fold, (tr_idx, val_idx) in enumerate(folds.split(X, y)):
    print(f"\n===== Fold {fold+1} =====")
    X_train, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[tr_idx], y.iloc[val_idx]

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    dtest = xgb.DMatrix(X_test)

    model = xgb.train(
        params,
        dtrain,
        num_boost_round=4000,
        evals=[(dval, "valid")],
        early_stopping_rounds=100,
        verbose_eval=250
    )

    oof_preds[val_idx] = model.predict(dval)
    sub_preds += model.predict(dtest) / folds.n_splits

auc = roc_auc_score(y, oof_preds)
print(f"\n‚úÖ Final OOF ROC-AUC: {auc:.4f}")

sample_submission[TARGET] = sub_preds
sample_submission.to_csv("submission_xgb_optuna.csv", index=False)
print("‚úÖ Submission saved as: submission_xgb_optuna.csv")


[I 2025-11-08 09:25:44,226] A new study created in memory with name: no-name-d61ce1c1-fbe2-4a10-ad56-1d338c824105



üîç Starting Optuna Tuning (this may take ~10-20 mins)...


[I 2025-11-08 09:25:50,349] Trial 0 finished with value: 0.6637539077937014 and parameters: {'learning_rate': 0.0564987503288643, 'max_depth': 10, 'min_child_weight': 20, 'subsample': 0.8692801340772867, 'colsample_bytree': 0.691709929961124, 'reg_lambda': 7.366751111153972, 'reg_alpha': 1.2594482709139783}. Best is trial 0 with value: 0.6637539077937014.
[I 2025-11-08 09:25:56,366] Trial 1 finished with value: 0.672389278295673 and parameters: {'learning_rate': 0.025227122518013158, 'max_depth': 6, 'min_child_weight': 39, 'subsample': 0.7176590325573209, 'colsample_bytree': 0.7775296806773826, 'reg_lambda': 7.37894259718327, 'reg_alpha': 0.24023856618309136}. Best is trial 1 with value: 0.672389278295673.
[I 2025-11-08 09:26:01,312] Trial 2 finished with value: 0.6647445071041147 and parameters: {'learning_rate': 0.06449243596292285, 'max_depth': 8, 'min_child_weight': 13, 'subsample': 0.611601155385176, 'colsample_bytree': 0.7550462875331687, 'reg_lambda': 5.41981332372733, 'reg_alph

‚úÖ Best Trial: {'learning_rate': 0.010314659908990718, 'max_depth': 6, 'min_child_weight': 20, 'subsample': 0.7379285740580522, 'colsample_bytree': 0.7907747042491868, 'reg_lambda': 1.1366615913001112, 'reg_alpha': 0.352491328161354}

===== Fold 1 =====
[0]	valid-auc:0.62512
[250]	valid-auc:0.67450
[500]	valid-auc:0.67990
[750]	valid-auc:0.68161
[971]	valid-auc:0.68134

===== Fold 2 =====
[0]	valid-auc:0.62194
[250]	valid-auc:0.66731
[500]	valid-auc:0.67076
[750]	valid-auc:0.67138
[892]	valid-auc:0.67136

===== Fold 3 =====
[0]	valid-auc:0.62919
[250]	valid-auc:0.66611
[500]	valid-auc:0.66941
[750]	valid-auc:0.67026
[1000]	valid-auc:0.67085
[1216]	valid-auc:0.67104

===== Fold 4 =====
[0]	valid-auc:0.62061
[250]	valid-auc:0.66154
[500]	valid-auc:0.66801
[750]	valid-auc:0.67032
[1000]	valid-auc:0.67115
[1250]	valid-auc:0.67190
[1500]	valid-auc:0.67220
[1634]	valid-auc:0.67195

===== Fold 5 =====
[0]	valid-auc:0.63143
[250]	valid-auc:0.66897
[500]	valid-auc:0.67314
[750]	valid-auc:0.673