In [None]:
!pip install catboost
!pip install xgboost
!pip install lightgbm

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

train_raw = pd.read_csv('train.csv').drop("id", axis=1)
test_raw = pd.read_csv('test.csv').drop("id", axis=1)
sample_submission = pd.read_csv('sample_submission.csv')
train_raw

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,26.0,34427.0,RENT,5.946998,DEBTCONSOLIDATION,E,19270.0,11.272717,0.193133,Y,4.0,1
1,32.0,91102.0,MORTGAGE,8.527473,VENTURE,B,19815.0,12.831859,0.183127,N,4.0,0
2,25.0,25847.0,RENT,4.919103,PERSONAL,D,10043.0,15.532120,0.241554,Y,3.0,0
3,22.0,32863.0,RENT,3.427904,EDUCATION,,12256.0,9.877151,0.198320,N,4.0,0
4,24.0,50740.0,RENT,2.842202,EDUCATION,C,6412.0,13.465822,0.185754,Y,2.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
39545,30.0,197453.0,MORTGAGE,13.893697,HOMEIMPROVEMENT,C,5422.0,16.498766,0.130787,N,7.0,0
39546,21.0,77450.0,MORTGAGE,6.081810,EDUCATION,B,15388.0,10.710760,0.193614,N,2.0,0
39547,29.0,47254.0,RENT,5.898234,PERSONAL,C,19525.0,11.827926,0.332164,Y,7.0,1
39548,23.0,24691.0,RENT,4.929896,PERSONAL,D,7148.0,16.854522,0.231152,Y,3.0,1


In [None]:
from sklearn.impute import SimpleImputer
import pandas as pd

def fill_missing_values(df):
    cats = df.select_dtypes(include=['object', 'category']).columns
    nums = df.select_dtypes(include=['number']).columns

    cat_imputer = SimpleImputer(strategy='most_frequent')
    df[cats] = cat_imputer.fit_transform(df[cats])

    num_imputer = SimpleImputer(strategy='median')
    df[nums] = num_imputer.fit_transform(df[nums])

    return df

In [None]:
import pandas as pd
import numpy as np

def process_x(df):
    df = df.copy()

    # Feature engineering //ลองเพิ่ม feat แล้วไม่ดีขึ้น แต่ลองเอาไป test ได้นะ
    df['income_bin'] = pd.qcut(df['person_income'], 7, labels=False)
    df['rate_bin'] = pd.qcut(df['loan_int_rate'], 5, labels=False)
    df['loan_grade'] = df['loan_grade'].replace({'E': 'D', 'F': 'D', 'G': 'D'})

    df['interest_to_income_ratio'] = (df['loan_amnt'] * df['loan_int_rate']) / df['person_income']
    df['loan_to_income'] = ((df['loan_amnt'] / df['person_income']) - df['loan_percent_income'])
    df['age_income_interaction'] = (df['person_age'] * df['person_income'])
    df['loan_to_emp_length_ratio'] = (df['loan_amnt'] / df['person_emp_length'])
    df['monthly_income'] = df['person_income'] / 12
    df["emp_lenght_times_lpi"] = df["person_emp_length"] * df["loan_percent_income"]
    df["dti_ratio"] = (df["loan_amnt"] * df["loan_int_rate"] / 12) / df["monthly_income"]

    df["has_fixed_debt"] = np.where(df["person_home_ownership"].isin(['RENT', 'MORTGAGE']), 1, 0)
    df['high_interest_fixed_debt'] = ((df['has_fixed_debt'] == 1) & (df['loan_int_rate'] > df['loan_int_rate'].median())).astype(int)
    df['bad_percent_fixed_debt'] = ((df['has_fixed_debt'] == 1) & (df['loan_percent_income'] < df['loan_percent_income'].mean())).astype(int)

    df["grade_home_interaction"] = df['person_home_ownership'].astype(str) + '_' + df['loan_grade'].astype(str)
    df["grade_intent_interaction"] = df['loan_intent'].astype(str) + '_' + df['loan_grade'].astype(str)
    df["home_intent_interaction"] = df['person_home_ownership'].astype(str) + '_' + df['loan_intent'].astype(str)

    cat_cols = df.select_dtypes(include=['object', 'category']).columns
    df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

    bool_cols = df.select_dtypes(include=['bool']).columns
    df[bool_cols] = df[bool_cols].astype(int)

    return df


In [None]:
train = fill_missing_values(train_raw.copy())
test = fill_missing_values(test_raw.copy())

train_flag,test_flag = train.copy(),test.copy()
train_flag.drop("loan_status", axis=1)
train_flag['is_train'] = 1
test_flag['is_train'] = 0
combined = pd.concat([train_flag, test_flag], ignore_index=True)

combined_processed = process_x(combined)

train = combined_processed[combined_processed['is_train'] == 1].drop(columns=['is_train', 'loan_status'])
test = combined_processed[combined_processed['is_train'] == 0].drop(columns=['is_train','loan_status'])

train

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,income_bin,rate_bin,interest_to_income_ratio,...,home_intent_interaction_OWN_HOMEIMPROVEMENT,home_intent_interaction_OWN_MEDICAL,home_intent_interaction_OWN_PERSONAL,home_intent_interaction_OWN_VENTURE,home_intent_interaction_RENT_DEBTCONSOLIDATION,home_intent_interaction_RENT_EDUCATION,home_intent_interaction_RENT_HOMEIMPROVEMENT,home_intent_interaction_RENT_MEDICAL,home_intent_interaction_RENT_PERSONAL,home_intent_interaction_RENT_VENTURE
0,26.0,34427.0,5.946998,19270.0,11.272717,0.193133,4.0,1,2,6.309735,...,0,0,0,0,1,0,0,0,0,0
1,32.0,91102.0,8.527473,19815.0,12.831859,0.183127,4.0,6,3,2.790974,...,0,0,0,0,0,0,0,0,0,0
2,25.0,25847.0,4.919103,10043.0,15.532120,0.241554,3.0,0,4,6.035094,...,0,0,0,0,0,0,0,0,1,0
3,22.0,32863.0,3.427904,12256.0,9.877151,0.198320,4.0,1,1,3.683607,...,0,0,0,0,0,1,0,0,0,0
4,24.0,50740.0,2.842202,6412.0,13.465822,0.185754,2.0,3,3,1.701672,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39545,30.0,197453.0,13.893697,5422.0,16.498766,0.130787,7.0,6,4,0.453051,...,0,0,0,0,0,0,0,0,0,0
39546,21.0,77450.0,6.081810,15388.0,10.710760,0.193614,2.0,5,2,2.128046,...,0,0,0,0,0,0,0,0,0,0
39547,29.0,47254.0,5.898234,19525.0,11.827926,0.332164,7.0,2,2,4.887211,...,0,0,0,0,0,0,0,0,1,0
39548,23.0,24691.0,4.929896,7148.0,16.854522,0.231152,3.0,0,4,4.879354,...,0,0,0,0,0,0,0,0,1,0


In [None]:
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

def run_cv_training(model_name, params, X, X_test, y):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=6969)

    test_preds = np.zeros(len(X_test))
    oof_preds = np.zeros(len(y))
    oof_targets = np.array(y)
    y_vals = np.zeros(len(y))

    auc_scores = []
    f1_scores = []

    cat_list = X.select_dtypes(include=['object']).columns.tolist()



    model = BASE_MODELS[model_name]
    model.set_params(**params)

    for n_fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[valid_idx], y.iloc[valid_idx]

        # ===== CatBoost ====
        if model_name == 'cb':
          train_set = Pool(data=X_train, label=y_train, cat_features=cat_list)
          valid_set = Pool(data=X_val, label=y_val, cat_features=cat_list)

          model.fit(train_set,eval_set=valid_set,
                    early_stopping_rounds=100, verbose=0,
                    use_best_model=True)
          y_pred_val = model.predict_proba(X_val)[:, 1]
          test_preds += model.predict_proba(Pool(X_test, cat_features=cat_list))[:, 1] / skf.get_n_splits()

        # ===== XGBoost ====
        elif model_name == 'xgb':
          model.fit(X_train, y_train, eval_set=[(X_val, y_val)],
                    verbose=False)
          y_pred_val = model.predict_proba(X_val, iteration_range=(0, model.best_iteration + 1))[:, 1]
          test_preds += model.predict_proba(X_test, iteration_range=(0, model.best_iteration + 1))[:, 1] / skf.get_n_splits()

        # ===== LightGDM ====
        elif model_name == 'lgb':
            model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
            y_pred_val = model.predict_proba(X_val, num_iteration=model.best_iteration_)[:, 1]
            test_preds += model.predict_proba(X_test, num_iteration=model.best_iteration_)[:, 1] / skf.get_n_splits()


        # Threshold tuning
        thresholds = np.arange(0.0, 1.01, 0.01)
        best_f1 = 0
        best_threshold = 0
        for t in thresholds:
            y_pred_classes = (y_pred_val >= t).astype(int)
            score = f1_score(y_val, y_pred_classes, average='macro')
            if score > best_f1:
                best_f1 = score
                best_threshold = t

        oof_preds[valid_idx] = y_pred_val
        y_vals[valid_idx] = y_val
        f1_scores.append(best_f1)

        auc = roc_auc_score(y_val, y_pred_val)
        auc_scores.append(auc)

        print(f'Fold {n_fold + 1} AUC: {auc:.5f}, Best F1: {best_f1:.5f}, Threshold: {best_threshold}')
        print('-' * 50)

        #gc.collect()

    # Global threshold tuning
    thresholds = np.arange(0.0, 1.01, 0.01)
    best_f1 = 0
    best_threshold = 0.5
    for t in thresholds:
        preds_binary = (oof_preds >= t).astype(int)
        score = f1_score(oof_targets, preds_binary, average='macro')
        if score > best_f1:
            best_f1 = score
            best_threshold = t

    print(f'AUC: {np.mean(auc_scores):.5f} +/- {np.std(auc_scores):.3f}')
    print(f'Average Fold F1: {np.mean(f1_scores):.5f} +/- {np.std(f1_scores):.3f}')
    print("Best global F1:", best_f1)
    print("Best global threshold:", best_threshold)

    OOF_PREDS[model_name] = oof_preds.copy()
    BASE_PREDS[model_name] = test_preds.copy()

In [None]:
BASE_MODELS = {
    'cb': CatBoostClassifier(),
    'xgb': XGBClassifier(),
    'lgb': LGBMClassifier(),
}

BASE_MODEL_LIST = ['cb', 'xgb', 'lgb']
BASE_PREDS = {name : None for name in BASE_MODEL_LIST}
OOF_PREDS = {name : None for name in BASE_MODEL_LIST}

RANDOM_STATE = 6969


X = train.copy()
X_test = test.copy()
y = train_raw["loan_status"]

## CatBoost

In [None]:
cat_params = {
    'depth': 8,
    'learning_rate': 0.19,
    'bagging_temperature': 0.45,
    'l2_leaf_reg': 6,
    'loss_function': 'Logloss',
    'iterations': 1000,
    'grow_policy': 'Lossguide',
    'eval_metric': 'AUC',
    'random_seed': RANDOM_STATE,
    'verbose' : 0
}

BASE_MODELS['cb'] = CatBoostClassifier()

In [None]:
run_cv_training('cb', cat_params, X, X_test, y)

Fold 1 AUC: 0.88607, Best F1: 0.78998, Threshold: 0.38
--------------------------------------------------
Fold 2 AUC: 0.88362, Best F1: 0.78694, Threshold: 0.4
--------------------------------------------------
Fold 3 AUC: 0.88824, Best F1: 0.78800, Threshold: 0.4
--------------------------------------------------
Fold 4 AUC: 0.88600, Best F1: 0.78320, Threshold: 0.38
--------------------------------------------------
Fold 5 AUC: 0.89611, Best F1: 0.79414, Threshold: 0.43
--------------------------------------------------
AUC: 0.88801 +/- 0.004
Average Fold F1: 0.78845 +/- 0.004
Best global F1: 0.787559051230929
Best global threshold: 0.38


In [None]:
submission = sample_submission.copy()
submission['loan_status'] = ( BASE_PREDS['cb'] > 0.34).astype(int)
submission.to_csv('cpe232_v9.csv', index=False)
display(submission['loan_status'].value_counts())
display(submission)

Unnamed: 0_level_0,count
loan_status,Unnamed: 1_level_1
0,12171
1,4779


Unnamed: 0,id,loan_status
0,39550,1
1,39551,0
2,39552,1
3,39553,0
4,39554,0
...,...,...
16945,56495,1
16946,56496,0
16947,56497,0
16948,56498,0


## XGBoost

In [None]:
xgb_params = {
    'max_depth':6,                         # Default: 6
    'learning_rate': 0.18,                 # Default: 0.3
    'subsample': 0.5,                      # Default: 1.0
    'colsample_bytree': 0.5,               # Default: 1.0
    'reg_lambda': 2,                       # Default: 1.0
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'n_estimators': 1500,
    'tree_method': 'hist',                 # Default: 'auto'
    'random_state': RANDOM_STATE,
    'use_label_encoder' : False,
    'early_stopping_rounds' : 300,
}

BASE_MODELS['xgb'] = XGBClassifier()

In [None]:
run_cv_training('xgb', xgb_params, X, X_test, y)

Fold 1 AUC: 0.88330, Best F1: 0.78501, Threshold: 0.4
--------------------------------------------------
Fold 2 AUC: 0.88176, Best F1: 0.78283, Threshold: 0.35000000000000003
--------------------------------------------------
Fold 3 AUC: 0.88682, Best F1: 0.78990, Threshold: 0.38
--------------------------------------------------
Fold 4 AUC: 0.88403, Best F1: 0.78073, Threshold: 0.38
--------------------------------------------------
Fold 5 AUC: 0.89475, Best F1: 0.79097, Threshold: 0.42
--------------------------------------------------
AUC: 0.88613 +/- 0.005
Average Fold F1: 0.78589 +/- 0.004
Best global F1: 0.7853423833780069
Best global threshold: 0.38


In [None]:
submission = sample_submission.copy()
submission['loan_status'] = ( BASE_PREDS['xgb'] > 0.34).astype(int)
submission.to_csv('cpe232_v9.csv', index=False)
display(submission['loan_status'].value_counts())
display(submission)

Unnamed: 0_level_0,count
loan_status,Unnamed: 1_level_1
0,12158
1,4792


Unnamed: 0,id,loan_status
0,39550,1
1,39551,0
2,39552,1
3,39553,0
4,39554,0
...,...,...
16945,56495,1
16946,56496,0
16947,56497,0
16948,56498,0


## LightGBM

In [None]:
lgb_params = {
    'max_depth': 7,                        # Default: -1 (no limit 💀💀💀)
    'learning_rate': 0.19,                 # Default: 0.1
    'num_leaves': 64,                      # Default: 31
    'bagging_fraction': 0.5,               # Default: 1.0
    'feature_fraction': 0.8,               # Default: 1.0
    'lambda_l2': 5.5,                      # Default: 0.0
    'objective': 'binary',
    'metric': 'auc',
    'n_estimators': 1000,
    'random_state': RANDOM_STATE,
    'verbose': -1,
    'early_stopping_rounds' : 200,
}

BASE_MODELS['lgb'] =  LGBMClassifier()

In [None]:
run_cv_training('lgb', lgb_params, X, X_test, y)

Fold 1 AUC: 0.88650, Best F1: 0.78963, Threshold: 0.37
--------------------------------------------------
Fold 2 AUC: 0.88365, Best F1: 0.78314, Threshold: 0.43
--------------------------------------------------
Fold 3 AUC: 0.88687, Best F1: 0.78470, Threshold: 0.35000000000000003
--------------------------------------------------
Fold 4 AUC: 0.88461, Best F1: 0.78362, Threshold: 0.39
--------------------------------------------------
Fold 5 AUC: 0.89444, Best F1: 0.79134, Threshold: 0.38
--------------------------------------------------
AUC: 0.88721 +/- 0.004
Average Fold F1: 0.78648 +/- 0.003
Best global F1: 0.7857047599545819
Best global threshold: 0.38


In [None]:
submission = sample_submission.copy()
submission['loan_status'] = ( BASE_PREDS['lgb'] > 0.34).astype(int)
submission.to_csv('cpe232_v9.csv', index=False)
display(submission['loan_status'].value_counts())
display(submission)

In [None]:
from IPython.display import display as disply
submission = sample_submission.copy()
submission['loan_status'] = (test_preds_cat > best_threshold).astype(int)
submission.to_csv('cpe232_v5.csv', index=False)
disply(submission)
disply(submission['loan_status'].value_counts())

Unnamed: 0,id,loan_status
0,39550,1
1,39551,0
2,39552,1
3,39553,0
4,39554,0
...,...,...
16945,56495,1
16946,56496,0
16947,56497,0
16948,56498,0


Unnamed: 0_level_0,count
loan_status,Unnamed: 1_level_1
0,12512
1,4438


## Stack

In [None]:
BASE_PREDS

{'cb': array([0.952668  , 0.02386829, 0.75198889, ..., 0.1601671 , 0.24476729,
        0.04025672]),
 'xgb': array([0.96218523, 0.01784255, 0.679057  , ..., 0.17473232, 0.21112588,
        0.02902805]),
 'lgb': array([0.9660079 , 0.01690355, 0.66526342, ..., 0.18557648, 0.19235201,
        0.03524634])}

In [None]:
OOF_PREDS

{'cb': array([0.78953656, 0.03287833, 0.8953748 , ..., 0.4815887 , 0.89224353,
        0.57729151]),
 'xgb': array([0.8046037 , 0.04251382, 0.80810815, ..., 0.57557589, 0.87484699,
        0.53949267]),
 'lgb': array([0.79323142, 0.03254623, 0.8882803 , ..., 0.5779397 , 0.87933127,
        0.62224227])}

In [None]:
BASE_MODELS

{'cb': <catboost.core.CatBoostClassifier at 0x7ec56abeafd0>,
 'xgb': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=0.5, device=None, early_stopping_rounds=300,
               enable_categorical=False, eval_metric='auc', feature_types=None,
               gamma=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=0.18, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=6, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               multi_strategy=None, n_estimators=1500, n_jobs=None,
               num_parallel_tree=None, random_state=6969, ...),
 'lgb': LGBMClassifier(bagging_fraction=0.5, early_stopping_rounds=200,
                feature_fraction=0.8, lambda_l2=5.5, learning_rate=0.19,
                max_d

In [None]:
X_meta = np.column_stack([OOF_PREDS[name] for name in BASE_MODELS.keys()])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd

meta_models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(random_state=RANDOM_STATE),
    "GradientBoosting": GradientBoostingClassifier(random_state=RANDOM_STATE)
}


X_meta = np.column_stack([OOF_PREDS[name] for name in BASE_MODELS.keys()])
X_meta_test = np.column_stack([BASE_PREDS[name] for name in BASE_MODELS.keys()])

results = []
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

for name, meta_model in meta_models.items():
    print(f"\nTraining meta model: {name}")

    val_probs = np.zeros_like(y, dtype=float)

    for train_idx, val_idx in skf.split(X_meta, y):
        meta_model.fit(X_meta[train_idx], y[train_idx])
        val_probs[val_idx] = meta_model.predict_proba(X_meta[val_idx])[:, 1]


    best_f1 = 0
    best_thresh = 0.5
    for t in np.arange(0.0, 1.01, 0.01):
        pred_val = (val_probs >= t).astype(int)
        score = f1_score(y, pred_val, average='macro')
        if score > best_f1:
            best_f1 = score
            best_thresh = t

    meta_model.fit(X_meta, y)
    prob_test = meta_model.predict_proba(X_meta_test)[:, 1]
    final_preds = (prob_test >= best_thresh).astype(int)

    results.append({
        "Meta Model": name,
        "Best Threshold": best_thresh,
        "OOF Macro F1": best_f1,
        "Final Test Predictions": final_preds
    })

pd.DataFrame([{k: v for k, v in r.items() if k != 'Final Test Predictions'} for r in results])


Training meta model: LogisticRegression

Training meta model: RandomForest

Training meta model: GradientBoosting


Unnamed: 0,Meta Model,Best Threshold,OOF Macro F1
0,LogisticRegression,0.32,0.789073
1,RandomForest,0.4,0.769728
2,GradientBoosting,0.41,0.787119


In [None]:
results

[{'Meta Model': 'LogisticRegression',
  'Best Threshold': 0.32,
  'OOF Macro F1': 0.7890726939884745,
  'Final Test Predictions': array([1, 0, 1, ..., 0, 0, 0])},
 {'Meta Model': 'RandomForest',
  'Best Threshold': 0.4,
  'OOF Macro F1': 0.7697280526444339,
  'Final Test Predictions': array([1, 0, 1, ..., 0, 0, 0])},
 {'Meta Model': 'GradientBoosting',
  'Best Threshold': 0.41000000000000003,
  'OOF Macro F1': 0.7871194049836421,
  'Final Test Predictions': array([1, 0, 1, ..., 0, 0, 0])}]

In [None]:
result_gb = pd.Series(results[0]['Final Test Predictions'])
submission = sample_submission.copy()
submission['loan_status'] = result_gb
submission.to_csv('cpe232_v10.csv', index=False)
display(result_gb.value_counts())
display(submission)

Unnamed: 0,count
0,12476
1,4474


Unnamed: 0,id,loan_status
0,39550,1
1,39551,0
2,39552,1
3,39553,0
4,39554,0
...,...,...
16945,56495,1
16946,56496,0
16947,56497,0
16948,56498,0
