# Imports

In [1]:
# General

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Hyperparameter tuning
import optuna

# Models
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

# Data processing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Metrics
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

# Data preparation

In [2]:
# Data input

try:
    data = pd.read_csv('../data/train.csv')
    data_to_predict = pd.read_csv('../data/test.csv')
    data_ccrisk = pd.read_csv('../data/credit_risk_dataset.csv')

except:
    data = pd.read_csv('/kaggle/input/playground-series-s4e10/train.csv')
    data_to_predict = pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv')
    data_ccrisk = pd.read_csv('/kaggle/input/loan-approval-prediction/credit_risk_dataset.csv')


data_ccrisk.dropna(inplace=True)
data_ccrisk.drop_duplicates(inplace=True)

X_old = data.drop(['loan_status'], axis=1)
y_old = data['loan_status']

# Merge the dataframes
data_no_id = data.drop(['id'], axis=1)
merged_data = pd.concat([data_no_id, data_ccrisk], ignore_index=True)

X = merged_data.drop(['loan_status'], axis=1)
y = merged_data['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, shuffle=True, test_size=0.2, random_state=2024)

In [3]:
from imblearn.over_sampling import SMOTENC

ratio = 0.25
k_neighbors = 8
random_state = 2024

categorical_columns = X.select_dtypes(include=['object']).columns

sm = SMOTENC(sampling_strategy=ratio, categorical_features=[X_train.columns.get_loc(col) for col in categorical_columns], random_state=random_state, k_neighbors=k_neighbors)

X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [4]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

# Data processing
categorical_columns = X.select_dtypes(include=['object']).columns

categorical_ordinal = ['loan_grade']
categorical_onehot = categorical_columns.drop(categorical_ordinal)

log_columns = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt']
log_columns = log_columns

numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
numerical_columns = numerical_columns.drop(log_columns)

def log_transform():
	return FunctionTransformer(np.log1p, validate=True)

preprocessor = ColumnTransformer(
	transformers=[
		('ordinal', OrdinalEncoder(), categorical_ordinal),
		('onehot', OneHotEncoder(), categorical_onehot),
		('scaler', StandardScaler(), numerical_columns),
		('log', log_transform(), log_columns)
	], remainder='passthrough'
	)

preprocessor.fit(X_train_res)

X_train_prep = preprocessor.transform(X_train_res)
X_test_prep = preprocessor.transform(X_test)

# Models

In [51]:
# Some parameters to control the training phase
n_trials_best_models = 500
timout_best_models = 3600*2

n_trial_mids = 250
timeout_mids = 3600*1

n_trial_average = 500
timout_average = 3600*1

# If testing, reduce the number of trials and timouts
testing = True

n_trial_bm_testing = 10
timeout_bm_testing = 60

## XGBoost

In [9]:
def xgb_objective(trial):
    # Hyperparameter search space
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 0.5),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 3.0),
        'eval_metric': 'auc',  # AUC is the metric
        'objective': 'binary:logistic'  # Use binary logistic, probability outputs
    }

    # Initialize the model
    model = xgb.XGBClassifier(**param)

    # Fit the model
    model.fit(X_train_prep, y_train_res,
              eval_set=[(X_test_prep, y_test)],  
              verbose=False)

    # Predict proba for validation set
    y_pred_prob = model.predict_proba(X_test_prep)[:, 1]

    auc = roc_auc_score(y_test, y_pred_prob)
 
    return auc

# Study object to run the optimization. I want to maximize AUC
if testing:
    xgb_study = optuna.create_study(direction='maximize')
    xgb_study.optimize(xgb_objective, n_trials=n_trial_bm_testing, timeout=timeout_bm_testing)

else:
    xgb_study = optuna.create_study(direction='maximize')
    xgb_study.optimize(xgb_objective, n_trials=n_trials_best_models, timeout=timout_best_models)

print(f"Best trial: {xgb_study.best_trial.params}")
print(f"Best value: {xgb_study.best_value}")

[I 2024-10-23 18:04:16,576] A new study created in memory with name: no-name-fb703848-58db-4b94-bf04-828ef268341f
[I 2024-10-23 18:04:25,690] Trial 0 finished with value: 0.9555008982245218 and parameters: {'max_depth': 9, 'learning_rate': 0.07360155984919532, 'n_estimators': 441, 'subsample': 0.9260887353761973, 'colsample_bytree': 0.8248525983134489, 'gamma': 0.3333339662288516, 'lambda': 0.009932006636130397, 'alpha': 0.002399367504058725, 'scale_pos_weight': 1.6159669655690991}. Best is trial 0 with value: 0.9555008982245218.
[I 2024-10-23 18:04:32,022] Trial 1 finished with value: 0.956354178368465 and parameters: {'max_depth': 7, 'learning_rate': 0.14202537705856288, 'n_estimators': 348, 'subsample': 0.8131256696625251, 'colsample_bytree': 0.535530141944325, 'gamma': 0.13956326880224473, 'lambda': 0.00019854579183898054, 'alpha': 2.6474958133166025e-07, 'scale_pos_weight': 1.4984581354551698}. Best is trial 1 with value: 0.956354178368465.
[I 2024-10-23 18:04:38,527] Trial 2 fini

Best trial: {'max_depth': 8, 'learning_rate': 0.037634759076473824, 'n_estimators': 288, 'subsample': 0.7975872516319609, 'colsample_bytree': 0.6750758577718527, 'gamma': 0.12216434906052287, 'lambda': 8.418992434485555e-08, 'alpha': 5.533189128050602e-05, 'scale_pos_weight': 1.0997425873555955}
Best value: 0.9568551495269616


In [10]:
xgb_best_params = xgb_study.best_trial.params
xgb_best_score = xgb_study.best_trial.value
xgb_best_score

0.9568551495269616

prev: 0.9576051262733373

## CatBoostClassifier

In [11]:
from catboost import CatBoostClassifier

def cat_objective(trial):

    # Define the hyperparameter search space

    param = {
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'iterations': trial.suggest_int('iterations', 200, 1000),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'random_strength': trial.suggest_float('random_strength', 0.0, 1.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 3.0),
        'eval_metric': 'AUC',  
        'logging_level': 'Silent',  # Disable CatBoost output
        'task_type': 'CPU',  # Try using GPU
        'use_best_model': True
    }

    # Initialize the CatBoost model with the trial's parameters
    model = CatBoostClassifier(**param)

    # Fit the model
    model.fit(X_train_prep, y_train_res,
              eval_set=[(X_test_prep, y_test)],
              early_stopping_rounds=20,  # Early stopping to prevent overfitting
              verbose=False)

    # Get predictions and calculate AUC score
    y_pred_prob = model.predict_proba(X_test_prep)[:, 1]  # Probabilities for class 1

    auc = roc_auc_score(y_test, y_pred_prob)

    return auc

if testing:
    cat_study = optuna.create_study(direction='maximize')  # We want to maximize AUC
    cat_study.optimize(cat_objective, n_trials=n_trial_bm_testing, timeout=timeout_bm_testing)

else:    
    cat_study = optuna.create_study(direction='maximize')  # We want to maximize AUC
    cat_study.optimize(cat_objective, n_trials=n_trials_best_models, timeout=timout_best_models)

# Output the best trial
print(f"Best trial: {cat_study.best_trial.params}")
print(f"Best value: {cat_study.best_value}")

[I 2024-10-23 18:06:07,068] A new study created in memory with name: no-name-9ec584d9-6890-4f9e-96b1-99f458e51936
[I 2024-10-23 18:06:17,570] Trial 0 finished with value: 0.9537597653382517 and parameters: {'depth': 5, 'learning_rate': 0.08996854149093136, 'iterations': 904, 'l2_leaf_reg': 6.721671126122494e-08, 'border_count': 142, 'bagging_temperature': 0.4020817890576337, 'random_strength': 0.6361271595036573, 'scale_pos_weight': 1.886983299730991}. Best is trial 0 with value: 0.9537597653382517.
[I 2024-10-23 18:06:27,784] Trial 1 finished with value: 0.9499374093890083 and parameters: {'depth': 8, 'learning_rate': 0.0879470619189848, 'iterations': 713, 'l2_leaf_reg': 0.043750810712403006, 'border_count': 63, 'bagging_temperature': 0.7513996965002671, 'random_strength': 0.8525439050757616, 'scale_pos_weight': 2.847770020952189}. Best is trial 0 with value: 0.9537597653382517.
[I 2024-10-23 18:06:37,985] Trial 2 finished with value: 0.9526078347759421 and parameters: {'depth': 4, 'l

Best trial: {'depth': 4, 'learning_rate': 0.24945537583407523, 'iterations': 697, 'l2_leaf_reg': 0.003925736296382238, 'border_count': 109, 'bagging_temperature': 0.9371063682566577, 'random_strength': 0.6471049836104487, 'scale_pos_weight': 2.9530371906062545}
Best value: 0.9542627374450793


In [12]:
cat_best_params = cat_study.best_trial.params
cat_best_score = cat_study.best_trial.value
cat_best_score

0.9542627374450793

prev: 0.9601213839193278

## LightGBM

In [13]:
# Objective function for LightGBM

def lgb_objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 10.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
    }

    lgb_model = lgb.LGBMClassifier(**param)
  

    # Train the model
    lgb_model.fit(X_train_prep, y_train_res,
                  eval_set=[(X_test_prep, y_test)],
                  eval_metric='auc'
                  )

    # Predict and evaluate AUC
    y_pred_prob = lgb_model.predict_proba(X_test_prep)[:, 1]
    auc = roc_auc_score(y_test, y_pred_prob)

    return auc

# Optimize the objective function
if testing:
    lgb_study = optuna.create_study(direction='maximize')
    lgb_study.optimize(lgb_objective, n_trials=n_trial_bm_testing, timeout=timeout_bm_testing)
else:
    lgb_study = optuna.create_study(direction='maximize')
    lgb_study.optimize(lgb_objective, n_trials=n_trials_best_models, timeout=timout_best_models)

# Best parameters and AUC
print("Best LGBM Params: ", lgb_study.best_trial.params)
print("Best AUC for LGBM: ", lgb_study.best_value)

[I 2024-10-23 18:07:19,541] A new study created in memory with name: no-name-599f38cb-df10-4d39-a801-5516dc2fde19


[LightGBM] [Info] Number of positive: 14521, number of negative: 58086
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005492 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1283
[LightGBM] [Info] Number of data points in the train set: 72607, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.199994 -> initscore=-1.386329
[LightGBM] [Info] Start training from score -1.386329


[I 2024-10-23 18:07:31,108] Trial 0 finished with value: 0.9535987896370105 and parameters: {'learning_rate': 0.11442618846275615, 'num_leaves': 139, 'max_depth': 14, 'min_child_samples': 70, 'min_child_weight': 3.928256090886183, 'subsample': 0.9867013671863696, 'colsample_bytree': 0.9131699715913681, 'n_estimators': 719}. Best is trial 0 with value: 0.9535987896370105.


[LightGBM] [Info] Number of positive: 14521, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010063 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1283
[LightGBM] [Info] Number of data points in the train set: 72607, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.199994 -> initscore=-1.386329
[LightGBM] [Info] Start training from score -1.386329


[I 2024-10-23 18:07:32,877] Trial 1 finished with value: 0.9581407173899411 and parameters: {'learning_rate': 0.1402640588449942, 'num_leaves': 268, 'max_depth': 6, 'min_child_samples': 69, 'min_child_weight': 1.2277551636534187, 'subsample': 0.7817332880643983, 'colsample_bytree': 0.6478819344817279, 'n_estimators': 121}. Best is trial 1 with value: 0.9581407173899411.


[LightGBM] [Info] Number of positive: 14521, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005220 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1283
[LightGBM] [Info] Number of data points in the train set: 72607, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.199994 -> initscore=-1.386329
[LightGBM] [Info] Start training from score -1.386329


[I 2024-10-23 18:07:34,496] Trial 2 finished with value: 0.9566769467261835 and parameters: {'learning_rate': 0.281629695049827, 'num_leaves': 45, 'max_depth': 15, 'min_child_samples': 26, 'min_child_weight': 3.208149169301617, 'subsample': 0.8836744855837173, 'colsample_bytree': 0.930762566433275, 'n_estimators': 109}. Best is trial 1 with value: 0.9581407173899411.


[LightGBM] [Info] Number of positive: 14521, number of negative: 58086
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008194 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1283
[LightGBM] [Info] Number of data points in the train set: 72607, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.199994 -> initscore=-1.386329
[LightGBM] [Info] Start training from score -1.386329


[I 2024-10-23 18:07:45,778] Trial 3 finished with value: 0.9545168934029231 and parameters: {'learning_rate': 0.010456749078764302, 'num_leaves': 288, 'max_depth': 7, 'min_child_samples': 64, 'min_child_weight': 2.857623519551814, 'subsample': 0.7817074641793313, 'colsample_bytree': 0.8732160076969882, 'n_estimators': 793}. Best is trial 1 with value: 0.9581407173899411.


[LightGBM] [Info] Number of positive: 14521, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016263 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1283
[LightGBM] [Info] Number of data points in the train set: 72607, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.199994 -> initscore=-1.386329
[LightGBM] [Info] Start training from score -1.386329


[I 2024-10-23 18:07:55,992] Trial 4 finished with value: 0.9583559318148794 and parameters: {'learning_rate': 0.02939264687758996, 'num_leaves': 116, 'max_depth': 13, 'min_child_samples': 85, 'min_child_weight': 0.9307415317017197, 'subsample': 0.6827354402898024, 'colsample_bytree': 0.9117921553354442, 'n_estimators': 421}. Best is trial 4 with value: 0.9583559318148794.


[LightGBM] [Info] Number of positive: 14521, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012616 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1283
[LightGBM] [Info] Number of data points in the train set: 72607, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.199994 -> initscore=-1.386329
[LightGBM] [Info] Start training from score -1.386329


[I 2024-10-23 18:07:58,143] Trial 5 finished with value: 0.9373550555652694 and parameters: {'learning_rate': 0.02007877977100393, 'num_leaves': 277, 'max_depth': 6, 'min_child_samples': 72, 'min_child_weight': 3.5137527705097766, 'subsample': 0.7071014551771867, 'colsample_bytree': 0.9941967828532963, 'n_estimators': 141}. Best is trial 4 with value: 0.9583559318148794.


[LightGBM] [Info] Number of positive: 14521, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009199 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1283
[LightGBM] [Info] Number of data points in the train set: 72607, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.199994 -> initscore=-1.386329
[LightGBM] [Info] Start training from score -1.386329


[I 2024-10-23 18:08:02,431] Trial 6 finished with value: 0.9585626096496006 and parameters: {'learning_rate': 0.10268389765699315, 'num_leaves': 191, 'max_depth': 13, 'min_child_samples': 95, 'min_child_weight': 8.742569326460316, 'subsample': 0.8869985473198712, 'colsample_bytree': 0.8455755786784573, 'n_estimators': 237}. Best is trial 6 with value: 0.9585626096496006.


[LightGBM] [Info] Number of positive: 14521, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012064 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1283
[LightGBM] [Info] Number of data points in the train set: 72607, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.199994 -> initscore=-1.386329
[LightGBM] [Info] Start training from score -1.386329


[I 2024-10-23 18:08:07,141] Trial 7 finished with value: 0.9514176138769294 and parameters: {'learning_rate': 0.03027308063387036, 'num_leaves': 36, 'max_depth': 4, 'min_child_samples': 26, 'min_child_weight': 8.990112812471892, 'subsample': 0.6836101137794934, 'colsample_bytree': 0.7901640723890953, 'n_estimators': 518}. Best is trial 6 with value: 0.9585626096496006.


[LightGBM] [Info] Number of positive: 14521, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005124 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1283
[LightGBM] [Info] Number of data points in the train set: 72607, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.199994 -> initscore=-1.386329
[LightGBM] [Info] Start training from score -1.386329


[I 2024-10-23 18:08:08,810] Trial 8 finished with value: 0.9550618854247039 and parameters: {'learning_rate': 0.05794005518188282, 'num_leaves': 285, 'max_depth': 9, 'min_child_samples': 91, 'min_child_weight': 1.5867564010358266, 'subsample': 0.612244903506941, 'colsample_bytree': 0.7735080064388254, 'n_estimators': 103}. Best is trial 6 with value: 0.9585626096496006.


[LightGBM] [Info] Number of positive: 14521, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008847 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1283
[LightGBM] [Info] Number of data points in the train set: 72607, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.199994 -> initscore=-1.386329
[LightGBM] [Info] Start training from score -1.386329


[I 2024-10-23 18:08:12,617] Trial 9 finished with value: 0.959296188091606 and parameters: {'learning_rate': 0.05406903943020577, 'num_leaves': 182, 'max_depth': 15, 'min_child_samples': 80, 'min_child_weight': 7.741875978990014, 'subsample': 0.9004310250993646, 'colsample_bytree': 0.5279231872412558, 'n_estimators': 192}. Best is trial 9 with value: 0.959296188091606.


Best LGBM Params:  {'learning_rate': 0.05406903943020577, 'num_leaves': 182, 'max_depth': 15, 'min_child_samples': 80, 'min_child_weight': 7.741875978990014, 'subsample': 0.9004310250993646, 'colsample_bytree': 0.5279231872412558, 'n_estimators': 192}
Best AUC for LGBM:  0.959296188091606


In [14]:
lgb_best_params = lgb_study.best_trial.params
lgb_best_score = lgb_study.best_value
lgb_best_score

0.959296188091606

prev: 0.9607718460460408

## ExtraTreesClassifier

In [15]:
from sklearn.ensemble import ExtraTreesClassifier

def extratrees_objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_float('max_features', 0.1, 1.0)
    }

    model = ExtraTreesClassifier(**param)
    score = cross_val_score(model, X_train_prep, y_train_res, cv=3, scoring='roc_auc').mean()

    return score

if testing:
    extratrees_study = optuna.create_study(direction='maximize')
    extratrees_study.optimize(extratrees_objective, n_trials=n_trial_bm_testing, timeout=timeout_bm_testing)
else:
    extratrees_study = optuna.create_study(direction='maximize')
    extratrees_study.optimize(extratrees_objective, n_trials=n_trial_mids, timeout=timeout_mids)

print('Best parameters for ExtraTrees:', extratrees_study.best_params)
print(f"Best value: {extratrees_study.best_value}")

[I 2024-10-23 18:08:22,177] A new study created in memory with name: no-name-bce30287-c879-4034-b9e3-58fca738a713
[I 2024-10-23 18:08:50,608] Trial 0 finished with value: 0.886681354359212 and parameters: {'n_estimators': 392, 'max_depth': 5, 'min_samples_split': 7, 'min_samples_leaf': 8, 'max_features': 0.18397397321158238}. Best is trial 0 with value: 0.886681354359212.
[I 2024-10-23 18:11:24,070] Trial 1 finished with value: 0.939539157319549 and parameters: {'n_estimators': 367, 'max_depth': 13, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 0.9210298739764378}. Best is trial 1 with value: 0.939539157319549.


Best parameters for ExtraTrees: {'n_estimators': 367, 'max_depth': 13, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 0.9210298739764378}


In [16]:
extratrees_best_params = extratrees_study.best_params
extratrees_best_score = extratrees_study.best_value
extratrees_best_score

0.939539157319549

Prev: 0.9298212453316017

## HistGradientBoostingClassifier

In [17]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

def histgb_objective(trial):
    param = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_iter': trial.suggest_int('max_iter', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 10, 50),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 10, 50),
        'l2_regularization': trial.suggest_float('l2_regularization', 1e-5, 1.0, log=True)
    }

    model = HistGradientBoostingClassifier(**param)
    score = cross_val_score(model, X_train_prep, y_train_res, cv=3, scoring='roc_auc').mean()

    return score

if testing:
    histgb_study = optuna.create_study(direction='maximize')
    histgb_study.optimize(histgb_objective, n_trials=n_trial_bm_testing, timeout=timeout_bm_testing)

else:
    histgb_study = optuna.create_study(direction='maximize')
    histgb_study.optimize(histgb_objective, n_trials=n_trial_mids, timeout=timeout_mids)


print('Best parameters for HistGradientBoosting:', histgb_study.best_params)
print(f"Best value: {histgb_study.best_value}")

[I 2024-10-23 18:11:24,999] A new study created in memory with name: no-name-e2f38910-5db3-4b7c-a3a0-8ff0fd590ad0
[I 2024-10-23 18:11:35,026] Trial 0 finished with value: 0.9492844545433062 and parameters: {'learning_rate': 0.03539620256112014, 'max_iter': 222, 'max_depth': 13, 'min_samples_leaf': 30, 'max_leaf_nodes': 12, 'l2_regularization': 0.2446822369738013}. Best is trial 0 with value: 0.9492844545433062.
[I 2024-10-23 18:11:47,501] Trial 1 finished with value: 0.9571989694687696 and parameters: {'learning_rate': 0.07879253085012963, 'max_iter': 232, 'max_depth': 14, 'min_samples_leaf': 48, 'max_leaf_nodes': 34, 'l2_regularization': 0.013917754954993082}. Best is trial 1 with value: 0.9571989694687696.
[I 2024-10-23 18:11:57,678] Trial 2 finished with value: 0.9471263457196772 and parameters: {'learning_rate': 0.025840760559500884, 'max_iter': 183, 'max_depth': 7, 'min_samples_leaf': 47, 'max_leaf_nodes': 19, 'l2_regularization': 0.0018449053625077822}. Best is trial 1 with value

Best parameters for HistGradientBoosting: {'learning_rate': 0.07879253085012963, 'max_iter': 232, 'max_depth': 14, 'min_samples_leaf': 48, 'max_leaf_nodes': 34, 'l2_regularization': 0.013917754954993082}
Best value: 0.9571989694687696


In [18]:
histgb_best_params = histgb_study.best_trial.params
histgb_best_score = histgb_study.best_trial.value
histgb_best_score

0.9571989694687696

Prev: 0.9519987809872763

## KNN

In [19]:
from sklearn.neighbors import KNeighborsClassifier

def knn_objective(trial):
    param = {
        'n_neighbors': trial.suggest_int('n_neighbors', 3, 15),
        'leaf_size': trial.suggest_int('leaf_size', 20, 50),
        'p': trial.suggest_categorical('p', [1, 2]),  # Minkowski distance parameter
        'weights': trial.suggest_categorical('weights', ['uniform', 'distance'])
    }

    model = KNeighborsClassifier(**param)
    score = cross_val_score(model, X_train_prep, y_train_res, cv=3, scoring='roc_auc').mean()

    return score

if testing:
    knn_study = optuna.create_study(direction='maximize')
    knn_study.optimize(knn_objective, n_trials=n_trial_bm_testing, timeout=timeout_bm_testing)

else:
    knn_study = optuna.create_study(direction='maximize')
    knn_study.optimize(knn_objective, n_trials=n_trial_mids, timeout=timeout_mids)

print('Best parameters for KNN:', knn_study.best_params)
print('Best score for KNN:', knn_study.best_value)

[I 2024-10-23 18:12:42,930] A new study created in memory with name: no-name-db05314c-719a-4e6c-aaee-143792b74dc2
[I 2024-10-23 18:13:41,433] Trial 0 finished with value: 0.9052906225892309 and parameters: {'n_neighbors': 9, 'leaf_size': 46, 'p': 1, 'weights': 'distance'}. Best is trial 0 with value: 0.9052906225892309.
[I 2024-10-23 18:14:35,157] Trial 1 finished with value: 0.9106103853912698 and parameters: {'n_neighbors': 12, 'leaf_size': 23, 'p': 1, 'weights': 'distance'}. Best is trial 1 with value: 0.9106103853912698.


Best parameters for KNN: {'n_neighbors': 12, 'leaf_size': 23, 'p': 1, 'weights': 'distance'}
Best score for KNN: 0.9106103853912698


In [20]:
knn_best_params = knn_study.best_trial.params
knn_best_score = knn_study.best_trial.value
knn_best_score

0.9106103853912698

Prev: 0.9003336325395043

# Weighted Average

In [22]:
# Init base models

xgb_model = xgb.XGBClassifier(**xgb_best_params)
cat_model = CatBoostClassifier(**cat_best_params)
lgb_model = lgb.LGBMClassifier(**lgb_best_params)
extratrees_model = ExtraTreesClassifier(**extratrees_best_params)
histgb_model = HistGradientBoostingClassifier(**histgb_best_params)
knn_model = KNeighborsClassifier(**knn_best_params)

# Base and meta models
base_estimators = [
	('xgb', xgb_model),
	('cat', cat_model),
	('lgb', lgb_model),
	('extratrees', extratrees_model),
	('histgb', histgb_model),
	('knn', knn_model)
]

predictions = []

for _, model in base_estimators:
	model.fit(X_train_prep, y_train_res)
	predictions.append(model.predict_proba(X_test_prep)[:, 1])

0:	learn: 0.5154313	total: 21ms	remaining: 14.6s
1:	learn: 0.4449421	total: 39.1ms	remaining: 13.6s
2:	learn: 0.4014669	total: 55.5ms	remaining: 12.8s
3:	learn: 0.3804414	total: 74ms	remaining: 12.8s
4:	learn: 0.3612612	total: 91.7ms	remaining: 12.7s
5:	learn: 0.3490253	total: 113ms	remaining: 13.1s
6:	learn: 0.3384559	total: 132ms	remaining: 13s
7:	learn: 0.3298691	total: 153ms	remaining: 13.1s
8:	learn: 0.3245997	total: 171ms	remaining: 13.1s
9:	learn: 0.3200758	total: 187ms	remaining: 12.8s
10:	learn: 0.3170923	total: 203ms	remaining: 12.7s
11:	learn: 0.3147011	total: 226ms	remaining: 12.9s
12:	learn: 0.3112430	total: 255ms	remaining: 13.4s
13:	learn: 0.3057434	total: 288ms	remaining: 14s
14:	learn: 0.3032503	total: 315ms	remaining: 14.3s
15:	learn: 0.2965394	total: 337ms	remaining: 14.3s
16:	learn: 0.2948192	total: 355ms	remaining: 14.2s
17:	learn: 0.2937626	total: 371ms	remaining: 14s
18:	learn: 0.2925644	total: 391ms	remaining: 13.9s
19:	learn: 0.2910094	total: 412ms	remaining: 1

In [28]:
df_preds = pd.DataFrame(predictions).T
df_preds.index = X_test.index
df_preds.columns = [name for name, _ in base_estimators]
df_preds

Unnamed: 0,xgb,cat,lgb,extratrees,histgb,knn
53767,0.063864,0.107648,0.028779,0.115236,0.056961,0.253203
59340,0.068318,0.054440,0.060169,0.076746,0.039892,0.114967
32378,0.092246,0.186737,0.068640,0.073758,0.067470,0.000000
6928,0.023177,0.044705,0.010690,0.051944,0.031952,0.075645
39419,0.060453,0.149299,0.040228,0.044165,0.089316,0.000000
...,...,...,...,...,...,...
81657,0.068190,0.084185,0.021608,0.035623,0.012460,0.000000
47513,0.041510,0.174881,0.057449,0.074147,0.094918,0.076764
1597,0.093373,0.118885,0.070298,0.136618,0.077045,0.140864
36272,0.010422,0.034405,0.007805,0.013011,0.012622,0.000000


In [31]:
def weighted_average_objective(trial):
	weights = [trial.suggest_float(name, 0.0, 1.0) for name, _ in base_estimators]
	weights = np.array(weights)
	weights /= weights.sum()

	y_pred = (df_preds * weights).sum(axis=1)
	score = roc_auc_score(y_test, y_pred)

	return score

if testing:
	weighted_average_study = optuna.create_study(direction='maximize')
	weighted_average_study.optimize(weighted_average_objective, n_trials=n_trial_bm_testing, timeout=timeout_bm_testing)

else:
	weighted_average_study = optuna.create_study(direction='maximize')
	weighted_average_study.optimize(weighted_average_objective, n_trials=n_trial_average, timeout=timout_average)

print('Best parameters for weighted average:', weighted_average_study.best_params)
print('Best score for weighted average:', weighted_average_study.best_value)

weights = [weighted_average_study.best_params[name] for name, _ in base_estimators]

[I 2024-10-23 18:22:00,169] A new study created in memory with name: no-name-1aebae15-83aa-4d75-b81b-3c9a71aa94a7
[I 2024-10-23 18:22:00,194] Trial 0 finished with value: 0.9553429535456046 and parameters: {'xgb': 0.41494048943532547, 'cat': 0.9669715979602339, 'lgb': 0.549168585070499, 'extratrees': 0.7130349793294533, 'histgb': 0.28287474616012154, 'knn': 0.6587181924983857}. Best is trial 0 with value: 0.9553429535456046.
[I 2024-10-23 18:22:00,228] Trial 1 finished with value: 0.9556719378670734 and parameters: {'xgb': 0.33225364164774995, 'cat': 0.39464659023510906, 'lgb': 0.9342335795971719, 'extratrees': 0.09137175311309531, 'histgb': 0.9947643977266784, 'knn': 0.7724656982224378}. Best is trial 1 with value: 0.9556719378670734.
[I 2024-10-23 18:22:00,292] Trial 2 finished with value: 0.9518664656593696 and parameters: {'xgb': 0.09784259527324946, 'cat': 0.22603313358869204, 'lgb': 0.5796700555989516, 'extratrees': 0.9627538873976983, 'histgb': 0.8897227256868561, 'knn': 0.75467

[0.11572064 0.26967378 0.15315483 0.19885469 0.0788895  0.18370656]
[0.09439733 0.11212393 0.2654272  0.02595983 0.28262475 0.21946696]
[0.02786988 0.0643842  0.16511558 0.27423473 0.25343224 0.21496337]
[0.19134359 0.01194691 0.12222683 0.18991856 0.25271139 0.23185272]
[0.0160597  0.02465431 0.19572541 0.18152239 0.24570693 0.33633126]


[I 2024-10-23 18:22:00,419] Trial 5 finished with value: 0.9550171186457694 and parameters: {'xgb': 0.5298538811002829, 'cat': 0.43107297097213404, 'lgb': 0.015094128011720609, 'extratrees': 0.29685530749541167, 'histgb': 0.8958326826886074, 'knn': 0.465918637221045}. Best is trial 1 with value: 0.9556719378670734.
[I 2024-10-23 18:22:00,465] Trial 6 finished with value: 0.9578948138639719 and parameters: {'xgb': 0.5335609162800464, 'cat': 0.742896433282625, 'lgb': 0.7299252840231465, 'extratrees': 0.09789462426747864, 'histgb': 0.06452442488167853, 'knn': 0.31833519672157007}. Best is trial 6 with value: 0.9578948138639719.
[I 2024-10-23 18:22:00,489] Trial 7 finished with value: 0.9539426212318948 and parameters: {'xgb': 0.5054178805169365, 'cat': 0.7204487182144502, 'lgb': 0.5604543893036131, 'extratrees': 0.8744259012290008, 'histgb': 0.05899013812870135, 'knn': 0.7634896408269796}. Best is trial 6 with value: 0.9578948138639719.
[I 2024-10-23 18:22:00,507] Trial 8 finished with va

[0.20111149 0.16361818 0.00572913 0.11267448 0.34002251 0.17684421]
[0.21452817 0.29869544 0.29348014 0.03936037 0.02594325 0.12799263]
[0.14510049 0.20683372 0.16090092 0.25103905 0.01693549 0.21919034]
[0.16923516 0.24765145 0.16912878 0.21527978 0.03691389 0.16179094]
[0.00137396 0.23075783 0.14801711 0.16795774 0.23010656 0.2217868 ]
Best parameters for weighted average: {'xgb': 0.5335609162800464, 'cat': 0.742896433282625, 'lgb': 0.7299252840231465, 'extratrees': 0.09789462426747864, 'histgb': 0.06452442488167853, 'knn': 0.31833519672157007}
Best score for weighted average: 0.9578948138639719


# Predictions and submission

In [43]:
X_res, y_res = SMOTENC(sampling_strategy=ratio, categorical_features=[X.columns.get_loc(col) for col in categorical_columns], random_state=random_state, k_neighbors=k_neighbors).fit_resample(X, y)

In [50]:
X_prep = preprocessor.transform(X_res)
data_to_predict_prep = preprocessor.transform(data_to_predict)

for _, model in base_estimators:
	model.fit(X_prep, y_res)

predictions = []

for _, model in base_estimators:
	predictions.append(model.predict_proba(data_to_predict_prep)[:, 1])

df_preds = pd.DataFrame(predictions).T
df_preds.columns = [name for name, _ in base_estimators]

y_pred = (df_preds * weights).sum(axis=1)

submission = pd.DataFrame({'id': data_to_predict['id'], 'loan_status': y_pred})

0:	learn: 0.5157991	total: 24.6ms	remaining: 17.1s
1:	learn: 0.4447401	total: 45.6ms	remaining: 15.8s
2:	learn: 0.4047223	total: 76.7ms	remaining: 17.8s
3:	learn: 0.3815329	total: 142ms	remaining: 24.6s
4:	learn: 0.3617474	total: 181ms	remaining: 25.1s
5:	learn: 0.3498702	total: 210ms	remaining: 24.2s
6:	learn: 0.3393584	total: 256ms	remaining: 25.2s
7:	learn: 0.3314868	total: 317ms	remaining: 27.3s
8:	learn: 0.3260266	total: 353ms	remaining: 27s
9:	learn: 0.3226007	total: 385ms	remaining: 26.5s
10:	learn: 0.3179814	total: 427ms	remaining: 26.7s
11:	learn: 0.3149539	total: 452ms	remaining: 25.8s
12:	learn: 0.3110525	total: 485ms	remaining: 25.5s
13:	learn: 0.3094974	total: 509ms	remaining: 24.8s
14:	learn: 0.3083181	total: 543ms	remaining: 24.7s
15:	learn: 0.3062048	total: 572ms	remaining: 24.3s
16:	learn: 0.3038216	total: 603ms	remaining: 24.1s
17:	learn: 0.3015818	total: 638ms	remaining: 24.1s
18:	learn: 0.2970293	total: 669ms	remaining: 23.9s
19:	learn: 0.2954496	total: 705ms	remain

In [52]:
import joblib

if not testing:
	try:
		joblib.dump(xgb_model, '../models/xgb_model.pkl')
		joblib.dump(cat_model, '../models/cat_model.pkl')
		joblib.dump(lgb_model, '../models/lgb_model.pkl')
		joblib.dump(extratrees_model, '../models/extratrees_model.pkl')
		joblib.dump(histgb_model, '../models/histgb_model.pkl')
		joblib.dump(knn_model, '../models/knn_model.pkl')

	except:
		joblib.dump(xgb_model, 'xgb_model.pkl')
		joblib.dump(cat_model, 'cat_model.pkl')
		joblib.dump(lgb_model, 'lgb_model.pkl')
		joblib.dump(extratrees_model, 'extratrees_model.pkl')
		joblib.dump(histgb_model, 'histgb_model.pkl')
		joblib.dump(knn_model, 'knn_model.pkl')

try:
	submission.to_csv('../submissions/weighted_avg_v1.csv', index=False)

except:
	submission.to_csv('weighted_avg_v1.csv', index=False)