# Imports

In [3]:
# General
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Hyperparameter tuning
import optuna

# Models
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC


# Data processing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Metrics
from sklearn.metrics import roc_auc_score

# Data preparation

In [4]:
# Data input
data = pd.read_csv('../data/train.csv')
data_to_predict = pd.read_csv('../data/test.csv')
data_ccrisk = pd.read_csv('../data/credit_risk_dataset.csv')

X_old = data.drop(['loan_status'], axis=1)
y_old = data['loan_status']

# Merge the dataframes
data_no_id = data.drop(['id'], axis=1)
merged_data = pd.concat([data_no_id, data_ccrisk], ignore_index=True)

X = merged_data.drop(['loan_status'], axis=1)
y = merged_data['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, shuffle=True, test_size=0.2, random_state=2024)

In [5]:
# Data processing

categorical_columns = X.select_dtypes(include=['object']).columns

categorical_ordinal = ['loan_grade']
categorical_onehot = categorical_columns.drop(categorical_ordinal)

numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessor
preprocessor = ColumnTransformer(
	transformers=[
		('ordinal', OrdinalEncoder(), categorical_ordinal),
		('onehot', OneHotEncoder(), categorical_onehot),
		('scaler', StandardScaler(), numerical_columns)
	])

preprocessor.fit(X_train)

X_train_prep = preprocessor.transform(X_train)
X_test_prep = preprocessor.transform(X_test)

# Models

## XGBoost

In [6]:
def xgb_objective(trial):
    # Hyperparameter search space
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 0.5),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 3.0),
        'eval_metric': 'auc',  # AUC is the metric
        'objective': 'binary:logistic'  # Use binary logistic, probability outputs
    }

    # Initialize the model
    model = xgb.XGBClassifier(**param)

    # Fit the model
    model.fit(X_train_prep, y_train, 
              eval_set=[(X_test_prep, y_test)],  
              verbose=False)

    # Predict proba for validation set
    y_pred_prob = model.predict_proba(X_test_prep)[:, 1]

    auc = roc_auc_score(y_test, y_pred_prob)
    
    return auc

# Study object to run the optimization. I want to maximize AUC
xgb_study = optuna.create_study(direction='maximize')
xgb_study.optimize(xgb_objective, n_trials=50)


print(f"Best trial: {xgb_study.best_trial.params}")

[I 2024-10-09 22:05:58,334] A new study created in memory with name: no-name-44073533-931b-4bf9-a0e4-6b809fdcc510
[I 2024-10-09 22:06:01,537] Trial 0 finished with value: 0.9525810347956751 and parameters: {'max_depth': 8, 'learning_rate': 0.23759751295096465, 'n_estimators': 104, 'subsample': 0.6861085235631791, 'colsample_bytree': 0.6075327980311174, 'gamma': 0.12451478473153665, 'lambda': 1.5462291489103863e-06, 'alpha': 0.0014761980421400247, 'scale_pos_weight': 2.9166529779731913}. Best is trial 0 with value: 0.9525810347956751.
[I 2024-10-09 22:06:12,725] Trial 1 finished with value: 0.9548030154749317 and parameters: {'max_depth': 10, 'learning_rate': 0.10771767717037606, 'n_estimators': 374, 'subsample': 0.7953274560696989, 'colsample_bytree': 0.8337629717071376, 'gamma': 0.2995102374207908, 'lambda': 1.2907977806555766, 'alpha': 1.3013028313152889e-06, 'scale_pos_weight': 2.323853613045628}. Best is trial 1 with value: 0.9548030154749317.
[I 2024-10-09 22:06:20,947] Trial 2 fi

Best trial: {'max_depth': 9, 'learning_rate': 0.04443861243064845, 'n_estimators': 295, 'subsample': 0.9129336231196766, 'colsample_bytree': 0.7817900881520917, 'gamma': 0.20733828465084803, 'lambda': 0.0007763279982043077, 'alpha': 4.17984861400515, 'scale_pos_weight': 2.6715664022423775}


In [7]:
xgb_best_params = xgb_study.best_trial.params
xgb_best_score = xgb_study.best_trial.value
xgb_best_score

0.959733166826617

## CatBoostClassifier

In [8]:
from catboost import CatBoostClassifier

In [9]:
def cat_objective(trial):
    # Define the hyperparameter search space
    param = {
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'iterations': trial.suggest_int('iterations', 200, 1000),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
        'border_count': trial.suggest_int('border_count', 32, 255),  # CatBoost feature binning
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'random_strength': trial.suggest_float('random_strength', 0.0, 1.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 3.0),
        'eval_metric': 'AUC',  # Using AUC as the evaluation metric
        'logging_level': 'Silent',  # Disable CatBoost output
        'task_type': 'CPU',  # Use CPU (set 'GPU' if you have a GPU available)
        'use_best_model': True
    }

    # Initialize the CatBoost model with the trial's parameters
    model = CatBoostClassifier(**param)

    # Fit the model
    model.fit(X_train_prep, y_train,
              eval_set=[(X_test_prep, y_test)],
              early_stopping_rounds=20,  # Early stopping to prevent overfitting
              verbose=False)

    # Get predictions and calculate AUC score
    y_pred_prob = model.predict_proba(X_test_prep)[:, 1]  # Probabilities for class 1
    auc = roc_auc_score(y_test, y_pred_prob)

    return auc

# Create a study object and start the optimization process
cat_study = optuna.create_study(direction='maximize')  # We want to maximize AUC
cat_study.optimize(cat_objective, n_trials=50)

# Output the best trial
print(f"Best trial: {cat_study.best_trial.params}")

[I 2024-10-09 22:11:51,100] A new study created in memory with name: no-name-e72c9759-8c17-475d-a66c-405ea194cf53
[I 2024-10-09 22:11:58,468] Trial 0 finished with value: 0.9535014045536119 and parameters: {'depth': 7, 'learning_rate': 0.14900669887451898, 'iterations': 972, 'l2_leaf_reg': 2.832927821560049e-06, 'border_count': 164, 'bagging_temperature': 0.5177074086756733, 'random_strength': 0.3755586295324942, 'scale_pos_weight': 1.0033482236176379}. Best is trial 0 with value: 0.9535014045536119.
[I 2024-10-09 22:12:01,653] Trial 1 finished with value: 0.9435656688784282 and parameters: {'depth': 9, 'learning_rate': 0.1442107216774723, 'iterations': 403, 'l2_leaf_reg': 1.9701502555594304e-05, 'border_count': 91, 'bagging_temperature': 0.14870006576346828, 'random_strength': 0.12286492442004282, 'scale_pos_weight': 2.431117089849309}. Best is trial 0 with value: 0.9535014045536119.
[I 2024-10-09 22:12:11,152] Trial 2 finished with value: 0.9434425794273231 and parameters: {'depth': 

Best trial: {'depth': 6, 'learning_rate': 0.1377846993488999, 'iterations': 335, 'l2_leaf_reg': 1.9947064092180422, 'border_count': 223, 'bagging_temperature': 0.43873437022890877, 'random_strength': 0.3977615491895288, 'scale_pos_weight': 2.4634007576197803}


In [10]:
cat_best_params = cat_study.best_trial.params
cat_best_score = cat_study.best_trial.value
cat_best_score

0.9595616312456452

In [12]:
# Objective function for LightGBM
def lgb_objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 10.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
    }

    lgb_model = lgb.LGBMClassifier(**param)
    
    # Train the model
    lgb_model.fit(X_train_prep, y_train,
                  eval_set=[(X_test_prep, y_test)],
                  eval_metric='auc'
                  )

    # Predict and evaluate AUC
    y_pred_prob = lgb_model.predict_proba(X_test_prep)[:, 1]
    auc = roc_auc_score(y_test, y_pred_prob)

    return auc

# Optimize the objective function
lgb_study = optuna.create_study(direction='maximize')
lgb_study.optimize(lgb_objective, n_trials=50)

# Best parameters and AUC
print("Best LGBM Params: ", lgb_study.best_trial.params)
print("Best AUC for LGBM: ", lgb_study.best_value)

[I 2024-10-09 22:47:59,952] A new study created in memory with name: no-name-e4df7c3a-673f-47c0-a4ee-3f61b705cfdd


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004010 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:48:06,809] Trial 0 finished with value: 0.9605070990867199 and parameters: {'learning_rate': 0.20286923205339508, 'num_leaves': 268, 'max_depth': 3, 'min_child_samples': 64, 'min_child_weight': 5.427852638749153, 'subsample': 0.7080406522259487, 'colsample_bytree': 0.6149863635429692, 'n_estimators': 882}. Best is trial 0 with value: 0.9605070990867199.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009931 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:48:10,991] Trial 1 finished with value: 0.9548021724695881 and parameters: {'learning_rate': 0.2920295183776929, 'num_leaves': 56, 'max_depth': 15, 'min_child_samples': 30, 'min_child_weight': 1.7023084792952516, 'subsample': 0.9259109299444939, 'colsample_bytree': 0.5835249499904873, 'n_estimators': 406}. Best is trial 0 with value: 0.9605070990867199.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010257 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:48:24,529] Trial 2 finished with value: 0.9522866872083949 and parameters: {'learning_rate': 0.2525474703574194, 'num_leaves': 267, 'max_depth': 13, 'min_child_samples': 19, 'min_child_weight': 8.975752064899297, 'subsample': 0.7074422786232508, 'colsample_bytree': 0.5749058240541898, 'n_estimators': 875}. Best is trial 0 with value: 0.9605070990867199.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009428 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:48:28,765] Trial 3 finished with value: 0.9609481082618622 and parameters: {'learning_rate': 0.044751196825447065, 'num_leaves': 229, 'max_depth': 11, 'min_child_samples': 75, 'min_child_weight': 4.090181332928228, 'subsample': 0.8168154445008844, 'colsample_bytree': 0.9119539806243956, 'n_estimators': 256}. Best is trial 3 with value: 0.9609481082618622.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004348 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:48:31,362] Trial 4 finished with value: 0.960367010806347 and parameters: {'learning_rate': 0.29754383994963796, 'num_leaves': 189, 'max_depth': 5, 'min_child_samples': 65, 'min_child_weight': 4.885386662903091, 'subsample': 0.6150542345144727, 'colsample_bytree': 0.8362288802520643, 'n_estimators': 332}. Best is trial 3 with value: 0.9609481082618622.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006814 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:48:33,261] Trial 5 finished with value: 0.9603709590592213 and parameters: {'learning_rate': 0.2523899409673789, 'num_leaves': 45, 'max_depth': 5, 'min_child_samples': 44, 'min_child_weight': 3.7220314936010834, 'subsample': 0.7945845221295094, 'colsample_bytree': 0.9534207540390103, 'n_estimators': 233}. Best is trial 3 with value: 0.9609481082618622.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006109 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:48:39,295] Trial 6 finished with value: 0.9603903588530757 and parameters: {'learning_rate': 0.17325332256248968, 'num_leaves': 287, 'max_depth': 5, 'min_child_samples': 78, 'min_child_weight': 6.508858911457511, 'subsample': 0.6461908801353845, 'colsample_bytree': 0.644038238415011, 'n_estimators': 790}. Best is trial 3 with value: 0.9609481082618622.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005730 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:48:47,529] Trial 7 finished with value: 0.9524822644480873 and parameters: {'learning_rate': 0.267064230216676, 'num_leaves': 72, 'max_depth': 13, 'min_child_samples': 64, 'min_child_weight': 5.735882817750888, 'subsample': 0.5684728403997407, 'colsample_bytree': 0.9402241439576229, 'n_estimators': 672}. Best is trial 3 with value: 0.9609481082618622.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007575 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:49:01,153] Trial 8 finished with value: 0.9525118870156005 and parameters: {'learning_rate': 0.22363963626245156, 'num_leaves': 244, 'max_depth': 12, 'min_child_samples': 16, 'min_child_weight': 7.746773731667268, 'subsample': 0.531480907416407, 'colsample_bytree': 0.9830837285844659, 'n_estimators': 986}. Best is trial 3 with value: 0.9609481082618622.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004194 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:49:08,476] Trial 9 finished with value: 0.9591228629707833 and parameters: {'learning_rate': 0.21478106928717844, 'num_leaves': 39, 'max_depth': 5, 'min_child_samples': 21, 'min_child_weight': 7.18139737915874, 'subsample': 0.5634438469196974, 'colsample_bytree': 0.933133686744733, 'n_estimators': 976}. Best is trial 3 with value: 0.9609481082618622.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009044 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:49:10,657] Trial 10 finished with value: 0.9515884653649014 and parameters: {'learning_rate': 0.02544277153068906, 'num_leaves': 141, 'max_depth': 9, 'min_child_samples': 100, 'min_child_weight': 0.6844912418088827, 'subsample': 0.8584829936400206, 'colsample_bytree': 0.7868014081123723, 'n_estimators': 129}. Best is trial 3 with value: 0.9609481082618622.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007282 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:49:17,192] Trial 11 finished with value: 0.9617815310889273 and parameters: {'learning_rate': 0.06977640022865256, 'num_leaves': 211, 'max_depth': 9, 'min_child_samples': 82, 'min_child_weight': 3.2023667077829443, 'subsample': 0.7568698592611878, 'colsample_bytree': 0.6931689817245681, 'n_estimators': 535}. Best is trial 11 with value: 0.9617815310889273.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007110 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:49:23,197] Trial 12 finished with value: 0.9617493901763371 and parameters: {'learning_rate': 0.06627799827454953, 'num_leaves': 205, 'max_depth': 9, 'min_child_samples': 91, 'min_child_weight': 2.813581876256246, 'subsample': 0.8175121185306542, 'colsample_bytree': 0.7202896367709608, 'n_estimators': 502}. Best is trial 11 with value: 0.9617815310889273.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008716 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:49:29,179] Trial 13 finished with value: 0.9606246503128466 and parameters: {'learning_rate': 0.0941509948391665, 'num_leaves': 175, 'max_depth': 9, 'min_child_samples': 100, 'min_child_weight': 2.69641673131332, 'subsample': 0.9887514995013953, 'colsample_bytree': 0.7049230244253417, 'n_estimators': 505}. Best is trial 11 with value: 0.9617815310889273.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010349 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:49:35,630] Trial 14 finished with value: 0.9604018621411808 and parameters: {'learning_rate': 0.10413954329308901, 'num_leaves': 117, 'max_depth': 8, 'min_child_samples': 86, 'min_child_weight': 2.5173482520967876, 'subsample': 0.8772446686228037, 'colsample_bytree': 0.6961941367755506, 'n_estimators': 596}. Best is trial 11 with value: 0.9617815310889273.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002465 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:49:40,545] Trial 15 finished with value: 0.9624540359339671 and parameters: {'learning_rate': 0.08902341196285564, 'num_leaves': 204, 'max_depth': 7, 'min_child_samples': 87, 'min_child_weight': 0.16451973122201746, 'subsample': 0.7307507204114967, 'colsample_bytree': 0.5022305234933646, 'n_estimators': 479}. Best is trial 15 with value: 0.9624540359339671.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005009 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:49:47,426] Trial 16 finished with value: 0.9593411479999817 and parameters: {'learning_rate': 0.14316454451923638, 'num_leaves': 125, 'max_depth': 7, 'min_child_samples': 45, 'min_child_weight': 0.2383871411081717, 'subsample': 0.7368531854998078, 'colsample_bytree': 0.5120977761108136, 'n_estimators': 678}. Best is trial 15 with value: 0.9624540359339671.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007559 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:49:52,355] Trial 17 finished with value: 0.9612550689164339 and parameters: {'learning_rate': 0.12406954300571224, 'num_leaves': 218, 'max_depth': 7, 'min_child_samples': 80, 'min_child_weight': 1.2476485351790312, 'subsample': 0.673312118018779, 'colsample_bytree': 0.540066789025212, 'n_estimators': 438}. Best is trial 15 with value: 0.9624540359339671.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005092 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:50:02,182] Trial 18 finished with value: 0.9602648897792923 and parameters: {'learning_rate': 0.07152956949015842, 'num_leaves': 162, 'max_depth': 11, 'min_child_samples': 90, 'min_child_weight': 0.015851730780978374, 'subsample': 0.7780051820229031, 'colsample_bytree': 0.8147317030747236, 'n_estimators': 621}. Best is trial 15 with value: 0.9624540359339671.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005247 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:50:06,109] Trial 19 finished with value: 0.9298976284189523 and parameters: {'learning_rate': 0.013937185991808855, 'num_leaves': 95, 'max_depth': 3, 'min_child_samples': 52, 'min_child_weight': 1.632201575379212, 'subsample': 0.7532987291425348, 'colsample_bytree': 0.6440306537498317, 'n_estimators': 357}. Best is trial 15 with value: 0.9624540359339671.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014974 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:50:14,921] Trial 20 finished with value: 0.9574138457075705 and parameters: {'learning_rate': 0.16120583813233497, 'num_leaves': 198, 'max_depth': 7, 'min_child_samples': 5, 'min_child_weight': 3.799649718840506, 'subsample': 0.6258978792679223, 'colsample_bytree': 0.8699269058933754, 'n_estimators': 737}. Best is trial 15 with value: 0.9624540359339671.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010294 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:50:22,371] Trial 21 finished with value: 0.9611076390199045 and parameters: {'learning_rate': 0.06271745858227824, 'num_leaves': 209, 'max_depth': 10, 'min_child_samples': 90, 'min_child_weight': 2.5569914065930557, 'subsample': 0.8277739346897284, 'colsample_bytree': 0.7376701077818645, 'n_estimators': 548}. Best is trial 15 with value: 0.9624540359339671.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003568 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:50:28,059] Trial 22 finished with value: 0.9615178945064392 and parameters: {'learning_rate': 0.08345821838088027, 'num_leaves': 246, 'max_depth': 8, 'min_child_samples': 71, 'min_child_weight': 3.0735222230072257, 'subsample': 0.8921612520909209, 'colsample_bytree': 0.7662597411471135, 'n_estimators': 500}. Best is trial 15 with value: 0.9624540359339671.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011034 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:50:34,602] Trial 23 finished with value: 0.9619825291731068 and parameters: {'learning_rate': 0.04448955587980918, 'num_leaves': 182, 'max_depth': 10, 'min_child_samples': 91, 'min_child_weight': 1.7802623782458105, 'subsample': 0.7421820323682535, 'colsample_bytree': 0.6880635375503108, 'n_estimators': 467}. Best is trial 15 with value: 0.9624540359339671.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005026 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:50:39,473] Trial 24 finished with value: 0.9615558190759431 and parameters: {'learning_rate': 0.04066587075975541, 'num_leaves': 158, 'max_depth': 10, 'min_child_samples': 84, 'min_child_weight': 1.19823306170152, 'subsample': 0.6984059740391716, 'colsample_bytree': 0.6714666635472952, 'n_estimators': 303}. Best is trial 15 with value: 0.9624540359339671.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011039 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:50:43,237] Trial 25 finished with value: 0.9618532398978935 and parameters: {'learning_rate': 0.11971317938807643, 'num_leaves': 177, 'max_depth': 6, 'min_child_samples': 94, 'min_child_weight': 1.9905905018885262, 'subsample': 0.7431806143076519, 'colsample_bytree': 0.5735863886734875, 'n_estimators': 424}. Best is trial 15 with value: 0.9624540359339671.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002226 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:50:47,110] Trial 26 finished with value: 0.9621791094824484 and parameters: {'learning_rate': 0.1108907416619132, 'num_leaves': 173, 'max_depth': 6, 'min_child_samples': 94, 'min_child_weight': 1.6332664857573358, 'subsample': 0.6619985537642925, 'colsample_bytree': 0.515005459844645, 'n_estimators': 435}. Best is trial 15 with value: 0.9624540359339671.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003593 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:50:50,628] Trial 27 finished with value: 0.9615520415583281 and parameters: {'learning_rate': 0.13218965334417793, 'num_leaves': 148, 'max_depth': 4, 'min_child_samples': 71, 'min_child_weight': 0.828531474423495, 'subsample': 0.6646489117184626, 'colsample_bytree': 0.5022274892905645, 'n_estimators': 449}. Best is trial 15 with value: 0.9624540359339671.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011460 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:50:52,392] Trial 28 finished with value: 0.9603978605335375 and parameters: {'learning_rate': 0.105566604038864, 'num_leaves': 120, 'max_depth': 6, 'min_child_samples': 96, 'min_child_weight': 0.04044790687437315, 'subsample': 0.6026739128301143, 'colsample_bytree': 0.5348271395920778, 'n_estimators': 179}. Best is trial 15 with value: 0.9624540359339671.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004819 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:50:58,987] Trial 29 finished with value: 0.95779042366418 and parameters: {'learning_rate': 0.17799838177422958, 'num_leaves': 182, 'max_depth': 8, 'min_child_samples': 58, 'min_child_weight': 4.825022071411773, 'subsample': 0.6912649192390726, 'colsample_bytree': 0.6067301943986884, 'n_estimators': 617}. Best is trial 15 with value: 0.9624540359339671.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032638 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:51:02,125] Trial 30 finished with value: 0.9433238117124729 and parameters: {'learning_rate': 0.04047716618566077, 'num_leaves': 232, 'max_depth': 3, 'min_child_samples': 70, 'min_child_weight': 9.882214345133452, 'subsample': 0.5049141284831262, 'colsample_bytree': 0.617097613112806, 'n_estimators': 369}. Best is trial 15 with value: 0.9624540359339671.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013734 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:51:06,169] Trial 31 finished with value: 0.9621719599434593 and parameters: {'learning_rate': 0.11972984874255746, 'num_leaves': 172, 'max_depth': 6, 'min_child_samples': 95, 'min_child_weight': 2.2844474780546955, 'subsample': 0.7426568452190337, 'colsample_bytree': 0.5501848598203263, 'n_estimators': 437}. Best is trial 15 with value: 0.9624540359339671.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004781 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:51:10,431] Trial 32 finished with value: 0.9619930720753777 and parameters: {'learning_rate': 0.1460440823964155, 'num_leaves': 165, 'max_depth': 6, 'min_child_samples': 88, 'min_child_weight': 1.9830941918618719, 'subsample': 0.7243981272515564, 'colsample_bytree': 0.5524337931725127, 'n_estimators': 464}. Best is trial 15 with value: 0.9624540359339671.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005724 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:51:13,908] Trial 33 finished with value: 0.9614622561537682 and parameters: {'learning_rate': 0.1429372101048106, 'num_leaves': 139, 'max_depth': 6, 'min_child_samples': 86, 'min_child_weight': 2.1578792569066856, 'subsample': 0.7250068472946997, 'colsample_bytree': 0.547315683943057, 'n_estimators': 390}. Best is trial 15 with value: 0.9624540359339671.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004966 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:51:16,052] Trial 34 finished with value: 0.9607570128227302 and parameters: {'learning_rate': 0.19451892507299995, 'num_leaves': 165, 'max_depth': 4, 'min_child_samples': 100, 'min_child_weight': 0.8975333039741678, 'subsample': 0.7878756468113797, 'colsample_bytree': 0.5696352868566742, 'n_estimators': 280}. Best is trial 15 with value: 0.9624540359339671.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003154 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:51:21,356] Trial 35 finished with value: 0.9624569811171925 and parameters: {'learning_rate': 0.11155044863004299, 'num_leaves': 94, 'max_depth': 6, 'min_child_samples': 78, 'min_child_weight': 1.3685946467930163, 'subsample': 0.7111839680418954, 'colsample_bytree': 0.596818324791621, 'n_estimators': 561}. Best is trial 35 with value: 0.9624569811171925.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012083 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:51:29,055] Trial 36 finished with value: 0.9583249317357748 and parameters: {'learning_rate': 0.11247859852885625, 'num_leaves': 94, 'max_depth': 15, 'min_child_samples': 75, 'min_child_weight': 1.368879726518217, 'subsample': 0.6479343933624427, 'colsample_bytree': 0.6071456663539412, 'n_estimators': 583}. Best is trial 35 with value: 0.9624569811171925.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002445 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:51:33,946] Trial 37 finished with value: 0.9612861213917451 and parameters: {'learning_rate': 0.08858622116351028, 'num_leaves': 90, 'max_depth': 4, 'min_child_samples': 95, 'min_child_weight': 0.5829468322676559, 'subsample': 0.6889393598025237, 'colsample_bytree': 0.5190228938797773, 'n_estimators': 667}. Best is trial 35 with value: 0.9624569811171925.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007733 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:51:41,919] Trial 38 finished with value: 0.9590379435211177 and parameters: {'learning_rate': 0.1601462669279552, 'num_leaves': 267, 'max_depth': 7, 'min_child_samples': 78, 'min_child_weight': 4.594701408951437, 'subsample': 0.6041978443151184, 'colsample_bytree': 0.565256854140317, 'n_estimators': 808}. Best is trial 35 with value: 0.9624569811171925.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005256 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:51:44,560] Trial 39 finished with value: 0.9612760480114382 and parameters: {'learning_rate': 0.13085020556507765, 'num_leaves': 191, 'max_depth': 5, 'min_child_samples': 56, 'min_child_weight': 3.4131432759907394, 'subsample': 0.7756475378817869, 'colsample_bytree': 0.5983231100299983, 'n_estimators': 319}. Best is trial 35 with value: 0.9624569811171925.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004873 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:51:46,834] Trial 40 finished with value: 0.9613084877107322 and parameters: {'learning_rate': 0.097627044392613, 'num_leaves': 78, 'max_depth': 6, 'min_child_samples': 76, 'min_child_weight': 0.492561992635385, 'subsample': 0.7085675197814689, 'colsample_bytree': 0.650005104903927, 'n_estimators': 238}. Best is trial 35 with value: 0.9624569811171925.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010976 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:51:50,253] Trial 41 finished with value: 0.9615934235168357 and parameters: {'learning_rate': 0.14660210357337003, 'num_leaves': 150, 'max_depth': 6, 'min_child_samples': 86, 'min_child_weight': 2.2334825793525432, 'subsample': 0.7211703657572497, 'colsample_bytree': 0.5480642344217674, 'n_estimators': 392}. Best is trial 35 with value: 0.9624569811171925.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002242 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:51:54,981] Trial 42 finished with value: 0.9619758278141738 and parameters: {'learning_rate': 0.11473618607033317, 'num_leaves': 23, 'max_depth': 7, 'min_child_samples': 95, 'min_child_weight': 1.5299134417213107, 'subsample': 0.6673336779334452, 'colsample_bytree': 0.5170959444478087, 'n_estimators': 559}. Best is trial 35 with value: 0.9624569811171925.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002266 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:51:58,749] Trial 43 finished with value: 0.9614791802863607 and parameters: {'learning_rate': 0.1792495336898677, 'num_leaves': 134, 'max_depth': 5, 'min_child_samples': 84, 'min_child_weight': 1.0678790032347047, 'subsample': 0.6371696404845909, 'colsample_bytree': 0.5025178640964063, 'n_estimators': 457}. Best is trial 35 with value: 0.9624569811171925.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005005 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:52:04,108] Trial 44 finished with value: 0.9625326168371258 and parameters: {'learning_rate': 0.08449110888018113, 'num_leaves': 170, 'max_depth': 8, 'min_child_samples': 88, 'min_child_weight': 4.390178347571743, 'subsample': 0.798443112335652, 'colsample_bytree': 0.584200612807678, 'n_estimators': 490}. Best is trial 44 with value: 0.9625326168371258.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008131 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:52:09,800] Trial 45 finished with value: 0.9620660400568821 and parameters: {'learning_rate': 0.08141245940242088, 'num_leaves': 229, 'max_depth': 8, 'min_child_samples': 67, 'min_child_weight': 5.599105130389816, 'subsample': 0.8442937939039694, 'colsample_bytree': 0.5897305678493943, 'n_estimators': 500}. Best is trial 44 with value: 0.9625326168371258.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004962 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:52:15,768] Trial 46 finished with value: 0.9627544019391427 and parameters: {'learning_rate': 0.056732075773989935, 'num_leaves': 200, 'max_depth': 8, 'min_child_samples': 81, 'min_child_weight': 4.238151660690304, 'subsample': 0.8031135623721948, 'colsample_bytree': 0.6213074649038759, 'n_estimators': 530}. Best is trial 46 with value: 0.9627544019391427.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007953 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:52:23,365] Trial 47 finished with value: 0.9627294319074493 and parameters: {'learning_rate': 0.05683766691065629, 'num_leaves': 253, 'max_depth': 8, 'min_child_samples': 82, 'min_child_weight': 6.237812841117454, 'subsample': 0.8001883639820435, 'colsample_bytree': 0.6467841985251128, 'n_estimators': 657}. Best is trial 46 with value: 0.9627544019391427.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011911 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:52:31,914] Trial 48 finished with value: 0.9624465235825518 and parameters: {'learning_rate': 0.05625929357298218, 'num_leaves': 287, 'max_depth': 8, 'min_child_samples': 36, 'min_child_weight': 6.1290178895068035, 'subsample': 0.8069805580833423, 'colsample_bytree': 0.6288177374771741, 'n_estimators': 706}. Best is trial 46 with value: 0.9627544019391427.


[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008586 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575


[I 2024-10-09 22:52:41,071] Trial 49 finished with value: 0.9612736790597132 and parameters: {'learning_rate': 0.018850003105030495, 'num_leaves': 243, 'max_depth': 9, 'min_child_samples': 80, 'min_child_weight': 7.2272787228664805, 'subsample': 0.8714441381979217, 'colsample_bytree': 0.6501289447353497, 'n_estimators': 648}. Best is trial 46 with value: 0.9627544019391427.


Best LGBM Params:  {'learning_rate': 0.056732075773989935, 'num_leaves': 200, 'max_depth': 8, 'min_child_samples': 81, 'min_child_weight': 4.238151660690304, 'subsample': 0.8031135623721948, 'colsample_bytree': 0.6213074649038759, 'n_estimators': 530}
Best AUC for LGBM:  0.9627544019391427


In [13]:
lgb_best_params = lgb_study.best_trial.params
lgb_best_score = lgb_study.best_value
lgb_best_score

0.9627544019391427

In [14]:
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier

# Impute missing values
imputer = SimpleImputer(strategy='mean')  # or use 'median' depending on your data
X_train_prep = imputer.fit_transform(X_train_prep)
X_test_prep = imputer.transform(X_test_prep)

# Add MLPClassifier as one of the base models
mlp = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=500)

# Update estimators list for stacking
estimators = [
    ('catboost', CatBoostClassifier(**cat_study.best_trial.params)),
    ('lightgbm', lgb.LGBMClassifier(**lgb_study.best_trial.params)),
    ('xgb', xgb.XGBClassifier(**xgb_study.best_trial.params)),
    ('mlp', mlp)  # Add the simple neural network
]

# Define the stacking model
stacking_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression()  # Or RandomForest if you prefer
)

# Fit the stacking model
stacking_model.fit(X_train_prep, y_train)

# Evaluate AUC on the test set
y_pred_prob = stacking_model.predict_proba(X_test_prep)[:, 1]
stacking_auc = roc_auc_score(y_test, y_pred_prob)

print(f"Stacking AUC with MLP: {stacking_auc}")


0:	learn: 0.5751075	total: 24.2ms	remaining: 8.08s
1:	learn: 0.4950226	total: 50.8ms	remaining: 8.45s
2:	learn: 0.4384848	total: 79.4ms	remaining: 8.79s
3:	learn: 0.4027386	total: 103ms	remaining: 8.49s
4:	learn: 0.3746160	total: 130ms	remaining: 8.59s
5:	learn: 0.3566968	total: 154ms	remaining: 8.42s
6:	learn: 0.3438022	total: 179ms	remaining: 8.41s
7:	learn: 0.3328342	total: 204ms	remaining: 8.34s
8:	learn: 0.3228014	total: 234ms	remaining: 8.46s
9:	learn: 0.3145417	total: 264ms	remaining: 8.59s
10:	learn: 0.3067987	total: 294ms	remaining: 8.64s
11:	learn: 0.3031934	total: 317ms	remaining: 8.54s
12:	learn: 0.2992174	total: 347ms	remaining: 8.6s
13:	learn: 0.2941149	total: 376ms	remaining: 8.62s
14:	learn: 0.2914530	total: 401ms	remaining: 8.55s
15:	learn: 0.2886530	total: 429ms	remaining: 8.55s
16:	learn: 0.2866313	total: 461ms	remaining: 8.62s
17:	learn: 0.2847274	total: 490ms	remaining: 8.63s
18:	learn: 0.2831081	total: 516ms	remaining: 8.57s
19:	learn: 0.2821727	total: 544ms	remai

In [15]:
# Fit on the full training data
X_prep = preprocessor.transform(X)
X_prep = imputer.transform(X_prep)

# Fit the stacking model on the full data
stacking_model.fit(X_prep, y)

0:	learn: 0.5754079	total: 30.6ms	remaining: 10.2s
1:	learn: 0.4919787	total: 57.3ms	remaining: 9.54s
2:	learn: 0.4377538	total: 92.4ms	remaining: 10.2s
3:	learn: 0.3999983	total: 120ms	remaining: 9.97s
4:	learn: 0.3755795	total: 153ms	remaining: 10.1s
5:	learn: 0.3563123	total: 189ms	remaining: 10.3s
6:	learn: 0.3413232	total: 222ms	remaining: 10.4s
7:	learn: 0.3284950	total: 268ms	remaining: 11s
8:	learn: 0.3193504	total: 329ms	remaining: 11.9s
9:	learn: 0.3126755	total: 371ms	remaining: 12s
10:	learn: 0.3077524	total: 414ms	remaining: 12.2s
11:	learn: 0.3031319	total: 449ms	remaining: 12.1s
12:	learn: 0.2989435	total: 485ms	remaining: 12s
13:	learn: 0.2953847	total: 531ms	remaining: 12.2s
14:	learn: 0.2931042	total: 578ms	remaining: 12.3s
15:	learn: 0.2905887	total: 618ms	remaining: 12.3s
16:	learn: 0.2880323	total: 650ms	remaining: 12.2s
17:	learn: 0.2842813	total: 691ms	remaining: 12.2s
18:	learn: 0.2827286	total: 736ms	remaining: 12.2s
19:	learn: 0.2813449	total: 778ms	remaining:

In [None]:
estimators_RF = [
    ('catboost', CatBoostClassifier(**cat_study.best_trial.params)),
    ('lightgbm', lgb.LGBMClassifier(**lgb_study.best_trial.params)),
    ('xgb', xgb.XGBClassifier(**xgb_study.best_trial.params)),
    ('mlp', mlp)  # Add the simple neural network
]

# Define the stacking model
stacking_model_RF = StackingClassifier(
    estimators=estimators,
    final_estimator=RandomForestClassifier()
)
# Fit the stacking model
stacking_model.fit(X_train_prep, y_train)

# Evaluate AUC on the test set
y_pred_prob_rf = stacking_model.predict_proba(X_test_prep)[:, 1]
stacking_auc_rf = roc_auc_score(y_test, y_pred_prob_rf)

print(f"Stacking AUC with MLP and RF: {stacking_auc_rf}")

0:	learn: 0.5751075	total: 23.5ms	remaining: 7.84s
1:	learn: 0.4950226	total: 46.9ms	remaining: 7.81s
2:	learn: 0.4384848	total: 69.3ms	remaining: 7.67s
3:	learn: 0.4027386	total: 97ms	remaining: 8.03s
4:	learn: 0.3746160	total: 119ms	remaining: 7.87s
5:	learn: 0.3566968	total: 143ms	remaining: 7.86s
6:	learn: 0.3438022	total: 166ms	remaining: 7.78s
7:	learn: 0.3328342	total: 195ms	remaining: 7.97s
8:	learn: 0.3228014	total: 217ms	remaining: 7.87s
9:	learn: 0.3145417	total: 244ms	remaining: 7.94s
10:	learn: 0.3067987	total: 280ms	remaining: 8.24s
11:	learn: 0.3031934	total: 314ms	remaining: 8.45s
12:	learn: 0.2992174	total: 342ms	remaining: 8.46s
13:	learn: 0.2941149	total: 372ms	remaining: 8.54s
14:	learn: 0.2914530	total: 397ms	remaining: 8.47s
15:	learn: 0.2886530	total: 423ms	remaining: 8.44s
16:	learn: 0.2866313	total: 447ms	remaining: 8.36s
17:	learn: 0.2847274	total: 472ms	remaining: 8.3s
18:	learn: 0.2831081	total: 508ms	remaining: 8.45s
19:	learn: 0.2821727	total: 532ms	remain

In [16]:
# Submission data
data_to_predict_prep = preprocessor.transform(data_to_predict)
data_to_predict_prep = imputer.transform(data_to_predict_prep)

# Predict probabilities for the submission data
y_pred_prob = stacking_model.predict_proba(data_to_predict_prep)[:, 1]

# Create a submission dataframe
submission = pd.DataFrame({
	'id': data_to_predict['id'],
	'loan_status': y_pred_prob
})
submission

Unnamed: 0,id,loan_status
0,58645,0.973510
1,58646,0.029088
2,58647,0.691275
3,58648,0.021320
4,58649,0.030864
...,...,...
39093,97738,0.043283
39094,97739,0.020252
39095,97740,0.021276
39096,97741,0.173647


In [17]:
submission.to_csv('../submissions/Stack_XGB_LGB_CatB_MLP_LogReg.csv', index=False)

In [18]:
import joblib

# Save the model
joblib.dump(stacking_model, '../models/Stack_XGB_LGB_CatB_MLP_LogReg.pkl')

['../models/Stack_XGB_LGB_CatB_MLP_LogReg.pkl']