# Imports

In [40]:
# General

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Hyperparameter tuning
import optuna

# Models
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

# Data processing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Metrics
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

# Data preparation

In [41]:
# Data input

try:
    data = pd.read_csv('../data/train.csv')
    data_to_predict = pd.read_csv('../data/test.csv')
    data_ccrisk = pd.read_csv('../data/credit_risk_dataset.csv')

except:
    data = pd.read_csv('/kaggle/input/playground-series-s4e10/train.csv')
    data_to_predict = pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv')
    data_ccrisk = pd.read_csv('/kaggle/input/loan-approval-prediction/credit_risk_dataset.csv')


data_ccrisk.dropna(inplace=True)
data_ccrisk.drop_duplicates(inplace=True)

X_old = data.drop(['loan_status'], axis=1)
y_old = data['loan_status']

# Merge the dataframes
data_no_id = data.drop(['id'], axis=1)
merged_data = pd.concat([data_no_id, data_ccrisk], ignore_index=True)

X = merged_data.drop(['loan_status'], axis=1)
y = merged_data['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, shuffle=True, test_size=0.2, random_state=2024)

In [42]:
from imblearn.over_sampling import SMOTENC

categorical_columns = X.select_dtypes(include=['object']).columns

sm = SMOTENC(sampling_strategy=0.7, categorical_features=[X_train.columns.get_loc(col) for col in categorical_columns], random_state=2024)

X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [43]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

# Data processing
categorical_columns = X.select_dtypes(include=['object']).columns

categorical_ordinal = ['loan_grade']
categorical_onehot = categorical_columns.drop(categorical_ordinal)

log_columns = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt']
log_columns = log_columns

numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
numerical_columns = numerical_columns.drop(log_columns)

def log_transform():
	return FunctionTransformer(np.log1p, validate=True)

preprocessor = ColumnTransformer(
	transformers=[
		('ordinal', OrdinalEncoder(), categorical_ordinal),
		('onehot', OneHotEncoder(), categorical_onehot),
		('scaler', StandardScaler(), numerical_columns),
		('log', log_transform(), log_columns)
	], remainder='passthrough'
	)

preprocessor.fit(X_train_res)

X_train_prep = preprocessor.transform(X_train_res)
X_test_prep = preprocessor.transform(X_test)

# Models

## XGBoost

In [44]:
def xgb_objective(trial):
    # Hyperparameter search space
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 0.5),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 3.0),
        'eval_metric': 'auc',  # AUC is the metric
        'objective': 'binary:logistic'  # Use binary logistic, probability outputs
    }

    # Initialize the model
    model = xgb.XGBClassifier(**param)

    # Fit the model
    model.fit(X_train_prep, y_train_res,
              eval_set=[(X_test_prep, y_test)],  
              verbose=False)

    # Predict proba for validation set
    y_pred_prob = model.predict_proba(X_test_prep)[:, 1]

    auc = roc_auc_score(y_test, y_pred_prob)
 
    return auc

# Study object to run the optimization. I want to maximize AUC
xgb_study = optuna.create_study(direction='maximize')
xgb_study.optimize(xgb_objective, n_trials=50)

print(f"Best trial: {xgb_study.best_trial.params}")

[I 2024-10-12 17:00:42,630] A new study created in memory with name: no-name-2263b3b2-21cf-4d2b-b47c-4356feb80959
[I 2024-10-12 17:02:52,140] Trial 0 finished with value: 0.9507313241191518 and parameters: {'max_depth': 4, 'learning_rate': 0.24012624077728573, 'n_estimators': 121, 'subsample': 0.9481655671908854, 'colsample_bytree': 0.6383550771450843, 'gamma': 0.26860160585751036, 'lambda': 7.561187809487967, 'alpha': 7.164737627292461e-07, 'scale_pos_weight': 1.254144199791522}. Best is trial 0 with value: 0.9507313241191518.
[I 2024-10-12 17:18:20,806] Trial 1 finished with value: 0.9562505671326926 and parameters: {'max_depth': 9, 'learning_rate': 0.07011664926571008, 'n_estimators': 468, 'subsample': 0.8445727127932295, 'colsample_bytree': 0.5665594237930074, 'gamma': 0.23659380254993023, 'lambda': 0.07453919177780559, 'alpha': 1.5131321925272251e-06, 'scale_pos_weight': 2.695944515997298}. Best is trial 1 with value: 0.9562505671326926.
[I 2024-10-12 17:19:44,929] Trial 2 finishe

Best trial: {'max_depth': 6, 'learning_rate': 0.08218480696485447, 'n_estimators': 419, 'subsample': 0.9598787494275235, 'colsample_bytree': 0.7202079398862329, 'gamma': 0.20887266737138027, 'lambda': 5.603302128369934e-05, 'alpha': 1.7764720274807023e-05, 'scale_pos_weight': 1.0084836089316636}


In [45]:
xgb_best_params = xgb_study.best_trial.params
xgb_best_score = xgb_study.best_trial.value
xgb_best_score

0.9576538239093483

prev: 0.9576051262733373

## CatBoostClassifier

In [46]:
from catboost import CatBoostClassifier

def cat_objective(trial):

    # Define the hyperparameter search space

    param = {
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'iterations': trial.suggest_int('iterations', 200, 1000),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'random_strength': trial.suggest_float('random_strength', 0.0, 1.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 3.0),
        'eval_metric': 'AUC',  
        'logging_level': 'Silent',  # Disable CatBoost output
        'task_type': 'CPU',  # Try using GPU
        'use_best_model': True
    }

    # Initialize the CatBoost model with the trial's parameters
    model = CatBoostClassifier(**param)

    # Fit the model
    model.fit(X_train_prep, y_train_res,
              eval_set=[(X_test_prep, y_test)],
              early_stopping_rounds=20,  # Early stopping to prevent overfitting
              verbose=False)

    # Get predictions and calculate AUC score
    y_pred_prob = model.predict_proba(X_test_prep)[:, 1]  # Probabilities for class 1

    auc = roc_auc_score(y_test, y_pred_prob)

    return auc
    
cat_study = optuna.create_study(direction='maximize')  # We want to maximize AUC
cat_study.optimize(cat_objective, n_trials=50)

# Output the best trial
print(f"Best trial: {cat_study.best_trial.params}")

[I 2024-10-12 17:56:15,040] A new study created in memory with name: no-name-c96b2107-4dff-4ecf-9ec1-bfef779afd5e
[I 2024-10-12 17:56:31,249] Trial 0 finished with value: 0.9460281483465679 and parameters: {'depth': 7, 'learning_rate': 0.04709665247569972, 'iterations': 469, 'l2_leaf_reg': 7.332825434593331e-06, 'border_count': 52, 'bagging_temperature': 0.5280668145234195, 'random_strength': 0.5146747469991674, 'scale_pos_weight': 2.51703942972807}. Best is trial 0 with value: 0.9460281483465679.
[I 2024-10-12 17:56:45,078] Trial 1 finished with value: 0.9502261853049596 and parameters: {'depth': 10, 'learning_rate': 0.15253812626222452, 'iterations': 834, 'l2_leaf_reg': 0.1392973180005424, 'border_count': 217, 'bagging_temperature': 0.8802359268625483, 'random_strength': 0.45298208313982746, 'scale_pos_weight': 2.8698008362912844}. Best is trial 1 with value: 0.9502261853049596.
[I 2024-10-12 17:56:55,805] Trial 2 finished with value: 0.9549404432529158 and parameters: {'depth': 8, '

Best trial: {'depth': 5, 'learning_rate': 0.1258808559129393, 'iterations': 778, 'l2_leaf_reg': 2.8587004971729453, 'border_count': 255, 'bagging_temperature': 0.11191509656237325, 'random_strength': 0.1702409451238981, 'scale_pos_weight': 1.0934074973940175}


In [47]:
cat_best_params = cat_study.best_trial.params
cat_best_score = cat_study.best_trial.value
cat_best_score

0.959752593749994

prev: 0.9601213839193278

## LightGBM

In [48]:
# Objective function for LightGBM

def lgb_objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 10.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
    }

    lgb_model = lgb.LGBMClassifier(**param)
  

    # Train the model
    lgb_model.fit(X_train_prep, y_train_res,
                  eval_set=[(X_test_prep, y_test)],
                  eval_metric='auc'
                  )

    # Predict and evaluate AUC
    y_pred_prob = lgb_model.predict_proba(X_test_prep)[:, 1]
    auc = roc_auc_score(y_test, y_pred_prob)

    return auc

# Optimize the objective function
lgb_study = optuna.create_study(direction='maximize')
lgb_study.optimize(lgb_objective, n_trials=50)

# Best parameters and AUC
print("Best LGBM Params: ", lgb_study.best_trial.params)
print("Best AUC for LGBM: ", lgb_study.best_value)

[I 2024-10-12 18:06:56,281] A new study created in memory with name: no-name-e8022a83-b95d-435a-877e-aa60d3ee442c


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009097 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:06:59,891] Trial 0 finished with value: 0.9589032681429892 and parameters: {'learning_rate': 0.23234812867231913, 'num_leaves': 229, 'max_depth': 4, 'min_child_samples': 78, 'min_child_weight': 2.0800834013858247, 'subsample': 0.9196533275862899, 'colsample_bytree': 0.5949543801059094, 'n_estimators': 401}. Best is trial 0 with value: 0.9589032681429892.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005646 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:07:03,351] Trial 1 finished with value: 0.9550560956984677 and parameters: {'learning_rate': 0.19318835418733074, 'num_leaves': 203, 'max_depth': 15, 'min_child_samples': 71, 'min_child_weight': 2.0758087890701855, 'subsample': 0.659930960298156, 'colsample_bytree': 0.9459028602251096, 'n_estimators': 170}. Best is trial 0 with value: 0.9589032681429892.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014110 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:07:10,856] Trial 2 finished with value: 0.9593675591953924 and parameters: {'learning_rate': 0.07056726325200254, 'num_leaves': 63, 'max_depth': 4, 'min_child_samples': 32, 'min_child_weight': 4.620821694957613, 'subsample': 0.599420341828908, 'colsample_bytree': 0.5455972952002038, 'n_estimators': 987}. Best is trial 2 with value: 0.9593675591953924.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007224 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:07:20,413] Trial 3 finished with value: 0.9517347345875831 and parameters: {'learning_rate': 0.24508729912304805, 'num_leaves': 242, 'max_depth': 9, 'min_child_samples': 90, 'min_child_weight': 0.3494955090814854, 'subsample': 0.9991742808783597, 'colsample_bytree': 0.8015538975971573, 'n_estimators': 665}. Best is trial 2 with value: 0.9593675591953924.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012625 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:07:33,563] Trial 4 finished with value: 0.960004452761233 and parameters: {'learning_rate': 0.029039945589354473, 'num_leaves': 100, 'max_depth': 15, 'min_child_samples': 10, 'min_child_weight': 3.352476848486978, 'subsample': 0.543215891847535, 'colsample_bytree': 0.5639949088196425, 'n_estimators': 930}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007890 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:07:47,160] Trial 5 finished with value: 0.9502039972743531 and parameters: {'learning_rate': 0.28398960446820365, 'num_leaves': 129, 'max_depth': 13, 'min_child_samples': 10, 'min_child_weight': 5.019864172126846, 'subsample': 0.6177947957581656, 'colsample_bytree': 0.9427601640413856, 'n_estimators': 842}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014783 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:07:57,166] Trial 6 finished with value: 0.959057021486349 and parameters: {'learning_rate': 0.06526478835317741, 'num_leaves': 255, 'max_depth': 8, 'min_child_samples': 36, 'min_child_weight': 9.321163541284633, 'subsample': 0.6457112529915178, 'colsample_bytree': 0.5588311581802002, 'n_estimators': 907}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007450 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:08:04,357] Trial 7 finished with value: 0.9558713459841889 and parameters: {'learning_rate': 0.2209471398453273, 'num_leaves': 22, 'max_depth': 15, 'min_child_samples': 99, 'min_child_weight': 4.500274366843375, 'subsample': 0.63254676764259, 'colsample_bytree': 0.8109095899827348, 'n_estimators': 846}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013724 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:08:11,041] Trial 8 finished with value: 0.9546911416667629 and parameters: {'learning_rate': 0.1996160702723635, 'num_leaves': 47, 'max_depth': 11, 'min_child_samples': 75, 'min_child_weight': 1.2497060663991812, 'subsample': 0.880088439407995, 'colsample_bytree': 0.5794253076502638, 'n_estimators': 695}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008483 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:08:16,937] Trial 9 finished with value: 0.9552207180984427 and parameters: {'learning_rate': 0.1602015858647037, 'num_leaves': 166, 'max_depth': 10, 'min_child_samples': 58, 'min_child_weight': 4.163421189678562, 'subsample': 0.685266362166564, 'colsample_bytree': 0.8790678408385044, 'n_estimators': 403}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011244 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:08:18,513] Trial 10 finished with value: 0.9320988129379946 and parameters: {'learning_rate': 0.014262933365462083, 'num_leaves': 106, 'max_depth': 7, 'min_child_samples': 5, 'min_child_weight': 7.272633471428765, 'subsample': 0.5113979262426397, 'colsample_bytree': 0.7069369496515726, 'n_estimators': 103}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006323 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:08:24,871] Trial 11 finished with value: 0.9577676766853952 and parameters: {'learning_rate': 0.09677338487494189, 'num_leaves': 78, 'max_depth': 3, 'min_child_samples': 28, 'min_child_weight': 6.101002695180752, 'subsample': 0.5098218767544003, 'colsample_bytree': 0.5073271106504815, 'n_estimators': 956}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013877 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:08:35,796] Trial 12 finished with value: 0.9500619891424992 and parameters: {'learning_rate': 0.012018756263569758, 'num_leaves': 79, 'max_depth': 6, 'min_child_samples': 28, 'min_child_weight': 3.7305564252936216, 'subsample': 0.7945997436880224, 'colsample_bytree': 0.6825143589860939, 'n_estimators': 989}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011848 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:08:42,212] Trial 13 finished with value: 0.9592266166573242 and parameters: {'learning_rate': 0.08635701499409174, 'num_leaves': 149, 'max_depth': 5, 'min_child_samples': 44, 'min_child_weight': 3.007842137746012, 'subsample': 0.5535712290497365, 'colsample_bytree': 0.6494532340710436, 'n_estimators': 730}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004618 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:08:51,678] Trial 14 finished with value: 0.9588823825047876 and parameters: {'learning_rate': 0.049271305502880905, 'num_leaves': 293, 'max_depth': 12, 'min_child_samples': 20, 'min_child_weight': 6.312217610447304, 'subsample': 0.7495530914668578, 'colsample_bytree': 0.5199976645805067, 'n_estimators': 515}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007705 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:09:02,313] Trial 15 finished with value: 0.9557889163848922 and parameters: {'learning_rate': 0.12102989259062491, 'num_leaves': 80, 'max_depth': 13, 'min_child_samples': 17, 'min_child_weight': 8.01802106349373, 'subsample': 0.5788846274307351, 'colsample_bytree': 0.6339128027466915, 'n_estimators': 800}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005078 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:09:09,479] Trial 16 finished with value: 0.9593751130713405 and parameters: {'learning_rate': 0.1367781122252107, 'num_leaves': 30, 'max_depth': 3, 'min_child_samples': 50, 'min_child_weight': 3.25174157845916, 'subsample': 0.7430428087426217, 'colsample_bytree': 0.7461099221539669, 'n_estimators': 996}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011449 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:09:14,505] Trial 17 finished with value: 0.959425977414716 and parameters: {'learning_rate': 0.1261131604760684, 'num_leaves': 26, 'max_depth': 8, 'min_child_samples': 53, 'min_child_weight': 2.9448725693123032, 'subsample': 0.7374616772590555, 'colsample_bytree': 0.7569434344880281, 'n_estimators': 574}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011276 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:09:22,078] Trial 18 finished with value: 0.9551327237315977 and parameters: {'learning_rate': 0.15809232479713017, 'num_leaves': 119, 'max_depth': 9, 'min_child_samples': 61, 'min_child_weight': 1.9635705231421405, 'subsample': 0.8232484907386753, 'colsample_bytree': 0.8141634863483572, 'n_estimators': 569}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014325 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:09:25,959] Trial 19 finished with value: 0.952996338430313 and parameters: {'learning_rate': 0.042273096013540595, 'num_leaves': 170, 'max_depth': 7, 'min_child_samples': 43, 'min_child_weight': 0.048907694094502574, 'subsample': 0.6974241335160758, 'colsample_bytree': 0.7495242930740917, 'n_estimators': 297}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011772 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:09:33,759] Trial 20 finished with value: 0.9556526387796193 and parameters: {'learning_rate': 0.11250467528744845, 'num_leaves': 101, 'max_depth': 11, 'min_child_samples': 58, 'min_child_weight': 2.9896879766808295, 'subsample': 0.865942430587908, 'colsample_bytree': 0.862897728791028, 'n_estimators': 583}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012701 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:09:39,988] Trial 21 finished with value: 0.9593812579955053 and parameters: {'learning_rate': 0.12907162629708288, 'num_leaves': 41, 'max_depth': 3, 'min_child_samples': 45, 'min_child_weight': 2.907179544806293, 'subsample': 0.7450935660003225, 'colsample_bytree': 0.7416360100688547, 'n_estimators': 891}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005117 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:09:47,130] Trial 22 finished with value: 0.9576642904083108 and parameters: {'learning_rate': 0.17929515962313927, 'num_leaves': 47, 'max_depth': 6, 'min_child_samples': 64, 'min_child_weight': 5.464017496672913, 'subsample': 0.716550751575049, 'colsample_bytree': 0.6720562743480939, 'n_estimators': 775}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011345 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:09:53,788] Trial 23 finished with value: 0.9586837558231148 and parameters: {'learning_rate': 0.13596399098854522, 'num_leaves': 20, 'max_depth': 14, 'min_child_samples': 47, 'min_child_weight': 1.189368661621211, 'subsample': 0.7926572648363275, 'colsample_bytree': 0.7148036875246988, 'n_estimators': 879}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008903 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:10:01,626] Trial 24 finished with value: 0.9588988400088128 and parameters: {'learning_rate': 0.03750126806687064, 'num_leaves': 59, 'max_depth': 8, 'min_child_samples': 38, 'min_child_weight': 2.698566177684502, 'subsample': 0.801576530747866, 'colsample_bytree': 0.8797493472921085, 'n_estimators': 639}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007021 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:10:07,635] Trial 25 finished with value: 0.9576317069183273 and parameters: {'learning_rate': 0.09948324020599103, 'num_leaves': 91, 'max_depth': 10, 'min_child_samples': 23, 'min_child_weight': 3.857311295384111, 'subsample': 0.9669383463106058, 'colsample_bytree': 0.6188972365110803, 'n_estimators': 479}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004831 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:10:14,368] Trial 26 finished with value: 0.9593424348619094 and parameters: {'learning_rate': 0.1368212518567294, 'num_leaves': 42, 'max_depth': 5, 'min_child_samples': 68, 'min_child_weight': 1.2793918375472813, 'subsample': 0.8543167323926165, 'colsample_bytree': 0.7900264630027045, 'n_estimators': 751}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013661 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:10:29,129] Trial 27 finished with value: 0.9553371519794376 and parameters: {'learning_rate': 0.07455606350070967, 'num_leaves': 134, 'max_depth': 12, 'min_child_samples': 84, 'min_child_weight': 2.40069152775385, 'subsample': 0.5456245851167668, 'colsample_bytree': 0.990159410548606, 'n_estimators': 916}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008795 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:10:32,130] Trial 28 finished with value: 0.956799241373 and parameters: {'learning_rate': 0.26671377809713825, 'num_leaves': 65, 'max_depth': 8, 'min_child_samples': 55, 'min_child_weight': 3.5077829739940567, 'subsample': 0.729548026033567, 'colsample_bytree': 0.7730899494502137, 'n_estimators': 259}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008554 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:10:35,249] Trial 29 finished with value: 0.9580358984812116 and parameters: {'learning_rate': 0.17812007579880523, 'num_leaves': 41, 'max_depth': 4, 'min_child_samples': 11, 'min_child_weight': 1.7630595479450715, 'subsample': 0.9168314355130627, 'colsample_bytree': 0.5988927298222393, 'n_estimators': 404}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011830 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:10:42,375] Trial 30 finished with value: 0.9594624207221902 and parameters: {'learning_rate': 0.11068332293038773, 'num_leaves': 111, 'max_depth': 5, 'min_child_samples': 52, 'min_child_weight': 5.57534571648271, 'subsample': 0.7698418729430712, 'colsample_bytree': 0.8348785413084526, 'n_estimators': 819}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009830 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:10:49,517] Trial 31 finished with value: 0.9592723187908039 and parameters: {'learning_rate': 0.11931401381984186, 'num_leaves': 188, 'max_depth': 5, 'min_child_samples': 52, 'min_child_weight': 5.526613403916363, 'subsample': 0.7878845193027162, 'colsample_bytree': 0.8429967160895773, 'n_estimators': 829}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011089 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:10:55,729] Trial 32 finished with value: 0.9589521196981027 and parameters: {'learning_rate': 0.14479140087806464, 'num_leaves': 110, 'max_depth': 3, 'min_child_samples': 67, 'min_child_weight': 6.897079413305577, 'subsample': 0.6807833949203804, 'colsample_bytree': 0.9059195278790655, 'n_estimators': 908}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010261 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:11:01,304] Trial 33 finished with value: 0.9598140666715036 and parameters: {'learning_rate': 0.10603705231215152, 'num_leaves': 95, 'max_depth': 4, 'min_child_samples': 39, 'min_child_weight': 4.741149309293597, 'subsample': 0.7674128941609336, 'colsample_bytree': 0.7244336551232209, 'n_estimators': 712}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013193 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:11:06,162] Trial 34 finished with value: 0.958636277699992 and parameters: {'learning_rate': 0.09791683270798998, 'num_leaves': 145, 'max_depth': 4, 'min_child_samples': 40, 'min_child_weight': 4.693275879602739, 'subsample': 0.8350040213374169, 'colsample_bytree': 0.8346241475201156, 'n_estimators': 619}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014342 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:11:14,002] Trial 35 finished with value: 0.9597659491921094 and parameters: {'learning_rate': 0.05906857340943509, 'num_leaves': 200, 'max_depth': 7, 'min_child_samples': 30, 'min_child_weight': 5.152329523939586, 'subsample': 0.7684078505334595, 'colsample_bytree': 0.7780387665050555, 'n_estimators': 705}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007111 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:11:21,812] Trial 36 finished with value: 0.9596863611762411 and parameters: {'learning_rate': 0.05774568128906863, 'num_leaves': 208, 'max_depth': 6, 'min_child_samples': 34, 'min_child_weight': 5.24734726357824, 'subsample': 0.7702715900419046, 'colsample_bytree': 0.9164946352258273, 'n_estimators': 706}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012694 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:11:29,444] Trial 37 finished with value: 0.9565476310003114 and parameters: {'learning_rate': 0.02967059254354905, 'num_leaves': 228, 'max_depth': 6, 'min_child_samples': 31, 'min_child_weight': 4.818785287802829, 'subsample': 0.82640951710573, 'colsample_bytree': 0.9267113093486783, 'n_estimators': 711}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005933 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:11:37,176] Trial 38 finished with value: 0.9591030314580336 and parameters: {'learning_rate': 0.05495080236622797, 'num_leaves': 206, 'max_depth': 7, 'min_child_samples': 16, 'min_child_weight': 4.149076856835634, 'subsample': 0.6590172464190245, 'colsample_bytree': 0.9767139535208998, 'n_estimators': 677}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010788 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:11:45,847] Trial 39 finished with value: 0.9579903147470413 and parameters: {'learning_rate': 0.06832930750614924, 'num_leaves': 218, 'max_depth': 15, 'min_child_samples': 34, 'min_child_weight': 6.20447349672138, 'subsample': 0.8985062312689445, 'colsample_bytree': 0.5438138694836845, 'n_estimators': 474}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012053 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:11:52,355] Trial 40 finished with value: 0.9562093641729752 and parameters: {'learning_rate': 0.0323392333675447, 'num_leaves': 254, 'max_depth': 6, 'min_child_samples': 23, 'min_child_weight': 7.517411561924601, 'subsample': 0.7681274141031995, 'colsample_bytree': 0.712580911404715, 'n_estimators': 631}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004824 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:11:59,130] Trial 41 finished with value: 0.9593386460840043 and parameters: {'learning_rate': 0.08389682216302928, 'num_leaves': 186, 'max_depth': 5, 'min_child_samples': 38, 'min_child_weight': 5.326878478736334, 'subsample': 0.7083749901160221, 'colsample_bytree': 0.9107225516626429, 'n_estimators': 789}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006986 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:12:06,036] Trial 42 finished with value: 0.956412501868341 and parameters: {'learning_rate': 0.056893163383840636, 'num_leaves': 182, 'max_depth': 4, 'min_child_samples': 29, 'min_child_weight': 5.780577177369075, 'subsample': 0.7690398768694778, 'colsample_bytree': 0.963097399065286, 'n_estimators': 831}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008380 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:12:14,998] Trial 43 finished with value: 0.9567558361861253 and parameters: {'learning_rate': 0.023425382825388794, 'num_leaves': 122, 'max_depth': 7, 'min_child_samples': 34, 'min_child_weight': 5.091179576439547, 'subsample': 0.7769081746810821, 'colsample_bytree': 0.8247916665593079, 'n_estimators': 743}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008833 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:12:23,173] Trial 44 finished with value: 0.9599875926995555 and parameters: {'learning_rate': 0.07947600526682223, 'num_leaves': 152, 'max_depth': 5, 'min_child_samples': 6, 'min_child_weight': 4.041939710557623, 'subsample': 0.6244914684982146, 'colsample_bytree': 0.8581862069674732, 'n_estimators': 947}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013776 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:12:32,368] Trial 45 finished with value: 0.9590669433484877 and parameters: {'learning_rate': 0.08026523932843439, 'num_leaves': 199, 'max_depth': 6, 'min_child_samples': 8, 'min_child_weight': 4.321361985105883, 'subsample': 0.6249472270497439, 'colsample_bytree': 0.8836823153253164, 'n_estimators': 940}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013259 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:12:39,390] Trial 46 finished with value: 0.9577636274290091 and parameters: {'learning_rate': 0.058408568381814165, 'num_leaves': 155, 'max_depth': 4, 'min_child_samples': 14, 'min_child_weight': 6.557820854011388, 'subsample': 0.6000364421651049, 'colsample_bytree': 0.7910750009721579, 'n_estimators': 860}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010795 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:12:49,648] Trial 47 finished with value: 0.9583224248102816 and parameters: {'learning_rate': 0.04583375706036662, 'num_leaves': 236, 'max_depth': 9, 'min_child_samples': 24, 'min_child_weight': 3.966651577337453, 'subsample': 0.5959941065292425, 'colsample_bytree': 0.8578925802240225, 'n_estimators': 703}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008449 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:13:01,088] Trial 48 finished with value: 0.95617440085687 and parameters: {'learning_rate': 0.019520666648972902, 'num_leaves': 166, 'max_depth': 7, 'min_child_samples': 5, 'min_child_weight': 8.705872450362914, 'subsample': 0.5296797744855617, 'colsample_bytree': 0.9486807177027433, 'n_estimators': 957}. Best is trial 4 with value: 0.960004452761233.


[LightGBM] [Info] Number of positive: 40660, number of negative: 58086
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009940 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 98746, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411764 -> initscore=-0.356680
[LightGBM] [Info] Start training from score -0.356680


[I 2024-10-12 18:13:07,721] Trial 49 finished with value: 0.9594052812154096 and parameters: {'learning_rate': 0.06687904433386209, 'num_leaves': 281, 'max_depth': 5, 'min_child_samples': 9, 'min_child_weight': 3.446901037706915, 'subsample': 0.5611868602191304, 'colsample_bytree': 0.6728814664662276, 'n_estimators': 770}. Best is trial 4 with value: 0.960004452761233.


Best LGBM Params:  {'learning_rate': 0.029039945589354473, 'num_leaves': 100, 'max_depth': 15, 'min_child_samples': 10, 'min_child_weight': 3.352476848486978, 'subsample': 0.543215891847535, 'colsample_bytree': 0.5639949088196425, 'n_estimators': 930}
Best AUC for LGBM:  0.960004452761233


In [49]:
lgb_best_params = lgb_study.best_trial.params
lgb_best_score = lgb_study.best_value
lgb_best_score

0.960004452761233

prev: 0.9607718460460408

## ExtraTreesClassifier

In [50]:
from sklearn.ensemble import ExtraTreesClassifier

def extratrees_objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_float('max_features', 0.1, 1.0)
    }

    model = ExtraTreesClassifier(**param)
    score = cross_val_score(model, X_train_prep, y_train_res, cv=3, scoring='roc_auc').mean()

    return score

extratrees_study = optuna.create_study(direction='maximize')
extratrees_study.optimize(extratrees_objective, n_trials=50, timeout=600)

print('Best parameters for ExtraTrees:', extratrees_study.best_params)

[I 2024-10-12 18:13:07,768] A new study created in memory with name: no-name-6fce1c72-effa-4ab0-bc51-6412cbfe2871
[I 2024-10-12 18:14:59,487] Trial 0 finished with value: 0.952951338258325 and parameters: {'n_estimators': 446, 'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 4, 'max_features': 0.6745876059339685}. Best is trial 0 with value: 0.952951338258325.
[I 2024-10-12 18:15:46,443] Trial 1 finished with value: 0.9516451245517682 and parameters: {'n_estimators': 230, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 0.49752693933422076}. Best is trial 0 with value: 0.952951338258325.
[I 2024-10-12 18:16:22,418] Trial 2 finished with value: 0.9559531449377087 and parameters: {'n_estimators': 226, 'max_depth': 18, 'min_samples_split': 4, 'min_samples_leaf': 3, 'max_features': 0.23802354942403522}. Best is trial 2 with value: 0.9559531449377087.
[I 2024-10-12 18:17:06,143] Trial 3 finished with value: 0.9253577820296481 and parameters: {'n_est

Best parameters for ExtraTrees: {'n_estimators': 311, 'max_depth': 18, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_features': 0.45302441774199387}


In [51]:
extratrees_best_params = extratrees_study.best_params
extratrees_best_score = extratrees_study.best_value
extratrees_best_score

0.967021916728157

Prev: 0.9298212453316017

## LogisticRegression

In [52]:
from sklearn.linear_model import LogisticRegression

def logreg_objective(trial):
    
    param = {
        'penalty': 'elasticnet',
        'solver' : 'saga',
        'C': trial.suggest_float('C', 1e-4, 1e2, log=True),
        'l1_ratio': trial.suggest_float('l1_ratio', 0, 1)
    }

    model = LogisticRegression(**param, max_iter=1000)
    score = cross_val_score(model, X_train_prep, y_train_res, cv=3, scoring='roc_auc').mean()
    return score

logreg_study = optuna.create_study(direction='maximize')
logreg_study.optimize(logreg_objective, n_trials=50, timeout=600)
print('Best parameters for Logistic Regression:', logreg_study.best_params)

[I 2024-10-12 18:23:28,714] A new study created in memory with name: no-name-f3b4f0a8-baab-4388-913f-58739ece9ee9
[I 2024-10-12 18:23:57,568] Trial 0 finished with value: 0.910028604105741 and parameters: {'C': 7.087962306346659, 'l1_ratio': 0.913463574135948}. Best is trial 0 with value: 0.910028604105741.
[I 2024-10-12 18:25:08,943] Trial 1 finished with value: 0.9100785579241907 and parameters: {'C': 0.11052673822097674, 'l1_ratio': 0.890464449999595}. Best is trial 1 with value: 0.9100785579241907.
[I 2024-10-12 18:25:53,636] Trial 2 finished with value: 0.9099666096774112 and parameters: {'C': 0.03573478732081317, 'l1_ratio': 0.8524898855620824}. Best is trial 1 with value: 0.9100785579241907.
[I 2024-10-12 18:26:11,392] Trial 3 finished with value: 0.9100244696225003 and parameters: {'C': 90.9695948736603, 'l1_ratio': 0.5012938506283703}. Best is trial 1 with value: 0.9100785579241907.
[I 2024-10-12 18:26:29,320] Trial 4 finished with value: 0.9100260129200578 and parameters: {'C

Best parameters for Logistic Regression: {'C': 0.11052673822097674, 'l1_ratio': 0.890464449999595}


In [61]:
logreg_best_params = logreg_study.best_params
logreg_best_params['penalty'] = 'elasticnet'
logreg_best_params['solver'] = 'saga'
logreg_best_score = logreg_study.best_value
logreg_best_score

0.9100785579241907

prev: 0.8814172446287084

## HistGradientBoostingClassifier

In [54]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

def histgb_objective(trial):
    param = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_iter': trial.suggest_int('max_iter', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 10, 50),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 10, 50),
        'l2_regularization': trial.suggest_float('l2_regularization', 1e-5, 1.0, log=True)
    }

    model = HistGradientBoostingClassifier(**param)
    score = cross_val_score(model, X_train_prep, y_train_res, cv=3, scoring='roc_auc').mean()

    return score

histgb_study = optuna.create_study(direction='maximize')
histgb_study.optimize(histgb_objective, n_trials=50, timeout=600)
print('Best parameters for HistGradientBoosting:', histgb_study.best_params)

[I 2024-10-12 18:34:52,309] A new study created in memory with name: no-name-8392c466-4a35-4db9-a3fc-4540bb8fb7fd
[I 2024-10-12 18:35:24,857] Trial 0 finished with value: 0.9747227654330096 and parameters: {'learning_rate': 0.021227276215547236, 'max_iter': 277, 'max_depth': 11, 'min_samples_leaf': 10, 'max_leaf_nodes': 50, 'l2_regularization': 0.17349418473479641}. Best is trial 0 with value: 0.9747227654330096.
[I 2024-10-12 18:35:54,083] Trial 1 finished with value: 0.9761508208268715 and parameters: {'learning_rate': 0.02340867799063854, 'max_iter': 442, 'max_depth': 6, 'min_samples_leaf': 29, 'max_leaf_nodes': 32, 'l2_regularization': 0.018644547199788253}. Best is trial 1 with value: 0.9761508208268715.
[I 2024-10-12 18:36:15,741] Trial 2 finished with value: 0.9795112689516552 and parameters: {'learning_rate': 0.055979485739357834, 'max_iter': 325, 'max_depth': 17, 'min_samples_leaf': 28, 'max_leaf_nodes': 27, 'l2_regularization': 2.841337632701343e-05}. Best is trial 2 with val

Best parameters for HistGradientBoosting: {'learning_rate': 0.0973302436339639, 'max_iter': 350, 'max_depth': 20, 'min_samples_leaf': 21, 'max_leaf_nodes': 40, 'l2_regularization': 2.024705049363233e-05}


In [55]:
histgb_best_params = histgb_study.best_trial.params
histgb_best_score = histgb_study.best_trial.value
histgb_best_score

0.9807639747945546

Prev: 0.9519987809872763

## KNN

In [56]:
from sklearn.neighbors import KNeighborsClassifier

def knn_objective(trial):
    param = {
        'n_neighbors': trial.suggest_int('n_neighbors', 3, 15),
        'leaf_size': trial.suggest_int('leaf_size', 20, 50),
        'p': trial.suggest_categorical('p', [1, 2]),  # Minkowski distance parameter
        'weights': trial.suggest_categorical('weights', ['uniform', 'distance'])
    }

    model = KNeighborsClassifier(**param)
    score = cross_val_score(model, X_train_prep, y_train_res, cv=3, scoring='roc_auc').mean()

    return score

knn_study = optuna.create_study(direction='maximize')
knn_study.optimize(knn_objective, n_trials=50, timeout=600)
print('Best parameters for KNN:', knn_study.best_params)
print('Best score for KNN:', knn_study.best_value)

[I 2024-10-12 18:44:53,727] A new study created in memory with name: no-name-07cd8edb-4def-4ed4-bb2d-c9e54ffb47aa
[I 2024-10-12 18:46:21,249] Trial 0 finished with value: 0.9512896923582271 and parameters: {'n_neighbors': 12, 'leaf_size': 46, 'p': 1, 'weights': 'uniform'}. Best is trial 0 with value: 0.9512896923582271.
[I 2024-10-12 18:46:48,600] Trial 1 finished with value: 0.9439928737368423 and parameters: {'n_neighbors': 6, 'leaf_size': 27, 'p': 2, 'weights': 'distance'}. Best is trial 0 with value: 0.9512896923582271.
[I 2024-10-12 18:47:15,978] Trial 2 finished with value: 0.9513008939884001 and parameters: {'n_neighbors': 15, 'leaf_size': 45, 'p': 2, 'weights': 'distance'}. Best is trial 2 with value: 0.9513008939884001.
[I 2024-10-12 18:48:43,062] Trial 3 finished with value: 0.9347276531193683 and parameters: {'n_neighbors': 3, 'leaf_size': 31, 'p': 1, 'weights': 'uniform'}. Best is trial 2 with value: 0.9513008939884001.
[I 2024-10-12 18:50:10,639] Trial 4 finished with valu

Best parameters for KNN: {'n_neighbors': 10, 'leaf_size': 34, 'p': 1, 'weights': 'distance'}
Best score for KNN: 0.9541333615154279


In [57]:
knn_best_params = knn_study.best_trial.params
knn_best_score = knn_study.best_trial.value
knn_best_score

0.9541333615154279

Prev: 0.9003336325395043

# Stacking

In [63]:
from sklearn.ensemble import StackingClassifier

# Init base models

xgb_model = xgb.XGBClassifier(**xgb_best_params)
cat_model = CatBoostClassifier(**cat_best_params)
lgb_model = lgb.LGBMClassifier(**lgb_best_params)
extratrees_model = ExtraTreesClassifier(**extratrees_best_params)
logreg_model = LogisticRegression(**logreg_best_params)
histgb_model = HistGradientBoostingClassifier(**histgb_best_params)
knn_model = KNeighborsClassifier(**knn_best_params)

# Base and meta models
base_estimators = [
	('xgb', xgb_model),
	('cat', cat_model),
	('lgb', lgb_model),
	('extratrees', extratrees_model),
	('logreg', logreg_model),
	('histgb', histgb_model),
	('knn', knn_model)
]

meta_model = LogisticRegression()

# Stack

stack_model = StackingClassifier(estimators=base_estimators, final_estimator=meta_model, cv=5, n_jobs=-1)

stack_model.fit(X_train_prep, y_train_res)

In [64]:
print('Stacking Classifier ROC AUC:', roc_auc_score(y_test, stack_model.predict_proba(X_test_prep)[:, 1]))

Stacking Classifier ROC AUC: 0.9584046412908215


Prev: 0.9614965682196932

In [None]:
# Choose different base models

base_estimators = [
	('xgb', xgb_model),
	('cat', cat_model),
	('lgb', lgb_model),
	('histgb', histgb_model),
	('knn', knn_model)

]

meta_model = lgb.LGBMClassifier()

# Stack
stack_model_2 = StackingClassifier(estimators=base_estimators, final_estimator=meta_model, cv=3, n_jobs=-1)

stack_model_2.fit(X_train_prep, y_train_res)

print('Stacking Classifier ROC AUC:', roc_auc_score(y_test, stack_model_2.predict_proba(X_test_prep)[:, 1]))

# Submission

In [None]:
smote = SMOTENC(sampling_strategy=0.3, categorical_features=[X.columns.get_loc(col) for col in categorical_columns], random_state=2024)
X_res, y_res = smote.fit_resample(X, y)

X_train_all = preprocessor.transform(X_res)

stack_model.fit(X_train_all, y_res)

In [23]:
import joblib

try:
    joblib.dump(stack_model_2, '../models/stack_model_07_smote.pkl')
except:
    joblib.dump(stack_model_2, 'stack_model_07_smote.pkl')
    
X_to_pred = preprocessor.transform(data_to_predict)
y_pred_submit = stack_model_2.predict_proba(X_to_pred)[:, 1]

submission = pd.DataFrame({'id': data_to_predict['id'], 'loan_status': y_pred_submit})
try:
    submission.to_csv('../submissions/stack_model_v2.csv', index=False)
except:
    submission.to_csv('stack_model_v2.csv', index=False)