In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("./bank_additional_full_clean_normalized.csv")

In [4]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,1.642226,0,2,1,0,0,0,0,5,1,...,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641,0
1,-0.196449,1,2,4,0,1,0,0,5,1,...,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641,0
2,0.093868,2,2,2,0,0,0,0,5,1,...,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641,0
3,1.642226,1,2,4,0,0,1,0,5,1,...,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641,0
4,1.932543,2,2,5,0,0,0,0,5,1,...,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641,0


In [5]:
df.shape

(30488, 21)

In [6]:
df0 = df[df['y']==0]

In [7]:
df1 = df[df['y']==1]

In [10]:
from sklearn.utils import shuffle

In [11]:
shuffle(df0, random_state=101).head()
# random shuffle class 0 data

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
24658,0.093868,3,2,1,0,1,0,1,5,4,...,-0.559326,0.211884,-0.37161,1,-1.07333,-1.076766,-1.168813,-1.23447,-0.821115,0
2034,-0.099677,3,0,1,0,1,0,0,5,4,...,-0.191699,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.787777,0.401641,0
16015,0.093868,4,2,5,0,0,0,1,8,5,...,-0.559326,0.211884,-0.37161,1,0.913755,-0.135488,0.940077,0.845733,0.895268,0
8954,-0.680311,2,0,6,0,0,0,1,7,5,...,-0.191699,0.211884,-0.37161,1,0.913755,0.67425,-0.43801,0.845733,0.895268,0
13622,0.868047,3,2,1,0,0,0,1,8,2,...,1.646434,0.211884,-0.37161,1,0.913755,-0.135488,0.940077,0.847421,0.895268,0


In [12]:
df0_rs = shuffle(df0, random_state=101).iloc[0:3859]
# pick 10000 class 0 data

In [13]:
df_merg = pd.concat([df1, df0_rs])
# mix 10000 class 0 and all class 1 data

In [14]:
df_merg = shuffle(df_merg, random_state=101)

In [15]:
df_merg.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
19728,-0.002904,8,2,5,0,0,1,1,4,2,...,-0.559326,0.211884,-0.37161,1,-1.07333,-0.765854,-1.356734,-1.12925,-0.821115,0
29808,0.19064,4,1,6,0,1,0,1,8,5,...,-0.191699,-4.719257,5.366854,2,-1.011233,0.860456,0.480715,-1.437595,-2.251434,1
8033,-0.970628,1,0,4,0,1,0,1,7,1,...,0.543554,0.211884,-0.37161,1,0.913755,0.67425,-0.43801,0.844045,0.895268,0
28085,2.416405,2,2,4,0,1,0,1,10,5,...,-0.559326,0.211884,1.541211,0,-2.066872,-1.866005,2.861046,-1.536062,-1.906827,1
4882,-0.873856,3,0,2,0,1,0,0,5,5,...,0.175927,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.790028,0.401641,0


In [16]:
len(df_merg)

7718

In [17]:
X = df.iloc[:, 0:-1]
y = df.y

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
categorical_columns_subset = [
    'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome'
]

numerical_columns_subset = [
    'duration', 'campaign', 'pdays',
    'previous', 'emp.var.rate', 'cons.price.idx', 
    'cons.conf.idx', 'euribor3m', 'nr.employed'
]

X = X[categorical_columns_subset + numerical_columns_subset]
X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")

n_categorical_features = X.select_dtypes(include="category").shape[1]
n_numerical_features = X.select_dtypes(include="number").shape[1]

print(f"Number of samples: {X.shape[0]}")
print(f"Number of features: {X.shape[1]}")
print(f"Number of categorical features: {n_categorical_features}")
print(f"Number of numerical features: {n_numerical_features}")

Number of samples: 30488
Number of features: 19
Number of categorical features: 10
Number of numerical features: 9


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [22]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.pipeline import make_pipeline

one_hot_encoder = make_column_transformer(
    (
        OneHotEncoder(sparse=False, handle_unknown="ignore"),
        make_column_selector(dtype_include="category"),
    ),
    remainder="passthrough",
)


In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
clf_lr1 = make_pipeline(one_hot_encoder,
                        LogisticRegression(C=1.0, max_iter=600)).fit(X_train, y_train)

In [25]:
clf_lr2 = make_pipeline(one_hot_encoder,
                        LogisticRegression(C=3.0, max_iter=600)).fit(X_train, y_train)

In [26]:
clf_lr3 = make_pipeline(one_hot_encoder,
                        LogisticRegression(C=10.0, max_iter=600)).fit(X_train, y_train)

In [27]:
clf_lr4 = make_pipeline(one_hot_encoder,
                        LogisticRegression(C=30.0, max_iter=600)).fit(X_train, y_train)

In [28]:
clf_lr5 = make_pipeline(one_hot_encoder,
                        LogisticRegression(C=100.0, max_iter=600)).fit(X_train, y_train)

In [29]:
clf_lr6 = make_pipeline(one_hot_encoder,
                        LogisticRegression(C=300.0, max_iter=600)).fit(X_train, y_train)

In [30]:
clf_lr7 = make_pipeline(one_hot_encoder,
                        LogisticRegression(C=1000.0, max_iter=600)).fit(X_train, y_train)

In [31]:
clf_lr8 = make_pipeline(one_hot_encoder,
                        LogisticRegression(C=3000.0, max_iter=600)).fit(X_train, y_train)

In [32]:
lr_lst = [clf_lr1, clf_lr2, clf_lr3,clf_lr4,clf_lr5, clf_lr6, clf_lr7, clf_lr8]
cc_lst = [1, 3, 10, 30, 100, 300, 1000, 3000]

In [33]:
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score

In [34]:
best_f1 = 0
best_idx = 999

for i in range(len(lr_lst)):
    print(f"Logistic regression classifier {i+1}, C={cc_lst[i]}")
    #print("Classification report on trainng data: \n", classification_report(y_train, rf_lst[i].predict(X_train)))
    cl_rep = classification_report(y_test, lr_lst[i].predict(X_test), digits=4, output_dict=True)
    f1_sc = cl_rep['macro avg']['f1-score']
    
    if f1_sc > best_f1:
        best_f1 = f1_sc
        best_idx = i+1
    print('Classification report on test data:')
    print(f"f1-score of macro average: {f1_sc:.5f}\n", )
    print("----------------------------------------------------------------------------------")
    
print(f"\nThe best Logistic regression classifier is classifier {best_idx} with Micro-Average F1_score {best_f1:.5f}")

Logistic regression classifier 1, C=1
Classification report on test data:
f1-score of macro average: 0.72916

----------------------------------------------------------------------------------
Logistic regression classifier 2, C=3
Classification report on test data:
f1-score of macro average: 0.72974

----------------------------------------------------------------------------------
Logistic regression classifier 3, C=10
Classification report on test data:
f1-score of macro average: 0.72999

----------------------------------------------------------------------------------
Logistic regression classifier 4, C=30
Classification report on test data:
f1-score of macro average: 0.72958

----------------------------------------------------------------------------------
Logistic regression classifier 5, C=100
Classification report on test data:
f1-score of macro average: 0.72958

----------------------------------------------------------------------------------
Logistic regression classifier 

In [35]:
import joblib
joblib.dump(clf_lr3, "best_Logistic regression_model_one_hot_C10_balanced_data.sav")
clf_load = joblib.load("best_Logistic regression_model_one_hot_C10_balanced_data.sav")

In [36]:
classification_report(y_test, clf_load.predict(X_test), output_dict=True)

{'0': {'precision': 0.9225436289744203,
  'recall': 0.9646294213223348,
  'f1-score': 0.9431172481212197,
  'support': 8001},
 '1': {'precision': 0.6376440460947503,
  'recall': 0.43455497382198954,
  'f1-score': 0.5168655941878567,
  'support': 1146},
 'accuracy': 0.8982179949710287,
 'macro avg': {'precision': 0.7800938375345854,
  'recall': 0.6995921975721622,
  'f1-score': 0.7299914211545382,
  'support': 9147},
 'weighted avg': {'precision': 0.8868494208209162,
  'recall': 0.8982179949710287,
  'f1-score': 0.8897134659623005,
  'support': 9147}}

In [37]:
best_auc = 0
best_idx = 999

for i in range(len(lr_lst)):
    print(f"Logistic regression classifier {i+1}, C={cc_lst[i]}")
    
    auc_sc = roc_auc_score(y_test, lr_lst[i].predict_proba(X_test)[:,1])
    
    if auc_sc > best_auc:
        best_auc = auc_sc
        best_idx = i+1
    
    print(f"roc_auc_score on test data: {auc_sc:.5f}\n")
    print("----------------------------------------------------------------------------------")
    
print(f"The best Logistic regression roc_auc_score {best_auc:.5f} is from classifier {best_idx} ")

Logistic regression classifier 1, C=1
roc_auc_score on test data: 0.92870

----------------------------------------------------------------------------------
Logistic regression classifier 2, C=3
roc_auc_score on test data: 0.92867

----------------------------------------------------------------------------------
Logistic regression classifier 3, C=10
roc_auc_score on test data: 0.92865

----------------------------------------------------------------------------------
Logistic regression classifier 4, C=30
roc_auc_score on test data: 0.92864

----------------------------------------------------------------------------------
Logistic regression classifier 5, C=100
roc_auc_score on test data: 0.92864

----------------------------------------------------------------------------------
Logistic regression classifier 6, C=300
roc_auc_score on test data: 0.92864

----------------------------------------------------------------------------------
Logistic regression classifier 7, C=1000
roc_a

#### All logistic regression models performed almost identcally. Model 4 are slightly better. Not as good as Random Forest models.

In [38]:
from sklearn.neural_network import MLPClassifier

In [39]:
clf_nn1 = make_pipeline(one_hot_encoder,
                       MLPClassifier(hidden_layer_sizes=(20,4), alpha=0.001, max_iter=600)).fit(X_train, y_train)

In [40]:
clf_nn2 =make_pipeline( one_hot_encoder,
                       MLPClassifier(hidden_layer_sizes=(20,4), alpha=0.003, max_iter=600)).fit(X_train, y_train)

In [41]:
clf_nn3 =make_pipeline( one_hot_encoder,
                       MLPClassifier(hidden_layer_sizes=(20,4), alpha=0.01, max_iter=600)).fit(X_train, y_train)

In [42]:
clf_nn4 =make_pipeline( one_hot_encoder,
                       MLPClassifier(hidden_layer_sizes=(20,4), alpha=0.03, max_iter=600)).fit(X_train, y_train)

In [43]:
clf_nn5 =make_pipeline( one_hot_encoder,
                       MLPClassifier(hidden_layer_sizes=(20,4), alpha=0.1, max_iter=600)).fit(X_train, y_train)

In [44]:
clf_nn6 =make_pipeline( one_hot_encoder,
                       MLPClassifier(hidden_layer_sizes=(20,4), alpha=0.3, max_iter=600)).fit(X_train, y_train)

In [45]:
clf_nn7 =make_pipeline( one_hot_encoder,
                       MLPClassifier(hidden_layer_sizes=(20,4), alpha=1, max_iter=600)).fit(X_train, y_train)

In [46]:
clf_nn8 =make_pipeline( one_hot_encoder,
                       MLPClassifier(hidden_layer_sizes=(20,4), alpha=3, max_iter=600)).fit(X_train, y_train)

In [47]:
clf_nn9 =make_pipeline( one_hot_encoder,
                       MLPClassifier(hidden_layer_sizes=(20,8, 4), alpha=0.001, max_iter=600)).fit(X_train, y_train)

In [48]:
clf_nn10 =make_pipeline( one_hot_encoder,
                       MLPClassifier(hidden_layer_sizes=(20,8,4), alpha=0.003, max_iter=600)).fit(X_train, y_train)

In [49]:
clf_nn11 =make_pipeline( one_hot_encoder,
                       MLPClassifier(hidden_layer_sizes=(20,8,4), alpha=0.01, max_iter=600)).fit(X_train, y_train)

In [50]:
clf_nn12 =make_pipeline( one_hot_encoder,
                       MLPClassifier(hidden_layer_sizes=(20,8,4), alpha=0.03, max_iter=600)).fit(X_train, y_train)

In [51]:
clf_nn13 =make_pipeline( one_hot_encoder,
                       MLPClassifier(hidden_layer_sizes=(20,8,4), alpha=0.1, max_iter=600)).fit(X_train, y_train)

In [52]:
clf_nn14 =make_pipeline( one_hot_encoder,
                       MLPClassifier(hidden_layer_sizes=(20,8,4), alpha=0.3, max_iter=600)).fit(X_train, y_train)

In [53]:
clf_nn15 =make_pipeline( one_hot_encoder,
                       MLPClassifier(hidden_layer_sizes=(20,8,4), alpha=1.0, max_iter=600)).fit(X_train, y_train)

In [54]:
clf_nn16 =make_pipeline( one_hot_encoder,
                       MLPClassifier(hidden_layer_sizes=(20,8,4), alpha=3.0, max_iter=600)).fit(X_train, y_train)

In [55]:
nn_lst = [clf_nn1, clf_nn2, clf_nn3, clf_nn4, clf_nn5, clf_nn6, clf_nn7, clf_nn8, clf_nn9, clf_nn10, clf_nn11, 
          clf_nn12, clf_nn13, clf_nn14, clf_nn15, clf_nn16]

In [56]:
best_f1 = 0
best_idx = 999

for i in range(len(nn_lst)):
    print(f"Neural network classifier {i+1}")
    #print("Classification report on trainng data: \n", classification_report(y_train, rf_lst[i].predict(X_train)))
    cl_rep = classification_report(y_test, nn_lst[i].predict(X_test), digits=4, output_dict=True)
    f1_sc = cl_rep['macro avg']['f1-score']
    
    if f1_sc > best_f1:
        best_f1 = f1_sc
        best_idx = i+1
    print('Classification report on test data:')
    print(f"f1-score of macro average: {f1_sc:.5f}\n", )
    print("----------------------------------------------------------------------------------")
    
print(f"\nThe best Neural network classifier is classifier {best_idx} with Micro-Average F1_score {best_f1:.5f}")

Neural network classifier 1
Classification report on test data:
f1-score of macro average: 0.76833

----------------------------------------------------------------------------------
Neural network classifier 2
Classification report on test data:
f1-score of macro average: 0.75980

----------------------------------------------------------------------------------
Neural network classifier 3
Classification report on test data:
f1-score of macro average: 0.75564

----------------------------------------------------------------------------------
Neural network classifier 4
Classification report on test data:
f1-score of macro average: 0.77085

----------------------------------------------------------------------------------
Neural network classifier 5
Classification report on test data:
f1-score of macro average: 0.77819

----------------------------------------------------------------------------------
Neural network classifier 6
Classification report on test data:
f1-score of macro ave

In [57]:
clf_nn6

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f11ae4d1b20>)])),
                ('mlpclassifier',
                 MLPClassifier(alpha=0.3, hidden_layer_sizes=(20, 4),
                               max_iter=600))])

In [58]:
import joblib
joblib.dump(clf_nn6, "best_neural network_model_one_hot_balanced_data.sav")
clf_load = joblib.load("best_neural network_model_one_hot_balanced_data.sav")

In [59]:
classification_report(y_test, clf_load.predict(X_test), output_dict=True)

{'0': {'precision': 0.9358832904254015,
  'recall': 0.9541307336582927,
  'f1-score': 0.9449189256096051,
  'support': 8001},
 '1': {'precision': 0.6292929292929293,
  'recall': 0.543630017452007,
  'f1-score': 0.5833333333333334,
  'support': 1146},
 'accuracy': 0.9027003389089319,
 'macro avg': {'precision': 0.7825881098591654,
  'recall': 0.7488803755551499,
  'f1-score': 0.7641261294714692,
  'support': 9147},
 'weighted avg': {'precision': 0.8974715101851246,
  'recall': 0.9027003389089319,
  'f1-score': 0.8996169589813545,
  'support': 9147}}

### Neural Network Model 6 performed the best based on macro average f1-scores.

In [60]:
best_auc = 0
best_idx = 999

for i in range(len(nn_lst)):
    print(f"Neural network classifier {i+1}")
    
    auc_sc = roc_auc_score(y_test, nn_lst[i].predict_proba(X_test)[:,1])
    
    if auc_sc > best_auc:
        best_auc = auc_sc
        best_idx = i+1
    
    print(f"roc_auc_score on test data: {auc_sc:.5f}\n")
    print("----------------------------------------------------------------------------------")
    
print(f"The best Neural network roc_auc_score {best_auc:.5f} is from classifier {best_idx} ")

Neural network classifier 1
roc_auc_score on test data: 0.93167

----------------------------------------------------------------------------------
Neural network classifier 2
roc_auc_score on test data: 0.92982

----------------------------------------------------------------------------------
Neural network classifier 3
roc_auc_score on test data: 0.92772

----------------------------------------------------------------------------------
Neural network classifier 4
roc_auc_score on test data: 0.93221

----------------------------------------------------------------------------------
Neural network classifier 5
roc_auc_score on test data: 0.93586

----------------------------------------------------------------------------------
Neural network classifier 6
roc_auc_score on test data: 0.93740

----------------------------------------------------------------------------------
Neural network classifier 7
roc_auc_score on test data: 0.93497

-----------------------------------------------

### Neural network model 15 has the best roc_auc_score on the cross-validation data.

In [61]:
from sklearn.svm import SVC

In [62]:
clf_svc1 = make_pipeline(one_hot_encoder, SVC(C=1.0, probability=True)).fit(X_train, y_train)

In [63]:
clf_svc2 = make_pipeline(one_hot_encoder, SVC(C=3.0, probability=True)).fit(X_train, y_train)

In [64]:
clf_svc3 = make_pipeline(one_hot_encoder, SVC(C=10.0, probability=True)).fit(X_train, y_train)

In [65]:
clf_svc4 = make_pipeline(one_hot_encoder, SVC(C=30.0, probability=True)).fit(X_train, y_train)

In [66]:
clf_svc5 = make_pipeline(one_hot_encoder, SVC(C=100.0, probability=True)).fit(X_train, y_train)

In [67]:
clf_svc6 = make_pipeline(one_hot_encoder, SVC(C=300.0, probability=True)).fit(X_train, y_train)

In [68]:
clf_svc7 = make_pipeline(one_hot_encoder, SVC(C=1000.0, probability=True)).fit(X_train, y_train)

In [69]:
svc_lst = [clf_svc1, clf_svc2, clf_svc3, clf_svc4, clf_svc5, clf_svc6, clf_svc7]
svc_c = [1, 3, 10, 30, 100, 300, 1000]

In [70]:
best_f1 = 0
best_idx = 999

for i in range(len(svc_lst)):
    print(f"SVM classifier {i+1}, C={svc_c[i]}")
    #print("Classification report on trainng data: \n", classification_report(y_train, rf_lst[i].predict(X_train)))
    cl_rep = classification_report(y_test, svc_lst[i].predict(X_test), digits=4, output_dict=True)
    f1_sc = cl_rep['macro avg']['f1-score']
    
    if f1_sc > best_f1:
        best_f1 = f1_sc
        best_idx = i+1
    print('Classification report on test data:')
    print(f"f1-score of macro average: {f1_sc:.5f}\n", )
    print("----------------------------------------------------------------------------------")
    
print(f"\nThe best SVM classifier is classifier {best_idx} with Micro-Average F1_score {best_f1:.5f}")

SVM classifier 1, C=1
Classification report on test data:
f1-score of macro average: 0.73051

----------------------------------------------------------------------------------
SVM classifier 2, C=3
Classification report on test data:
f1-score of macro average: 0.74330

----------------------------------------------------------------------------------
SVM classifier 3, C=10
Classification report on test data:
f1-score of macro average: 0.73836

----------------------------------------------------------------------------------
SVM classifier 4, C=30
Classification report on test data:
f1-score of macro average: 0.72930

----------------------------------------------------------------------------------
SVM classifier 5, C=100
Classification report on test data:
f1-score of macro average: 0.72401

----------------------------------------------------------------------------------
SVM classifier 6, C=300
Classification report on test data:
f1-score of macro average: 0.72209

---------------

In [71]:
best_auc = 0
best_idx = 999

for i in range(len(svc_lst)):
    print(f"SVM classifier {i+1}, C={svc_c[i]}")    
    auc_sc = roc_auc_score(y_test, svc_lst[i].predict_proba(X_test)[:,1])    
    if auc_sc > best_auc:
        best_auc = auc_sc
        best_idx = i+1
    
    print(f"roc_auc_score on test data: {auc_sc:.5f}\n")
    print("----------------------------------------------------------------------------------")
    
print(f"The best SVM roc_auc_score {best_auc:.5f} is from classifier {best_idx} ")

SVM classifier 1, C=1
roc_auc_score on test data: 0.90488

----------------------------------------------------------------------------------
SVM classifier 2, C=3
roc_auc_score on test data: 0.91379

----------------------------------------------------------------------------------
SVM classifier 3, C=10
roc_auc_score on test data: 0.91276

----------------------------------------------------------------------------------
SVM classifier 4, C=30
roc_auc_score on test data: 0.90753

----------------------------------------------------------------------------------
SVM classifier 5, C=100
roc_auc_score on test data: 0.89756

----------------------------------------------------------------------------------
SVM classifier 6, C=300
roc_auc_score on test data: 0.89241

----------------------------------------------------------------------------------
SVM classifier 7, C=1000
roc_auc_score on test data: 0.88972

--------------------------------------------------------------------------------

In [72]:
joblib.dump(clf_svc2, "best_SVM_model_one_hot_C3_balanced_data.sav")
clf_load = joblib.load("best_SVM_model_one_hot_C3_balanced_data.sav")

### SVC model 2 (C=3) is the best among the SVC models. 

In [73]:
from optuna.integration import LightGBMPruningCallback

def objective(trial, X, y):
    param_grid = {
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.05, step=0.02),
        "num_leaves": trial.suggest_int("num_leaves", 30, 210, step=30),
        "max_depth": trial.suggest_int("max_depth", 3, 12, step=1),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 120, step=20),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=20),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=20),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 4.0, step=1.0),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.25, 0.95, step=0.1
        ),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.25, 0.95, step=0.1
        ),
    }

#     cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1121218)

#     cv_scores = np.empty(5)
#     for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
#         X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
#         y_train, y_test = y[train_idx], y[test_idx]

    model = lgbm.LGBMClassifier(objective="binary", **param_grid)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_test, y_test)],
        eval_metric="binary_logloss",
        early_stopping_rounds=100,
        callbacks=[
            LightGBMPruningCallback(trial, "binary_logloss")
        ],  # Add a pruning callback
    )
    preds = model.predict_proba(X_test)
    cv_scores = log_loss(y_test, preds)
    mean_loss = np.mean(cv_scores)
    print(f"mean_loss {mean_loss:.5f}")
    return mean_loss

In [None]:
study = optuna.create_study(direction="minimize", study_name="LGBM Classifier")
func = lambda trial: objective(trial, X, y)
study.optimize(func, n_trials=2000)