In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_curve, roc_auc_score, auc
from sklearn.metrics import confusion_matrix, classification_report

# Importing the models

import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

# Importing sampling methods

from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import TomekLinks

!pip install optuna
import optuna
from optuna.samplers import TPESampler
optuna.logging.set_verbosity(optuna.logging.WARNING)

TRIALS = 5
MODEL_NUM = 5
NUM_ITER = 1000
SEED = 2024
SEED_OBJ = SEED
SAMPLER = TPESampler(seed=SEED)
max_metrics = []
max_val_auc_score = 0
avg_accuracy, avg_auc, avg_f1, avg_precision, avg_recall = 0, 0, 0, 0, 0

kfold = KFold(n_splits=5, shuffle=True, random_state=SEED)

import time
import warnings
warnings.filterwarnings("ignore")



In [2]:
htru2_data = pd.read_csv('https://raw.githubusercontent.com/szbela87/ml_22_elteik/main/data/HTRU_2.csv', header=None)
htru2_data.columns = ['mean_ip', 'std_ip', 'excess_kurt_ip', 'skewness_ip', 'mean_DMSNR', 'std_DMSNR', 'excess_kurt_DMSNR', 'skewness_DMSNR', 'class']

In [3]:
htru2_data

Unnamed: 0,mean_ip,std_ip,excess_kurt_ip,skewness_ip,mean_DMSNR,std_DMSNR,excess_kurt_DMSNR,skewness_DMSNR,class
0,140.562500,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.882430,0.465318,-0.515088,1.677258,14.860146,10.576487,127.393580,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.750000,57.178449,-0.068415,-0.636238,3.642977,20.959280,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.178930,11.468720,14.269573,252.567306,0
...,...,...,...,...,...,...,...,...,...
17893,136.429688,59.847421,-0.187846,-0.738123,1.296823,12.166062,15.450260,285.931022,0
17894,122.554688,49.485605,0.127978,0.323061,16.409699,44.626893,2.945244,8.297092,0
17895,119.335938,59.935939,0.159363,-0.743025,21.430602,58.872000,2.499517,4.595173,0
17896,114.507812,53.902400,0.201161,-0.024789,1.946488,13.381731,10.007967,134.238910,0


In [4]:
htru2_data.describe()

Unnamed: 0,mean_ip,std_ip,excess_kurt_ip,skewness_ip,mean_DMSNR,std_DMSNR,excess_kurt_DMSNR,skewness_DMSNR,class
count,17898.0,17898.0,17898.0,17898.0,17898.0,17898.0,17898.0,17898.0,17898.0
mean,111.079968,46.549532,0.477857,1.770279,12.6144,26.326515,8.303556,104.857709,0.091574
std,25.652935,6.843189,1.06404,6.167913,29.472897,19.470572,4.506092,106.51454,0.288432
min,5.8125,24.772042,-1.876011,-1.791886,0.213211,7.370432,-3.13927,-1.976976,0.0
25%,100.929688,42.376018,0.027098,-0.188572,1.923077,14.437332,5.781506,34.960504,0.0
50%,115.078125,46.947479,0.22324,0.19871,2.801839,18.461316,8.433515,83.064556,0.0
75%,127.085938,51.023202,0.473325,0.927783,5.464256,28.428104,10.702959,139.30933,0.0
max,192.617188,98.778911,8.069522,68.101622,223.392141,110.642211,34.539844,1191.000837,1.0


In [5]:
htru2_data.shape

(17898, 9)

In [6]:
htru2_data["class"].value_counts()

class
0    16259
1     1639
Name: count, dtype: int64

In [7]:
# Splitting the data (80/10/10% - train/validation/test)

X = htru2_data.drop("class", axis=1)
y = htru2_data["class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=SEED)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((16108, 8), (16108,), (1790, 8), (1790,))

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1/0.9, random_state=SEED)

X_train.shape, y_train.shape, X_val.shape, y_val.shape

((14318, 8), (14318,), (1790, 8), (1790,))

In [9]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((14318, 8), (14318,), (1790, 8), (1790,), (1790, 8), (1790,))

In [10]:
y_train.value_counts(), y_val.value_counts(), y_test.value_counts()

(class
 0    13014
 1     1304
 Name: count, dtype: int64,
 class
 0    1627
 1     163
 Name: count, dtype: int64,
 class
 0    1618
 1     172
 Name: count, dtype: int64)

# LightGBM StackingClassifier fine tuning with Optuna

In [11]:
max_metrics = []
max_val_auc_score = 0
avg_accuracy, avg_auc, avg_f1, avg_precision, avg_recall = 0, 0, 0, 0, 0

for i in range(TRIALS):

  t1 = time.perf_counter()
  SEED = SEED + 1

  def objective(trial):

    # LightGBM parameters

    base_models = []
    for j in range(MODEL_NUM):

      SEED_OBJ + 1

      params = {
          f"num_leaves_{j}" : trial.suggest_int(f"num_leaves_{j}", 20, 50),
          f"learning_rate_{j}" : trial.suggest_loguniform(f"learning_rate_{j}", 0.01, 0.2),
          f"n_estimators_{j}" : trial.suggest_int(f"n_estimators_{j}", 100, 1000),
          f"min_split_gain_{j}" : trial.suggest_uniform(f"min_split_gain_{j}", 0.0, 0.5),
          f"min_child_weight_{j}" : trial.suggest_loguniform(f"min_child_weight_{j}", 1e-3, 1.0),
          f"subsample_{j}" : trial.suggest_uniform(f"subsample_{j}", 0.5, 1.0),
          f"colsample_bytree_{j}" : trial.suggest_uniform(f"colsample_bytree_{j}", 0.5, 1.0),
          f"lambda_{j}" : trial.suggest_loguniform(f"lambda_{j}", 1e-8, 10.0)
      }

      model_params = {
          "num_leaves" : params[f"num_leaves_{j}"],
          "learning_rate" : params[f"learning_rate_{j}"],
          "n_estimators" : params[f"n_estimators_{j}"],
          "min_split_gain" : params[f"min_split_gain_{j}"],
          "min_child_weight" : params[f"min_child_weight_{j}"],
          "subsample" : params[f"subsample_{j}"],
          "colsample_bytree" : params[f"colsample_bytree_{j}"],
          "lambda" : params[f"lambda_{j}"]
      }
      base_models.append((f"lgb{j}", lgb.LGBMClassifier(random_state=SEED_OBJ, verbosity=-1, **model_params)))

    # Parameters of LogisticRegression

    lr_C = trial.suggest_loguniform("C", 0.001, 10)
    lr_max_iter = trial.suggest_int("max_iter", 100, 1000)

    stack_clf = StackingClassifier(estimators=base_models,
                                  final_estimator=LogisticRegression(random_state=SEED_OBJ, C=lr_C, max_iter=lr_max_iter),
                                  cv=kfold,
                                  stack_method="predict_proba")


    # Fitting the model and evaluating with auc metric

    stack_clf.fit(X_train, y_train)
    y_pred = stack_clf.predict(X_val)

    auc_score = roc_auc_score(y_val, y_pred)

    return auc_score

  # Optuna sampler

  SAMPLER = TPESampler(seed=SEED)

  # Optuna study

  lgb_study = optuna.create_study(direction="maximize", sampler=SAMPLER)
  lgb_study.optimize(objective, n_trials=NUM_ITER)

  # Optuna best parameters

  best_params = lgb_study.best_trial.params

  # LightGBM StackingClassifier best parameters

  base_models = []
  for k in range(MODEL_NUM):

    model_params = {
        "num_leaves" : best_params[f"num_leaves_{k}"],
        "learning_rate" : best_params[f"learning_rate_{k}"],
        "n_estimators" : best_params[f"n_estimators_{k}"],
        "min_split_gain" : best_params[f"min_split_gain_{k}"],
        "min_child_weight" : best_params[f"min_child_weight_{k}"],
        "subsample" : best_params[f"subsample_{k}"],
        "colsample_bytree" : best_params[f"colsample_bytree_{k}"],
        "lambda" : best_params[f"lambda_{k}"]
    }
    base_models.append((f"lgb{k}", lgb.LGBMClassifier(random_state=SEED, verbosity=-1, **model_params)))

  # LogisticRegression best parameters

  lr_C = best_params["C"]
  lr_max_iter = best_params["max_iter"]

  # Fine tuned LightGBM StackingClassifier

  stack_clf = StackingClassifier(estimators=base_models,
                                final_estimator=LogisticRegression(random_state=SEED, C=lr_C, max_iter=lr_max_iter),
                                cv=kfold,
                                stack_method="predict_proba")

  stack_clf.fit(X_train, y_train)

  y_pred = stack_clf.predict(X_test)

  accuracy = accuracy_score(y_test, y_pred)
  auc = roc_auc_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  conf_matrix = confusion_matrix(y_test, y_pred)

  # Max AUC validation scores

  if max_val_auc_score < lgb_study.best_trial.value:
    max_val_auc_score = lgb_study.best_trial.value
    max_metrics = [accuracy, auc, f1, precision, recall]

  # Average scores

  avg_accuracy += accuracy
  avg_auc += auc
  avg_f1 += f1
  avg_precision += precision
  avg_recall += recall

  print("\n" + 10*"/" + "*")
  print(f"Trial {i+1}" + 3*" " + "*")
  print(10*"/" + "*")

  print(f"\nAccuracy: {accuracy:.4f}")
  print(f"AUC: {auc:.4f}")
  print(f"F1-score: {f1:.4f}")
  print(f"Precision: {precision:.4f}")
  print(f"Recall score: {recall:.4f}")
  print(f"Confusion matrix:\n {conf_matrix}")

  t2 = time.perf_counter()
  print("Runtime:", t2-t1)

# Best validation auc scores

print("\n" + 50*"-")
print("\n" + 42*"/" + "*")
print("Scores with the best validation AUC score *")
print(42*"/" + "*")

print(f"\nBest validation AUC: {(max_val_auc_score):.4f}")
print(f"Accuracy: {(max_metrics[0]):.4f}")
print(f"AUC: {(max_metrics[1]):.4f}")
print(f"F1-score: {(max_metrics[2]):.4f}")
print(f"Precision: {(max_metrics[3]):.4f}")
print(f"Recall score: {(max_metrics[4]):.4f}")

# Printing average scores

print("\n" + 50*"-")
print("\n" + 15*"/" + "*")
print("Average scores *")
print(15*"/" + "*")

print(f"\nAverage accuracy: {(avg_accuracy / TRIALS):.4f}")
print(f"Average AUC: {(avg_auc / TRIALS):.4f}")
print(f"Average f1-score: {(avg_f1 / TRIALS):.4f}")
print(f"Average precision: {(avg_precision / TRIALS):.4f}")
print(f"Average recall score: {(avg_recall / TRIALS):.4f}")

# Saving the model

pickle.dump(stack_clf, open("stacked_lightgbm_without_smote.pkl","wb"))


//////////*
Trial 1   *
//////////*

Accuracy: 0.9849
AUC: 0.9449
F1-score: 0.9194
Precision: 0.9448
Recall score: 0.8953
Confusion matrix:
 [[1609    9]
 [  18  154]]
Runtime: 13786.799222428817

//////////*
Trial 2   *
//////////*

Accuracy: 0.9832
AUC: 0.9440
F1-score: 0.9112
Precision: 0.9277
Recall score: 0.8953
Confusion matrix:
 [[1606   12]
 [  18  154]]
Runtime: 8649.777214934118

//////////*
Trial 3   *
//////////*

Accuracy: 0.9855
AUC: 0.9452
F1-score: 0.9222
Precision: 0.9506
Recall score: 0.8953
Confusion matrix:
 [[1610    8]
 [  18  154]]
Runtime: 12314.726405731868

//////////*
Trial 4   *
//////////*

Accuracy: 0.9844
AUC: 0.9420
F1-score: 0.9162
Precision: 0.9444
Recall score: 0.8895
Confusion matrix:
 [[1609    9]
 [  19  153]]
Runtime: 15022.434128265362

//////////*
Trial 5   *
//////////*

Accuracy: 0.9855
AUC: 0.9426
F1-score: 0.9217
Precision: 0.9563
Recall score: 0.8895
Confusion matrix:
 [[1611    7]
 [  19  153]]
Runtime: 13405.079136505723

---------------

# LightGBM StackingClassifier with SMOTE and fine tuning with Optuna

In [12]:
max_metrics = []
max_val_auc_score = 0
avg_accuracy, avg_auc, avg_f1, avg_precision, avg_recall = 0, 0, 0, 0, 0

for i in range(TRIALS):

  t1 = time.perf_counter()
  SEED = SEED + 1

  def objective(trial):

    # LightGBM parameters

    base_models = []
    for j in range(MODEL_NUM):

      SEED_OBJ + 1

      params = {
          f"num_leaves_{j}" : trial.suggest_int(f"num_leaves_{j}", 20, 50),
          f"learning_rate_{j}" : trial.suggest_loguniform(f"learning_rate_{j}", 0.01, 0.2),
          f"n_estimators_{j}" : trial.suggest_int(f"n_estimators_{j}", 100, 1000),
          f"min_split_gain_{j}" : trial.suggest_uniform(f"min_split_gain_{j}", 0.0, 0.5),
          f"min_child_weight_{j}" : trial.suggest_loguniform(f"min_child_weight_{j}", 1e-3, 1.0),
          f"subsample_{j}" : trial.suggest_uniform(f"subsample_{j}", 0.5, 1.0),
          f"colsample_bytree_{j}" : trial.suggest_uniform(f"colsample_bytree_{j}", 0.5, 1.0),
          f"lambda_{j}" : trial.suggest_loguniform(f"lambda_{j}", 1e-8, 10.0)
      }
      model_params = {
          "num_leaves" : params[f"num_leaves_{j}"],
          "learning_rate" : params[f"learning_rate_{j}"],
          "n_estimators" : params[f"n_estimators_{j}"],
          "min_split_gain" : params[f"min_split_gain_{j}"],
          "min_child_weight" : params[f"min_child_weight_{j}"],
          "subsample" : params[f"subsample_{j}"],
          "colsample_bytree" : params[f"colsample_bytree_{j}"],
          "lambda" : params[f"lambda_{j}"]
      }
      base_models.append((f"lgb{j}", lgb.LGBMClassifier(random_state=SEED_OBJ, verbosity=-1, **model_params)))

    # SMOTE parameter

    smote_kn = trial.suggest_int("k_neighbors", 10, 230)

    smote_train = SMOTE(sampling_strategy="minority", k_neighbors=smote_kn, random_state=SEED_OBJ)
    X_smote_train, y_smote_train = smote_train.fit_resample(X_train, y_train)

    # LogisticRegression parameters

    lr_C = trial.suggest_loguniform("C", 0.001, 10)
    lr_max_iter = trial.suggest_int("max_iter", 100, 1000)

    stack_clf = StackingClassifier(estimators=base_models,
                                  final_estimator=LogisticRegression(random_state=SEED_OBJ, C=lr_C, max_iter=lr_max_iter),
                                  cv=kfold,
                                  stack_method="predict_proba")


    # Fitting the model and evaluating with auc metric

    stack_clf.fit(X_smote_train, y_smote_train)
    y_pred = stack_clf.predict(X_val)

    auc_score = roc_auc_score(y_val, y_pred)

    return auc_score

  # Optuna sampler

  SAMPLER = TPESampler(seed=SEED)

  # Optuna study

  lgb_study = optuna.create_study(direction="maximize", sampler=SAMPLER)
  lgb_study.optimize(objective, n_trials=NUM_ITER)

  # Optuna best parameters

  best_params = lgb_study.best_trial.params

  # LightGBM StackingClassifier best parameters

  base_models = []
  for k in range(MODEL_NUM):

    model_params = {
        "num_leaves" : best_params[f"num_leaves_{k}"],
        "learning_rate" : best_params[f"learning_rate_{k}"],
        "n_estimators" : best_params[f"n_estimators_{k}"],
        "min_split_gain" : best_params[f"min_split_gain_{k}"],
        "min_child_weight" : best_params[f"min_child_weight_{k}"],
        "subsample" : best_params[f"subsample_{k}"],
        "colsample_bytree" : best_params[f"colsample_bytree_{k}"],
        "lambda" : best_params[f"lambda_{k}"]
    }
    base_models.append((f"lgb{k}", lgb.LGBMClassifier(random_state=SEED, verbosity=-1, **model_params)))

  # SMOTE best parameter

  smote_kn = best_params["k_neighbors"]

  smote_train = SMOTE(sampling_strategy="minority", k_neighbors=smote_kn, random_state=SEED)
  X_smote_train, y_smote_train = smote_train.fit_resample(X_train, y_train)

  # LogisticRegression best parameters

  lr_C = best_params["C"]
  lr_max_iter = best_params["max_iter"]

  # Fine tuned LightGBM StackingClassifier

  stack_clf = StackingClassifier(estimators=base_models,
                                final_estimator=LogisticRegression(random_state=SEED, C=lr_C, max_iter=lr_max_iter),
                                cv=kfold,
                                stack_method="predict_proba")

  stack_clf.fit(X_smote_train, y_smote_train)

  y_pred = stack_clf.predict(X_test)

  accuracy = accuracy_score(y_test, y_pred)
  auc = roc_auc_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  conf_matrix = confusion_matrix(y_test, y_pred)

  # Max AUC validation scores

  if max_val_auc_score < lgb_study.best_trial.value:
    max_val_auc_score = lgb_study.best_trial.value
    max_metrics = [accuracy, auc, f1, precision, recall]

  # Average scores

  avg_accuracy += accuracy
  avg_auc += auc
  avg_f1 += f1
  avg_precision += precision
  avg_recall += recall

  print("\n" + 10*"/" + "*")
  print(f"Trial {i+1}" + 3*" " + "*")
  print(10*"/" + "*")

  print(f"\nAccuracy: {accuracy:.4f}")
  print(f"AUC: {auc:.4f}")
  print(f"F1-score: {f1:.4f}")
  print(f"Precision: {precision:.4f}")
  print(f"Recall score: {recall:.4f}")
  print(f"Confusion matrix:\n {conf_matrix}")

  t2 = time.perf_counter()
  print("Runtime:", t2-t1)

# Best validation auc scores

print("\n" + 50*"-")
print("\n" + 42*"/" + "*")
print("Scores with the best validation AUC score *")
print(42*"/" + "*")

print(f"\nBest validation AUC: {(max_val_auc_score):.4f}")
print(f"Accuracy: {(max_metrics[0]):.4f}")
print(f"AUC: {(max_metrics[1]):.4f}")
print(f"F1-score: {(max_metrics[2]):.4f}")
print(f"Precision: {(max_metrics[3]):.4f}")
print(f"Recall score: {(max_metrics[4]):.4f}")

# Printing average scores

print("\n" + 50*"-")
print("\n" + 15*"/" + "*")
print("Average scores *")
print(15*"/" + "*")

print(f"\nAverage accuracy: {(avg_accuracy / TRIALS):.4f}")
print(f"Average AUC: {(avg_auc / TRIALS):.4f}")
print(f"Average f1-score: {(avg_f1 / TRIALS):.4f}")
print(f"Average precision: {(avg_precision / TRIALS):.4f}")
print(f"Average recall score: {(avg_recall / TRIALS):.4f}")

# Saving the model

pickle.dump(stack_clf, open("stacked_lightgbm_with_smote.pkl","wb"))


//////////*
Trial 1   *
//////////*

Accuracy: 0.9721
AUC: 0.9508
F1-score: 0.8641
Precision: 0.8112
Recall score: 0.9244
Confusion matrix:
 [[1581   37]
 [  13  159]]
Runtime: 24239.443389546126

//////////*
Trial 2   *
//////////*

Accuracy: 0.9704
AUC: 0.9498
F1-score: 0.8571
Precision: 0.7990
Recall score: 0.9244
Confusion matrix:
 [[1578   40]
 [  13  159]]
Runtime: 19142.18049670197

//////////*
Trial 3   *
//////////*

Accuracy: 0.9704
AUC: 0.9498
F1-score: 0.8571
Precision: 0.7990
Recall score: 0.9244
Confusion matrix:
 [[1578   40]
 [  13  159]]
Runtime: 23106.19133951422

//////////*
Trial 4   *
//////////*

Accuracy: 0.9732
AUC: 0.9436
F1-score: 0.8667
Precision: 0.8298
Recall score: 0.9070
Confusion matrix:
 [[1586   32]
 [  16  156]]
Runtime: 19219.90137112001

//////////*
Trial 5   *
//////////*

Accuracy: 0.9721
AUC: 0.9508
F1-score: 0.8641
Precision: 0.8112
Recall score: 0.9244
Confusion matrix:
 [[1581   37]
 [  13  159]]
Runtime: 22853.536697069183

-----------------

# LightGBM StackingClassifier with Oversampling (ADASYN) and fine tuning with Optuna

In [13]:
max_metrics = []
max_val_auc_score = 0
avg_accuracy, avg_auc, avg_f1, avg_precision, avg_recall = 0, 0, 0, 0, 0

for i in range(TRIALS):

  t1 = time.perf_counter()
  SEED = SEED + 1

  def objective(trial):

    # LightGBM parameters

    base_models = []
    for j in range(MODEL_NUM):

      SEED_OBJ + 1

      params = {
          f"num_leaves_{j}" : trial.suggest_int(f"num_leaves_{j}", 20, 50),
          f"learning_rate_{j}" : trial.suggest_loguniform(f"learning_rate_{j}", 0.01, 0.2),
          f"n_estimators_{j}" : trial.suggest_int(f"n_estimators_{j}", 100, 1000),
          f"min_split_gain_{j}" : trial.suggest_uniform(f"min_split_gain_{j}", 0.0, 0.5),
          f"min_child_weight_{j}" : trial.suggest_loguniform(f"min_child_weight_{j}", 1e-3, 1.0),
          f"subsample_{j}" : trial.suggest_uniform(f"subsample_{j}", 0.5, 1.0),
          f"colsample_bytree_{j}" : trial.suggest_uniform(f"colsample_bytree_{j}", 0.5, 1.0),
          f"lambda_{j}" : trial.suggest_loguniform(f"lambda_{j}", 1e-8, 10.0)
      }
      model_params = {
          "num_leaves" : params[f"num_leaves_{j}"],
          "learning_rate" : params[f"learning_rate_{j}"],
          "n_estimators" : params[f"n_estimators_{j}"],
          "min_split_gain" : params[f"min_split_gain_{j}"],
          "min_child_weight" : params[f"min_child_weight_{j}"],
          "subsample" : params[f"subsample_{j}"],
          "colsample_bytree" : params[f"colsample_bytree_{j}"],
          "lambda" : params[f"lambda_{j}"]
      }
      base_models.append((f"lgb{j}", lgb.LGBMClassifier(random_state=SEED_OBJ, verbosity=-1, **model_params)))

    # ADASYN parameter

    adasyn_nn = trial.suggest_int("n_neighbors", 10, 230)

    adasyn_train = ADASYN(sampling_strategy="minority", n_neighbors=adasyn_nn, random_state=SEED_OBJ)
    X_adasyn_train, y_adasyn_train = adasyn_train.fit_resample(X_train, y_train)

    # LogisticRegression parameters

    lr_C = trial.suggest_loguniform("C", 0.001, 10)
    lr_max_iter = trial.suggest_int("max_iter", 100, 1000)

    stack_clf = StackingClassifier(estimators=base_models,
                                  final_estimator=LogisticRegression(random_state=SEED_OBJ, C=lr_C, max_iter=lr_max_iter),
                                  cv=kfold,
                                  stack_method="predict_proba")


    # Fitting the model and evaluating with auc metric

    stack_clf.fit(X_adasyn_train, y_adasyn_train)
    y_pred = stack_clf.predict(X_val)

    auc_score = roc_auc_score(y_val, y_pred)

    return auc_score

  # Optuna sampler

  SAMPLER = TPESampler(seed=SEED)

  # Optuna study

  lgb_study = optuna.create_study(direction="maximize", sampler=SAMPLER)
  lgb_study.optimize(objective, n_trials=NUM_ITER)

  # Optuna best parameters

  best_params = lgb_study.best_trial.params

  # LightGBM StackingClassifier best parameters

  base_models = []
  for k in range(MODEL_NUM):

    model_params = {
        "num_leaves" : best_params[f"num_leaves_{k}"],
        "learning_rate" : best_params[f"learning_rate_{k}"],
        "n_estimators" : best_params[f"n_estimators_{k}"],
        "min_split_gain" : best_params[f"min_split_gain_{k}"],
        "min_child_weight" : best_params[f"min_child_weight_{k}"],
        "subsample" : best_params[f"subsample_{k}"],
        "colsample_bytree" : best_params[f"colsample_bytree_{k}"],
        "lambda" : best_params[f"lambda_{k}"]
    }
    base_models.append((f"lgb{k}", lgb.LGBMClassifier(random_state=SEED, verbosity=-1, **model_params)))

  # ADASYN best parameter

  adasyn_nn = best_params["n_neighbors"]

  adasyn_train = ADASYN(sampling_strategy="minority", n_neighbors=adasyn_nn, random_state=SEED)
  X_adasyn_train, y_adasyn_train = adasyn_train.fit_resample(X_train, y_train)

  # LogisticRegression best parameters

  lr_C = best_params["C"]
  lr_max_iter = best_params["max_iter"]

  # Fine tuned LightGBM StackingClassifier

  stack_clf = StackingClassifier(estimators=base_models,
                                final_estimator=LogisticRegression(random_state=SEED, C=lr_C, max_iter=lr_max_iter),
                                cv=kfold,
                                stack_method="predict_proba")

  stack_clf.fit(X_adasyn_train, y_adasyn_train)

  y_pred = stack_clf.predict(X_test)

  accuracy = accuracy_score(y_test, y_pred)
  auc = roc_auc_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  conf_matrix = confusion_matrix(y_test, y_pred)

  # Max AUC validation scores

  if max_val_auc_score < lgb_study.best_trial.value:
    max_val_auc_score = lgb_study.best_trial.value
    max_metrics = [accuracy, auc, f1, precision, recall]

  # Average scores

  avg_accuracy += accuracy
  avg_auc += auc
  avg_f1 += f1
  avg_precision += precision
  avg_recall += recall

  print("\n" + 10*"/" + "*")
  print(f"Trial {i+1}" + 3*" " + "*")
  print(10*"/" + "*")

  print(f"\nAccuracy: {accuracy:.4f}")
  print(f"AUC: {auc:.4f}")
  print(f"F1-score: {f1:.4f}")
  print(f"Precision: {precision:.4f}")
  print(f"Recall score: {recall:.4f}")
  print(f"Confusion matrix:\n {conf_matrix}")

  t2 = time.perf_counter()
  print("Runtime:", t2-t1)

# Test scores with best validation AUC score

print("\n" + 50*"-")
print("\n" + 42*"/" + "*")
print("Scores with the best validation AUC score *")
print(42*"/" + "*")

print(f"\nBest validation AUC: {(max_val_auc_score):.4f}")
print(f"Accuracy: {(max_metrics[0]):.4f}")
print(f"AUC: {(max_metrics[1]):.4f}")
print(f"F1-score: {(max_metrics[2]):.4f}")
print(f"Precision: {(max_metrics[3]):.4f}")
print(f"Recall score: {(max_metrics[4]):.4f}")

# Printing average scores

print("\n" + 50*"-")
print("\n" + 15*"/" + "*")
print("Average scores *")
print(15*"/" + "*")

print(f"\nAverage accuracy: {(avg_accuracy / TRIALS):.4f}")
print(f"Average AUC: {(avg_auc / TRIALS):.4f}")
print(f"Average f1-score: {(avg_f1 / TRIALS):.4f}")
print(f"Average precision: {(avg_precision / TRIALS):.4f}")
print(f"Average recall score: {(avg_recall / TRIALS):.4f}")

# Saving the model

pickle.dump(stack_clf, open("stacked_lightgbm_with_adasyn.pkl","wb"))


//////////*
Trial 1   *
//////////*

Accuracy: 0.9547
AUC: 0.9464
F1-score: 0.7990
Precision: 0.6970
Recall score: 0.9360
Confusion matrix:
 [[1548   70]
 [  11  161]]
Runtime: 21878.201284334995

//////////*
Trial 2   *
//////////*

Accuracy: 0.9553
AUC: 0.9441
F1-score: 0.8000
Precision: 0.7018
Recall score: 0.9302
Confusion matrix:
 [[1550   68]
 [  12  160]]
Runtime: 30560.640570339747

//////////*
Trial 3   *
//////////*

Accuracy: 0.9570
AUC: 0.9398
F1-score: 0.8041
Precision: 0.7149
Recall score: 0.9186
Confusion matrix:
 [[1555   63]
 [  14  158]]
Runtime: 22543.926722519565

//////////*
Trial 4   *
//////////*

Accuracy: 0.9553
AUC: 0.9467
F1-score: 0.8010
Precision: 0.7000
Recall score: 0.9360
Confusion matrix:
 [[1549   69]
 [  11  161]]
Runtime: 23914.558225593995

//////////*
Trial 5   *
//////////*

Accuracy: 0.9570
AUC: 0.9528
F1-score: 0.8089
Precision: 0.7056
Recall score: 0.9477
Confusion matrix:
 [[1550   68]
 [   9  163]]
Runtime: 20959.33299438283

---------------

# LightGBM StackingClassifier with Undersampling (TomekLinks) and fine tuning with Optuna

In [14]:
max_metrics = []
max_val_auc_score = 0
avg_accuracy, avg_auc, avg_f1, avg_precision, avg_recall = 0, 0, 0, 0, 0

for i in range(TRIALS):

  t1 = time.perf_counter()
  SEED = SEED + 1

  def objective(trial):

    # LightGBM parameters

    base_models = []
    for j in range(MODEL_NUM):

      SEED_OBJ + 1

      params = {
          f"num_leaves_{j}" : trial.suggest_int(f"num_leaves_{j}", 20, 50),
          f"learning_rate_{j}" : trial.suggest_loguniform(f"learning_rate_{j}", 0.01, 0.2),
          f"n_estimators_{j}" : trial.suggest_int(f"n_estimators_{j}", 100, 1000),
          f"min_split_gain_{j}" : trial.suggest_uniform(f"min_split_gain_{j}", 0.0, 0.5),
          f"min_child_weight_{j}" : trial.suggest_loguniform(f"min_child_weight_{j}", 1e-3, 1.0),
          f"subsample_{j}" : trial.suggest_uniform(f"subsample_{j}", 0.5, 1.0),
          f"colsample_bytree_{j}" : trial.suggest_uniform(f"colsample_bytree_{j}", 0.5, 1.0),
          f"lambda_{j}" : trial.suggest_loguniform(f"lambda_{j}", 1e-8, 10.0)
      }
      model_params = {
          "num_leaves" : params[f"num_leaves_{j}"],
          "learning_rate" : params[f"learning_rate_{j}"],
          "n_estimators" : params[f"n_estimators_{j}"],
          "min_split_gain" : params[f"min_split_gain_{j}"],
          "min_child_weight" : params[f"min_child_weight_{j}"],
          "subsample" : params[f"subsample_{j}"],
          "colsample_bytree" : params[f"colsample_bytree_{j}"],
          "lambda" : params[f"lambda_{j}"]
      }
      base_models.append((f"lgb{j}", lgb.LGBMClassifier(random_state=SEED_OBJ, verbosity=-1, **model_params)))

    # TomekLinks

    tomek_train = TomekLinks(sampling_strategy="majority")
    X_tomek_train, y_tomek_train = tomek_train.fit_resample(X_train, y_train)

    # LogisticRegression parameters

    lr_C = trial.suggest_loguniform("C", 0.001, 10)
    lr_max_iter = trial.suggest_int("max_iter", 100, 1000)

    stack_clf = StackingClassifier(estimators=base_models,
                                  final_estimator=LogisticRegression(random_state=SEED_OBJ, C=lr_C, max_iter=lr_max_iter),
                                  cv=kfold,
                                  stack_method="predict_proba")


    # Fitting the model and evaluating with auc metric

    stack_clf.fit(X_tomek_train, y_tomek_train)
    y_pred = stack_clf.predict(X_val)

    auc_score = roc_auc_score(y_val, y_pred)

    return auc_score

  # Optuna sampler

  SAMPLER = TPESampler(seed=SEED)

  # Optuna study

  lgb_study = optuna.create_study(direction="maximize", sampler=SAMPLER)
  lgb_study.optimize(objective, n_trials=NUM_ITER)

  # Optuna best parameters

  best_params = lgb_study.best_trial.params

  # LightGBM StackingClassifier best parameters

  base_models = []
  for k in range(MODEL_NUM):

    model_params = {
        "num_leaves" : best_params[f"num_leaves_{k}"],
        "learning_rate" : best_params[f"learning_rate_{k}"],
        "n_estimators" : best_params[f"n_estimators_{k}"],
        "min_split_gain" : best_params[f"min_split_gain_{k}"],
        "min_child_weight" : best_params[f"min_child_weight_{k}"],
        "subsample" : best_params[f"subsample_{k}"],
        "colsample_bytree" : best_params[f"colsample_bytree_{k}"],
        "lambda" : best_params[f"lambda_{k}"]
    }
    base_models.append((f"lgb{k}", lgb.LGBMClassifier(random_state=SEED, verbosity=-1, **model_params)))

  # TomekLinks

  tomek_train = TomekLinks(sampling_strategy="majority")
  X_tomek_train, y_tomek_train = tomek_train.fit_resample(X_train, y_train)

  # LogisticRegression best parameters

  lr_C = best_params["C"]
  lr_max_iter = best_params["max_iter"]

  # Fine tuned LightGBM StackingClassifier

  stack_clf = StackingClassifier(estimators=base_models,
                                final_estimator=LogisticRegression(random_state=SEED, C=lr_C, max_iter=lr_max_iter),
                                cv=kfold,
                                stack_method="predict_proba")

  stack_clf.fit(X_tomek_train, y_tomek_train)

  y_pred = stack_clf.predict(X_test)

  accuracy = accuracy_score(y_test, y_pred)
  auc = roc_auc_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  conf_matrix = confusion_matrix(y_test, y_pred)

  # Max AUC validation scores

  if max_val_auc_score < lgb_study.best_trial.value:
    max_val_auc_score = lgb_study.best_trial.value
    max_metrics = [accuracy, auc, f1, precision, recall]

  # Average scores

  avg_accuracy += accuracy
  avg_auc += auc
  avg_f1 += f1
  avg_precision += precision
  avg_recall += recall

  print("\n" + 10*"/" + "*")
  print(f"Trial {i+1}" + 3*" " + "*")
  print(10*"/" + "*")

  print(f"\nAccuracy: {accuracy:.4f}")
  print(f"AUC: {auc:.4f}")
  print(f"F1-score: {f1:.4f}")
  print(f"Precision: {precision:.4f}")
  print(f"Recall score: {recall:.4f}")
  print(f"Confusion matrix:\n {conf_matrix}")

  t2 = time.perf_counter()
  print("Runtime:", t2-t1)

# Test scores with best validation AUC score

print("\n" + 50*"-")
print("\n" + 42*"/" + "*")
print("Scores with the best validation AUC score *")
print(42*"/" + "*")

print(f"\nBest validation AUC: {(max_val_auc_score):.4f}")
print(f"Accuracy: {(max_metrics[0]):.4f}")
print(f"AUC: {(max_metrics[1]):.4f}")
print(f"F1-score: {(max_metrics[2]):.4f}")
print(f"Precision: {(max_metrics[3]):.4f}")
print(f"Recall score: {(max_metrics[4]):.4f}")

# Printing average scores

print("\n" + 50*"-")
print("\n" + 15*"/" + "*")
print("Average scores *")
print(15*"/" + "*")

print(f"\nAverage accuracy: {(avg_accuracy / TRIALS):.4f}")
print(f"Average AUC: {(avg_auc / TRIALS):.4f}")
print(f"Average f1-score: {(avg_f1 / TRIALS):.4f}")
print(f"Average precision: {(avg_precision / TRIALS):.4f}")
print(f"Average recall score: {(avg_recall / TRIALS):.4f}")

# Saving the model

pickle.dump(stack_clf, open("stacked_lightgbm_with_tomeklinks.pkl","wb"))


//////////*
Trial 1   *
//////////*

Accuracy: 0.9827
AUC: 0.9411
F1-score: 0.9080
Precision: 0.9273
Recall score: 0.8895
Confusion matrix:
 [[1606   12]
 [  19  153]]
Runtime: 10010.837662018836

//////////*
Trial 2   *
//////////*

Accuracy: 0.9827
AUC: 0.9411
F1-score: 0.9080
Precision: 0.9273
Recall score: 0.8895
Confusion matrix:
 [[1606   12]
 [  19  153]]
Runtime: 12514.837733316235

//////////*
Trial 3   *
//////////*

Accuracy: 0.9816
AUC: 0.9404
F1-score: 0.9027
Precision: 0.9162
Recall score: 0.8895
Confusion matrix:
 [[1604   14]
 [  19  153]]
Runtime: 10312.28130711522

//////////*
Trial 4   *
//////////*

Accuracy: 0.9827
AUC: 0.9385
F1-score: 0.9075
Precision: 0.9325
Recall score: 0.8837
Confusion matrix:
 [[1607   11]
 [  20  152]]
Runtime: 11510.53915929515

//////////*
Trial 5   *
//////////*

Accuracy: 0.9821
AUC: 0.9382
F1-score: 0.9048
Precision: 0.9268
Recall score: 0.8837
Confusion matrix:
 [[1606   12]
 [  20  152]]
Runtime: 9926.112030077726

-----------------