In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_curve, roc_auc_score, auc
from sklearn.metrics import confusion_matrix, classification_report

# Importing the models

import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

# Importing sampling methods

from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import TomekLinks

!pip install optuna
import optuna
from optuna.samplers import TPESampler
optuna.logging.set_verbosity(optuna.logging.WARNING)

TRIALS = 5
MODEL_NUM = 5
NUM_ITER = 1000
SEED = 2024
SEED_OBJ = SEED
SAMPLER = TPESampler(seed=SEED)
max_metrics = []
max_val_auc_score = 0
avg_accuracy, avg_auc, avg_f1, avg_precision, avg_recall = 0, 0, 0, 0, 0

kfold = KFold(n_splits=5, shuffle=True, random_state=SEED)

import time
import warnings
warnings.filterwarnings("ignore")



In [2]:
htru2_data = pd.read_csv('https://raw.githubusercontent.com/szbela87/ml_22_elteik/main/data/HTRU_2.csv', header=None)
htru2_data.columns = ['mean_ip', 'std_ip', 'excess_kurt_ip', 'skewness_ip', 'mean_DMSNR', 'std_DMSNR', 'excess_kurt_DMSNR', 'skewness_DMSNR', 'class']

In [3]:
htru2_data

Unnamed: 0,mean_ip,std_ip,excess_kurt_ip,skewness_ip,mean_DMSNR,std_DMSNR,excess_kurt_DMSNR,skewness_DMSNR,class
0,140.562500,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.882430,0.465318,-0.515088,1.677258,14.860146,10.576487,127.393580,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.750000,57.178449,-0.068415,-0.636238,3.642977,20.959280,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.178930,11.468720,14.269573,252.567306,0
...,...,...,...,...,...,...,...,...,...
17893,136.429688,59.847421,-0.187846,-0.738123,1.296823,12.166062,15.450260,285.931022,0
17894,122.554688,49.485605,0.127978,0.323061,16.409699,44.626893,2.945244,8.297092,0
17895,119.335938,59.935939,0.159363,-0.743025,21.430602,58.872000,2.499517,4.595173,0
17896,114.507812,53.902400,0.201161,-0.024789,1.946488,13.381731,10.007967,134.238910,0


In [4]:
htru2_data.describe()

Unnamed: 0,mean_ip,std_ip,excess_kurt_ip,skewness_ip,mean_DMSNR,std_DMSNR,excess_kurt_DMSNR,skewness_DMSNR,class
count,17898.0,17898.0,17898.0,17898.0,17898.0,17898.0,17898.0,17898.0,17898.0
mean,111.079968,46.549532,0.477857,1.770279,12.6144,26.326515,8.303556,104.857709,0.091574
std,25.652935,6.843189,1.06404,6.167913,29.472897,19.470572,4.506092,106.51454,0.288432
min,5.8125,24.772042,-1.876011,-1.791886,0.213211,7.370432,-3.13927,-1.976976,0.0
25%,100.929688,42.376018,0.027098,-0.188572,1.923077,14.437332,5.781506,34.960504,0.0
50%,115.078125,46.947479,0.22324,0.19871,2.801839,18.461316,8.433515,83.064556,0.0
75%,127.085938,51.023202,0.473325,0.927783,5.464256,28.428104,10.702959,139.30933,0.0
max,192.617188,98.778911,8.069522,68.101622,223.392141,110.642211,34.539844,1191.000837,1.0


In [5]:
htru2_data.shape

(17898, 9)

In [6]:
htru2_data["class"].value_counts()

class
0    16259
1     1639
Name: count, dtype: int64

In [7]:
# Splitting the data (80/10/10% - train/validation/test)

X = htru2_data.drop("class", axis=1)
y = htru2_data["class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=SEED)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((16108, 8), (16108,), (1790, 8), (1790,))

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1/0.9, random_state=SEED)

X_train.shape, y_train.shape, X_val.shape, y_val.shape

((14318, 8), (14318,), (1790, 8), (1790,))

In [9]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((14318, 8), (14318,), (1790, 8), (1790,), (1790, 8), (1790,))

In [10]:
y_train.value_counts(), y_val.value_counts(), y_test.value_counts()

(class
 0    13014
 1     1304
 Name: count, dtype: int64,
 class
 0    1627
 1     163
 Name: count, dtype: int64,
 class
 0    1618
 1     172
 Name: count, dtype: int64)

# Baseline LightGBM

In [11]:
lgb_clf = lgb.LGBMClassifier(random_state=SEED, verbosity=-1)
lgb_clf.fit(X_train, y_train)

y_pred = lgb_clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"AUC: {auc:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall score: {recall:.4f}")
print(f"Confusion matrix:\n {conf_matrix}")

Accuracy: 0.9810
AUC: 0.9427
F1-score: 0.9006
Precision: 0.9059
Recall score: 0.8953
Confusion matrix:
 [[1602   16]
 [  18  154]]


# Baseline LightGBM fine tuning with Optuna

In [12]:
for i in range(TRIALS):

  t1 = time.perf_counter()
  SEED = SEED + 1

  def objective(trial):

    # LightGBM parameters

    params = {
      "num_leaves": trial.suggest_int("num_leaves", 20, 50),
      "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.2),
      "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
      "min_split_gain": trial.suggest_uniform("min_split_gain", 0.0, 0.5),
      "min_child_weight": trial.suggest_loguniform("min_child_weight", 1e-3, 1.0),
      "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
      "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
      "lambda": trial.suggest_loguniform("lambda", 1e-8, 10.0)
    }


    lgb_clf = lgb.LGBMClassifier(random_state=SEED, verbosity=-1, **params)

    # Fitting the model and evaluating with AUC metric

    lgb_clf.fit(X_train, y_train)
    y_pred = lgb_clf.predict(X_val)

    auc_score = roc_auc_score(y_val, y_pred)

    return auc_score

  # Optuna sampler

  SAMPLER = TPESampler(seed=SEED)

  # Optuna study

  lgb_study = optuna.create_study(direction="maximize", sampler=SAMPLER)
  lgb_study.optimize(objective, n_trials=NUM_ITER)

  # Optuna best parameters

  best_params = lgb_study.best_trial.params

  # Fine tuned LightGBM

  lgb_clf_optuna = lgb.LGBMClassifier(random_state=SEED, verbosity=-1, **best_params)
  lgb_clf_optuna.fit(X_train, y_train)

  y_pred = lgb_clf_optuna.predict(X_test)

  accuracy = accuracy_score(y_test, y_pred)
  auc = roc_auc_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  conf_matrix = confusion_matrix(y_test, y_pred)

  # Max AUC validation scores

  if max_val_auc_score < lgb_study.best_trial.value:
    max_val_auc_score = lgb_study.best_trial.value
    max_metrics = [accuracy, auc, f1, precision, recall]

  # Average scores

  avg_accuracy += accuracy
  avg_auc += auc
  avg_f1 += f1
  avg_precision += precision
  avg_recall += recall

  print("\n" + 10*"/" + "*")
  print(f"Trial {i+1}" + 3*" " + "*")
  print(10*"/" + "*")

  print(f"\nAccuracy: {accuracy:.4f}")
  print(f"AUC: {auc:.4f}")
  print(f"F1-score: {f1:.4f}")
  print(f"Precision: {precision:.4f}")
  print(f"Recall score: {recall:.4f}")
  print(f"Confusion matrix:\n {conf_matrix}")

  t2 = time.perf_counter()
  print("Runtime:", t2-t1)

# Best validation auc scores

print("\n" + 50*"-")
print("\n" + 42*"/" + "*")
print("Scores with the best validation AUC score *")
print(42*"/" + "*")

print(f"\nBest validation AUC: {(max_val_auc_score):.4f}")
print(f"Accuracy: {(max_metrics[0]):.4f}")
print(f"AUC: {(max_metrics[1]):.4f}")
print(f"F1-score: {(max_metrics[2]):.4f}")
print(f"Precision: {(max_metrics[3]):.4f}")
print(f"Recall score: {(max_metrics[4]):.4f}")

# Printing average scores

print("\n" + 50*"-")
print("\n" + 15*"/" + "*")
print("Average scores *")
print(15*"/" + "*")

print(f"\nAverage accuracy: {(avg_accuracy / TRIALS):.4f}")
print(f"Average AUC: {(avg_auc / TRIALS):.4f}")
print(f"Average f1-score: {(avg_f1 / TRIALS):.4f}")
print(f"Average precision: {(avg_precision / TRIALS):.4f}")
print(f"Average recall score: {(avg_recall / TRIALS):.4f}")

# Saving the model

pickle.dump(lgb_clf_optuna, open("baseline_lightgbm_without_smote.pkl","wb"))


//////////*
Trial 1   *
//////////*

Accuracy: 0.9821
AUC: 0.9408
F1-score: 0.9053
Precision: 0.9217
Recall score: 0.8895
Confusion matrix:
 [[1605   13]
 [  19  153]]
Runtime: 275.2178612211719

//////////*
Trial 2   *
//////////*

Accuracy: 0.9832
AUC: 0.9440
F1-score: 0.9112
Precision: 0.9277
Recall score: 0.8953
Confusion matrix:
 [[1606   12]
 [  18  154]]
Runtime: 366.54901589639485

//////////*
Trial 3   *
//////////*

Accuracy: 0.9799
AUC: 0.9395
F1-score: 0.8947
Precision: 0.9000
Recall score: 0.8895
Confusion matrix:
 [[1601   17]
 [  19  153]]
Runtime: 405.6334013706073

//////////*
Trial 4   *
//////////*

Accuracy: 0.9844
AUC: 0.9446
F1-score: 0.9167
Precision: 0.9390
Recall score: 0.8953
Confusion matrix:
 [[1608   10]
 [  18  154]]
Runtime: 337.4589594723657

//////////*
Trial 5   *
//////////*

Accuracy: 0.9832
AUC: 0.9440
F1-score: 0.9112
Precision: 0.9277
Recall score: 0.8953
Confusion matrix:
 [[1606   12]
 [  18  154]]
Runtime: 371.2524773394689

------------------

# Baseline LightGBM with SMOTE and fine tuning with Optuna

In [13]:
from re import L
max_metrics = []
max_val_auc_score = 0
avg_accuracy, avg_auc, avg_f1, avg_precision, avg_recall = 0, 0, 0, 0, 0

for i in range(TRIALS):

  t1 = time.perf_counter()
  SEED = SEED + 1

  def objective(trial):

    # LightGBM parameters

    params = {
      "num_leaves": trial.suggest_int("num_leaves", 20, 50),
      "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.2),
      "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
      "min_split_gain": trial.suggest_uniform("min_split_gain", 0.0, 0.5),
      "min_child_weight": trial.suggest_loguniform("min_child_weight", 1e-3, 1.0),
      "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
      "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
      "lambda": trial.suggest_loguniform("lambda", 1e-8, 10.0)
    }

    lgb_clf = lgb.LGBMClassifier(random_state=SEED, verbosity=-1, **params)

    # SMOTE parameter

    smote_kn = trial.suggest_int("k_neighbors", 5, 50)

    smote_train = SMOTE(sampling_strategy="minority", k_neighbors=smote_kn, random_state=SEED)
    X_smote_train, y_smote_train = smote_train.fit_resample(X_train, y_train)

    # Fitting the model and evaluating with auc metric

    lgb_clf.fit(X_smote_train, y_smote_train)
    y_pred = lgb_clf.predict(X_val)

    auc_score = roc_auc_score(y_val, y_pred)

    return auc_score

  # Optuna sampler

  SAMPLER = TPESampler(seed=SEED)

  # Optuna study

  lgb_study = optuna.create_study(direction="maximize", sampler=SAMPLER)
  lgb_study.optimize(objective, n_trials=NUM_ITER)

  # Optuna best parameters

  best_params = lgb_study.best_trial.params

  # SMOTE best parameter

  smote_kn = best_params["k_neighbors"]

  smote_train = SMOTE(sampling_strategy="minority", k_neighbors=smote_kn, random_state=SEED)
  X_smote_train, y_smote_train = smote_train.fit_resample(X_train, y_train)

  del best_params["k_neighbors"]

  # Fine tuned LightGBM

  lgb_clf_optuna = lgb.LGBMClassifier(random_state=SEED, verbosity=-1, **best_params)
  lgb_clf_optuna.fit(X_smote_train, y_smote_train)

  y_pred = lgb_clf_optuna.predict(X_test)

  accuracy = accuracy_score(y_test, y_pred)
  auc = roc_auc_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  conf_matrix = confusion_matrix(y_test, y_pred)

  # Max AUC validation scores

  if max_val_auc_score < lgb_study.best_trial.value:
    max_val_auc_score = lgb_study.best_trial.value
    max_metrics = [accuracy, auc, f1, precision, recall]

  # Average scores

  avg_accuracy += accuracy
  avg_auc += auc
  avg_f1 += f1
  avg_precision += precision
  avg_recall += recall

  print("\n" + 10*"/" + "*")
  print(f"Trial {i+1}" + 3*" " + "*")
  print(10*"/" + "*")

  print(f"\nAccuracy: {accuracy:.4f}")
  print(f"AUC: {auc:.4f}")
  print(f"F1-score: {f1:.4f}")
  print(f"Precision: {precision:.4f}")
  print(f"Recall score: {recall:.4f}")
  print(f"Confusion matrix:\n {conf_matrix}")

  t2 = time.perf_counter()
  print("Runtime:", t2-t1)

# Best validation auc scores

print("\n" + 50*"-")
print("\n" + 42*"/" + "*")
print("Scores with the best validation AUC score *")
print(42*"/" + "*")

print(f"\nBest validation AUC: {(max_val_auc_score):.4f}")
print(f"Accuracy: {(max_metrics[0]):.4f}")
print(f"AUC: {(max_metrics[1]):.4f}")
print(f"F1-score: {(max_metrics[2]):.4f}")
print(f"Precision: {(max_metrics[3]):.4f}")
print(f"Recall score: {(max_metrics[4]):.4f}")

# Printing average scores

print("\n" + 50*"-")
print("\n" + 15*"/" + "*")
print("Average scores *")
print(15*"/" + "*")

print(f"\nAverage accuracy: {(avg_accuracy / TRIALS):.4f}")
print(f"Average AUC: {(avg_auc / TRIALS):.4f}")
print(f"Average f1-score: {(avg_f1 / TRIALS):.4f}")
print(f"Average precision: {(avg_precision / TRIALS):.4f}")
print(f"Average recall score: {(avg_recall / TRIALS):.4f}")

# Saving the model

pickle.dump(lgb_clf_optuna, open("baseline_lightgbm_with_smote.pkl","wb"))


//////////*
Trial 1   *
//////////*

Accuracy: 0.9670
AUC: 0.9428
F1-score: 0.8418
Precision: 0.7811
Recall score: 0.9128
Confusion matrix:
 [[1574   44]
 [  15  157]]
Runtime: 580.2759282924235

//////////*
Trial 2   *
//////////*

Accuracy: 0.9676
AUC: 0.9457
F1-score: 0.8449
Precision: 0.7822
Recall score: 0.9186
Confusion matrix:
 [[1574   44]
 [  14  158]]
Runtime: 758.0350985508412

//////////*
Trial 3   *
//////////*

Accuracy: 0.9687
AUC: 0.9463
F1-score: 0.8495
Precision: 0.7900
Recall score: 0.9186
Confusion matrix:
 [[1576   42]
 [  14  158]]
Runtime: 1214.4653470572084

//////////*
Trial 4   *
//////////*

Accuracy: 0.9687
AUC: 0.9463
F1-score: 0.8495
Precision: 0.7900
Recall score: 0.9186
Confusion matrix:
 [[1576   42]
 [  14  158]]
Runtime: 921.561550824903

//////////*
Trial 5   *
//////////*

Accuracy: 0.9704
AUC: 0.9473
F1-score: 0.8564
Precision: 0.8020
Recall score: 0.9186
Confusion matrix:
 [[1579   39]
 [  14  158]]
Runtime: 1579.7546774055809

------------------

# Baseline LightGBM with SMOTE and fine tuning with Optuna

In [14]:
max_metrics = []
max_val_auc_score = 0
avg_accuracy, avg_auc, avg_f1, avg_precision, avg_recall = 0, 0, 0, 0, 0

for i in range(TRIALS):

  t1 = time.perf_counter()
  SEED = SEED + 1

  def objective(trial):

    # LightGBM parameters

    params = {
      "num_leaves": trial.suggest_int("num_leaves", 20, 50),
      "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.2),
      "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
      "min_split_gain": trial.suggest_uniform("min_split_gain", 0.0, 0.5),
      "min_child_weight": trial.suggest_loguniform("min_child_weight", 1e-3, 1.0),
      "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
      "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
      "lambda": trial.suggest_loguniform("lambda", 1e-8, 10.0)
    }

    lgb_clf = lgb.LGBMClassifier(random_state=SEED, verbosity=-1, **params)

    # SMOTE parameter

    smote_kn = trial.suggest_int("k_neighbors", 5, 50)

    smote_train = SMOTE(sampling_strategy="minority", k_neighbors=smote_kn, random_state=SEED)
    X_smote_train, y_smote_train = smote_train.fit_resample(X_train, y_train)

    # Fitting the model and evaluating with AUC metric

    lgb_clf.fit(X_smote_train, y_smote_train)
    y_pred = lgb_clf.predict(X_val)

    auc_score = roc_auc_score(y_val, y_pred)

    return auc_score

  # Optuna sampler

  SAMPLER = TPESampler(seed=SEED)

  # Optuna study

  lgb_study = optuna.create_study(direction="maximize", sampler=SAMPLER)
  lgb_study.optimize(objective, n_trials=NUM_ITER)

  # Optuna best parameters

  best_params = lgb_study.best_trial.params

  # SMOTE best parameter

  smote_kn = best_params["k_neighbors"]

  smote_train = SMOTE(sampling_strategy="minority", k_neighbors=smote_kn, random_state=SEED)
  X_smote_train, y_smote_train = smote_train.fit_resample(X_train, y_train)

  del best_params["k_neighbors"]

  # Fine tuned LightGBM

  lgb_clf_optuna = lgb.LGBMClassifier(random_state=SEED, verbosity=-1, **best_params)
  lgb_clf_optuna.fit(X_smote_train, y_smote_train)

  y_pred = lgb_clf_optuna.predict(X_test)

  accuracy = accuracy_score(y_test, y_pred)
  auc = roc_auc_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  conf_matrix = confusion_matrix(y_test, y_pred)

  # Max AUC validation scores

  if max_val_auc_score < lgb_study.best_trial.value:
    max_val_auc_score = lgb_study.best_trial.value
    max_metrics = [accuracy, auc, f1, precision, recall]

  # Average scores

  avg_accuracy += accuracy
  avg_auc += auc
  avg_f1 += f1
  avg_precision += precision
  avg_recall += recall

  print("\n" + 10*"/" + "*")
  print(f"Trial {i+1}" + 3*" " + "*")
  print(10*"/" + "*")

  print(f"\nAccuracy: {accuracy:.4f}")
  print(f"AUC: {auc:.4f}")
  print(f"F1-score: {f1:.4f}")
  print(f"Precision: {precision:.4f}")
  print(f"Recall score: {recall:.4f}")
  print(f"Confusion matrix:\n {conf_matrix}")

  t2 = time.perf_counter()
  print("Runtime:", t2-t1)

# Best validation AUC scores

print("\n" + 50*"-")
print("\n" + 42*"/" + "*")
print("Scores with the best validation AUC score *")
print(42*"/" + "*")

print(f"\nBest validation AUC: {(max_val_auc_score):.4f}")
print(f"Accuracy: {(max_metrics[0]):.4f}")
print(f"AUC: {(max_metrics[1]):.4f}")
print(f"F1-score: {(max_metrics[2]):.4f}")
print(f"Precision: {(max_metrics[3]):.4f}")
print(f"Recall score: {(max_metrics[4]):.4f}")

# Printing average scores

print("\n" + 50*"-")
print("\n" + 15*"/" + "*")
print("Average scores *")
print(15*"/" + "*")

print(f"\nAverage accuracy: {(avg_accuracy / TRIALS):.4f}")
print(f"Average AUC: {(avg_auc / TRIALS):.4f}")
print(f"Average f1-score: {(avg_f1 / TRIALS):.4f}")
print(f"Average precision: {(avg_precision / TRIALS):.4f}")
print(f"Average recall score: {(avg_recall / TRIALS):.4f}")

# Saving the model

pickle.dump(lgb_clf_optuna, open("baseline_lightgbm_with_smote.pkl","wb"))


//////////*
Trial 1   *
//////////*

Accuracy: 0.9715
AUC: 0.9479
F1-score: 0.8610
Precision: 0.8103
Recall score: 0.9186
Confusion matrix:
 [[1581   37]
 [  14  158]]
Runtime: 747.9793854225427

//////////*
Trial 2   *
//////////*

Accuracy: 0.9654
AUC: 0.9419
F1-score: 0.8351
Precision: 0.7696
Recall score: 0.9128
Confusion matrix:
 [[1571   47]
 [  15  157]]
Runtime: 1235.8564876830205

//////////*
Trial 3   *
//////////*

Accuracy: 0.9670
AUC: 0.9402
F1-score: 0.8410
Precision: 0.7839
Recall score: 0.9070
Confusion matrix:
 [[1575   43]
 [  16  156]]
Runtime: 974.1252677151933

//////////*
Trial 4   *
//////////*

Accuracy: 0.9665
AUC: 0.9399
F1-score: 0.8387
Precision: 0.7800
Recall score: 0.9070
Confusion matrix:
 [[1574   44]
 [  16  156]]
Runtime: 882.2660342641175

//////////*
Trial 5   *
//////////*

Accuracy: 0.9659
AUC: 0.9396
F1-score: 0.8365
Precision: 0.7761
Recall score: 0.9070
Confusion matrix:
 [[1573   45]
 [  16  156]]
Runtime: 902.254444221966

-------------------

# Baseline LightGBM with Oversampling (ADASYN) and fine tuning with Optuna

In [15]:
max_metrics = []
max_val_auc_score = 0
avg_accuracy, avg_auc, avg_f1, avg_precision, avg_recall = 0, 0, 0, 0, 0

for i in range(TRIALS):

  t1 = time.perf_counter()
  SEED = SEED + 1

  def objective(trial):

    # LightGBM parameters

    params = {
      "num_leaves": trial.suggest_int("num_leaves", 20, 50),
      "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.2),
      "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
      "min_split_gain": trial.suggest_uniform("min_split_gain", 0.0, 0.5),
      "min_child_weight": trial.suggest_loguniform("min_child_weight", 1e-3, 1.0),
      "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
      "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
      "lambda": trial.suggest_loguniform("lambda", 1e-8, 10.0)
    }

    lgb_clf = lgb.LGBMClassifier(random_state=SEED, verbosity=-1, **params)

    # ADASYN parameter

    adasyn_nn = trial.suggest_int("n_neighbors", 5, 50)

    adasyn_train = ADASYN(sampling_strategy="minority", n_neighbors=adasyn_nn, random_state=SEED_OBJ)
    X_adasyn_train, y_adasyn_train = adasyn_train.fit_resample(X_train, y_train)

    # Fitting the model and evaluating with AUC metric

    lgb_clf.fit(X_adasyn_train, y_adasyn_train)
    y_pred = lgb_clf.predict(X_val)

    auc_score = roc_auc_score(y_val, y_pred)

    return auc_score

  # Optuna sampler

  SAMPLER = TPESampler(seed=SEED)

  # Optuna study

  lgb_study = optuna.create_study(direction="maximize", sampler=SAMPLER)
  lgb_study.optimize(objective, n_trials=NUM_ITER)

  # Optuna best parameters

  best_params = lgb_study.best_trial.params

  # ADASYN best parameter

  adasyn_nn = best_params["n_neighbors"]

  adasyn_train = ADASYN(sampling_strategy="minority", n_neighbors=adasyn_nn, random_state=SEED)
  X_adasyn_train, y_adasyn_train = adasyn_train.fit_resample(X_train, y_train)

  del best_params["n_neighbors"]

  # Fine tuned LightGBM

  lgb_clf_optuna = lgb.LGBMClassifier(random_state=SEED, verbosity=-1, **best_params)
  lgb_clf_optuna.fit(X_adasyn_train, y_adasyn_train)

  y_pred = lgb_clf_optuna.predict(X_test)

  accuracy = accuracy_score(y_test, y_pred)
  auc = roc_auc_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  conf_matrix = confusion_matrix(y_test, y_pred)

  # Max AUC validation scores

  if max_val_auc_score < lgb_study.best_trial.value:
    max_val_auc_score = lgb_study.best_trial.value
    max_metrics = [accuracy, auc, f1, precision, recall]

  # Average scores

  avg_accuracy += accuracy
  avg_auc += auc
  avg_f1 += f1
  avg_precision += precision
  avg_recall += recall

  print("\n" + 10*"/" + "*")
  print(f"Trial {i+1}" + 3*" " + "*")
  print(10*"/" + "*")

  print(f"\nAccuracy: {accuracy:.4f}")
  print(f"AUC: {auc:.4f}")
  print(f"F1-score: {f1:.4f}")
  print(f"Precision: {precision:.4f}")
  print(f"Recall score: {recall:.4f}")
  print(f"Confusion matrix:\n {conf_matrix}")

  t2 = time.perf_counter()
  print("Runtime:", t2-t1)

# Best validation AUC scores

print("\n" + 50*"-")
print("\n" + 42*"/" + "*")
print("Scores with the best validation AUC score *")
print(42*"/" + "*")

print(f"\nBest validation AUC: {(max_val_auc_score):.4f}")
print(f"Accuracy: {(max_metrics[0]):.4f}")
print(f"AUC: {(max_metrics[1]):.4f}")
print(f"F1-score: {(max_metrics[2]):.4f}")
print(f"Precision: {(max_metrics[3]):.4f}")
print(f"Recall score: {(max_metrics[4]):.4f}")

# Printing average scores

print("\n" + 50*"-")
print("\n" + 15*"/" + "*")
print("Average scores *")
print(15*"/" + "*")

print(f"\nAverage accuracy: {(avg_accuracy / TRIALS):.4f}")
print(f"Average AUC: {(avg_auc / TRIALS):.4f}")
print(f"Average f1-score: {(avg_f1 / TRIALS):.4f}")
print(f"Average precision: {(avg_precision / TRIALS):.4f}")
print(f"Average recall score: {(avg_recall / TRIALS):.4f}")

# Saving the model

pickle.dump(lgb_clf_optuna, open("baseline_lightgbm_with_adasyn.pkl","wb"))


//////////*
Trial 1   *
//////////*

Accuracy: 0.9475
AUC: 0.9320
F1-score: 0.7696
Precision: 0.6653
Recall score: 0.9128
Confusion matrix:
 [[1539   79]
 [  15  157]]
Runtime: 1393.1418150467798

//////////*
Trial 2   *
//////////*

Accuracy: 0.9302
AUC: 0.9328
F1-score: 0.7204
Precision: 0.5855
Recall score: 0.9360
Confusion matrix:
 [[1504  114]
 [  11  161]]
Runtime: 1183.5515688043088

//////////*
Trial 3   *
//////////*

Accuracy: 0.9536
AUC: 0.9380
F1-score: 0.7920
Precision: 0.6960
Recall score: 0.9186
Confusion matrix:
 [[1549   69]
 [  14  158]]
Runtime: 925.8327027745545

//////////*
Trial 4   *
//////////*

Accuracy: 0.9503
AUC: 0.9361
F1-score: 0.7802
Precision: 0.6781
Recall score: 0.9186
Confusion matrix:
 [[1543   75]
 [  14  158]]
Runtime: 950.4837110023946

//////////*
Trial 5   *
//////////*

Accuracy: 0.9453
AUC: 0.9333
F1-score: 0.7633
Precision: 0.6529
Recall score: 0.9186
Confusion matrix:
 [[1534   84]
 [  14  158]]
Runtime: 1147.727321634069

-----------------

# Baseline LightGBM with Undersampling (TomekLinks) and fine tuning with Optuna

In [16]:
max_metrics = []
max_val_auc_score = 0
avg_accuracy, avg_auc, avg_f1, avg_precision, avg_recall = 0, 0, 0, 0, 0

for i in range(TRIALS):

  t1 = time.perf_counter()
  SEED = SEED + 1

  def objective(trial):

    # LightGBM parameters

    params = {
      "num_leaves": trial.suggest_int("num_leaves", 20, 50),
      "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.2),
      "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
      "min_split_gain": trial.suggest_uniform("min_split_gain", 0.0, 0.5),
      "min_child_weight": trial.suggest_loguniform("min_child_weight", 1e-3, 1.0),
      "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
      "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
      "lambda": trial.suggest_loguniform("lambda", 1e-8, 10.0)
    }

    lgb_clf = lgb.LGBMClassifier(random_state=SEED, verbosity=-1, **params)

    # TomekLinks

    tomek_train = TomekLinks(sampling_strategy="majority")
    X_tomek_train, y_tomek_train = tomek_train.fit_resample(X_train, y_train)

    # Fitting the model and evaluating with AUC metric

    lgb_clf.fit(X_tomek_train, y_tomek_train)
    y_pred = lgb_clf.predict(X_val)

    auc_score = roc_auc_score(y_val, y_pred)

    return auc_score

  # Optuna sampler

  SAMPLER = TPESampler(seed=SEED)

  # Optuna study

  lgb_study = optuna.create_study(direction="maximize", sampler=SAMPLER)
  lgb_study.optimize(objective, n_trials=NUM_ITER)

  # Optuna best parameters

  best_params = lgb_study.best_trial.params

  # TomekLinks

  tomek_train = TomekLinks(sampling_strategy="majority")
  X_tomek_train, y_tomek_train = tomek_train.fit_resample(X_train, y_train)

  # Fine tuned LightGBM

  lgb_clf_optuna = lgb.LGBMClassifier(random_state=SEED, verbosity=-1, **best_params)
  lgb_clf_optuna.fit(X_tomek_train, y_tomek_train)

  y_pred = lgb_clf_optuna.predict(X_test)

  accuracy = accuracy_score(y_test, y_pred)
  auc = roc_auc_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  conf_matrix = confusion_matrix(y_test, y_pred)

  # Max AUC validation scores

  if max_val_auc_score < lgb_study.best_trial.value:
    max_val_auc_score = lgb_study.best_trial.value
    max_metrics = [accuracy, auc, f1, precision, recall]

  # Average scores

  avg_accuracy += accuracy
  avg_auc += auc
  avg_f1 += f1
  avg_precision += precision
  avg_recall += recall

  print("\n" + 10*"/" + "*")
  print(f"Trial {i+1}" + 3*" " + "*")
  print(10*"/" + "*")

  print(f"\nAccuracy: {accuracy:.4f}")
  print(f"AUC: {auc:.4f}")
  print(f"F1-score: {f1:.4f}")
  print(f"Precision: {precision:.4f}")
  print(f"Recall score: {recall:.4f}")
  print(f"Confusion matrix:\n {conf_matrix}")

  t2 = time.perf_counter()
  print("Runtime:", t2-t1)

# Best validation AUC scores

print("\n" + 50*"-")
print("\n" + 42*"/" + "*")
print("Scores with the best validation AUC score *")
print(42*"/" + "*")

print(f"\nBest validation AUC: {(max_val_auc_score):.4f}")
print(f"Accuracy: {(max_metrics[0]):.4f}")
print(f"AUC: {(max_metrics[1]):.4f}")
print(f"F1-score: {(max_metrics[2]):.4f}")
print(f"Precision: {(max_metrics[3]):.4f}")
print(f"Recall score: {(max_metrics[4]):.4f}")

# Printing average scores

print("\n" + 50*"-")
print("\n" + 15*"/" + "*")
print("Average scores *")
print(15*"/" + "*")

print(f"\nAverage accuracy: {(avg_accuracy / TRIALS):.4f}")
print(f"Average AUC: {(avg_auc / TRIALS):.4f}")
print(f"Average f1-score: {(avg_f1 / TRIALS):.4f}")
print(f"Average precision: {(avg_precision / TRIALS):.4f}")
print(f"Average recall score: {(avg_recall / TRIALS):.4f}")

# Saving the model

pickle.dump(lgb_clf_optuna, open("baseline_lightgbm_with_tomeklinks.pkl","wb"))


//////////*
Trial 1   *
//////////*

Accuracy: 0.9816
AUC: 0.9404
F1-score: 0.9027
Precision: 0.9162
Recall score: 0.8895
Confusion matrix:
 [[1604   14]
 [  19  153]]
Runtime: 573.2941641341895

//////////*
Trial 2   *
//////////*

Accuracy: 0.9799
AUC: 0.9343
F1-score: 0.8935
Precision: 0.9096
Recall score: 0.8779
Confusion matrix:
 [[1603   15]
 [  21  151]]
Runtime: 517.5136222532019

//////////*
Trial 3   *
//////////*

Accuracy: 0.9810
AUC: 0.9401
F1-score: 0.9000
Precision: 0.9107
Recall score: 0.8895
Confusion matrix:
 [[1603   15]
 [  19  153]]
Runtime: 845.2987708635628

//////////*
Trial 4   *
//////////*

Accuracy: 0.9804
AUC: 0.9398
F1-score: 0.8974
Precision: 0.9053
Recall score: 0.8895
Confusion matrix:
 [[1602   16]
 [  19  153]]
Runtime: 1023.7811617953703

//////////*
Trial 5   *
//////////*

Accuracy: 0.9804
AUC: 0.9424
F1-score: 0.8980
Precision: 0.9006
Recall score: 0.8953
Confusion matrix:
 [[1601   17]
 [  18  154]]
Runtime: 562.6753405267373

------------------