# Notebook to Implement Model Training - LGBM

---

### 1) Setup

In [34]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

from scipy import stats
from scipy.stats import wilcoxon

from sklearn.metrics import balanced_accuracy_score, make_scorer, confusion_matrix, classification_report
from sklearn.model_selection import RepeatedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

import optuna
import lightgbm as lgb

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.impute import SimpleImputer


from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

In [128]:
SEED = 42

In [129]:
TRAIN_CLINICAL_FILENAME = "train_set2.csv"
TEST_CLINICAL_FILENAME = "test_set2.csv"

---

### 2) Read and Preprocess Data

In [151]:
train = pd.read_csv(TRAIN_CLINICAL_FILENAME, sep=",")
train = train.iloc[: , 1:]

In [152]:
train.shape

(132, 547)

In [153]:
train.head()

Unnamed: 0,Freq.1198.26559877594,Freq.1204.07148226988,Freq.1211.13623593336,Freq.1217.73683596229,Freq.1223.16958659828,Freq.1234.45996741201,Freq.1239.41043347512,Freq.1244.8230732813,Freq.1254.64710066874,Freq.1261.3335177338,...,Freq.9059.04784969308,Freq.9098.3102509794,Freq.9436.045671808,Freq.9594.78353572215,Freq.9799.45047805923,Freq.10431.4344537929,Freq.11007.128608859,Freq.11035.3843513154,Freq.11160.836582373,Group
0,0.000251,0.000345,7.4e-05,3.541783e-05,7.9e-05,0.000245,6e-06,6.1e-05,0.0,5.2e-05,...,1.8e-05,7.8e-05,7e-05,0.000116,7.8e-05,6.1e-05,0.000102,2.1e-05,0.0001,MILD
1,0.0,0.0,1.5e-05,0.0008515219,0.000107,0.001334,0.005597,0.001388,0.0005144433,0.001511,...,3.2e-05,4e-06,1.2e-05,7e-06,0.000152,8e-06,0.000397,0.000112,3e-06,MILD
2,0.00021,6e-06,1.8e-05,8.649685e-09,8.4e-05,0.000487,0.000185,9.2e-05,0.0001294411,0.0,...,0.000119,5.2e-05,0.000205,0.00042,6e-05,9.6e-05,0.000243,3.6e-05,0.000253,MILD
3,0.000171,0.000233,7.6e-05,4.854921e-05,0.000122,0.000333,3.8e-05,7.5e-05,1.764229e-07,6.4e-05,...,0.000134,3.8e-05,0.000153,0.000124,2.6e-05,1.2e-05,0.000147,5.2e-05,8.6e-05,MILD
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.8e-05,7.4e-05,2.7e-05,7.9e-05,1.8e-05,4.4e-05,7.4e-05,4.2e-05,4e-05,MILD


In [154]:
##### Preprocessing

train_mod = train.copy()
# Imputation using mean method
#train_mod = train_mod.replace(0,np.nan)
#train_mod_imp = train_mod.transform(lambda x: x.fillna(x.mean()))

In [156]:
train_mod.shape

(132, 547)

In [157]:
train_mod.head()

Unnamed: 0,Freq.1198.26559877594,Freq.1204.07148226988,Freq.1211.13623593336,Freq.1217.73683596229,Freq.1223.16958659828,Freq.1234.45996741201,Freq.1239.41043347512,Freq.1244.8230732813,Freq.1254.64710066874,Freq.1261.3335177338,...,Freq.9059.04784969308,Freq.9098.3102509794,Freq.9436.045671808,Freq.9594.78353572215,Freq.9799.45047805923,Freq.10431.4344537929,Freq.11007.128608859,Freq.11035.3843513154,Freq.11160.836582373,Group
0,0.000251,0.000345,7.4e-05,3.541783e-05,7.9e-05,0.000245,6e-06,6.1e-05,0.0,5.2e-05,...,1.8e-05,7.8e-05,7e-05,0.000116,7.8e-05,6.1e-05,0.000102,2.1e-05,0.0001,MILD
1,0.0,0.0,1.5e-05,0.0008515219,0.000107,0.001334,0.005597,0.001388,0.0005144433,0.001511,...,3.2e-05,4e-06,1.2e-05,7e-06,0.000152,8e-06,0.000397,0.000112,3e-06,MILD
2,0.00021,6e-06,1.8e-05,8.649685e-09,8.4e-05,0.000487,0.000185,9.2e-05,0.0001294411,0.0,...,0.000119,5.2e-05,0.000205,0.00042,6e-05,9.6e-05,0.000243,3.6e-05,0.000253,MILD
3,0.000171,0.000233,7.6e-05,4.854921e-05,0.000122,0.000333,3.8e-05,7.5e-05,1.764229e-07,6.4e-05,...,0.000134,3.8e-05,0.000153,0.000124,2.6e-05,1.2e-05,0.000147,5.2e-05,8.6e-05,MILD
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.8e-05,7.4e-05,2.7e-05,7.9e-05,1.8e-05,4.4e-05,7.4e-05,4.2e-05,4e-05,MILD


In [158]:
test = pd.read_csv(TEST_CLINICAL_FILENAME, sep=",")
test = test.iloc[: , 1:]
test_mod = test.copy()
test_mod.head()

Unnamed: 0,Freq.1198.26559877594,Freq.1204.07148226988,Freq.1211.13623593336,Freq.1217.73683596229,Freq.1223.16958659828,Freq.1234.45996741201,Freq.1239.41043347512,Freq.1244.8230732813,Freq.1254.64710066874,Freq.1261.3335177338,...,Freq.9059.04784969308,Freq.9098.3102509794,Freq.9436.045671808,Freq.9594.78353572215,Freq.9799.45047805923,Freq.10431.4344537929,Freq.11007.128608859,Freq.11035.3843513154,Freq.11160.836582373,Group
0,0.0001363486,0.000205,0.000115,7.7e-05,0.000199,6.6e-05,2.4e-05,0.000406,7.2e-05,0.000154,...,5.5e-05,3.2e-05,2.8e-05,5.1e-05,2.5e-05,5.6e-05,8.3e-05,7.4e-05,3.5e-05,MILD
1,8.938546e-07,5.1e-05,1.7e-05,2.4e-05,1.7e-05,7.2e-05,6.2e-05,0.000202,0.000217,3.6e-05,...,2.6e-05,3.7e-05,5.4e-05,0.0005,0.000125,0.000128,4.9e-05,4e-06,0.000192,MILD
2,0.0002061797,0.000237,2e-06,0.000109,0.000136,2.9e-05,0.000629,0.00015,0.000105,0.000229,...,0.000135,7.9e-05,5.3e-05,0.000232,7.6e-05,0.000187,0.00012,3.1e-05,0.000156,MILD
3,0.002256595,0.001221,0.000174,0.000109,0.001292,0.000142,0.000579,0.001619,3e-06,0.002916,...,1.6e-05,1e-05,9e-06,2e-05,3e-05,6e-06,1.5e-05,1.8e-05,2.1e-05,MILD
4,1.023898e-05,0.000314,7.3e-05,2.3e-05,7.1e-05,0.0,0.00043,0.000378,9.6e-05,2.6e-05,...,7.2e-05,1.9e-05,7.2e-05,0.000166,6e-06,0.000204,7.9e-05,2.6e-05,9.4e-05,MILD


In [159]:
X_test, y_test = test_mod.drop("Group", axis=1), test_mod["Group"]

---

### 3) Baseline Model Training and CV

In [160]:
# Define Classifier (or pipeline)
clf = lgb.LGBMClassifier(random_state=SEED)

In [161]:
# Get Features and Target
X, y = train_mod.drop("Group", axis=1), train_mod["Group"]

In [162]:
# Defining RepeatedKFold Cross Validator
rkf = RepeatedKFold(n_splits=5, n_repeats=20, random_state=SEED)

In [163]:
# Define metric scorer
metric_scorer = make_scorer(balanced_accuracy_score)
metric_scorer

make_scorer(balanced_accuracy_score)

In [164]:
# Cross validate model
scores = cross_val_score(clf, X, y, scoring=metric_scorer, cv=rkf, n_jobs=-1)

In [165]:
# Score from each CV Iteration
scores

array([0.54605263, 0.94444444, 0.70676692, 0.61538462, 0.7797619 ,
       0.66666667, 0.875     , 0.67857143, 0.75625   , 0.83333333,
       0.64204545, 0.9       , 0.65413534, 0.81944444, 0.6875    ,
       0.80555556, 0.84117647, 0.71428571, 0.85947712, 0.71568627,
       0.725     , 0.63486842, 0.73030303, 0.66875   , 0.8875    ,
       0.76666667, 0.76923077, 0.65972222, 0.7745098 , 0.66013072,
       0.61111111, 0.73235294, 0.87593985, 0.71875   , 0.91176471,
       0.77272727, 0.66118421, 0.7202381 , 0.625     , 0.85714286,
       0.77777778, 0.875     , 0.67272727, 0.81818182, 0.69444444,
       0.89285714, 0.68181818, 0.74509804, 0.76875   , 0.89473684,
       0.84117647, 0.68333333, 0.8875    , 0.68954248, 0.65625   ,
       0.78693182, 0.76176471, 0.57638889, 0.79761905, 0.90625   ,
       0.81176471, 0.75      , 0.7202381 , 0.63194444, 0.72619048,
       0.725     , 0.8       , 0.90972222, 0.65972222, 0.73939394,
       0.84659091, 0.75      , 0.9       , 0.74183007, 0.75151

In [166]:
# Mean Metric Value
np.mean(scores)

0.7530414906515912

# Feature reduction

In [167]:
train_mod_stats = train_mod.drop('Group', axis = 1)


In [168]:
df = train_mod.copy()
peaks_list = []
pval_list = []
df2 = pd.melt(df, id_vars = ["Group"])
mild = df.where(df.Group == "MILD").dropna()
sev = df.where(df.Group == "SEVERE").dropna()
for i in range(1,(len(df.columns)-1)):
    s, p1 = stats.mannwhitneyu(mild.iloc[:,i], sev.iloc[:,i], alternative = 'two-sided')
    if p1 <= 0.05:
        pval_list.append(p1)
        peaks_list.append(df.columns[i])
peaks_list.append("Group")

In [169]:
df_filtered = df.drop(columns=[col for col in df if col not in peaks_list])
df_test_filtered = test.drop(columns=[col for col in test if col not in peaks_list])

In [170]:
df_test_filtered.shape

(64, 119)

---

In [171]:
# Get Features and Target
X, y = df_filtered.drop("Group", axis=1), df_filtered["Group"]
X_test, y_test = df_test_filtered.drop("Group", axis=1), df_test_filtered["Group"]
# Cross validate model
scores = cross_val_score(clf, X, y, scoring=metric_scorer, cv=rkf, n_jobs=-1)
# Score from each CV Iteration
scores

array([0.63486842, 0.97222222, 0.70676692, 0.73076923, 0.66666667,
       0.74166667, 1.        , 0.72619048, 0.80625   , 0.83333333,
       0.6875    , 1.        , 0.77819549, 0.76388889, 0.76875   ,
       0.94444444, 0.84117647, 0.78571429, 0.85947712, 0.74509804,
       0.88333333, 0.72368421, 0.77575758, 0.81875   , 0.8875    ,
       0.84166667, 0.92307692, 0.63194444, 0.77124183, 0.63398693,
       0.69444444, 0.75294118, 0.70676692, 0.6875    , 0.88235294,
       0.83238636, 0.69736842, 0.76190476, 0.75833333, 0.92857143,
       0.77777778, 0.875     , 0.69393939, 0.83030303, 0.78472222,
       0.85714286, 0.74147727, 0.74509804, 0.8375    , 0.89473684,
       0.78235294, 0.80833333, 0.8875    , 0.74509804, 0.65625   ,
       0.81534091, 0.76176471, 0.63888889, 0.76190476, 0.96875   ,
       0.9       , 0.77777778, 0.76190476, 0.78472222, 0.76785714,
       0.69166667, 0.8       , 0.9375    , 0.75694444, 0.87575758,
       0.87784091, 0.8       , 0.7875    , 0.74183007, 0.78484

In [172]:
# Mean Metric Value
np.mean(scores)

0.7912739915743012

### 4) Experiments

##### 4.1) Hyper Parameter Optimization with Optuna

In [173]:
# Define objectiva funtion to maximize metric
def objective(trial):
    
    # Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model with param trial
    clf = lgb.LGBMClassifier(random_state=SEED, **param)
    # Get CV Metric we want to maximize
    balanced_accuracy = np.mean(cross_val_score(clf, X, y, scoring=metric_scorer, cv=rkf, n_jobs=-1))
    
    return balanced_accuracy

In [174]:
study = optuna.create_study(direction="maximize")

[32m[I 2022-06-23 23:34:05,756][0m A new study created in memory with name: no-name-266a551f-14d7-4c9e-bae0-eb3eeb77289c[0m


In [175]:
study.optimize(objective, n_trials=500)

[32m[I 2022-06-23 23:34:06,946][0m Trial 0 finished with value: 0.5 and parameters: {'lambda_l1': 2.3595784575679856e-05, 'lambda_l2': 1.1012063752489056e-06, 'num_leaves': 184, 'feature_fraction': 0.6866384687759279, 'bagging_fraction': 0.6995098509208947, 'bagging_freq': 7, 'min_child_samples': 75, 'max_depth': 294}. Best is trial 0 with value: 0.5.[0m
[32m[I 2022-06-23 23:34:08,520][0m Trial 1 finished with value: 0.7823621184027532 and parameters: {'lambda_l1': 3.8693967443574304, 'lambda_l2': 9.689284707975446e-08, 'num_leaves': 180, 'feature_fraction': 0.8189516075958034, 'bagging_fraction': 0.9077794004028834, 'bagging_freq': 1, 'min_child_samples': 48, 'max_depth': 906}. Best is trial 1 with value: 0.7823621184027532.[0m
[32m[I 2022-06-23 23:34:09,175][0m Trial 2 finished with value: 0.5 and parameters: {'lambda_l1': 0.04583950873771014, 'lambda_l2': 0.030289944481967472, 'num_leaves': 158, 'feature_fraction': 0.4396819945927838, 'bagging_fraction': 0.6935136647082449, 

[32m[I 2022-06-23 23:34:32,041][0m Trial 22 finished with value: 0.8458864038653124 and parameters: {'lambda_l1': 0.0049848585689571435, 'lambda_l2': 0.4350407031813399, 'num_leaves': 190, 'feature_fraction': 0.40232254850151916, 'bagging_fraction': 0.7686444240928251, 'bagging_freq': 5, 'min_child_samples': 20, 'max_depth': 191}. Best is trial 20 with value: 0.8514517096680408.[0m
[32m[I 2022-06-23 23:34:32,944][0m Trial 23 finished with value: 0.8483214730960473 and parameters: {'lambda_l1': 0.00014041839015897656, 'lambda_l2': 0.024737071573333977, 'num_leaves': 141, 'feature_fraction': 0.5309675943408307, 'bagging_fraction': 0.6007980823669852, 'bagging_freq': 4, 'min_child_samples': 32, 'max_depth': 324}. Best is trial 20 with value: 0.8514517096680408.[0m
[32m[I 2022-06-23 23:34:34,448][0m Trial 24 finished with value: 0.8261507213546686 and parameters: {'lambda_l1': 0.3387155633074678, 'lambda_l2': 0.00013554409961060858, 'num_leaves': 103, 'feature_fraction': 0.51480443

[32m[I 2022-06-23 23:34:56,458][0m Trial 44 finished with value: 0.8309216784462528 and parameters: {'lambda_l1': 0.12805272165280263, 'lambda_l2': 1.1096030783653459e-08, 'num_leaves': 146, 'feature_fraction': 0.564379479523821, 'bagging_fraction': 0.4740626746323659, 'bagging_freq': 7, 'min_child_samples': 14, 'max_depth': 160}. Best is trial 34 with value: 0.8587185535482749.[0m
[32m[I 2022-06-23 23:34:58,260][0m Trial 45 finished with value: 0.847702398521671 and parameters: {'lambda_l1': 0.0042033404046229175, 'lambda_l2': 3.45172810622433e-08, 'num_leaves': 82, 'feature_fraction': 0.5986836272784102, 'bagging_fraction': 0.5004357529230422, 'bagging_freq': 3, 'min_child_samples': 7, 'max_depth': 295}. Best is trial 34 with value: 0.8587185535482749.[0m
[32m[I 2022-06-23 23:34:59,397][0m Trial 46 finished with value: 0.8450055008416075 and parameters: {'lambda_l1': 0.6769662413006188, 'lambda_l2': 1.2202155135645877e-07, 'num_leaves': 128, 'feature_fraction': 0.657740684977

[32m[I 2022-06-23 23:35:23,777][0m Trial 66 finished with value: 0.8624388564222155 and parameters: {'lambda_l1': 3.332194359867164e-08, 'lambda_l2': 2.8748091577579024e-07, 'num_leaves': 99, 'feature_fraction': 0.5202959488420069, 'bagging_fraction': 0.6412478137487818, 'bagging_freq': 4, 'min_child_samples': 31, 'max_depth': 890}. Best is trial 62 with value: 0.8627685659283957.[0m
[32m[I 2022-06-23 23:35:25,717][0m Trial 67 finished with value: 0.8590302445018003 and parameters: {'lambda_l1': 6.486994021115984e-08, 'lambda_l2': 3.3498734276185855e-07, 'num_leaves': 80, 'feature_fraction': 0.5212812170695712, 'bagging_fraction': 0.6395867937595023, 'bagging_freq': 4, 'min_child_samples': 18, 'max_depth': 875}. Best is trial 62 with value: 0.8627685659283957.[0m
[32m[I 2022-06-23 23:35:27,604][0m Trial 68 finished with value: 0.861898707029125 and parameters: {'lambda_l1': 4.5346043038858475e-08, 'lambda_l2': 2.3372156525239048e-07, 'num_leaves': 71, 'feature_fraction': 0.5274

[32m[I 2022-06-23 23:36:01,954][0m Trial 88 finished with value: 0.8526266152578073 and parameters: {'lambda_l1': 4.818120766385874e-08, 'lambda_l2': 1.2497593771243688e-05, 'num_leaves': 47, 'feature_fraction': 0.48639136083604334, 'bagging_fraction': 0.555751654860862, 'bagging_freq': 4, 'min_child_samples': 32, 'max_depth': 914}. Best is trial 62 with value: 0.8627685659283957.[0m
[32m[I 2022-06-23 23:36:02,743][0m Trial 89 finished with value: 0.5 and parameters: {'lambda_l1': 1.600771207270657e-08, 'lambda_l2': 2.466229149104665e-06, 'num_leaves': 102, 'feature_fraction': 0.5074234648852856, 'bagging_fraction': 0.7060544958260057, 'bagging_freq': 3, 'min_child_samples': 51, 'max_depth': 745}. Best is trial 62 with value: 0.8627685659283957.[0m
[32m[I 2022-06-23 23:36:03,568][0m Trial 90 finished with value: 0.652139272814389 and parameters: {'lambda_l1': 0.0007686671648022217, 'lambda_l2': 5.509232029833948e-06, 'num_leaves': 21, 'feature_fraction': 0.4410034475085249, 'ba

[32m[I 2022-06-23 23:36:35,669][0m Trial 110 finished with value: 0.8507242804495126 and parameters: {'lambda_l1': 1.0159467181529021e-05, 'lambda_l2': 6.213213697908753e-08, 'num_leaves': 119, 'feature_fraction': 0.5973770740931523, 'bagging_fraction': 0.5564618457768707, 'bagging_freq': 7, 'min_child_samples': 16, 'max_depth': 811}. Best is trial 93 with value: 0.8669831659783285.[0m
[32m[I 2022-06-23 23:36:37,271][0m Trial 111 finished with value: 0.8632371132049924 and parameters: {'lambda_l1': 4.95036111816296e-07, 'lambda_l2': 5.77697730476053e-07, 'num_leaves': 115, 'feature_fraction': 0.5596063700980043, 'bagging_fraction': 0.514225138782351, 'bagging_freq': 7, 'min_child_samples': 24, 'max_depth': 675}. Best is trial 93 with value: 0.8669831659783285.[0m
[32m[I 2022-06-23 23:36:39,441][0m Trial 112 finished with value: 0.8613373396684154 and parameters: {'lambda_l1': 1.0505812770458955e-06, 'lambda_l2': 5.433539435674912e-07, 'num_leaves': 109, 'feature_fraction': 0.55

[32m[I 2022-06-23 23:37:14,648][0m Trial 131 finished with value: 0.8612691074620252 and parameters: {'lambda_l1': 1.4447298520837669e-06, 'lambda_l2': 3.972040831436347e-07, 'num_leaves': 107, 'feature_fraction': 0.5090066986371035, 'bagging_fraction': 0.5617362531838136, 'bagging_freq': 7, 'min_child_samples': 26, 'max_depth': 879}. Best is trial 93 with value: 0.8669831659783285.[0m
[32m[I 2022-06-23 23:37:16,561][0m Trial 132 finished with value: 0.8642255075720634 and parameters: {'lambda_l1': 3.639981739520953e-05, 'lambda_l2': 1.6309334424409136e-07, 'num_leaves': 77, 'feature_fraction': 0.5447445262144984, 'bagging_fraction': 0.5257647253495119, 'bagging_freq': 7, 'min_child_samples': 23, 'max_depth': 912}. Best is trial 93 with value: 0.8669831659783285.[0m
[32m[I 2022-06-23 23:37:17,967][0m Trial 133 finished with value: 0.861188004934522 and parameters: {'lambda_l1': 3.0364323160031347e-05, 'lambda_l2': 1.646446088067987e-07, 'num_leaves': 45, 'feature_fraction': 0.4

[32m[I 2022-06-23 23:37:56,341][0m Trial 153 finished with value: 0.8527526709831276 and parameters: {'lambda_l1': 6.160326146954555e-05, 'lambda_l2': 9.683751181348257e-07, 'num_leaves': 52, 'feature_fraction': 0.4771417795844353, 'bagging_fraction': 0.5874330405425392, 'bagging_freq': 7, 'min_child_samples': 29, 'max_depth': 788}. Best is trial 139 with value: 0.867258153868835.[0m
[32m[I 2022-06-23 23:37:58,443][0m Trial 154 finished with value: 0.859382634865994 and parameters: {'lambda_l1': 3.280300841432966e-05, 'lambda_l2': 1.3801506762654663e-07, 'num_leaves': 62, 'feature_fraction': 0.46065415840021795, 'bagging_fraction': 0.4942941749492299, 'bagging_freq': 7, 'min_child_samples': 27, 'max_depth': 909}. Best is trial 139 with value: 0.867258153868835.[0m
[32m[I 2022-06-23 23:38:00,711][0m Trial 155 finished with value: 0.8602038302040236 and parameters: {'lambda_l1': 0.0001390538871014217, 'lambda_l2': 3.0694065297350307e-07, 'num_leaves': 74, 'feature_fraction': 0.79

[32m[I 2022-06-23 23:38:33,121][0m Trial 174 finished with value: 0.8644614868705426 and parameters: {'lambda_l1': 1.992813575788047e-05, 'lambda_l2': 2.9737669071130354e-08, 'num_leaves': 43, 'feature_fraction': 0.4181771462743229, 'bagging_fraction': 0.5632852406039218, 'bagging_freq': 7, 'min_child_samples': 25, 'max_depth': 849}. Best is trial 139 with value: 0.867258153868835.[0m
[32m[I 2022-06-23 23:38:35,083][0m Trial 175 finished with value: 0.8605554749043526 and parameters: {'lambda_l1': 5.313869915439392e-05, 'lambda_l2': 7.644804592458342e-08, 'num_leaves': 39, 'feature_fraction': 0.41610508633309434, 'bagging_fraction': 0.5673298232976314, 'bagging_freq': 7, 'min_child_samples': 25, 'max_depth': 847}. Best is trial 139 with value: 0.867258153868835.[0m
[32m[I 2022-06-23 23:38:37,219][0m Trial 176 finished with value: 0.8590831013456787 and parameters: {'lambda_l1': 3.1573112506855846e-05, 'lambda_l2': 3.491879012284155e-07, 'num_leaves': 45, 'feature_fraction': 0.5

[32m[I 2022-06-23 23:39:11,767][0m Trial 195 finished with value: 0.8598642040578958 and parameters: {'lambda_l1': 2.9885149481594142e-05, 'lambda_l2': 4.916188634469289e-07, 'num_leaves': 231, 'feature_fraction': 0.8987861013454304, 'bagging_fraction': 0.5746642052695575, 'bagging_freq': 7, 'min_child_samples': 27, 'max_depth': 766}. Best is trial 139 with value: 0.867258153868835.[0m
[32m[I 2022-06-23 23:39:13,581][0m Trial 196 finished with value: 0.8627065093059673 and parameters: {'lambda_l1': 1.1386823416540284e-06, 'lambda_l2': 2.2451851361478862e-07, 'num_leaves': 91, 'feature_fraction': 0.47656871975998566, 'bagging_fraction': 0.5123133660949802, 'bagging_freq': 7, 'min_child_samples': 24, 'max_depth': 820}. Best is trial 139 with value: 0.867258153868835.[0m
[32m[I 2022-06-23 23:39:14,822][0m Trial 197 finished with value: 0.8569075719172469 and parameters: {'lambda_l1': 4.693603665232704e-06, 'lambda_l2': 3.6213107143618475e-07, 'num_leaves': 47, 'feature_fraction': 

[32m[I 2022-06-23 23:39:53,252][0m Trial 217 finished with value: 0.8687246830182201 and parameters: {'lambda_l1': 0.0012186619598463103, 'lambda_l2': 0.010621422784060185, 'num_leaves': 91, 'feature_fraction': 0.41026908475219803, 'bagging_fraction': 0.538812846851669, 'bagging_freq': 7, 'min_child_samples': 21, 'max_depth': 846}. Best is trial 217 with value: 0.8687246830182201.[0m
[32m[I 2022-06-23 23:39:55,171][0m Trial 218 finished with value: 0.8604815054441601 and parameters: {'lambda_l1': 0.006115342657051539, 'lambda_l2': 0.032586685379544005, 'num_leaves': 93, 'feature_fraction': 0.4004934751756876, 'bagging_fraction': 0.5425119414960488, 'bagging_freq': 7, 'min_child_samples': 21, 'max_depth': 847}. Best is trial 217 with value: 0.8687246830182201.[0m
[32m[I 2022-06-23 23:39:57,393][0m Trial 219 finished with value: 0.8633436895277065 and parameters: {'lambda_l1': 0.002099628661688207, 'lambda_l2': 0.012878897352011542, 'num_leaves': 99, 'feature_fraction': 0.4212193

[32m[I 2022-06-23 23:40:40,244][0m Trial 239 finished with value: 0.8570742702215225 and parameters: {'lambda_l1': 0.003407753744998582, 'lambda_l2': 0.0004829150728734601, 'num_leaves': 74, 'feature_fraction': 0.41271730742862905, 'bagging_fraction': 0.5834663545190748, 'bagging_freq': 7, 'min_child_samples': 23, 'max_depth': 829}. Best is trial 217 with value: 0.8687246830182201.[0m
[32m[I 2022-06-23 23:40:42,509][0m Trial 240 finished with value: 0.8578470547340515 and parameters: {'lambda_l1': 0.0018567023036396508, 'lambda_l2': 0.006783336498717448, 'num_leaves': 72, 'feature_fraction': 0.4359513805905052, 'bagging_fraction': 0.5932072573261093, 'bagging_freq': 7, 'min_child_samples': 19, 'max_depth': 869}. Best is trial 217 with value: 0.8687246830182201.[0m
[32m[I 2022-06-23 23:40:44,549][0m Trial 241 finished with value: 0.8648182820361613 and parameters: {'lambda_l1': 0.0008996885138093709, 'lambda_l2': 0.00402858107500018, 'num_leaves': 82, 'feature_fraction': 0.40023

[32m[I 2022-06-23 23:41:22,642][0m Trial 261 finished with value: 0.8606191635377006 and parameters: {'lambda_l1': 0.00019073427580550075, 'lambda_l2': 0.056330374846064714, 'num_leaves': 85, 'feature_fraction': 0.41881802125967516, 'bagging_fraction': 0.5485985929754897, 'bagging_freq': 7, 'min_child_samples': 23, 'max_depth': 755}. Best is trial 217 with value: 0.8687246830182201.[0m
[32m[I 2022-06-23 23:41:24,768][0m Trial 262 finished with value: 0.8543616865375765 and parameters: {'lambda_l1': 0.0020098973369900163, 'lambda_l2': 0.00012110619127255987, 'num_leaves': 89, 'feature_fraction': 0.5543993402685228, 'bagging_fraction': 0.5722723820344939, 'bagging_freq': 7, 'min_child_samples': 17, 'max_depth': 840}. Best is trial 217 with value: 0.8687246830182201.[0m
[32m[I 2022-06-23 23:41:26,531][0m Trial 263 finished with value: 0.8612881434655815 and parameters: {'lambda_l1': 0.0006064282363193419, 'lambda_l2': 0.006422929091744252, 'num_leaves': 73, 'feature_fraction': 0.4

[32m[I 2022-06-23 23:42:01,687][0m Trial 282 finished with value: 0.862293431392309 and parameters: {'lambda_l1': 0.0003222299057201958, 'lambda_l2': 0.009968675665249609, 'num_leaves': 125, 'feature_fraction': 0.4525041252483267, 'bagging_fraction': 0.5303749787214656, 'bagging_freq': 7, 'min_child_samples': 26, 'max_depth': 733}. Best is trial 217 with value: 0.8687246830182201.[0m
[32m[I 2022-06-23 23:42:03,887][0m Trial 283 finished with value: 0.8535479988922325 and parameters: {'lambda_l1': 0.010161786385994208, 'lambda_l2': 0.023821569762222806, 'num_leaves': 78, 'feature_fraction': 0.6633804834656041, 'bagging_fraction': 0.5372874500773479, 'bagging_freq': 7, 'min_child_samples': 15, 'max_depth': 772}. Best is trial 217 with value: 0.8687246830182201.[0m
[32m[I 2022-06-23 23:42:05,484][0m Trial 284 finished with value: 0.8626389157368258 and parameters: {'lambda_l1': 0.002347697085999192, 'lambda_l2': 0.01020009205122552, 'num_leaves': 87, 'feature_fraction': 0.46222371

[32m[I 2022-06-23 23:42:38,175][0m Trial 304 finished with value: 0.8602551586914977 and parameters: {'lambda_l1': 0.0015497999131978209, 'lambda_l2': 0.005108269337080783, 'num_leaves': 73, 'feature_fraction': 0.4852500586431378, 'bagging_fraction': 0.49651833170337667, 'bagging_freq': 7, 'min_child_samples': 25, 'max_depth': 917}. Best is trial 217 with value: 0.8687246830182201.[0m
[32m[I 2022-06-23 23:42:40,307][0m Trial 305 finished with value: 0.8622846610518823 and parameters: {'lambda_l1': 0.0005859550425546332, 'lambda_l2': 0.012318197143969744, 'num_leaves': 101, 'feature_fraction': 0.5059871424473631, 'bagging_fraction': 0.56020503402158, 'bagging_freq': 7, 'min_child_samples': 21, 'max_depth': 974}. Best is trial 217 with value: 0.8687246830182201.[0m
[32m[I 2022-06-23 23:42:42,045][0m Trial 306 finished with value: 0.8633898118994867 and parameters: {'lambda_l1': 9.241020663959357e-08, 'lambda_l2': 0.07522733004311231, 'num_leaves': 154, 'feature_fraction': 0.53684

[32m[I 2022-06-23 23:43:23,234][0m Trial 325 finished with value: 0.8578223068211457 and parameters: {'lambda_l1': 0.00025096486808194596, 'lambda_l2': 0.001570835679880564, 'num_leaves': 85, 'feature_fraction': 0.7794277181682177, 'bagging_fraction': 0.5488731257982931, 'bagging_freq': 7, 'min_child_samples': 26, 'max_depth': 806}. Best is trial 314 with value: 0.8692806059609466.[0m
[32m[I 2022-06-23 23:43:25,317][0m Trial 326 finished with value: 0.8588471341180319 and parameters: {'lambda_l1': 0.0003103410687211104, 'lambda_l2': 7.505080496073651e-08, 'num_leaves': 136, 'feature_fraction': 0.4210518289704224, 'bagging_fraction': 0.5743215995516393, 'bagging_freq': 7, 'min_child_samples': 29, 'max_depth': 859}. Best is trial 314 with value: 0.8692806059609466.[0m
[32m[I 2022-06-23 23:43:27,605][0m Trial 327 finished with value: 0.8621488113924258 and parameters: {'lambda_l1': 0.0004551025855082303, 'lambda_l2': 0.003075894258533625, 'num_leaves': 110, 'feature_fraction': 0.9

[32m[I 2022-06-23 23:44:04,883][0m Trial 347 finished with value: 0.8481726023374629 and parameters: {'lambda_l1': 0.630887588496574, 'lambda_l2': 6.003688132312409e-08, 'num_leaves': 195, 'feature_fraction': 0.5353282323522818, 'bagging_fraction': 0.556684312534875, 'bagging_freq': 7, 'min_child_samples': 24, 'max_depth': 522}. Best is trial 314 with value: 0.8692806059609466.[0m
[32m[I 2022-06-23 23:44:06,769][0m Trial 348 finished with value: 0.8666485605253019 and parameters: {'lambda_l1': 5.218685746662984e-07, 'lambda_l2': 0.014915915120576504, 'num_leaves': 149, 'feature_fraction': 0.5761036132554095, 'bagging_fraction': 0.5220605779995002, 'bagging_freq': 7, 'min_child_samples': 28, 'max_depth': 838}. Best is trial 314 with value: 0.8692806059609466.[0m
[32m[I 2022-06-23 23:44:08,157][0m Trial 349 finished with value: 0.8496795661983357 and parameters: {'lambda_l1': 4.2357057267967383e-07, 'lambda_l2': 0.00017216070444441708, 'num_leaves': 190, 'feature_fraction': 0.577

[32m[I 2022-06-23 23:44:48,463][0m Trial 369 finished with value: 0.8638034359860985 and parameters: {'lambda_l1': 5.441755742344828e-07, 'lambda_l2': 0.1826744513897969, 'num_leaves': 209, 'feature_fraction': 0.42453658394820515, 'bagging_fraction': 0.5349531027865656, 'bagging_freq': 7, 'min_child_samples': 28, 'max_depth': 924}. Best is trial 314 with value: 0.8692806059609466.[0m
[32m[I 2022-06-23 23:44:49,812][0m Trial 370 finished with value: 0.8594715465562989 and parameters: {'lambda_l1': 0.013642825056322413, 'lambda_l2': 0.00623738419612872, 'num_leaves': 147, 'feature_fraction': 0.5114270979783081, 'bagging_fraction': 0.5451786034181727, 'bagging_freq': 7, 'min_child_samples': 25, 'max_depth': 549}. Best is trial 314 with value: 0.8692806059609466.[0m
[32m[I 2022-06-23 23:44:51,642][0m Trial 371 finished with value: 0.8580650440890377 and parameters: {'lambda_l1': 0.0009473148386408692, 'lambda_l2': 4.978059548390485e-08, 'num_leaves': 253, 'feature_fraction': 0.5444

[32m[I 2022-06-23 23:45:23,762][0m Trial 390 finished with value: 0.8326813908915303 and parameters: {'lambda_l1': 1.406645375598954e-06, 'lambda_l2': 3.3818791600288737e-07, 'num_leaves': 143, 'feature_fraction': 0.47387325973299066, 'bagging_fraction': 0.5115302451331699, 'bagging_freq': 7, 'min_child_samples': 32, 'max_depth': 993}. Best is trial 389 with value: 0.8707509203687609.[0m
[32m[I 2022-06-23 23:45:26,080][0m Trial 391 finished with value: 0.867966289451003 and parameters: {'lambda_l1': 3.6055919093473432e-06, 'lambda_l2': 3.234812940229759e-07, 'num_leaves': 148, 'feature_fraction': 0.4615413056785933, 'bagging_fraction': 0.5220472425680726, 'bagging_freq': 7, 'min_child_samples': 27, 'max_depth': 977}. Best is trial 389 with value: 0.8707509203687609.[0m
[32m[I 2022-06-23 23:45:28,311][0m Trial 392 finished with value: 0.8669040362587189 and parameters: {'lambda_l1': 8.88292186759587e-07, 'lambda_l2': 3.328163649830276e-07, 'num_leaves': 144, 'feature_fraction': 

[32m[I 2022-06-23 23:46:01,082][0m Trial 411 finished with value: 0.8621527785295199 and parameters: {'lambda_l1': 7.862166665867306e-07, 'lambda_l2': 9.375295112829582e-07, 'num_leaves': 149, 'feature_fraction': 0.5195885221499709, 'bagging_fraction': 0.5074669680301662, 'bagging_freq': 7, 'min_child_samples': 24, 'max_depth': 1000}. Best is trial 389 with value: 0.8707509203687609.[0m
[32m[I 2022-06-23 23:46:03,744][0m Trial 412 finished with value: 0.8485060990136453 and parameters: {'lambda_l1': 4.4308282282747516e-07, 'lambda_l2': 1.2717635615583436e-07, 'num_leaves': 185, 'feature_fraction': 0.5131652890535878, 'bagging_fraction': 0.8314598201254372, 'bagging_freq': 7, 'min_child_samples': 25, 'max_depth': 955}. Best is trial 389 with value: 0.8707509203687609.[0m
[32m[I 2022-06-23 23:46:06,098][0m Trial 413 finished with value: 0.8655736146378561 and parameters: {'lambda_l1': 1.2132822998094689e-06, 'lambda_l2': 1.972375816184608e-06, 'num_leaves': 152, 'feature_fraction

[32m[I 2022-06-23 23:46:42,814][0m Trial 432 finished with value: 0.8587951116006457 and parameters: {'lambda_l1': 1.4618734850486864e-06, 'lambda_l2': 2.785306771823844e-07, 'num_leaves': 203, 'feature_fraction': 0.4398351627047236, 'bagging_fraction': 0.5587211703943287, 'bagging_freq': 7, 'min_child_samples': 30, 'max_depth': 885}. Best is trial 389 with value: 0.8707509203687609.[0m
[32m[I 2022-06-23 23:46:45,113][0m Trial 433 finished with value: 0.85830191340737 and parameters: {'lambda_l1': 2.51236434437743e-07, 'lambda_l2': 1.8500154137947161e-07, 'num_leaves': 124, 'feature_fraction': 0.4258237180209296, 'bagging_fraction': 0.5485874266513936, 'bagging_freq': 3, 'min_child_samples': 27, 'max_depth': 999}. Best is trial 389 with value: 0.8707509203687609.[0m
[32m[I 2022-06-23 23:46:45,677][0m Trial 434 finished with value: 0.5 and parameters: {'lambda_l1': 1.0636700836231468e-06, 'lambda_l2': 4.381405214615813e-07, 'num_leaves': 17, 'feature_fraction': 0.734279495952620

[32m[I 2022-06-23 23:47:33,652][0m Trial 454 finished with value: 0.8635215630602627 and parameters: {'lambda_l1': 4.3002192248139256e-07, 'lambda_l2': 0.028597780748577294, 'num_leaves': 110, 'feature_fraction': 0.48550049305664245, 'bagging_fraction': 0.525828965467082, 'bagging_freq': 7, 'min_child_samples': 28, 'max_depth': 956}. Best is trial 389 with value: 0.8707509203687609.[0m
[32m[I 2022-06-23 23:47:36,845][0m Trial 455 finished with value: 0.8533881371251725 and parameters: {'lambda_l1': 6.370676102521838e-08, 'lambda_l2': 0.027683053584809363, 'num_leaves': 98, 'feature_fraction': 0.44052853721318935, 'bagging_fraction': 0.7429723414731342, 'bagging_freq': 7, 'min_child_samples': 23, 'max_depth': 977}. Best is trial 389 with value: 0.8707509203687609.[0m
[32m[I 2022-06-23 23:47:39,288][0m Trial 456 finished with value: 0.8604125516511353 and parameters: {'lambda_l1': 9.679893940325626e-07, 'lambda_l2': 0.0798298504459157, 'num_leaves': 128, 'feature_fraction': 0.462

[32m[I 2022-06-23 23:48:21,140][0m Trial 475 finished with value: 0.8584953768926291 and parameters: {'lambda_l1': 2.4852837200631695e-07, 'lambda_l2': 2.3422306931688404e-06, 'num_leaves': 90, 'feature_fraction': 0.41176162863497945, 'bagging_fraction': 0.5597154304705471, 'bagging_freq': 6, 'min_child_samples': 22, 'max_depth': 850}. Best is trial 389 with value: 0.8707509203687609.[0m
[32m[I 2022-06-23 23:48:23,256][0m Trial 476 finished with value: 0.8573552238310365 and parameters: {'lambda_l1': 6.555638964960354e-07, 'lambda_l2': 3.945930538524973e-06, 'num_leaves': 95, 'feature_fraction': 0.42314432709406696, 'bagging_fraction': 0.5729497265985805, 'bagging_freq': 7, 'min_child_samples': 18, 'max_depth': 801}. Best is trial 389 with value: 0.8707509203687609.[0m
[32m[I 2022-06-23 23:48:25,359][0m Trial 477 finished with value: 0.8652563879404049 and parameters: {'lambda_l1': 8.007413607892195e-07, 'lambda_l2': 8.305079310786386e-07, 'num_leaves': 106, 'feature_fraction':

[32m[I 2022-06-23 23:49:05,827][0m Trial 496 finished with value: 0.8651252549050535 and parameters: {'lambda_l1': 0.00021724973622386085, 'lambda_l2': 3.213181290608168e-07, 'num_leaves': 89, 'feature_fraction': 0.43212384421408406, 'bagging_fraction': 0.5437777658203073, 'bagging_freq': 7, 'min_child_samples': 29, 'max_depth': 766}. Best is trial 389 with value: 0.8707509203687609.[0m
[32m[I 2022-06-23 23:49:08,503][0m Trial 497 finished with value: 0.8631671972248597 and parameters: {'lambda_l1': 0.00936132840023022, 'lambda_l2': 1.9074403363786395e-07, 'num_leaves': 148, 'feature_fraction': 0.46194213370316994, 'bagging_fraction': 0.5589369795428327, 'bagging_freq': 7, 'min_child_samples': 26, 'max_depth': 848}. Best is trial 389 with value: 0.8707509203687609.[0m
[32m[I 2022-06-23 23:49:09,761][0m Trial 498 finished with value: 0.6010329444162649 and parameters: {'lambda_l1': 4.6622755268778554e-07, 'lambda_l2': 1.0941142690987427e-06, 'num_leaves': 105, 'feature_fraction'

In [176]:
# Get best trial based on metric score
trial = study.best_trial

In [177]:
# Print params for best trial
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

    lambda_l1: 1.2537265730755262e-06
    lambda_l2: 3.451074192111965e-07
    num_leaves: 141
    feature_fraction: 0.4703062413129908
    bagging_fraction: 0.5227902554474249
    bagging_freq: 7
    min_child_samples: 27
    max_depth: 996


In [178]:
# Best Score from HP Opt
trial.values[0]

0.8707509203687609

- Aumento de performance considerável em relação ao valor baseline para um LGBM

##### 4.2) PCA Dimension Reduction + Hyper Parameter Optimization with Optuna

In [181]:
# Define objectiva funtion to maximize metric
def objective(trial):
    
    # PCA Parameter Grid
    pca_param = {
        "n_components": trial.suggest_int("n_components", 5, 100)
    }
    # Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model pipeline with param trial
    clf = Pipeline([("pca", PCA(**pca_param)),
                    ("lgbm_clf", lgb.LGBMClassifier(random_state=SEED, **param))])
    # Get CV Metric we want to maximize
    balanced_accuracy = np.mean(cross_val_score(clf, X, y, scoring=metric_scorer, cv=rkf, n_jobs=-1))
    
    return balanced_accuracy

In [182]:
study = optuna.create_study(direction="maximize")

[32m[I 2022-06-17 15:30:06,080][0m A new study created in memory with name: no-name-adfd880b-812e-41f4-828a-f1b968db75c6[0m


In [None]:
study.optimize(objective, n_trials=500)

In [26]:
# Get best trial based on metric score
trial = study.best_trial

In [27]:
# Print params for best trial
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

    n_components: 98
    lambda_l1: 4.997984938308335e-06
    lambda_l2: 9.51579276153768e-06
    num_leaves: 44
    feature_fraction: 0.5421454819677247
    bagging_fraction: 0.48235662397332973
    bagging_freq: 4
    min_child_samples: 8
    max_depth: 563


In [28]:
# Best Score from HP Opt
trial.values[0]

0.7134464057614213

- Utilizar redução de dimensionalidade via PCA piorou bastante a performance do modelo

##### 4.3) Features Scaler + Hyper Parameter Optimization with Optuna

In [29]:
# Define objectiva funtion to maximize metric
def objective(trial):
    
    # List scalers to chose from
    scalers = trial.suggest_categorical("scalers", ['minmax', 'standard', 'robust'])
    # Define scalers
    if scalers == "minmax":
        scaler = MinMaxScaler()
    elif scalers == "standard":
        scaler = StandardScaler()
    else:
        scaler = RobustScaler()
    # LGBM Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model pipeline with param trial
    clf = Pipeline([("scaler", scaler),
                    ("lgbm_clf", lgb.LGBMClassifier(random_state=SEED, **param))])
    # Get CV Metric we want to maximize
    balanced_accuracy = np.mean(cross_val_score(clf, X, y, scoring=metric_scorer, cv=rkf, n_jobs=-1))
    
    return balanced_accuracy

In [30]:
study = optuna.create_study(direction="maximize")

[32m[I 2022-06-07 22:15:09,043][0m A new study created in memory with name: no-name-9d027bd4-11fc-4b2d-8846-da6f6ffb30f6[0m


In [None]:
study.optimize(objective, n_trials=500)

In [32]:
# Get best trial based on metric score
trial = study.best_trial

In [33]:
# Print params for best trial
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

    scalers: robust
    lambda_l1: 4.3223282482835764e-07
    lambda_l2: 3.3796186878776776e-05
    num_leaves: 188
    feature_fraction: 0.650978565866076
    bagging_fraction: 0.9212536490874126
    bagging_freq: 7
    min_child_samples: 36
    max_depth: 956


In [34]:
# Best Score from HP Opt
trial.values[0]

0.8521054248039541

- A adição de um estágio de feature scaling antes do treinamento parece ter ajudado o modelo

##### 4.4) Boruta Feature Selection + Features Scaler + Hyper Parameter Optimization with Optuna

In [217]:
# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=SEED, )

In [218]:
feat_selector.fit(np.array(X), y)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	546
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	546
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	546
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	546
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	546
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	546
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	546
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	70
Rejected: 	476
Iteration: 	9 / 100
Confirmed: 	22
Tentative: 	48
Rejected: 	476
Iteration: 	10 / 100
Confirmed: 	22
Tentative: 	48
Rejected: 	476
Iteration: 	11 / 100
Confirmed: 	22
Tentative: 	48
Rejected: 	476
Iteration: 	12 / 100
Confirmed: 	25
Tentative: 	36
Rejected: 	485
Iteration: 	13 / 100
Confirmed: 	25
Tentative: 	36
Rejected: 	485
Iteration: 	14 / 100
Confirmed: 	25
Tentative: 	36
Rejected: 	485
Iteration: 	15 / 100
Confirmed: 	25
Tentative: 	36
Rejected: 	485
Iteration: 	16 / 100
Confirmed: 	2

In [219]:
print("Number of selected features: ", feat_selector.n_features_)

Number of selected features:  33


In [220]:
# Best Features (accord to Boruta)
X.columns[feat_selector.support_]

Index(['Freq.1399.46591504505', 'Freq.1522.99914751846',
       'Freq.1715.83254187774', 'Freq.1740.12061901788',
       'Freq.1794.21713030157', 'Freq.2032.98713905056',
       'Freq.2100.44990262345', 'Freq.2182.54012190969',
       'Freq.2187.26929655148', 'Freq.2241.01398322552',
       'Freq.2461.57721259156', 'Freq.2822.17822957638',
       'Freq.2981.05105455515', 'Freq.3083.87231952593',
       'Freq.3795.45160708473', 'Freq.3983.61553339652',
       'Freq.4266.97846896688', 'Freq.4283.95182164633',
       'Freq.4307.03317519015', 'Freq.4395.11277752994',
       'Freq.4495.09063766933', 'Freq.4659.55667096198',
       'Freq.4802.98802524845', 'Freq.4823.08140765752',
       'Freq.5084.14952111257', 'Freq.5224.84719303067',
       'Freq.5272.6322523475', 'Freq.5433.53206707083',
       'Freq.5485.69282171011', 'Freq.5720.79450801948',
       'Freq.7738.2889532685', 'Freq.8943.8000787644', 'Freq.9098.3102509794'],
      dtype='object')

In [221]:
# Filter most importante features
X_transform = feat_selector.transform(np.array(X))

In [222]:
# Define objectiva funtion to maximize metric
def objective(trial):
    
    # List scalers to chose from
    scalers = trial.suggest_categorical("scalers", ['minmax', 'standard', 'robust'])
    # Define scalers
    if scalers == "minmax":
        scaler = MinMaxScaler()
    elif scalers == "standard":
        scaler = StandardScaler()
    else:
        scaler = RobustScaler()
    # LGBM Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model pipeline with param trial
    clf = Pipeline([("scaler", scaler),
                    ("lgbm_clf", lgb.LGBMClassifier(random_state=SEED, **param))])
    # Get CV Metric we want to maximize
    balanced_accuracy = np.mean(cross_val_score(clf, X_transform, y, scoring=metric_scorer, cv=rkf, n_jobs=-1))
    
    return balanced_accuracy

In [223]:
study = optuna.create_study(direction="maximize")

[32m[I 2022-06-24 00:05:09,001][0m A new study created in memory with name: no-name-7e03f0db-a45d-43fa-aeb6-3eaeb5815ff6[0m


In [224]:
study.optimize(objective, n_trials=500)

[32m[I 2022-06-24 00:07:07,780][0m Trial 0 finished with value: 0.5 and parameters: {'scalers': 'standard', 'lambda_l1': 1.736164714022951e-05, 'lambda_l2': 0.0008744806520391302, 'num_leaves': 186, 'feature_fraction': 0.8593730467230957, 'bagging_fraction': 0.7039252573471362, 'bagging_freq': 5, 'min_child_samples': 67, 'max_depth': 291}. Best is trial 0 with value: 0.5.[0m
[32m[I 2022-06-24 00:07:08,583][0m Trial 1 finished with value: 0.8603665093506424 and parameters: {'scalers': 'standard', 'lambda_l1': 0.008622697977459544, 'lambda_l2': 6.208314622454566e-08, 'num_leaves': 184, 'feature_fraction': 0.701051773780067, 'bagging_fraction': 0.4008763757819356, 'bagging_freq': 1, 'min_child_samples': 20, 'max_depth': 476}. Best is trial 1 with value: 0.8603665093506424.[0m
[32m[I 2022-06-24 00:07:09,649][0m Trial 2 finished with value: 0.8380926025728658 and parameters: {'scalers': 'robust', 'lambda_l1': 0.17266781673396256, 'lambda_l2': 1.059988995841295e-07, 'num_leaves': 30,

[32m[I 2022-06-24 00:07:27,143][0m Trial 21 finished with value: 0.8489316654063171 and parameters: {'scalers': 'standard', 'lambda_l1': 1.402975481383046e-08, 'lambda_l2': 9.991701764028775e-05, 'num_leaves': 225, 'feature_fraction': 0.9238790545409372, 'bagging_fraction': 0.47278969400458437, 'bagging_freq': 3, 'min_child_samples': 18, 'max_depth': 562}. Best is trial 18 with value: 0.8606703089007656.[0m
[32m[I 2022-06-24 00:07:28,404][0m Trial 22 finished with value: 0.8557324019289964 and parameters: {'scalers': 'standard', 'lambda_l1': 1.9395373152078576e-07, 'lambda_l2': 2.458075416311321e-07, 'num_leaves': 227, 'feature_fraction': 0.9393163921111374, 'bagging_fraction': 0.45858041879466954, 'bagging_freq': 2, 'min_child_samples': 18, 'max_depth': 556}. Best is trial 18 with value: 0.8606703089007656.[0m
[32m[I 2022-06-24 00:07:29,803][0m Trial 23 finished with value: 0.8553611502231858 and parameters: {'scalers': 'standard', 'lambda_l1': 1.023051122687271e-07, 'lambda_l

[32m[I 2022-06-24 00:07:43,881][0m Trial 42 finished with value: 0.5 and parameters: {'scalers': 'minmax', 'lambda_l1': 4.016506692881616e-08, 'lambda_l2': 8.445820809561303e-06, 'num_leaves': 159, 'feature_fraction': 0.9554397718072419, 'bagging_fraction': 0.7313497446825487, 'bagging_freq': 1, 'min_child_samples': 46, 'max_depth': 401}. Best is trial 24 with value: 0.870197741929553.[0m
[32m[I 2022-06-24 00:07:44,462][0m Trial 43 finished with value: 0.8532307688656373 and parameters: {'scalers': 'robust', 'lambda_l1': 3.3006575930054615e-07, 'lambda_l2': 4.0587916862520554e-05, 'num_leaves': 144, 'feature_fraction': 0.8524526137899114, 'bagging_fraction': 0.6827111778722371, 'bagging_freq': 2, 'min_child_samples': 31, 'max_depth': 138}. Best is trial 24 with value: 0.870197741929553.[0m
[32m[I 2022-06-24 00:07:45,844][0m Trial 44 finished with value: 0.8565074724252356 and parameters: {'scalers': 'robust', 'lambda_l1': 1.718341135731164e-08, 'lambda_l2': 1.010455807137174e-0

[32m[I 2022-06-24 00:08:01,232][0m Trial 63 finished with value: 0.8597860226074237 and parameters: {'scalers': 'standard', 'lambda_l1': 2.640054204056451e-08, 'lambda_l2': 2.10337760151054e-08, 'num_leaves': 184, 'feature_fraction': 0.7129508948932183, 'bagging_fraction': 0.4052925742915207, 'bagging_freq': 1, 'min_child_samples': 22, 'max_depth': 665}. Best is trial 24 with value: 0.870197741929553.[0m
[32m[I 2022-06-24 00:08:01,819][0m Trial 64 finished with value: 0.5 and parameters: {'scalers': 'standard', 'lambda_l1': 5.72724282074693e-08, 'lambda_l2': 1.6165174186927202e-07, 'num_leaves': 157, 'feature_fraction': 0.7290473533412141, 'bagging_fraction': 0.4385790711428784, 'bagging_freq': 1, 'min_child_samples': 32, 'max_depth': 750}. Best is trial 24 with value: 0.870197741929553.[0m
[32m[I 2022-06-24 00:08:02,303][0m Trial 65 finished with value: 0.5 and parameters: {'scalers': 'standard', 'lambda_l1': 2.358237908871582e-08, 'lambda_l2': 3.9888316362284076e-07, 'num_lea

[32m[I 2022-06-24 00:08:18,053][0m Trial 84 finished with value: 0.8640526208182941 and parameters: {'scalers': 'robust', 'lambda_l1': 8.569444484836788e-08, 'lambda_l2': 0.003779939269437846, 'num_leaves': 175, 'feature_fraction': 0.48316712380251475, 'bagging_fraction': 0.5208280746556682, 'bagging_freq': 1, 'min_child_samples': 19, 'max_depth': 309}. Best is trial 24 with value: 0.870197741929553.[0m
[32m[I 2022-06-24 00:08:18,612][0m Trial 85 finished with value: 0.7416672635500035 and parameters: {'scalers': 'robust', 'lambda_l1': 1.5643004554794204e-08, 'lambda_l2': 0.001078940432489709, 'num_leaves': 176, 'feature_fraction': 0.9055578204624273, 'bagging_fraction': 0.40177537631439764, 'bagging_freq': 1, 'min_child_samples': 26, 'max_depth': 311}. Best is trial 24 with value: 0.870197741929553.[0m
[32m[I 2022-06-24 00:08:19,400][0m Trial 86 finished with value: 0.8526485367702473 and parameters: {'scalers': 'robust', 'lambda_l1': 4.340889648922245e-08, 'lambda_l2': 0.0516

[32m[I 2022-06-24 00:08:38,348][0m Trial 105 finished with value: 0.8575949885812502 and parameters: {'scalers': 'robust', 'lambda_l1': 6.143209318456959e-08, 'lambda_l2': 0.023437625666072194, 'num_leaves': 123, 'feature_fraction': 0.8342478263367732, 'bagging_fraction': 0.4031627616780694, 'bagging_freq': 1, 'min_child_samples': 11, 'max_depth': 225}. Best is trial 24 with value: 0.870197741929553.[0m
[32m[I 2022-06-24 00:08:39,315][0m Trial 106 finished with value: 0.8489890327680576 and parameters: {'scalers': 'robust', 'lambda_l1': 1.0030812723247223e-08, 'lambda_l2': 0.0053953079813989545, 'num_leaves': 114, 'feature_fraction': 0.7877723157014915, 'bagging_fraction': 0.6931044440652246, 'bagging_freq': 1, 'min_child_samples': 15, 'max_depth': 163}. Best is trial 24 with value: 0.870197741929553.[0m
[32m[I 2022-06-24 00:08:40,201][0m Trial 107 finished with value: 0.8627445574506332 and parameters: {'scalers': 'robust', 'lambda_l1': 3.112841148044115e-08, 'lambda_l2': 0.00

[32m[I 2022-06-24 00:08:59,205][0m Trial 126 finished with value: 0.8629881753523395 and parameters: {'scalers': 'robust', 'lambda_l1': 8.643101790839507e-08, 'lambda_l2': 2.2214959529008746, 'num_leaves': 105, 'feature_fraction': 0.90879926145945, 'bagging_fraction': 0.5139137783824822, 'bagging_freq': 1, 'min_child_samples': 23, 'max_depth': 273}. Best is trial 24 with value: 0.870197741929553.[0m
[32m[I 2022-06-24 00:09:00,511][0m Trial 127 finished with value: 0.8615623654596177 and parameters: {'scalers': 'robust', 'lambda_l1': 2.2011745070006225e-07, 'lambda_l2': 6.220165955241483, 'num_leaves': 93, 'feature_fraction': 0.897997434720152, 'bagging_fraction': 0.45311565439632295, 'bagging_freq': 1, 'min_child_samples': 17, 'max_depth': 374}. Best is trial 24 with value: 0.870197741929553.[0m
[32m[I 2022-06-24 00:09:01,282][0m Trial 128 finished with value: 0.8196744114588544 and parameters: {'scalers': 'robust', 'lambda_l1': 6.18218828432302e-08, 'lambda_l2': 2.604218242805

[32m[I 2022-06-24 00:09:20,110][0m Trial 147 finished with value: 0.861736772987934 and parameters: {'scalers': 'robust', 'lambda_l1': 1.4471247505814906e-07, 'lambda_l2': 0.16275253742081833, 'num_leaves': 45, 'feature_fraction': 0.9862832556493635, 'bagging_fraction': 0.47441964847629536, 'bagging_freq': 1, 'min_child_samples': 14, 'max_depth': 359}. Best is trial 24 with value: 0.870197741929553.[0m
[32m[I 2022-06-24 00:09:20,992][0m Trial 148 finished with value: 0.8637761573866449 and parameters: {'scalers': 'robust', 'lambda_l1': 7.347934666962333e-07, 'lambda_l2': 0.5300633048438841, 'num_leaves': 32, 'feature_fraction': 0.9320757435649505, 'bagging_fraction': 0.42683254299012763, 'bagging_freq': 1, 'min_child_samples': 19, 'max_depth': 325}. Best is trial 24 with value: 0.870197741929553.[0m
[32m[I 2022-06-24 00:09:22,608][0m Trial 149 finished with value: 0.8615939927590469 and parameters: {'scalers': 'robust', 'lambda_l1': 3.927723673126104e-07, 'lambda_l2': 0.2903275

[32m[I 2022-06-24 00:09:47,756][0m Trial 168 finished with value: 0.8632264818944385 and parameters: {'scalers': 'minmax', 'lambda_l1': 3.5195009750714974e-06, 'lambda_l2': 0.470287007833321, 'num_leaves': 59, 'feature_fraction': 0.9362316772976995, 'bagging_fraction': 0.4529041697785809, 'bagging_freq': 1, 'min_child_samples': 13, 'max_depth': 408}. Best is trial 24 with value: 0.870197741929553.[0m
[32m[I 2022-06-24 00:09:48,876][0m Trial 169 finished with value: 0.8614011064829563 and parameters: {'scalers': 'minmax', 'lambda_l1': 4.1529020090961666e-07, 'lambda_l2': 3.4336462323520487, 'num_leaves': 23, 'feature_fraction': 0.9257608926455617, 'bagging_fraction': 0.42746927798405987, 'bagging_freq': 1, 'min_child_samples': 16, 'max_depth': 435}. Best is trial 24 with value: 0.870197741929553.[0m
[32m[I 2022-06-24 00:09:50,259][0m Trial 170 finished with value: 0.8613305787564688 and parameters: {'scalers': 'minmax', 'lambda_l1': 5.127571689478989e-07, 'lambda_l2': 0.99278697

[32m[I 2022-06-24 00:10:12,029][0m Trial 189 finished with value: 0.862092874273989 and parameters: {'scalers': 'robust', 'lambda_l1': 6.858374700285733e-07, 'lambda_l2': 0.5353781031028702, 'num_leaves': 43, 'feature_fraction': 0.8649218241500364, 'bagging_fraction': 0.4614168140430328, 'bagging_freq': 1, 'min_child_samples': 22, 'max_depth': 452}. Best is trial 179 with value: 0.8708616194177339.[0m
[32m[I 2022-06-24 00:10:13,268][0m Trial 190 finished with value: 0.8685445969793879 and parameters: {'scalers': 'robust', 'lambda_l1': 9.510654927268586e-07, 'lambda_l2': 1.0786820250163967, 'num_leaves': 96, 'feature_fraction': 0.9231085103495782, 'bagging_fraction': 0.4887119362360643, 'bagging_freq': 1, 'min_child_samples': 19, 'max_depth': 342}. Best is trial 179 with value: 0.8708616194177339.[0m
[32m[I 2022-06-24 00:10:14,442][0m Trial 191 finished with value: 0.8681475439550131 and parameters: {'scalers': 'robust', 'lambda_l1': 1.1270261093593196e-06, 'lambda_l2': 1.044218

[32m[I 2022-06-24 00:10:31,330][0m Trial 210 finished with value: 0.8668631846309864 and parameters: {'scalers': 'robust', 'lambda_l1': 2.624478904117859e-07, 'lambda_l2': 0.31126828018582475, 'num_leaves': 28, 'feature_fraction': 0.891797805435693, 'bagging_fraction': 0.4805578193530731, 'bagging_freq': 1, 'min_child_samples': 23, 'max_depth': 260}. Best is trial 179 with value: 0.8708616194177339.[0m
[32m[I 2022-06-24 00:10:31,956][0m Trial 211 finished with value: 0.8678655838915127 and parameters: {'scalers': 'robust', 'lambda_l1': 3.3230502013634666e-07, 'lambda_l2': 0.28131679794067693, 'num_leaves': 27, 'feature_fraction': 0.8920777604008847, 'bagging_fraction': 0.4654767351370562, 'bagging_freq': 1, 'min_child_samples': 23, 'max_depth': 263}. Best is trial 179 with value: 0.8708616194177339.[0m
[32m[I 2022-06-24 00:10:32,974][0m Trial 212 finished with value: 0.8668979360422859 and parameters: {'scalers': 'robust', 'lambda_l1': 2.9363425527970954e-07, 'lambda_l2': 0.270

[32m[I 2022-06-24 00:10:46,943][0m Trial 230 finished with value: 0.8640739634462545 and parameters: {'scalers': 'standard', 'lambda_l1': 1.0234060503834778e-07, 'lambda_l2': 0.39248313282187064, 'num_leaves': 3, 'feature_fraction': 0.9278944855234093, 'bagging_fraction': 0.548527774331145, 'bagging_freq': 1, 'min_child_samples': 27, 'max_depth': 269}. Best is trial 179 with value: 0.8708616194177339.[0m
[32m[I 2022-06-24 00:10:47,736][0m Trial 231 finished with value: 0.8601418256236368 and parameters: {'scalers': 'standard', 'lambda_l1': 1.3703318663773002e-06, 'lambda_l2': 1.1938522594962715, 'num_leaves': 7, 'feature_fraction': 0.9367080054552612, 'bagging_fraction': 0.5350553914871468, 'bagging_freq': 1, 'min_child_samples': 18, 'max_depth': 286}. Best is trial 179 with value: 0.8708616194177339.[0m
[32m[I 2022-06-24 00:10:48,532][0m Trial 232 finished with value: 0.865513608660861 and parameters: {'scalers': 'standard', 'lambda_l1': 5.64824423796149e-07, 'lambda_l2': 0.63

[32m[I 2022-06-24 00:11:02,290][0m Trial 250 finished with value: 0.8623456175317631 and parameters: {'scalers': 'robust', 'lambda_l1': 1.5975178750812e-06, 'lambda_l2': 2.5272808757425342, 'num_leaves': 15, 'feature_fraction': 0.9872437031542555, 'bagging_fraction': 0.5005900645768526, 'bagging_freq': 1, 'min_child_samples': 20, 'max_depth': 254}. Best is trial 179 with value: 0.8708616194177339.[0m
[32m[I 2022-06-24 00:11:03,194][0m Trial 251 finished with value: 0.860433962000332 and parameters: {'scalers': 'robust', 'lambda_l1': 1.6414879572661718e-07, 'lambda_l2': 1.236618683326059, 'num_leaves': 82, 'feature_fraction': 0.9148140377067255, 'bagging_fraction': 0.4843364265105953, 'bagging_freq': 1, 'min_child_samples': 18, 'max_depth': 224}. Best is trial 179 with value: 0.8708616194177339.[0m
[32m[I 2022-06-24 00:11:04,161][0m Trial 252 finished with value: 0.865285840814672 and parameters: {'scalers': 'robust', 'lambda_l1': 8.263003779796827e-07, 'lambda_l2': 0.5704910098

[32m[I 2022-06-24 00:11:24,926][0m Trial 271 finished with value: 0.8408414216932018 and parameters: {'scalers': 'robust', 'lambda_l1': 9.086737892505517e-08, 'lambda_l2': 1.3070902135284155, 'num_leaves': 104, 'feature_fraction': 0.9147162449502293, 'bagging_fraction': 0.9031516903236207, 'bagging_freq': 1, 'min_child_samples': 19, 'max_depth': 358}. Best is trial 179 with value: 0.8708616194177339.[0m
[32m[I 2022-06-24 00:11:26,034][0m Trial 272 finished with value: 0.8664586433088755 and parameters: {'scalers': 'robust', 'lambda_l1': 2.1591344692230616e-07, 'lambda_l2': 0.5299663145730305, 'num_leaves': 81, 'feature_fraction': 0.8855130225824943, 'bagging_fraction': 0.4514243655075606, 'bagging_freq': 1, 'min_child_samples': 22, 'max_depth': 340}. Best is trial 179 with value: 0.8708616194177339.[0m
[32m[I 2022-06-24 00:11:28,171][0m Trial 273 finished with value: 0.8411208536475563 and parameters: {'scalers': 'robust', 'lambda_l1': 1.3940322832540772e-07, 'lambda_l2': 3.194

[32m[I 2022-06-24 00:11:45,863][0m Trial 292 finished with value: 0.8718675463897988 and parameters: {'scalers': 'standard', 'lambda_l1': 1.580293918334291e-07, 'lambda_l2': 0.0004709721620374175, 'num_leaves': 31, 'feature_fraction': 0.9644780321403139, 'bagging_fraction': 0.5249470583723804, 'bagging_freq': 1, 'min_child_samples': 27, 'max_depth': 319}. Best is trial 292 with value: 0.8718675463897988.[0m
[32m[I 2022-06-24 00:11:46,855][0m Trial 293 finished with value: 0.8546698008733614 and parameters: {'scalers': 'standard', 'lambda_l1': 1.7137391502496216e-07, 'lambda_l2': 1.3169074715553096e-05, 'num_leaves': 38, 'feature_fraction': 0.9685050066153338, 'bagging_fraction': 0.5239927500614322, 'bagging_freq': 2, 'min_child_samples': 28, 'max_depth': 323}. Best is trial 292 with value: 0.8718675463897988.[0m
[32m[I 2022-06-24 00:11:48,661][0m Trial 294 finished with value: 0.8634514357546478 and parameters: {'scalers': 'standard', 'lambda_l1': 6.593597006993443e-08, 'lambda

[32m[I 2022-06-24 00:12:06,550][0m Trial 312 finished with value: 0.8620518711809346 and parameters: {'scalers': 'robust', 'lambda_l1': 1.644670048768301e-07, 'lambda_l2': 0.66863426034767, 'num_leaves': 158, 'feature_fraction': 0.9872411131964395, 'bagging_fraction': 0.48990818863487373, 'bagging_freq': 6, 'min_child_samples': 19, 'max_depth': 294}. Best is trial 292 with value: 0.8718675463897988.[0m
[32m[I 2022-06-24 00:12:07,900][0m Trial 313 finished with value: 0.8675546363466257 and parameters: {'scalers': 'robust', 'lambda_l1': 1.2198260037203145e-07, 'lambda_l2': 1.0365299537595376, 'num_leaves': 165, 'feature_fraction': 0.9777471435592027, 'bagging_fraction': 0.4016350586933666, 'bagging_freq': 1, 'min_child_samples': 16, 'max_depth': 340}. Best is trial 292 with value: 0.8718675463897988.[0m
[32m[I 2022-06-24 00:12:09,103][0m Trial 314 finished with value: 0.8577826238733748 and parameters: {'scalers': 'standard', 'lambda_l1': 8.187350207711399e-08, 'lambda_l2': 0.37

[32m[I 2022-06-24 00:12:29,484][0m Trial 332 finished with value: 0.8623160994707049 and parameters: {'scalers': 'standard', 'lambda_l1': 7.068771634985539e-07, 'lambda_l2': 1.3523643056162904, 'num_leaves': 170, 'feature_fraction': 0.9139050334088719, 'bagging_fraction': 0.5241071631597108, 'bagging_freq': 1, 'min_child_samples': 25, 'max_depth': 248}. Best is trial 292 with value: 0.8718675463897988.[0m
[32m[I 2022-06-24 00:12:30,291][0m Trial 333 finished with value: 0.8632743947444026 and parameters: {'scalers': 'robust', 'lambda_l1': 7.013502454932728e-08, 'lambda_l2': 0.47425314398120366, 'num_leaves': 161, 'feature_fraction': 0.9520793005106826, 'bagging_fraction': 0.4214718071689629, 'bagging_freq': 1, 'min_child_samples': 19, 'max_depth': 162}. Best is trial 292 with value: 0.8718675463897988.[0m
[32m[I 2022-06-24 00:12:31,330][0m Trial 334 finished with value: 0.8682510807690761 and parameters: {'scalers': 'standard', 'lambda_l1': 1.5023658937013433e-06, 'lambda_l2': 

[32m[I 2022-06-24 00:12:47,649][0m Trial 353 finished with value: 0.8631558994840729 and parameters: {'scalers': 'standard', 'lambda_l1': 1.8205765905125908e-07, 'lambda_l2': 0.0007372418925814933, 'num_leaves': 90, 'feature_fraction': 0.8376719312222847, 'bagging_fraction': 0.5568756151913118, 'bagging_freq': 1, 'min_child_samples': 20, 'max_depth': 292}. Best is trial 292 with value: 0.8718675463897988.[0m
[32m[I 2022-06-24 00:12:48,251][0m Trial 354 finished with value: 0.8533309867591988 and parameters: {'scalers': 'standard', 'lambda_l1': 4.0596774474084667e-07, 'lambda_l2': 0.00021841899703054217, 'num_leaves': 32, 'feature_fraction': 0.89740358629734, 'bagging_fraction': 0.4598091184133699, 'bagging_freq': 4, 'min_child_samples': 24, 'max_depth': 195}. Best is trial 292 with value: 0.8718675463897988.[0m
[32m[I 2022-06-24 00:12:49,043][0m Trial 355 finished with value: 0.8644394941573735 and parameters: {'scalers': 'standard', 'lambda_l1': 2.8058151824573615e-08, 'lambda

[32m[I 2022-06-24 00:13:08,943][0m Trial 373 finished with value: 0.5 and parameters: {'scalers': 'robust', 'lambda_l1': 0.016244786250354862, 'lambda_l2': 0.0029009009819546895, 'num_leaves': 6, 'feature_fraction': 0.8618044160572558, 'bagging_fraction': 0.44613000958437565, 'bagging_freq': 2, 'min_child_samples': 56, 'max_depth': 316}. Best is trial 292 with value: 0.8718675463897988.[0m
[32m[I 2022-06-24 00:13:09,943][0m Trial 374 finished with value: 0.8622742288885865 and parameters: {'scalers': 'standard', 'lambda_l1': 2.892298325229975e-07, 'lambda_l2': 0.00038459295462934504, 'num_leaves': 36, 'feature_fraction': 0.9445273601355417, 'bagging_fraction': 0.4911159126062896, 'bagging_freq': 1, 'min_child_samples': 15, 'max_depth': 345}. Best is trial 292 with value: 0.8718675463897988.[0m
[32m[I 2022-06-24 00:13:10,849][0m Trial 375 finished with value: 0.8583604401274679 and parameters: {'scalers': 'robust', 'lambda_l1': 6.239342574450905e-07, 'lambda_l2': 0.8376314820318

[32m[I 2022-06-24 00:13:26,415][0m Trial 394 finished with value: 0.8571937065772205 and parameters: {'scalers': 'standard', 'lambda_l1': 1.6174278803529076e-08, 'lambda_l2': 0.4027776854564583, 'num_leaves': 11, 'feature_fraction': 0.9869466448439408, 'bagging_fraction': 0.5355462795727463, 'bagging_freq': 3, 'min_child_samples': 23, 'max_depth': 338}. Best is trial 292 with value: 0.8718675463897988.[0m
[32m[I 2022-06-24 00:13:26,709][0m Trial 395 finished with value: 0.5 and parameters: {'scalers': 'standard', 'lambda_l1': 3.56985533133441e-07, 'lambda_l2': 0.5688748947725494, 'num_leaves': 71, 'feature_fraction': 0.962153386120586, 'bagging_fraction': 0.5799087196566228, 'bagging_freq': 1, 'min_child_samples': 61, 'max_depth': 187}. Best is trial 292 with value: 0.8718675463897988.[0m
[32m[I 2022-06-24 00:13:27,516][0m Trial 396 finished with value: 0.8650798282814536 and parameters: {'scalers': 'robust', 'lambda_l1': 1.0038854204229754e-07, 'lambda_l2': 1.9428666382821536,

[32m[I 2022-06-24 00:13:44,203][0m Trial 415 finished with value: 0.867273502361931 and parameters: {'scalers': 'robust', 'lambda_l1': 4.567564657823748e-07, 'lambda_l2': 1.0309303053758623, 'num_leaves': 136, 'feature_fraction': 0.8559907488727889, 'bagging_fraction': 0.4532189792511736, 'bagging_freq': 1, 'min_child_samples': 18, 'max_depth': 312}. Best is trial 292 with value: 0.8718675463897988.[0m
[32m[I 2022-06-24 00:13:45,144][0m Trial 416 finished with value: 0.855779762033632 and parameters: {'scalers': 'standard', 'lambda_l1': 1.3866430297754027e-06, 'lambda_l2': 0.49330349439851795, 'num_leaves': 27, 'feature_fraction': 0.9454132984200015, 'bagging_fraction': 0.5544379225831841, 'bagging_freq': 2, 'min_child_samples': 12, 'max_depth': 274}. Best is trial 292 with value: 0.8718675463897988.[0m
[32m[I 2022-06-24 00:13:46,041][0m Trial 417 finished with value: 0.8415174342556309 and parameters: {'scalers': 'robust', 'lambda_l1': 6.727195594522539e-07, 'lambda_l2': 0.289

[32m[I 2022-06-24 00:14:03,030][0m Trial 436 finished with value: 0.8601204843273342 and parameters: {'scalers': 'robust', 'lambda_l1': 1.1174549342978452e-07, 'lambda_l2': 2.2714201909080423, 'num_leaves': 233, 'feature_fraction': 0.9135453525812575, 'bagging_fraction': 0.6408961740140573, 'bagging_freq': 2, 'min_child_samples': 21, 'max_depth': 381}. Best is trial 292 with value: 0.8718675463897988.[0m
[32m[I 2022-06-24 00:14:03,844][0m Trial 437 finished with value: 0.8657390188715653 and parameters: {'scalers': 'standard', 'lambda_l1': 4.812127031401299e-06, 'lambda_l2': 0.7689851018072759, 'num_leaves': 30, 'feature_fraction': 0.9371356197225286, 'bagging_fraction': 0.4872149732209391, 'bagging_freq': 1, 'min_child_samples': 19, 'max_depth': 347}. Best is trial 292 with value: 0.8718675463897988.[0m
[32m[I 2022-06-24 00:14:04,668][0m Trial 438 finished with value: 0.8667566041199943 and parameters: {'scalers': 'robust', 'lambda_l1': 1.6186092503713917e-06, 'lambda_l2': 0.4

[32m[I 2022-06-24 00:14:19,392][0m Trial 456 finished with value: 0.8628238760938065 and parameters: {'scalers': 'robust', 'lambda_l1': 5.669313556366672e-07, 'lambda_l2': 1.0848805461650514e-07, 'num_leaves': 24, 'feature_fraction': 0.9298998657760505, 'bagging_fraction': 0.48393097305090405, 'bagging_freq': 1, 'min_child_samples': 22, 'max_depth': 351}. Best is trial 292 with value: 0.8718675463897988.[0m
[32m[I 2022-06-24 00:14:19,823][0m Trial 457 finished with value: 0.7569378375752216 and parameters: {'scalers': 'standard', 'lambda_l1': 1.2873625751923197e-07, 'lambda_l2': 0.06055678155514418, 'num_leaves': 108, 'feature_fraction': 0.9165123407076817, 'bagging_fraction': 0.4007256886105837, 'bagging_freq': 2, 'min_child_samples': 26, 'max_depth': 294}. Best is trial 292 with value: 0.8718675463897988.[0m
[32m[I 2022-06-24 00:14:20,774][0m Trial 458 finished with value: 0.8656200601084503 and parameters: {'scalers': 'robust', 'lambda_l1': 1.728333673601931e-06, 'lambda_l2'

[32m[I 2022-06-24 00:14:35,313][0m Trial 476 finished with value: 0.8525593227506926 and parameters: {'scalers': 'standard', 'lambda_l1': 3.655023538203521e-07, 'lambda_l2': 1.8925273785925903e-05, 'num_leaves': 35, 'feature_fraction': 0.9491278433284144, 'bagging_fraction': 0.5270579409131134, 'bagging_freq': 1, 'min_child_samples': 5, 'max_depth': 276}. Best is trial 292 with value: 0.8718675463897988.[0m
[32m[I 2022-06-24 00:14:36,156][0m Trial 477 finished with value: 0.8645600994507728 and parameters: {'scalers': 'robust', 'lambda_l1': 1.2159032131384854e-07, 'lambda_l2': 0.6569944434810124, 'num_leaves': 96, 'feature_fraction': 0.8606045208298949, 'bagging_fraction': 0.5022703561071765, 'bagging_freq': 2, 'min_child_samples': 23, 'max_depth': 333}. Best is trial 292 with value: 0.8718675463897988.[0m
[32m[I 2022-06-24 00:14:36,808][0m Trial 478 finished with value: 0.8663079462676986 and parameters: {'scalers': 'standard', 'lambda_l1': 3.3678297103803053e-06, 'lambda_l2':

[32m[I 2022-06-24 00:14:54,935][0m Trial 497 finished with value: 0.8663799573749266 and parameters: {'scalers': 'robust', 'lambda_l1': 5.427335872538109e-07, 'lambda_l2': 1.7704879606874941, 'num_leaves': 57, 'feature_fraction': 0.8610745154107482, 'bagging_fraction': 0.4319905870341003, 'bagging_freq': 1, 'min_child_samples': 12, 'max_depth': 480}. Best is trial 292 with value: 0.8718675463897988.[0m
[32m[I 2022-06-24 00:14:55,999][0m Trial 498 finished with value: 0.8557544824886156 and parameters: {'scalers': 'robust', 'lambda_l1': 8.556695614332493e-07, 'lambda_l2': 0.00021356760518056062, 'num_leaves': 51, 'feature_fraction': 0.8284412766724575, 'bagging_fraction': 0.448774866217654, 'bagging_freq': 1, 'min_child_samples': 16, 'max_depth': 406}. Best is trial 292 with value: 0.8718675463897988.[0m
[32m[I 2022-06-24 00:14:56,990][0m Trial 499 finished with value: 0.8582455141178054 and parameters: {'scalers': 'robust', 'lambda_l1': 1.3118950510214943e-06, 'lambda_l2': 4.66

In [225]:
# Get best trial based on metric score
trial = study.best_trial

In [226]:
# Print params for best trial
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

    scalers: standard
    lambda_l1: 1.580293918334291e-07
    lambda_l2: 0.0004709721620374175
    num_leaves: 31
    feature_fraction: 0.9644780321403139
    bagging_fraction: 0.5249470583723804
    bagging_freq: 1
    min_child_samples: 27
    max_depth: 319


In [227]:
# Best Score from HP Opt
trial.values[0]

0.8718675463897988

In [228]:
gbm_opt = lgb.LGBMClassifier(random_state=SEED, **trial.params)

- Pré selecionar as features mais importantes auxiliou a obter uma performance ainda melhor a partir do pipeline com Robust Scaler e LightGBM

In [229]:
X_transform_test = feat_selector.transform(np.array(X_test))

In [230]:
X_transform_test.shape

(64, 33)

In [231]:
gbm_opt.fit(X_transform,y)



In [232]:
y_pred = gbm_opt.predict(X_transform_test)

In [233]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        MILD       0.72      0.81      0.76        42
      SEVERE       0.53      0.41      0.46        22

    accuracy                           0.67        64
   macro avg       0.63      0.61      0.61        64
weighted avg       0.66      0.67      0.66        64



### Testing other algorithms 

In [179]:
X_test, y_test = test_mod.drop("Group", axis=1), test_mod["Group"]
X, y = train_mod.drop("Group", axis=1), train_mod["Group"]

###### Get most important features from boruta

In [180]:
# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=SEED, )
feat_selector.fit(np.array(X), y)
print("Number of selected features: ", feat_selector.n_features_)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	546
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	546
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	546
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	546
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	546
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	546
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	546
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	70
Rejected: 	476
Iteration: 	9 / 100
Confirmed: 	22
Tentative: 	48
Rejected: 	476
Iteration: 	10 / 100
Confirmed: 	22
Tentative: 	48
Rejected: 	476
Iteration: 	11 / 100
Confirmed: 	22
Tentative: 	48
Rejected: 	476
Iteration: 	12 / 100
Confirmed: 	25
Tentative: 	36
Rejected: 	485
Iteration: 	13 / 100
Confirmed: 	25
Tentative: 	36
Rejected: 	485
Iteration: 	14 / 100
Confirmed: 	25
Tentative: 	36
Rejected: 	485
Iteration: 	15 / 100
Confirmed: 	25
Tentative: 	36
Rejected: 	485
Iteration: 	16 / 100
Confirmed: 	2

In [181]:
# Filter most importante features
X_transform = feat_selector.transform(np.array(X))
X_transform_test = feat_selector.transform(np.array(X_test))

In [182]:
gbc = GradientBoostingClassifier(random_state = SEED)

In [183]:
parameters = {'max_depth':list(range(1,100,1)), 'subsample':list(np.arange(0.05,1,0.05)),
             'learning_rate':list(np.arange(0.05,1,0.05)), 'n_estimators':list(range(50,500,25)), }

In [184]:
rscv_gbc = RandomizedSearchCV(gbc, parameters, scoring=metric_scorer, n_jobs = -1, verbose = 4, n_iter = 200)

In [185]:
scores_gbc = rscv_gbc.fit(X,y)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


In [186]:
scores_gbc.best_score_

0.7781216577540107

In [187]:
y_pred = scores_gbc.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        MILD       0.71      0.71      0.71        42
      SEVERE       0.45      0.45      0.45        22

    accuracy                           0.62        64
   macro avg       0.58      0.58      0.58        64
weighted avg       0.62      0.62      0.62        64



In [188]:
scores_gbc_boruta = rscv_gbc.fit(X_transform, y)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


In [189]:
scores_gbc_boruta.best_score_

0.844659090909091

In [190]:
scores_gbc_boruta.best_params_

{'subsample': 0.55,
 'n_estimators': 250,
 'max_depth': 56,
 'learning_rate': 0.9000000000000001}

In [191]:
y_pred = scores_gbc_boruta.predict(X_transform_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        MILD       0.68      0.62      0.65        42
      SEVERE       0.38      0.45      0.42        22

    accuracy                           0.56        64
   macro avg       0.53      0.54      0.53        64
weighted avg       0.58      0.56      0.57        64



In [192]:
lgbm = lgb.LGBMClassifier(random_state=SEED)
parameters = {'num_leaves':list(range(20,100,5)), 'min_child_samples':list(range(5,30,5)),'max_depth':list(range(-1,20,1)),
             'learning_rate':list(np.arange(0.05,1,0.05)),'reg_alpha':list(np.arange(0,0.6, 0.1))}
rscv_lgbm = RandomizedSearchCV(lgbm, parameters, scoring=metric_scorer, n_jobs = -1, verbose = 4, n_iter = 200)

In [193]:
scores_lgbm = rscv_lgbm.fit(X, y)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


In [194]:
scores_lgbm.best_score_

0.7836363636363636

In [195]:
X_test.shape

(64, 546)

In [196]:
y_pred = scores_lgbm.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        MILD       0.70      0.74      0.72        42
      SEVERE       0.45      0.41      0.43        22

    accuracy                           0.62        64
   macro avg       0.58      0.57      0.57        64
weighted avg       0.62      0.62      0.62        64



In [197]:
scores_lgbm_boruta = rscv_lgbm.fit(X_transform, y)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


In [198]:
scores_lgbm_boruta.best_score_

0.8555949197860961

In [199]:
X_transform_test.shape

(64, 33)

In [200]:
y_pred = scores_lgbm_boruta.predict(X_transform_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        MILD       0.66      0.64      0.65        42
      SEVERE       0.35      0.36      0.36        22

    accuracy                           0.55        64
   macro avg       0.50      0.50      0.50        64
weighted avg       0.55      0.55      0.55        64



In [211]:
svc_poly = SVC(random_state=SEED)
#parameters = {'C': list(np.arange(1.0,100,0.25)), 'coef0': list(np.arange(1.0,100,0.25)), 'degree': list(range(1,20,1)),
              #'shrinking': ['True', 'False'], 'class_weight': ['balanced'], 'gamma': ['scale','auto'] }

params = {'kernel': ['rbf'],
          'degree': np.linspace(1, 8, 8),
          'C': np.logspace(-3, 5, 40),
          'gamma': np.logspace(-3, 5, 30),
         'class_weight': ['balanced']}
rsvc_svc_poly = RandomizedSearchCV(svc_poly, params, scoring=metric_scorer, n_jobs = -1, verbose = 4, n_iter = 500)

In [212]:
scores = rsvc_svc_poly.fit(X, y)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


In [213]:
scores.best_params_

{'kernel': 'rbf',
 'gamma': 1.0826367338740541,
 'degree': 4.0,
 'class_weight': 'balanced',
 'C': 5878.0160722749115}

In [214]:
y_pred = scores.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        MILD       0.68      0.62      0.65        42
      SEVERE       0.38      0.45      0.42        22

    accuracy                           0.56        64
   macro avg       0.53      0.54      0.53        64
weighted avg       0.58      0.56      0.57        64



In [215]:
scores = rsvc_svc_poly.fit(X_transform, y)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


In [216]:
y_pred = scores.predict(X_transform_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        MILD       0.63      0.52      0.57        42
      SEVERE       0.31      0.41      0.35        22

    accuracy                           0.48        64
   macro avg       0.47      0.47      0.46        64
weighted avg       0.52      0.48      0.50        64

