# Notebook to Implement Model Training - LGBM

---

### 1) Setup

In [283]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

from scipy import stats
from scipy.stats import wilcoxon

from sklearn.metrics import balanced_accuracy_score, make_scorer
from sklearn.model_selection import RepeatedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

import optuna
import lightgbm as lgb

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report

from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

In [2]:
SEED = 42

In [3]:
TRAIN_CLINICAL_FILENAME = "train_set.csv"
TEST_CLINICAL_FILENAME = "test_set.csv"

---

### 2) Read and Preprocess Data

In [4]:
train = pd.read_csv(TRAIN_CLINICAL_FILENAME, sep=",", index_col="ID")

In [5]:
train.shape

(132, 636)

In [6]:
train.head()

Unnamed: 0_level_0,Freq.1198.05707939985,Freq.1204.07148226988,Freq.1211.25531677913,Freq.1217.79534957812,Freq.1223.24180676615,Freq.1234.42874922858,Freq.1239.04873179697,Freq.1243.98934968702,Freq.1249.10239635402,Freq.1254.7097579948,...,Freq.8943.76551923189,Freq.9058.85825530971,Freq.9098.58510797401,Freq.9437.74469644083,Freq.9593.90405666006,Freq.9799.842201746,Freq.10432.4853106264,Freq.11006.9514551194,Freq.11161.31855876,Group
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000319,0.000416,7.7e-05,4.03414e-05,8.9e-05,0.000282,4e-06,0.00013,0.000288,0.0,...,1.7e-05,5e-06,2e-05,2.9e-05,3.7e-05,2.4e-05,1.6e-05,3e-05,3e-05,MILD
10,0.0,0.0,2.1e-05,0.001064014,0.000113,0.001591,0.01631,0.00233,0.000476,0.0006074661,...,4e-05,1e-05,1e-06,5e-06,2e-06,5.2e-05,3e-06,0.000155,1e-06,MILD
100,0.000199,6e-06,1.3e-05,5.089552e-08,8.7e-05,0.000543,0.000231,6.8e-05,1.8e-05,0.0001286428,...,9e-06,4.4e-05,1.2e-05,6.7e-05,0.000207,2e-05,2.7e-05,0.0001,0.0001,MILD
101,0.000196,0.000244,6.9e-05,4.874671e-05,0.000125,0.000353,2.9e-05,0.000125,0.00012,4.988436e-07,...,7.2e-05,5.1e-05,1.3e-05,6.1e-05,4e-05,8e-06,3e-06,5.5e-05,2.9e-05,MILD
105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.7e-05,3.1e-05,1.4e-05,1.7e-05,2.8e-05,9e-06,1.2e-05,2.6e-05,1.7e-05,MILD


In [7]:
##### Preprocessing

train_mod = train.copy()
# Imputation using mean method
#train_mod = train_mod.replace(0,np.nan)
#train_mod_imp = train_mod.transform(lambda x: x.fillna(x.mean()))

In [8]:
train_mod.shape

(132, 636)

In [9]:
train_mod.head()

Unnamed: 0_level_0,Freq.1198.05707939985,Freq.1204.07148226988,Freq.1211.25531677913,Freq.1217.79534957812,Freq.1223.24180676615,Freq.1234.42874922858,Freq.1239.04873179697,Freq.1243.98934968702,Freq.1249.10239635402,Freq.1254.7097579948,...,Freq.8943.76551923189,Freq.9058.85825530971,Freq.9098.58510797401,Freq.9437.74469644083,Freq.9593.90405666006,Freq.9799.842201746,Freq.10432.4853106264,Freq.11006.9514551194,Freq.11161.31855876,Group
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000319,0.000416,7.7e-05,4.03414e-05,8.9e-05,0.000282,4e-06,0.00013,0.000288,0.0,...,1.7e-05,5e-06,2e-05,2.9e-05,3.7e-05,2.4e-05,1.6e-05,3e-05,3e-05,MILD
10,0.0,0.0,2.1e-05,0.001064014,0.000113,0.001591,0.01631,0.00233,0.000476,0.0006074661,...,4e-05,1e-05,1e-06,5e-06,2e-06,5.2e-05,3e-06,0.000155,1e-06,MILD
100,0.000199,6e-06,1.3e-05,5.089552e-08,8.7e-05,0.000543,0.000231,6.8e-05,1.8e-05,0.0001286428,...,9e-06,4.4e-05,1.2e-05,6.7e-05,0.000207,2e-05,2.7e-05,0.0001,0.0001,MILD
101,0.000196,0.000244,6.9e-05,4.874671e-05,0.000125,0.000353,2.9e-05,0.000125,0.00012,4.988436e-07,...,7.2e-05,5.1e-05,1.3e-05,6.1e-05,4e-05,8e-06,3e-06,5.5e-05,2.9e-05,MILD
105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.7e-05,3.1e-05,1.4e-05,1.7e-05,2.8e-05,9e-06,1.2e-05,2.6e-05,1.7e-05,MILD


In [84]:
test = pd.read_csv(TEST_CLINICAL_FILENAME, sep=",", index_col="ID")
test_mod = test.copy()

In [85]:
X_test, y_test = test_mod.drop("Group", axis=1), test_mod["Group"]

---

### 3) Baseline Model Training and CV

In [10]:
# Define Classifier (or pipeline)
clf = lgb.LGBMClassifier(random_state=SEED)

In [12]:
# Get Features and Target
X, y = train_mod.drop("Group", axis=1), train_mod["Group"]

In [13]:
# Defining RepeatedKFold Cross Validator
rkf = RepeatedKFold(n_splits=5, n_repeats=20, random_state=SEED)

In [39]:
# Define metric scorer
metric_scorer = make_scorer(balanced_accuracy_score)
metric_scorer

make_scorer(balanced_accuracy_score)

In [15]:
# Cross validate model
scores = cross_val_score(clf, X, y, scoring=metric_scorer, cv=rkf, n_jobs=-1)

In [16]:
# Score from each CV Iteration
scores

array([0.82236842, 0.75      , 0.86842105, 0.73076923, 0.85119048,
       0.875     , 0.875     , 0.76190476, 0.7375    , 0.88888889,
       0.82954545, 0.79117647, 0.72556391, 0.72916667, 0.85625   ,
       0.83333333, 0.85      , 0.85714286, 0.80065359, 0.69281046,
       0.63333333, 0.88486842, 0.70606061, 0.8       , 0.81875   ,
       0.80833333, 0.88736264, 0.59722222, 0.69281046, 0.85947712,
       0.69444444, 0.83235294, 0.77819549, 0.78125   , 0.69281046,
       0.69602273, 0.78618421, 0.72619048, 0.56666667, 0.76785714,
       0.75      , 0.75      , 0.83030303, 0.83030303, 0.88194444,
       0.78021978, 0.90909091, 0.80392157, 0.7875    , 0.82330827,
       0.8       , 0.64166667, 0.7875    , 0.74509804, 0.81875   ,
       0.83238636, 0.75      , 0.70138889, 0.80357143, 0.75      ,
       0.72058824, 0.69444444, 0.80952381, 0.84722222, 0.64285714,
       0.68333333, 0.8       , 0.88194444, 0.72916667, 0.83030303,
       0.78693182, 0.79117647, 0.6875    , 0.91176471, 0.76363

In [17]:
# Mean Metric Value
np.mean(scores)

0.7818376105748396

# Feature reduction

In [71]:
train_mod_stats = train_mod.drop('Group', axis = 1)


In [72]:
df = train_mod.copy()
peaks_list = []
pval_list = []
df2 = pd.melt(df, id_vars = ["Group"])
mild = df.where(df.Group == "MILD").dropna()
sev = df.where(df.Group == "SEVERE").dropna()
for i in range(1,(len(df.columns)-1)):
    s, p1 = stats.mannwhitneyu(mild.iloc[:,i], sev.iloc[:,i], alternative = 'two-sided')
    if p1 <= 0.05:
        pval_list.append(p1)
        peaks_list.append(df.columns[i])
peaks_list.append("Group")

In [87]:
df_filtered = df.drop(columns=[col for col in df if col not in peaks_list])
df_test_filtered = test.drop(columns=[col for col in test if col not in peaks_list])

In [89]:
df_test_filtered.shape

(64, 150)

---

In [97]:
# Get Features and Target
X, y = df_filtered.drop("Group", axis=1), df_filtered["Group"]
X_test, y_test = df_test_filtered.drop("Group", axis=1), df_test_filtered["Group"]
# Cross validate model
scores = cross_val_score(clf, X, y, scoring=metric_scorer, cv=rkf, n_jobs=-1)
# Score from each CV Iteration
scores

array([0.79605263, 0.88888889, 0.82330827, 0.76923077, 0.96428571,
       0.88333333, 0.75986842, 0.80357143, 0.7375    , 0.88888889,
       0.79829545, 0.82058824, 0.7518797 , 0.75694444, 0.86875   ,
       0.88888889, 0.9       , 0.81547619, 0.82679739, 0.71568627,
       0.70833333, 0.88486842, 0.71818182, 0.8       , 0.81875   ,
       0.73333333, 0.88736264, 0.60416667, 0.71895425, 0.83006536,
       0.80555556, 0.83235294, 0.80451128, 0.875     , 0.72222222,
       0.78693182, 0.8125    , 0.79761905, 0.625     , 0.85119048,
       0.75      , 0.8       , 0.73939394, 0.86363636, 0.88194444,
       0.8543956 , 0.87784091, 0.83006536, 0.86875   , 0.82330827,
       0.9       , 0.725     , 0.80625   , 0.71568627, 0.8875    ,
       0.95454545, 0.8       , 0.70138889, 0.88095238, 0.7       ,
       0.72058824, 0.75      , 0.80952381, 0.78472222, 0.68452381,
       0.725     , 0.76666667, 0.81944444, 0.79166667, 0.84242424,
       0.86363636, 0.79117647, 0.75625   , 0.94117647, 0.76363

In [92]:
# Mean Metric Value
np.mean(scores)

0.8039530045547848

### 4) Experiments

##### 4.1) Hyper Parameter Optimization with Optuna

In [128]:
# Define objectiva funtion to maximize metric
def objective(trial):
    
    # Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model with param trial
    clf = lgb.LGBMClassifier(random_state=SEED, **param)
    # Get CV Metric we want to maximize
    balanced_accuracy = np.mean(cross_val_score(clf, X, y, scoring=metric_scorer, cv=rkf, n_jobs=-1))
    
    return balanced_accuracy

In [129]:
study = optuna.create_study(direction="maximize")

[32m[I 2022-06-21 15:45:39,282][0m A new study created in memory with name: no-name-e6416ef2-db4b-43fe-88d2-f515ea06571a[0m


In [130]:
study.optimize(objective, n_trials=500)

[32m[I 2022-06-21 15:46:12,124][0m Trial 0 finished with value: 0.5 and parameters: {'lambda_l1': 9.94178690130804e-07, 'lambda_l2': 6.396882700818826e-07, 'num_leaves': 233, 'feature_fraction': 0.6538636920379493, 'bagging_fraction': 0.4738351942510842, 'bagging_freq': 5, 'min_child_samples': 34, 'max_depth': 822}. Best is trial 0 with value: 0.5.[0m
[32m[I 2022-06-21 15:46:15,559][0m Trial 1 finished with value: 0.8521005587120137 and parameters: {'lambda_l1': 0.013322054229437065, 'lambda_l2': 0.5502889088762963, 'num_leaves': 235, 'feature_fraction': 0.9108649897363823, 'bagging_fraction': 0.8063519516448048, 'bagging_freq': 5, 'min_child_samples': 44, 'max_depth': 362}. Best is trial 1 with value: 0.8521005587120137.[0m
[32m[I 2022-06-21 15:46:17,689][0m Trial 2 finished with value: 0.5 and parameters: {'lambda_l1': 0.03688252904299475, 'lambda_l2': 2.8541859675529306e-06, 'num_leaves': 225, 'feature_fraction': 0.8014690743573696, 'bagging_fraction': 0.5733983875978387, 'b

[32m[I 2022-06-21 15:47:30,200][0m Trial 22 finished with value: 0.7176173893830626 and parameters: {'lambda_l1': 1.250110199888723e-07, 'lambda_l2': 0.005649106517137179, 'num_leaves': 142, 'feature_fraction': 0.5441127177991123, 'bagging_fraction': 0.5363743250007063, 'bagging_freq': 3, 'min_child_samples': 34, 'max_depth': 486}. Best is trial 16 with value: 0.8811761166679898.[0m
[32m[I 2022-06-21 15:47:34,201][0m Trial 23 finished with value: 0.8714963405402648 and parameters: {'lambda_l1': 2.4073326244501874e-06, 'lambda_l2': 0.10512963899232872, 'num_leaves': 99, 'feature_fraction': 0.4786620653582086, 'bagging_fraction': 0.4024229090219423, 'bagging_freq': 4, 'min_child_samples': 17, 'max_depth': 526}. Best is trial 16 with value: 0.8811761166679898.[0m
[32m[I 2022-06-21 15:47:43,234][0m Trial 24 finished with value: 0.8511374290484198 and parameters: {'lambda_l1': 7.94983213297775e-05, 'lambda_l2': 0.002349543795169676, 'num_leaves': 187, 'feature_fraction': 0.576239430

[32m[I 2022-06-21 15:49:14,911][0m Trial 44 finished with value: 0.8681508221096069 and parameters: {'lambda_l1': 6.080460751761331e-07, 'lambda_l2': 0.0001878636759265941, 'num_leaves': 186, 'feature_fraction': 0.4050047261452406, 'bagging_fraction': 0.647933176578071, 'bagging_freq': 5, 'min_child_samples': 32, 'max_depth': 509}. Best is trial 16 with value: 0.8811761166679898.[0m
[32m[I 2022-06-21 15:49:16,793][0m Trial 45 finished with value: 0.5 and parameters: {'lambda_l1': 2.0458701382487298e-07, 'lambda_l2': 2.3280524436107897e-05, 'num_leaves': 232, 'feature_fraction': 0.5948706876621798, 'bagging_fraction': 0.42741963824656126, 'bagging_freq': 3, 'min_child_samples': 45, 'max_depth': 566}. Best is trial 16 with value: 0.8811761166679898.[0m
[32m[I 2022-06-21 15:49:20,575][0m Trial 46 finished with value: 0.8665419422338927 and parameters: {'lambda_l1': 2.15048050488611e-06, 'lambda_l2': 0.00698113438937916, 'num_leaves': 126, 'feature_fraction': 0.7425199592895938, 'b

[32m[I 2022-06-21 15:51:04,979][0m Trial 66 finished with value: 0.5 and parameters: {'lambda_l1': 2.7582919511685486e-06, 'lambda_l2': 0.10198250997548158, 'num_leaves': 180, 'feature_fraction': 0.42428969610501416, 'bagging_fraction': 0.4012261999929807, 'bagging_freq': 4, 'min_child_samples': 34, 'max_depth': 532}. Best is trial 16 with value: 0.8811761166679898.[0m
[32m[I 2022-06-21 15:51:09,102][0m Trial 67 finished with value: 0.8687060445574377 and parameters: {'lambda_l1': 7.349417058410406e-05, 'lambda_l2': 0.0006164976182081101, 'num_leaves': 146, 'feature_fraction': 0.48707408304042954, 'bagging_fraction': 0.6224373002862733, 'bagging_freq': 4, 'min_child_samples': 20, 'max_depth': 405}. Best is trial 16 with value: 0.8811761166679898.[0m
[32m[I 2022-06-21 15:51:15,061][0m Trial 68 finished with value: 0.8680702605143393 and parameters: {'lambda_l1': 1.1971148788182363e-06, 'lambda_l2': 2.0151149702599686e-07, 'num_leaves': 138, 'feature_fraction': 0.5433981203547671

[32m[I 2022-06-21 15:52:59,937][0m Trial 88 finished with value: 0.8799582241588818 and parameters: {'lambda_l1': 2.0197136013670465e-06, 'lambda_l2': 0.00022569588894087297, 'num_leaves': 190, 'feature_fraction': 0.4517570060682192, 'bagging_fraction': 0.5611025796797239, 'bagging_freq': 1, 'min_child_samples': 19, 'max_depth': 455}. Best is trial 16 with value: 0.8811761166679898.[0m
[32m[I 2022-06-21 15:53:04,392][0m Trial 89 finished with value: 0.8749241924226445 and parameters: {'lambda_l1': 6.174449190177766e-06, 'lambda_l2': 0.007360102391987431, 'num_leaves': 148, 'feature_fraction': 0.5686208663918747, 'bagging_fraction': 0.5498489044108533, 'bagging_freq': 1, 'min_child_samples': 19, 'max_depth': 445}. Best is trial 16 with value: 0.8811761166679898.[0m
[32m[I 2022-06-21 15:53:08,853][0m Trial 90 finished with value: 0.8623355753723402 and parameters: {'lambda_l1': 2.5772732476567496e-07, 'lambda_l2': 0.0025676598939141527, 'num_leaves': 107, 'feature_fraction': 0.50

[32m[I 2022-06-21 15:54:40,243][0m Trial 110 finished with value: 0.8593227061895793 and parameters: {'lambda_l1': 1.4404689534784058e-05, 'lambda_l2': 0.006410808073023914, 'num_leaves': 180, 'feature_fraction': 0.536232715588167, 'bagging_fraction': 0.6850284841777867, 'bagging_freq': 3, 'min_child_samples': 13, 'max_depth': 507}. Best is trial 16 with value: 0.8811761166679898.[0m
[32m[I 2022-06-21 15:54:44,715][0m Trial 111 finished with value: 0.8741796291728567 and parameters: {'lambda_l1': 5.500370559086233e-06, 'lambda_l2': 0.0003453127613507241, 'num_leaves': 152, 'feature_fraction': 0.4291256073435325, 'bagging_fraction': 0.5559300405668334, 'bagging_freq': 2, 'min_child_samples': 16, 'max_depth': 396}. Best is trial 16 with value: 0.8811761166679898.[0m
[32m[I 2022-06-21 15:54:48,988][0m Trial 112 finished with value: 0.8784033149083459 and parameters: {'lambda_l1': 8.57510234893784e-06, 'lambda_l2': 0.00041153131306933443, 'num_leaves': 133, 'feature_fraction': 0.42

[32m[I 2022-06-21 15:56:19,031][0m Trial 132 finished with value: 0.8744959601157357 and parameters: {'lambda_l1': 3.619029386670999e-06, 'lambda_l2': 0.0011335030735696843, 'num_leaves': 256, 'feature_fraction': 0.4535336781779985, 'bagging_fraction': 0.4428341821950199, 'bagging_freq': 1, 'min_child_samples': 12, 'max_depth': 699}. Best is trial 16 with value: 0.8811761166679898.[0m
[32m[I 2022-06-21 15:56:21,883][0m Trial 133 finished with value: 0.8642569700466374 and parameters: {'lambda_l1': 7.342286128142549e-06, 'lambda_l2': 0.0005104367960474656, 'num_leaves': 161, 'feature_fraction': 0.44428427324944486, 'bagging_fraction': 0.4308470376606492, 'bagging_freq': 1, 'min_child_samples': 17, 'max_depth': 299}. Best is trial 16 with value: 0.8811761166679898.[0m
[32m[I 2022-06-21 15:56:29,449][0m Trial 134 finished with value: 0.8646181417481728 and parameters: {'lambda_l1': 1.6331758734676134e-05, 'lambda_l2': 0.002546125312368245, 'num_leaves': 190, 'feature_fraction': 0.

[32m[I 2022-06-21 15:58:23,712][0m Trial 153 finished with value: 0.8769732680245451 and parameters: {'lambda_l1': 0.0019163044788775671, 'lambda_l2': 0.00020039947070291862, 'num_leaves': 97, 'feature_fraction': 0.4397731203029992, 'bagging_fraction': 0.4682462043157422, 'bagging_freq': 1, 'min_child_samples': 15, 'max_depth': 874}. Best is trial 150 with value: 0.8847808377982527.[0m
[32m[I 2022-06-21 15:58:31,174][0m Trial 154 finished with value: 0.8696024968538515 and parameters: {'lambda_l1': 0.0015810233402183767, 'lambda_l2': 9.39698185574301e-05, 'num_leaves': 89, 'feature_fraction': 0.43739900260838277, 'bagging_fraction': 0.48143251666169334, 'bagging_freq': 1, 'min_child_samples': 10, 'max_depth': 875}. Best is trial 150 with value: 0.8847808377982527.[0m
[32m[I 2022-06-21 15:58:37,604][0m Trial 155 finished with value: 0.8807305727253483 and parameters: {'lambda_l1': 0.0009111544773447673, 'lambda_l2': 0.0002594359762512202, 'num_leaves': 91, 'feature_fraction': 0.

[32m[I 2022-06-21 16:00:41,042][0m Trial 174 finished with value: 0.8845538057573661 and parameters: {'lambda_l1': 0.00047615659041224, 'lambda_l2': 0.0002740102838153565, 'num_leaves': 63, 'feature_fraction': 0.40034493505127705, 'bagging_fraction': 0.46324087950727383, 'bagging_freq': 1, 'min_child_samples': 17, 'max_depth': 817}. Best is trial 160 with value: 0.8861352395454563.[0m
[32m[I 2022-06-21 16:00:47,424][0m Trial 175 finished with value: 0.8758166859585202 and parameters: {'lambda_l1': 0.0004113684589210403, 'lambda_l2': 0.0005140829358191595, 'num_leaves': 72, 'feature_fraction': 0.4104325088315345, 'bagging_fraction': 0.44378311046657837, 'bagging_freq': 1, 'min_child_samples': 13, 'max_depth': 816}. Best is trial 160 with value: 0.8861352395454563.[0m
[32m[I 2022-06-21 16:00:53,108][0m Trial 176 finished with value: 0.8813795883124442 and parameters: {'lambda_l1': 0.0004900043252873543, 'lambda_l2': 0.0003117074949884513, 'num_leaves': 65, 'feature_fraction': 0.4

[32m[I 2022-06-21 16:02:42,705][0m Trial 195 finished with value: 0.882536862840462 and parameters: {'lambda_l1': 0.00016204353022279924, 'lambda_l2': 0.0005045029348427908, 'num_leaves': 35, 'feature_fraction': 0.4357928531022917, 'bagging_fraction': 0.4727293072202429, 'bagging_freq': 1, 'min_child_samples': 15, 'max_depth': 866}. Best is trial 183 with value: 0.8861708219970604.[0m
[32m[I 2022-06-21 16:02:48,383][0m Trial 196 finished with value: 0.8818699888647644 and parameters: {'lambda_l1': 0.00010153020406353105, 'lambda_l2': 0.0006694953756919065, 'num_leaves': 50, 'feature_fraction': 0.42562598210807145, 'bagging_fraction': 0.4698745231788326, 'bagging_freq': 1, 'min_child_samples': 18, 'max_depth': 848}. Best is trial 183 with value: 0.8861708219970604.[0m
[32m[I 2022-06-21 16:02:54,197][0m Trial 197 finished with value: 0.8851972307663096 and parameters: {'lambda_l1': 0.0001302117359865024, 'lambda_l2': 0.0011527750367602333, 'num_leaves': 38, 'feature_fraction': 0.

[32m[I 2022-06-21 16:04:48,057][0m Trial 216 finished with value: 0.8810954087622046 and parameters: {'lambda_l1': 0.00010852746376655703, 'lambda_l2': 0.0008039067535666632, 'num_leaves': 36, 'feature_fraction': 0.42077688799964685, 'bagging_fraction': 0.4705110681461432, 'bagging_freq': 1, 'min_child_samples': 16, 'max_depth': 884}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:04:54,031][0m Trial 217 finished with value: 0.8834331363081364 and parameters: {'lambda_l1': 0.00026043769670510087, 'lambda_l2': 0.00010376834214559606, 'num_leaves': 50, 'feature_fraction': 0.4579607596688159, 'bagging_fraction': 0.4964846737815323, 'bagging_freq': 1, 'min_child_samples': 19, 'max_depth': 922}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:04:59,735][0m Trial 218 finished with value: 0.88487977330547 and parameters: {'lambda_l1': 0.00021799735712551503, 'lambda_l2': 6.693146006058434e-05, 'num_leaves': 23, 'feature_fraction': 0

[32m[I 2022-06-21 16:06:50,613][0m Trial 237 finished with value: 0.8828730076897645 and parameters: {'lambda_l1': 0.0008681008797857587, 'lambda_l2': 0.0005537910906179976, 'num_leaves': 41, 'feature_fraction': 0.4206691547900913, 'bagging_fraction': 0.49643143316410443, 'bagging_freq': 1, 'min_child_samples': 15, 'max_depth': 822}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:06:57,852][0m Trial 238 finished with value: 0.8678476771121896 and parameters: {'lambda_l1': 0.001759253500926012, 'lambda_l2': 0.000354012985704083, 'num_leaves': 46, 'feature_fraction': 0.45357382951025016, 'bagging_fraction': 0.4856162258514311, 'bagging_freq': 1, 'min_child_samples': 11, 'max_depth': 824}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:07:04,250][0m Trial 239 finished with value: 0.8793157620243534 and parameters: {'lambda_l1': 0.0008826572729877014, 'lambda_l2': 0.000188014826664708, 'num_leaves': 39, 'feature_fraction': 0.433

[32m[I 2022-06-21 16:09:02,393][0m Trial 258 finished with value: 0.8615978402617334 and parameters: {'lambda_l1': 0.0008751063254425684, 'lambda_l2': 0.00025868216848322784, 'num_leaves': 61, 'feature_fraction': 0.4090513363252196, 'bagging_fraction': 0.505931858880416, 'bagging_freq': 1, 'min_child_samples': 9, 'max_depth': 826}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:09:10,201][0m Trial 259 finished with value: 0.8773333041442563 and parameters: {'lambda_l1': 0.0004664465582149631, 'lambda_l2': 0.00016385130419221546, 'num_leaves': 48, 'feature_fraction': 0.8096493137971674, 'bagging_fraction': 0.4573436614168412, 'bagging_freq': 1, 'min_child_samples': 15, 'max_depth': 976}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:09:11,589][0m Trial 260 finished with value: 0.5 and parameters: {'lambda_l1': 0.0016750277525315062, 'lambda_l2': 0.0005997609273536505, 'num_leaves': 41, 'feature_fraction': 0.4001740812872507,

[32m[I 2022-06-21 16:11:12,444][0m Trial 280 finished with value: 0.8723466030443585 and parameters: {'lambda_l1': 0.00014872880233022493, 'lambda_l2': 0.00029845033275225985, 'num_leaves': 56, 'feature_fraction': 0.42671296835263695, 'bagging_fraction': 0.49800205629419325, 'bagging_freq': 1, 'min_child_samples': 11, 'max_depth': 999}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:11:19,359][0m Trial 281 finished with value: 0.886527777047514 and parameters: {'lambda_l1': 0.000532090734002133, 'lambda_l2': 0.00014111513852081302, 'num_leaves': 47, 'feature_fraction': 0.4413426603454792, 'bagging_fraction': 0.4780290528133305, 'bagging_freq': 1, 'min_child_samples': 16, 'max_depth': 832}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:11:25,354][0m Trial 282 finished with value: 0.8765322045085979 and parameters: {'lambda_l1': 0.0005088273083002167, 'lambda_l2': 0.0001357010010811024, 'num_leaves': 49, 'feature_fraction': 0

[32m[I 2022-06-21 16:13:14,043][0m Trial 301 finished with value: 0.7503147248133701 and parameters: {'lambda_l1': 5.603217840518412, 'lambda_l2': 5.95019568515795e-05, 'num_leaves': 58, 'feature_fraction': 0.41222325846603525, 'bagging_fraction': 0.45694835382837035, 'bagging_freq': 1, 'min_child_samples': 21, 'max_depth': 854}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:13:21,277][0m Trial 302 finished with value: 0.8793519769703981 and parameters: {'lambda_l1': 0.0006734798875949753, 'lambda_l2': 0.00016359735937422642, 'num_leaves': 64, 'feature_fraction': 0.4470088739044536, 'bagging_fraction': 0.4741880028358359, 'bagging_freq': 1, 'min_child_samples': 17, 'max_depth': 842}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:13:25,599][0m Trial 303 finished with value: 0.87073052233714 and parameters: {'lambda_l1': 0.0009466154013695218, 'lambda_l2': 7.288361966482836e-05, 'num_leaves': 70, 'feature_fraction': 0.418285

[32m[I 2022-06-21 16:15:30,380][0m Trial 322 finished with value: 0.877102992187551 and parameters: {'lambda_l1': 0.00021161098629754184, 'lambda_l2': 5.459875504170399e-05, 'num_leaves': 32, 'feature_fraction': 0.43001712110296036, 'bagging_fraction': 0.41699260712404074, 'bagging_freq': 1, 'min_child_samples': 15, 'max_depth': 800}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:15:38,801][0m Trial 323 finished with value: 0.8750193734928642 and parameters: {'lambda_l1': 0.010161088159903468, 'lambda_l2': 0.0001236393311967647, 'num_leaves': 128, 'feature_fraction': 0.41408207827958077, 'bagging_fraction': 0.6555982299850718, 'bagging_freq': 1, 'min_child_samples': 23, 'max_depth': 939}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:15:45,114][0m Trial 324 finished with value: 0.8804045240749961 and parameters: {'lambda_l1': 0.000690698350505699, 'lambda_l2': 1.1885449455683662e-05, 'num_leaves': 62, 'feature_fraction': 0

[32m[I 2022-06-21 16:17:45,697][0m Trial 343 finished with value: 0.8792206098639922 and parameters: {'lambda_l1': 0.00025139619676091683, 'lambda_l2': 0.00012141495584990056, 'num_leaves': 48, 'feature_fraction': 0.44248270994299166, 'bagging_fraction': 0.4644963807042366, 'bagging_freq': 1, 'min_child_samples': 18, 'max_depth': 960}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:17:52,446][0m Trial 344 finished with value: 0.8845537459831191 and parameters: {'lambda_l1': 0.0004285779292180835, 'lambda_l2': 0.00044051611447943294, 'num_leaves': 55, 'feature_fraction': 0.4327126989943394, 'bagging_fraction': 0.48579625296872425, 'bagging_freq': 1, 'min_child_samples': 16, 'max_depth': 875}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:18:01,332][0m Trial 345 finished with value: 0.8810863409251568 and parameters: {'lambda_l1': 0.0004911088386517125, 'lambda_l2': 0.000789185838839328, 'num_leaves': 55, 'feature_fraction': 

[32m[I 2022-06-21 16:20:08,687][0m Trial 364 finished with value: 0.875883331516265 and parameters: {'lambda_l1': 0.00019884490660158427, 'lambda_l2': 0.0002470395159626294, 'num_leaves': 58, 'feature_fraction': 0.41100059906565123, 'bagging_fraction': 0.476498026460706, 'bagging_freq': 1, 'min_child_samples': 11, 'max_depth': 784}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:20:14,564][0m Trial 365 finished with value: 0.879918400313137 and parameters: {'lambda_l1': 0.00013454609053031004, 'lambda_l2': 5.6620354241866604e-05, 'num_leaves': 16, 'feature_fraction': 0.409270173797347, 'bagging_fraction': 0.45216782502250086, 'bagging_freq': 1, 'min_child_samples': 15, 'max_depth': 823}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:20:21,049][0m Trial 366 finished with value: 0.8764642722258822 and parameters: {'lambda_l1': 7.384321072874452e-05, 'lambda_l2': 0.0015007546871281786, 'num_leaves': 47, 'feature_fraction': 0.4

[32m[I 2022-06-21 16:22:12,544][0m Trial 386 finished with value: 0.883656509726943 and parameters: {'lambda_l1': 0.000181595776294639, 'lambda_l2': 0.0019092578836145601, 'num_leaves': 56, 'feature_fraction': 0.4211667608003263, 'bagging_fraction': 0.4279847436764175, 'bagging_freq': 1, 'min_child_samples': 16, 'max_depth': 874}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:22:18,389][0m Trial 387 finished with value: 0.881338209391228 and parameters: {'lambda_l1': 0.0007313084621190815, 'lambda_l2': 0.0004556784430267891, 'num_leaves': 40, 'feature_fraction': 0.444464470700347, 'bagging_fraction': 0.4568159741826754, 'bagging_freq': 1, 'min_child_samples': 19, 'max_depth': 832}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:22:24,878][0m Trial 388 finished with value: 0.8744881692954448 and parameters: {'lambda_l1': 0.0003194819220065898, 'lambda_l2': 8.429818829190951e-05, 'num_leaves': 110, 'feature_fraction': 0.40805

[32m[I 2022-06-21 16:24:26,009][0m Trial 407 finished with value: 0.8813093860903457 and parameters: {'lambda_l1': 0.0001559705119274011, 'lambda_l2': 2.912580296393889e-05, 'num_leaves': 49, 'feature_fraction': 0.43974907846349265, 'bagging_fraction': 0.47177724519081876, 'bagging_freq': 1, 'min_child_samples': 21, 'max_depth': 852}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:24:32,796][0m Trial 408 finished with value: 0.8792980045625169 and parameters: {'lambda_l1': 0.00044716872487764734, 'lambda_l2': 0.0005414503718799589, 'num_leaves': 69, 'feature_fraction': 0.42329468521276276, 'bagging_fraction': 0.4609659155718737, 'bagging_freq': 1, 'min_child_samples': 12, 'max_depth': 825}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:24:39,460][0m Trial 409 finished with value: 0.8780072399865355 and parameters: {'lambda_l1': 0.0002823710053549648, 'lambda_l2': 0.00018639433754600988, 'num_leaves': 33, 'feature_fraction':

[32m[I 2022-06-21 16:26:33,766][0m Trial 428 finished with value: 0.8760069154701506 and parameters: {'lambda_l1': 0.00018616897031614155, 'lambda_l2': 0.0006218108469178078, 'num_leaves': 31, 'feature_fraction': 0.4226192928663138, 'bagging_fraction': 0.4138852371770448, 'bagging_freq': 1, 'min_child_samples': 21, 'max_depth': 970}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:26:45,870][0m Trial 429 finished with value: 0.865929099495276 and parameters: {'lambda_l1': 0.0007777179651516396, 'lambda_l2': 6.019226485390526e-05, 'num_leaves': 57, 'feature_fraction': 0.40902717870025423, 'bagging_fraction': 0.5001134908464107, 'bagging_freq': 7, 'min_child_samples': 19, 'max_depth': 921}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:26:54,027][0m Trial 430 finished with value: 0.8804986571735798 and parameters: {'lambda_l1': 0.0003703804906812325, 'lambda_l2': 3.847506279793461e-05, 'num_leaves': 69, 'feature_fraction': 0.4

[32m[I 2022-06-21 16:29:30,885][0m Trial 450 finished with value: 0.8829844590394127 and parameters: {'lambda_l1': 5.553263541382837e-05, 'lambda_l2': 0.0010860916988087734, 'num_leaves': 22, 'feature_fraction': 0.4142207269556086, 'bagging_fraction': 0.45149112661633495, 'bagging_freq': 1, 'min_child_samples': 14, 'max_depth': 899}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:29:41,754][0m Trial 451 finished with value: 0.8835561930759297 and parameters: {'lambda_l1': 0.0005025289137102649, 'lambda_l2': 0.0034995328677811756, 'num_leaves': 42, 'feature_fraction': 0.44554320025511407, 'bagging_fraction': 0.46831574238564416, 'bagging_freq': 1, 'min_child_samples': 17, 'max_depth': 856}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:29:49,972][0m Trial 452 finished with value: 0.877560623007024 and parameters: {'lambda_l1': 0.0012305072875005484, 'lambda_l2': 0.0008489118221944081, 'num_leaves': 47, 'feature_fraction': 0.

[32m[I 2022-06-21 16:32:18,091][0m Trial 471 finished with value: 0.8610537495485251 and parameters: {'lambda_l1': 0.0003701786403513371, 'lambda_l2': 0.00018089292695456838, 'num_leaves': 43, 'feature_fraction': 0.47126875592502293, 'bagging_fraction': 0.4587047884161032, 'bagging_freq': 1, 'min_child_samples': 9, 'max_depth': 880}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:32:24,764][0m Trial 472 finished with value: 0.8768835244562798 and parameters: {'lambda_l1': 0.00357936857662071, 'lambda_l2': 0.00010601895065003074, 'num_leaves': 20, 'feature_fraction': 0.4001572685512002, 'bagging_fraction': 0.481550253391267, 'bagging_freq': 2, 'min_child_samples': 17, 'max_depth': 817}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:32:27,027][0m Trial 473 finished with value: 0.5 and parameters: {'lambda_l1': 0.0007725371673255903, 'lambda_l2': 0.0005764124912455184, 'num_leaves': 49, 'feature_fraction': 0.43190127768888215,

[32m[I 2022-06-21 16:34:35,991][0m Trial 492 finished with value: 0.8725766492494436 and parameters: {'lambda_l1': 0.0004668619824448935, 'lambda_l2': 0.0008422837619629946, 'num_leaves': 45, 'feature_fraction': 0.42346452451672967, 'bagging_fraction': 0.4788908833376862, 'bagging_freq': 2, 'min_child_samples': 10, 'max_depth': 788}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:34:42,420][0m Trial 493 finished with value: 0.8491881449089267 and parameters: {'lambda_l1': 0.0007867987936307244, 'lambda_l2': 0.00016323598668836035, 'num_leaves': 50, 'feature_fraction': 0.43315615910098987, 'bagging_fraction': 0.9353034383285065, 'bagging_freq': 1, 'min_child_samples': 39, 'max_depth': 929}. Best is trial 205 with value: 0.8875892335648526.[0m
[32m[I 2022-06-21 16:34:49,208][0m Trial 494 finished with value: 0.874788100057837 and parameters: {'lambda_l1': 0.00034288447318799785, 'lambda_l2': 0.00027239853192653307, 'num_leaves': 42, 'feature_fraction': 

In [131]:
# Get best trial based on metric score
trial = study.best_trial

In [132]:
# Print params for best trial
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

    lambda_l1: 9.301631683130897e-05
    lambda_l2: 0.00042116251159990197
    num_leaves: 59
    feature_fraction: 0.4100780643597869
    bagging_fraction: 0.4704677676270611
    bagging_freq: 1
    min_child_samples: 15
    max_depth: 996


In [180]:
# Best Score from HP Opt
trial.values[0]

0.886714470556963

- Aumento de performance considerável em relação ao valor baseline para um LGBM

##### 4.2) PCA Dimension Reduction + Hyper Parameter Optimization with Optuna

In [181]:
# Define objectiva funtion to maximize metric
def objective(trial):
    
    # PCA Parameter Grid
    pca_param = {
        "n_components": trial.suggest_int("n_components", 5, 100)
    }
    # Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model pipeline with param trial
    clf = Pipeline([("pca", PCA(**pca_param)),
                    ("lgbm_clf", lgb.LGBMClassifier(random_state=SEED, **param))])
    # Get CV Metric we want to maximize
    balanced_accuracy = np.mean(cross_val_score(clf, X, y, scoring=metric_scorer, cv=rkf, n_jobs=-1))
    
    return balanced_accuracy

In [182]:
study = optuna.create_study(direction="maximize")

[32m[I 2022-06-17 15:30:06,080][0m A new study created in memory with name: no-name-adfd880b-812e-41f4-828a-f1b968db75c6[0m


In [None]:
study.optimize(objective, n_trials=500)

In [26]:
# Get best trial based on metric score
trial = study.best_trial

In [27]:
# Print params for best trial
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

    n_components: 98
    lambda_l1: 4.997984938308335e-06
    lambda_l2: 9.51579276153768e-06
    num_leaves: 44
    feature_fraction: 0.5421454819677247
    bagging_fraction: 0.48235662397332973
    bagging_freq: 4
    min_child_samples: 8
    max_depth: 563


In [28]:
# Best Score from HP Opt
trial.values[0]

0.7134464057614213

- Utilizar redução de dimensionalidade via PCA piorou bastante a performance do modelo

##### 4.3) Features Scaler + Hyper Parameter Optimization with Optuna

In [29]:
# Define objectiva funtion to maximize metric
def objective(trial):
    
    # List scalers to chose from
    scalers = trial.suggest_categorical("scalers", ['minmax', 'standard', 'robust'])
    # Define scalers
    if scalers == "minmax":
        scaler = MinMaxScaler()
    elif scalers == "standard":
        scaler = StandardScaler()
    else:
        scaler = RobustScaler()
    # LGBM Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model pipeline with param trial
    clf = Pipeline([("scaler", scaler),
                    ("lgbm_clf", lgb.LGBMClassifier(random_state=SEED, **param))])
    # Get CV Metric we want to maximize
    balanced_accuracy = np.mean(cross_val_score(clf, X, y, scoring=metric_scorer, cv=rkf, n_jobs=-1))
    
    return balanced_accuracy

In [30]:
study = optuna.create_study(direction="maximize")

[32m[I 2022-06-07 22:15:09,043][0m A new study created in memory with name: no-name-9d027bd4-11fc-4b2d-8846-da6f6ffb30f6[0m


In [None]:
study.optimize(objective, n_trials=500)

In [32]:
# Get best trial based on metric score
trial = study.best_trial

In [33]:
# Print params for best trial
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

    scalers: robust
    lambda_l1: 4.3223282482835764e-07
    lambda_l2: 3.3796186878776776e-05
    num_leaves: 188
    feature_fraction: 0.650978565866076
    bagging_fraction: 0.9212536490874126
    bagging_freq: 7
    min_child_samples: 36
    max_depth: 956


In [34]:
# Best Score from HP Opt
trial.values[0]

0.8521054248039541

- A adição de um estágio de feature scaling antes do treinamento parece ter ajudado o modelo

##### 4.4) Boruta Feature Selection + Features Scaler + Hyper Parameter Optimization with Optuna

In [275]:
# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=SEED, )

In [276]:
feat_selector.fit(np.array(X), y)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	635
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	635
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	635
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	635
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	635
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	635
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	635
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	50
Rejected: 	585
Iteration: 	9 / 100
Confirmed: 	10
Tentative: 	40
Rejected: 	585
Iteration: 	10 / 100
Confirmed: 	10
Tentative: 	40
Rejected: 	585
Iteration: 	11 / 100
Confirmed: 	10
Tentative: 	40
Rejected: 	585
Iteration: 	12 / 100
Confirmed: 	12
Tentative: 	32
Rejected: 	591
Iteration: 	13 / 100
Confirmed: 	12
Tentative: 	32
Rejected: 	591
Iteration: 	14 / 100
Confirmed: 	12
Tentative: 	32
Rejected: 	591
Iteration: 	15 / 100
Confirmed: 	12
Tentative: 	32
Rejected: 	591
Iteration: 	16 / 100
Confirmed: 	1

In [277]:
print("Number of selected features: ", feat_selector.n_features_)

Number of selected features:  27


In [278]:
# Best Features (accord to Boruta)
X.columns[feat_selector.support_]

Index(['Freq.1715.32907573994', 'Freq.1793.31292765446',
       'Freq.1984.28695988636', 'Freq.2032.95119529926',
       'Freq.2148.55916353654', 'Freq.2182.52259691583',
       'Freq.2186.33577081196', 'Freq.2242.07973381149',
       'Freq.2761.81291676166', 'Freq.3044.21704373186',
       'Freq.3077.77601350217', 'Freq.3414.71158220371',
       'Freq.3425.70211639867', 'Freq.3912.82100942603',
       'Freq.4266.3135397872', 'Freq.4282.69712175929',
       'Freq.4305.85988898402', 'Freq.4318.14922038936',
       'Freq.4395.12541812139', 'Freq.4773.1748593189',
       'Freq.4823.05474215093', 'Freq.5085.12753419191',
       'Freq.5224.39772946441', 'Freq.5433.51287445961',
       'Freq.6079.05181901815', 'Freq.7738.28945568542',
       'Freq.8943.76551923189'],
      dtype='object')

In [279]:
# Filter most importante features
X_transform = feat_selector.transform(np.array(X))

In [280]:
# Define objectiva funtion to maximize metric
def objective(trial):
    
    # List scalers to chose from
    scalers = trial.suggest_categorical("scalers", ['minmax', 'standard', 'robust'])
    # Define scalers
    if scalers == "minmax":
        scaler = MinMaxScaler()
    elif scalers == "standard":
        scaler = StandardScaler()
    else:
        scaler = RobustScaler()
    # LGBM Hyper Parameter Grid
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "n_estimators": trial.suggest_int("max_depth", 100, 1000)
    }
    # Create model pipeline with param trial
    clf = Pipeline([("scaler", scaler),
                    ("lgbm_clf", lgb.LGBMClassifier(random_state=SEED, **param))])
    # Get CV Metric we want to maximize
    balanced_accuracy = np.mean(cross_val_score(clf, X_transform, y, scoring=metric_scorer, cv=rkf, n_jobs=-1))
    
    return balanced_accuracy

In [281]:
study = optuna.create_study(direction="maximize")

[32m[I 2022-06-22 12:23:24,685][0m A new study created in memory with name: no-name-9537cfd1-f771-4be9-84f2-c5085aacaac4[0m


In [282]:
study.optimize(objective, n_trials=500)

[32m[I 2022-06-22 12:23:33,031][0m Trial 0 finished with value: 0.5 and parameters: {'scalers': 'minmax', 'lambda_l1': 0.00010650389196195333, 'lambda_l2': 0.5284911398502122, 'num_leaves': 91, 'feature_fraction': 0.487892320827369, 'bagging_fraction': 0.928213850523036, 'bagging_freq': 4, 'min_child_samples': 68, 'max_depth': 116}. Best is trial 0 with value: 0.5.[0m
[32m[I 2022-06-22 12:23:33,706][0m Trial 1 finished with value: 0.5 and parameters: {'scalers': 'standard', 'lambda_l1': 0.00024207495676854067, 'lambda_l2': 1.9141138837209378e-06, 'num_leaves': 171, 'feature_fraction': 0.45548449794622425, 'bagging_fraction': 0.4576570690947096, 'bagging_freq': 5, 'min_child_samples': 84, 'max_depth': 244}. Best is trial 0 with value: 0.5.[0m
[32m[I 2022-06-22 12:23:34,257][0m Trial 2 finished with value: 0.5 and parameters: {'scalers': 'minmax', 'lambda_l1': 0.00028470085166451916, 'lambda_l2': 1.227952984016988e-07, 'num_leaves': 99, 'feature_fraction': 0.9874417190432293, 'ba

[32m[I 2022-06-22 12:24:27,689][0m Trial 21 finished with value: 0.8592067886104263 and parameters: {'scalers': 'standard', 'lambda_l1': 2.4065039926722546e-05, 'lambda_l2': 1.570508061367828e-05, 'num_leaves': 202, 'feature_fraction': 0.9917645229352783, 'bagging_fraction': 0.6831882741343556, 'bagging_freq': 4, 'min_child_samples': 28, 'max_depth': 725}. Best is trial 5 with value: 0.8710019716910196.[0m
[32m[I 2022-06-22 12:24:32,871][0m Trial 22 finished with value: 0.8713779969815186 and parameters: {'scalers': 'standard', 'lambda_l1': 7.304353833037338e-06, 'lambda_l2': 6.291546369282907e-08, 'num_leaves': 222, 'feature_fraction': 0.9392423667211711, 'bagging_fraction': 0.6760225543292433, 'bagging_freq': 3, 'min_child_samples': 19, 'max_depth': 833}. Best is trial 22 with value: 0.8713779969815186.[0m
[32m[I 2022-06-22 12:24:38,923][0m Trial 23 finished with value: 0.8747459322583161 and parameters: {'scalers': 'standard', 'lambda_l1': 0.0011259192845105394, 'lambda_l2':

[32m[I 2022-06-22 12:26:09,156][0m Trial 42 finished with value: 0.8681394836673474 and parameters: {'scalers': 'minmax', 'lambda_l1': 0.00042898405555965034, 'lambda_l2': 3.0771362823084766e-08, 'num_leaves': 183, 'feature_fraction': 0.8792173213292823, 'bagging_fraction': 0.539665647382743, 'bagging_freq': 1, 'min_child_samples': 9, 'max_depth': 880}. Best is trial 31 with value: 0.8836487496206924.[0m
[32m[I 2022-06-22 12:26:14,973][0m Trial 43 finished with value: 0.8737881053629891 and parameters: {'scalers': 'minmax', 'lambda_l1': 0.0008016617685338747, 'lambda_l2': 5.705124892639596e-07, 'num_leaves': 156, 'feature_fraction': 0.9119443593512041, 'bagging_fraction': 0.5894490661510684, 'bagging_freq': 2, 'min_child_samples': 13, 'max_depth': 936}. Best is trial 31 with value: 0.8836487496206924.[0m
[32m[I 2022-06-22 12:26:20,083][0m Trial 44 finished with value: 0.8752180070531463 and parameters: {'scalers': 'minmax', 'lambda_l1': 0.0001956553712468332, 'lambda_l2': 1.981

[32m[I 2022-06-22 12:27:43,290][0m Trial 62 finished with value: 0.8708441697191697 and parameters: {'scalers': 'robust', 'lambda_l1': 0.004916108164899716, 'lambda_l2': 1.142078775121887e-07, 'num_leaves': 156, 'feature_fraction': 0.8708465328314912, 'bagging_fraction': 0.5868893451991998, 'bagging_freq': 1, 'min_child_samples': 19, 'max_depth': 921}. Best is trial 31 with value: 0.8836487496206924.[0m
[32m[I 2022-06-22 12:27:48,226][0m Trial 63 finished with value: 0.8688115109864334 and parameters: {'scalers': 'robust', 'lambda_l1': 0.05808091627402084, 'lambda_l2': 3.266790532452957e-07, 'num_leaves': 188, 'feature_fraction': 0.8451196666373337, 'bagging_fraction': 0.5452725959472474, 'bagging_freq': 1, 'min_child_samples': 10, 'max_depth': 838}. Best is trial 31 with value: 0.8836487496206924.[0m
[32m[I 2022-06-22 12:27:54,689][0m Trial 64 finished with value: 0.8684830714521119 and parameters: {'scalers': 'robust', 'lambda_l1': 0.001475220523645136, 'lambda_l2': 6.9702027

[32m[I 2022-06-22 12:29:28,795][0m Trial 83 finished with value: 0.8788550160924388 and parameters: {'scalers': 'standard', 'lambda_l1': 0.0009984593970234243, 'lambda_l2': 7.591205265253566e-08, 'num_leaves': 162, 'feature_fraction': 0.8503663106899272, 'bagging_fraction': 0.5352040781193108, 'bagging_freq': 1, 'min_child_samples': 10, 'max_depth': 882}. Best is trial 31 with value: 0.8836487496206924.[0m
[32m[I 2022-06-22 12:29:34,579][0m Trial 84 finished with value: 0.8733723577334258 and parameters: {'scalers': 'standard', 'lambda_l1': 0.0005874617198536758, 'lambda_l2': 3.557389110981414e-07, 'num_leaves': 183, 'feature_fraction': 0.83975580450065, 'bagging_fraction': 0.5370577976793572, 'bagging_freq': 1, 'min_child_samples': 10, 'max_depth': 952}. Best is trial 31 with value: 0.8836487496206924.[0m
[32m[I 2022-06-22 12:29:40,240][0m Trial 85 finished with value: 0.8686329806674233 and parameters: {'scalers': 'standard', 'lambda_l1': 0.0010192521360826718, 'lambda_l2': 7

[32m[I 2022-06-22 12:31:00,267][0m Trial 103 finished with value: 0.8734872313144372 and parameters: {'scalers': 'standard', 'lambda_l1': 0.0025027640575862515, 'lambda_l2': 1.6488413168823194e-06, 'num_leaves': 146, 'feature_fraction': 0.4662615723370299, 'bagging_fraction': 0.5135807458990556, 'bagging_freq': 3, 'min_child_samples': 10, 'max_depth': 924}. Best is trial 31 with value: 0.8836487496206924.[0m
[32m[I 2022-06-22 12:31:02,817][0m Trial 104 finished with value: 0.8793289634127484 and parameters: {'scalers': 'standard', 'lambda_l1': 0.0282533576854392, 'lambda_l2': 8.944620128751643e-08, 'num_leaves': 155, 'feature_fraction': 0.48996670085494004, 'bagging_fraction': 0.5357822422030867, 'bagging_freq': 3, 'min_child_samples': 12, 'max_depth': 327}. Best is trial 31 with value: 0.8836487496206924.[0m
[32m[I 2022-06-22 12:31:05,458][0m Trial 105 finished with value: 0.8692502265751103 and parameters: {'scalers': 'standard', 'lambda_l1': 0.0945856077579523, 'lambda_l2': 

[32m[I 2022-06-22 12:32:06,833][0m Trial 123 finished with value: 0.8724082177067858 and parameters: {'scalers': 'standard', 'lambda_l1': 0.0023326744552785565, 'lambda_l2': 3.418189263738051e-06, 'num_leaves': 160, 'feature_fraction': 0.5908119850425395, 'bagging_fraction': 0.5272769549484085, 'bagging_freq': 3, 'min_child_samples': 18, 'max_depth': 562}. Best is trial 31 with value: 0.8836487496206924.[0m
[32m[I 2022-06-22 12:32:09,556][0m Trial 124 finished with value: 0.8643864547260444 and parameters: {'scalers': 'standard', 'lambda_l1': 0.00312421174179364, 'lambda_l2': 1.495332466781988e-06, 'num_leaves': 169, 'feature_fraction': 0.6030274613653532, 'bagging_fraction': 0.579293067121224, 'bagging_freq': 3, 'min_child_samples': 28, 'max_depth': 509}. Best is trial 31 with value: 0.8836487496206924.[0m
[32m[I 2022-06-22 12:32:13,496][0m Trial 125 finished with value: 0.8777167114790955 and parameters: {'scalers': 'standard', 'lambda_l1': 0.007092149776836769, 'lambda_l2': 

[32m[I 2022-06-22 12:33:31,375][0m Trial 143 finished with value: 0.8776563271180144 and parameters: {'scalers': 'standard', 'lambda_l1': 0.0032170216058973575, 'lambda_l2': 0.00033819754045059415, 'num_leaves': 169, 'feature_fraction': 0.5560626148630541, 'bagging_fraction': 0.5185643379967751, 'bagging_freq': 3, 'min_child_samples': 11, 'max_depth': 689}. Best is trial 31 with value: 0.8836487496206924.[0m
[32m[I 2022-06-22 12:33:36,513][0m Trial 144 finished with value: 0.8775680476429314 and parameters: {'scalers': 'standard', 'lambda_l1': 0.001072727430778094, 'lambda_l2': 0.001402497528447426, 'num_leaves': 168, 'feature_fraction': 0.5383205530755695, 'bagging_fraction': 0.5051322195747276, 'bagging_freq': 3, 'min_child_samples': 6, 'max_depth': 695}. Best is trial 31 with value: 0.8836487496206924.[0m
[32m[I 2022-06-22 12:33:41,411][0m Trial 145 finished with value: 0.8757740915989756 and parameters: {'scalers': 'standard', 'lambda_l1': 0.0008984569339613656, 'lambda_l2'

[32m[I 2022-06-22 12:35:06,408][0m Trial 163 finished with value: 0.8816434446831118 and parameters: {'scalers': 'standard', 'lambda_l1': 4.0798914178225835e-06, 'lambda_l2': 0.003548747600347742, 'num_leaves': 240, 'feature_fraction': 0.40348991112322957, 'bagging_fraction': 0.42896615808546773, 'bagging_freq': 3, 'min_child_samples': 9, 'max_depth': 610}. Best is trial 31 with value: 0.8836487496206924.[0m
[32m[I 2022-06-22 12:35:10,511][0m Trial 164 finished with value: 0.8797804489112537 and parameters: {'scalers': 'standard', 'lambda_l1': 1.822414185711631e-06, 'lambda_l2': 0.0035279446065690613, 'num_leaves': 255, 'feature_fraction': 0.40639377427558904, 'bagging_fraction': 0.4325650149410472, 'bagging_freq': 3, 'min_child_samples': 12, 'max_depth': 615}. Best is trial 31 with value: 0.8836487496206924.[0m
[32m[I 2022-06-22 12:35:14,105][0m Trial 165 finished with value: 0.8822216784333656 and parameters: {'scalers': 'standard', 'lambda_l1': 1.1905081794180516e-06, 'lambd

[32m[I 2022-06-22 12:36:21,430][0m Trial 183 finished with value: 0.87593984507065 and parameters: {'scalers': 'standard', 'lambda_l1': 0.00018168208572779486, 'lambda_l2': 0.0002179649272508385, 'num_leaves': 241, 'feature_fraction': 0.4455800256475023, 'bagging_fraction': 0.4382882384495033, 'bagging_freq': 3, 'min_child_samples': 10, 'max_depth': 633}. Best is trial 31 with value: 0.8836487496206924.[0m
[32m[I 2022-06-22 12:36:22,274][0m Trial 184 finished with value: 0.5 and parameters: {'scalers': 'standard', 'lambda_l1': 0.00010005243994162052, 'lambda_l2': 6.84581282090692e-05, 'num_leaves': 246, 'feature_fraction': 0.43630569783178624, 'bagging_fraction': 0.4114971509773171, 'bagging_freq': 3, 'min_child_samples': 98, 'max_depth': 411}. Best is trial 31 with value: 0.8836487496206924.[0m
[32m[I 2022-06-22 12:36:26,162][0m Trial 185 finished with value: 0.8790320101510117 and parameters: {'scalers': 'standard', 'lambda_l1': 0.00028777843862436446, 'lambda_l2': 0.00060026

[32m[I 2022-06-22 12:37:34,697][0m Trial 203 finished with value: 0.8777268078928294 and parameters: {'scalers': 'standard', 'lambda_l1': 0.0005892100737320585, 'lambda_l2': 0.0003942964526655976, 'num_leaves': 256, 'feature_fraction': 0.7755088526483217, 'bagging_fraction': 0.41663207987280365, 'bagging_freq': 3, 'min_child_samples': 12, 'max_depth': 556}. Best is trial 31 with value: 0.8836487496206924.[0m
[32m[I 2022-06-22 12:37:38,745][0m Trial 204 finished with value: 0.8820022878308482 and parameters: {'scalers': 'standard', 'lambda_l1': 5.329896732365912e-05, 'lambda_l2': 0.00315560164316693, 'num_leaves': 241, 'feature_fraction': 0.4399706567820615, 'bagging_fraction': 0.45631993756335715, 'bagging_freq': 3, 'min_child_samples': 14, 'max_depth': 627}. Best is trial 31 with value: 0.8836487496206924.[0m
[32m[I 2022-06-22 12:37:42,583][0m Trial 205 finished with value: 0.880019134352919 and parameters: {'scalers': 'standard', 'lambda_l1': 5.4906573248575276e-05, 'lambda_l

[32m[I 2022-06-22 12:39:00,651][0m Trial 223 finished with value: 0.8763821645975283 and parameters: {'scalers': 'standard', 'lambda_l1': 1.1523541064789146e-05, 'lambda_l2': 0.003244028690506324, 'num_leaves': 225, 'feature_fraction': 0.43604278765547744, 'bagging_fraction': 0.42784328622881257, 'bagging_freq': 3, 'min_child_samples': 5, 'max_depth': 630}. Best is trial 212 with value: 0.8846667409173213.[0m
[32m[I 2022-06-22 12:39:04,824][0m Trial 224 finished with value: 0.8773171267801685 and parameters: {'scalers': 'standard', 'lambda_l1': 7.563657817951725e-06, 'lambda_l2': 0.001535307358969394, 'num_leaves': 235, 'feature_fraction': 0.44695306489549863, 'bagging_fraction': 0.43993372005694764, 'bagging_freq': 7, 'min_child_samples': 8, 'max_depth': 594}. Best is trial 212 with value: 0.8846667409173213.[0m
[32m[I 2022-06-22 12:39:09,328][0m Trial 225 finished with value: 0.8721915915826518 and parameters: {'scalers': 'standard', 'lambda_l1': 2.874574836562151e-05, 'lambd

[32m[I 2022-06-22 12:40:24,883][0m Trial 243 finished with value: 0.883723163661244 and parameters: {'scalers': 'minmax', 'lambda_l1': 9.155410641950171e-06, 'lambda_l2': 0.0068885141857731565, 'num_leaves': 217, 'feature_fraction': 0.41070276731679234, 'bagging_fraction': 0.4103717291614584, 'bagging_freq': 3, 'min_child_samples': 10, 'max_depth': 670}. Best is trial 212 with value: 0.8846667409173213.[0m
[32m[I 2022-06-22 12:40:29,030][0m Trial 244 finished with value: 0.8820298869731918 and parameters: {'scalers': 'minmax', 'lambda_l1': 8.692555617096305e-06, 'lambda_l2': 0.007123718436064789, 'num_leaves': 219, 'feature_fraction': 0.40953784430982, 'bagging_fraction': 0.40967114975431634, 'bagging_freq': 3, 'min_child_samples': 11, 'max_depth': 659}. Best is trial 212 with value: 0.8846667409173213.[0m
[32m[I 2022-06-22 12:40:33,393][0m Trial 245 finished with value: 0.8831657031479012 and parameters: {'scalers': 'minmax', 'lambda_l1': 3.7440072489947992e-06, 'lambda_l2': 0

[32m[I 2022-06-22 12:41:51,684][0m Trial 263 finished with value: 0.8834765138757011 and parameters: {'scalers': 'minmax', 'lambda_l1': 4.9353936958893845e-06, 'lambda_l2': 0.11549936676953122, 'num_leaves': 221, 'feature_fraction': 0.42181176868926623, 'bagging_fraction': 0.42149691462664624, 'bagging_freq': 3, 'min_child_samples': 11, 'max_depth': 733}. Best is trial 212 with value: 0.8846667409173213.[0m
[32m[I 2022-06-22 12:41:56,108][0m Trial 264 finished with value: 0.8764710672239697 and parameters: {'scalers': 'minmax', 'lambda_l1': 4.7724216305378966e-06, 'lambda_l2': 0.1262066396773385, 'num_leaves': 217, 'feature_fraction': 0.4163250419004575, 'bagging_fraction': 0.40043966494528344, 'bagging_freq': 3, 'min_child_samples': 11, 'max_depth': 743}. Best is trial 212 with value: 0.8846667409173213.[0m
[32m[I 2022-06-22 12:42:00,538][0m Trial 265 finished with value: 0.8819842664430513 and parameters: {'scalers': 'minmax', 'lambda_l1': 2.5802934769963756e-06, 'lambda_l2':

[32m[I 2022-06-22 12:43:16,325][0m Trial 283 finished with value: 0.8697712020306834 and parameters: {'scalers': 'minmax', 'lambda_l1': 3.644926651367471e-06, 'lambda_l2': 0.07293207198193219, 'num_leaves': 71, 'feature_fraction': 0.4005115299838523, 'bagging_fraction': 0.42660993116823537, 'bagging_freq': 3, 'min_child_samples': 17, 'max_depth': 656}. Best is trial 212 with value: 0.8846667409173213.[0m
[32m[I 2022-06-22 12:43:21,262][0m Trial 284 finished with value: 0.8762632418536055 and parameters: {'scalers': 'minmax', 'lambda_l1': 9.68530877192183e-06, 'lambda_l2': 0.03312532378133076, 'num_leaves': 223, 'feature_fraction': 0.4232919262132287, 'bagging_fraction': 0.413252910085033, 'bagging_freq': 3, 'min_child_samples': 9, 'max_depth': 762}. Best is trial 212 with value: 0.8846667409173213.[0m
[32m[I 2022-06-22 12:43:22,287][0m Trial 285 finished with value: 0.5 and parameters: {'scalers': 'minmax', 'lambda_l1': 1.3597027723578746e-06, 'lambda_l2': 0.2050717779013016, '

[32m[I 2022-06-22 12:44:58,708][0m Trial 303 finished with value: 0.8786520215680433 and parameters: {'scalers': 'minmax', 'lambda_l1': 5.340062507668073e-06, 'lambda_l2': 0.155132939377533, 'num_leaves': 196, 'feature_fraction': 0.6547990675239339, 'bagging_fraction': 0.4187980390636136, 'bagging_freq': 3, 'min_child_samples': 8, 'max_depth': 888}. Best is trial 212 with value: 0.8846667409173213.[0m
[32m[I 2022-06-22 12:45:02,751][0m Trial 304 finished with value: 0.8682608600549777 and parameters: {'scalers': 'minmax', 'lambda_l1': 1.699653866091738e-05, 'lambda_l2': 0.42468031151283214, 'num_leaves': 220, 'feature_fraction': 0.4350990068184032, 'bagging_fraction': 0.4005819889698662, 'bagging_freq': 4, 'min_child_samples': 17, 'max_depth': 641}. Best is trial 212 with value: 0.8846667409173213.[0m
[32m[I 2022-06-22 12:45:09,426][0m Trial 305 finished with value: 0.8811905131117591 and parameters: {'scalers': 'minmax', 'lambda_l1': 9.382751133596857e-06, 'lambda_l2': 0.05995

[32m[I 2022-06-22 12:46:36,775][0m Trial 323 finished with value: 0.874757685125332 and parameters: {'scalers': 'minmax', 'lambda_l1': 2.262808188013875e-06, 'lambda_l2': 0.005269175536920084, 'num_leaves': 218, 'feature_fraction': 0.4253664944291647, 'bagging_fraction': 0.42291916316097206, 'bagging_freq': 3, 'min_child_samples': 16, 'max_depth': 692}. Best is trial 212 with value: 0.8846667409173213.[0m
[32m[I 2022-06-22 12:46:40,863][0m Trial 324 finished with value: 0.8860457100948587 and parameters: {'scalers': 'minmax', 'lambda_l1': 4.75261507145719e-06, 'lambda_l2': 0.011742802261727603, 'num_leaves': 241, 'feature_fraction': 0.4109969027768354, 'bagging_fraction': 0.4003975536156305, 'bagging_freq': 3, 'min_child_samples': 11, 'max_depth': 655}. Best is trial 324 with value: 0.8860457100948587.[0m
[32m[I 2022-06-22 12:46:45,536][0m Trial 325 finished with value: 0.8825576095516109 and parameters: {'scalers': 'minmax', 'lambda_l1': 4.464824147385123e-06, 'lambda_l2': 0.0

[32m[I 2022-06-22 12:48:06,163][0m Trial 343 finished with value: 0.885193542547064 and parameters: {'scalers': 'minmax', 'lambda_l1': 1.7987385986191625e-06, 'lambda_l2': 0.007837746162029742, 'num_leaves': 234, 'feature_fraction': 0.44906761388425287, 'bagging_fraction': 0.44605806175203583, 'bagging_freq': 3, 'min_child_samples': 11, 'max_depth': 641}. Best is trial 324 with value: 0.8860457100948587.[0m
[32m[I 2022-06-22 12:48:09,216][0m Trial 344 finished with value: 0.8652011262731079 and parameters: {'scalers': 'minmax', 'lambda_l1': 1.3145780196423376, 'lambda_l2': 0.010596239576547444, 'num_leaves': 233, 'feature_fraction': 0.4529423104461867, 'bagging_fraction': 0.44257898171572796, 'bagging_freq': 3, 'min_child_samples': 11, 'max_depth': 645}. Best is trial 324 with value: 0.8860457100948587.[0m
[32m[I 2022-06-22 12:48:12,845][0m Trial 345 finished with value: 0.8756159732725135 and parameters: {'scalers': 'minmax', 'lambda_l1': 1.6077445551017187e-06, 'lambda_l2': 0

[32m[I 2022-06-22 12:49:39,164][0m Trial 363 finished with value: 0.8808055827686246 and parameters: {'scalers': 'minmax', 'lambda_l1': 1.3602799292298448e-06, 'lambda_l2': 0.009860531057247105, 'num_leaves': 232, 'feature_fraction': 0.8090189526019089, 'bagging_fraction': 0.41256841861663474, 'bagging_freq': 3, 'min_child_samples': 10, 'max_depth': 635}. Best is trial 324 with value: 0.8860457100948587.[0m
[32m[I 2022-06-22 12:49:44,884][0m Trial 364 finished with value: 0.8769212410181837 and parameters: {'scalers': 'minmax', 'lambda_l1': 0.0013388348025826712, 'lambda_l2': 0.00566809049110198, 'num_leaves': 242, 'feature_fraction': 0.431083930218379, 'bagging_fraction': 0.4284291147481357, 'bagging_freq': 4, 'min_child_samples': 7, 'max_depth': 854}. Best is trial 324 with value: 0.8860457100948587.[0m
[32m[I 2022-06-22 12:49:49,043][0m Trial 365 finished with value: 0.8796149739493392 and parameters: {'scalers': 'minmax', 'lambda_l1': 0.0026834410933103446, 'lambda_l2': 0.0

[32m[I 2022-06-22 12:51:14,390][0m Trial 383 finished with value: 0.8813669558425751 and parameters: {'scalers': 'minmax', 'lambda_l1': 7.43246086513516e-06, 'lambda_l2': 0.03198250506124418, 'num_leaves': 215, 'feature_fraction': 0.4005821884843612, 'bagging_fraction': 0.41280558004456963, 'bagging_freq': 3, 'min_child_samples': 12, 'max_depth': 703}. Best is trial 324 with value: 0.8860457100948587.[0m
[32m[I 2022-06-22 12:51:18,175][0m Trial 384 finished with value: 0.8781228434448249 and parameters: {'scalers': 'minmax', 'lambda_l1': 2.5689944697433075e-05, 'lambda_l2': 0.005675355374559127, 'num_leaves': 131, 'feature_fraction': 0.4101287534816503, 'bagging_fraction': 0.4204288570107275, 'bagging_freq': 3, 'min_child_samples': 10, 'max_depth': 583}. Best is trial 324 with value: 0.8860457100948587.[0m
[32m[I 2022-06-22 12:51:23,575][0m Trial 385 finished with value: 0.8810548059826311 and parameters: {'scalers': 'minmax', 'lambda_l1': 0.0008092729071714408, 'lambda_l2': 0.

[32m[I 2022-06-22 12:52:40,763][0m Trial 403 finished with value: 0.880622160197044 and parameters: {'scalers': 'minmax', 'lambda_l1': 6.039748814217228e-07, 'lambda_l2': 0.07081047323122279, 'num_leaves': 215, 'feature_fraction': 0.41523517302887114, 'bagging_fraction': 0.4336495409442528, 'bagging_freq': 2, 'min_child_samples': 7, 'max_depth': 715}. Best is trial 324 with value: 0.8860457100948587.[0m
[32m[I 2022-06-22 12:52:45,944][0m Trial 404 finished with value: 0.879525139162523 and parameters: {'scalers': 'minmax', 'lambda_l1': 0.001301324268739333, 'lambda_l2': 0.009998874000345616, 'num_leaves': 234, 'feature_fraction': 0.44195455407399337, 'bagging_fraction': 0.4003995434083204, 'bagging_freq': 3, 'min_child_samples': 13, 'max_depth': 934}. Best is trial 324 with value: 0.8860457100948587.[0m
[32m[I 2022-06-22 12:52:50,193][0m Trial 405 finished with value: 0.8756150638577108 and parameters: {'scalers': 'minmax', 'lambda_l1': 3.7081444034990286e-06, 'lambda_l2': 0.04

[32m[I 2022-06-22 12:54:06,510][0m Trial 423 finished with value: 0.8737359527727177 and parameters: {'scalers': 'minmax', 'lambda_l1': 1.0151184010932386e-05, 'lambda_l2': 0.0020143240895745293, 'num_leaves': 242, 'feature_fraction': 0.4100711149217103, 'bagging_fraction': 0.4217348873994826, 'bagging_freq': 3, 'min_child_samples': 10, 'max_depth': 610}. Best is trial 324 with value: 0.8860457100948587.[0m
[32m[I 2022-06-22 12:54:08,990][0m Trial 424 finished with value: 0.8667789924428856 and parameters: {'scalers': 'minmax', 'lambda_l1': 1.5954425977148137e-06, 'lambda_l2': 0.002871167731614152, 'num_leaves': 231, 'feature_fraction': 0.42195791797257687, 'bagging_fraction': 0.48990922269470116, 'bagging_freq': 3, 'min_child_samples': 26, 'max_depth': 518}. Best is trial 324 with value: 0.8860457100948587.[0m
[32m[I 2022-06-22 12:54:15,095][0m Trial 425 finished with value: 0.8738322260814521 and parameters: {'scalers': 'minmax', 'lambda_l1': 0.0016372010670453654, 'lambda_l2

[32m[I 2022-06-22 12:55:25,378][0m Trial 444 finished with value: 0.8817689606789838 and parameters: {'scalers': 'minmax', 'lambda_l1': 2.807377985791381e-06, 'lambda_l2': 1.3767166736272074e-08, 'num_leaves': 228, 'feature_fraction': 0.4454500640407263, 'bagging_fraction': 0.4382327166990512, 'bagging_freq': 3, 'min_child_samples': 5, 'max_depth': 632}. Best is trial 324 with value: 0.8860457100948587.[0m
[32m[I 2022-06-22 12:55:29,154][0m Trial 445 finished with value: 0.8767286242094678 and parameters: {'scalers': 'minmax', 'lambda_l1': 8.345354091838188e-06, 'lambda_l2': 2.71799957118158e-08, 'num_leaves': 234, 'feature_fraction': 0.4653156746873899, 'bagging_fraction': 0.6801092215665016, 'bagging_freq': 4, 'min_child_samples': 7, 'max_depth': 598}. Best is trial 324 with value: 0.8860457100948587.[0m
[32m[I 2022-06-22 12:55:33,946][0m Trial 446 finished with value: 0.8796720529212788 and parameters: {'scalers': 'minmax', 'lambda_l1': 4.479831307382189e-06, 'lambda_l2': 0.

[32m[I 2022-06-22 12:56:53,651][0m Trial 464 finished with value: 0.8780635506134732 and parameters: {'scalers': 'minmax', 'lambda_l1': 3.338745988830862e-05, 'lambda_l2': 0.0031307383510765204, 'num_leaves': 236, 'feature_fraction': 0.45255452765296345, 'bagging_fraction': 0.6677060648585293, 'bagging_freq': 3, 'min_child_samples': 15, 'max_depth': 707}. Best is trial 324 with value: 0.8860457100948587.[0m
[32m[I 2022-06-22 12:56:58,347][0m Trial 465 finished with value: 0.8807520680858528 and parameters: {'scalers': 'minmax', 'lambda_l1': 5.4079394366272204e-05, 'lambda_l2': 0.005045734507853339, 'num_leaves': 229, 'feature_fraction': 0.4204983486464603, 'bagging_fraction': 0.48148066837281733, 'bagging_freq': 3, 'min_child_samples': 12, 'max_depth': 737}. Best is trial 324 with value: 0.8860457100948587.[0m
[32m[I 2022-06-22 12:57:02,882][0m Trial 466 finished with value: 0.8711710729930546 and parameters: {'scalers': 'minmax', 'lambda_l1': 1.7868531329506287e-05, 'lambda_l2

[32m[I 2022-06-22 12:58:17,781][0m Trial 484 finished with value: 0.8770583290118894 and parameters: {'scalers': 'minmax', 'lambda_l1': 1.433626564102245e-05, 'lambda_l2': 0.003910273830979929, 'num_leaves': 248, 'feature_fraction': 0.41947518984840815, 'bagging_fraction': 0.43121228729720423, 'bagging_freq': 1, 'min_child_samples': 15, 'max_depth': 553}. Best is trial 324 with value: 0.8860457100948587.[0m
[32m[I 2022-06-22 12:58:22,021][0m Trial 485 finished with value: 0.8774274724243765 and parameters: {'scalers': 'minmax', 'lambda_l1': 1.885458127913055e-06, 'lambda_l2': 0.005236252891708598, 'num_leaves': 241, 'feature_fraction': 0.8173470733853108, 'bagging_fraction': 0.4490773992044536, 'bagging_freq': 3, 'min_child_samples': 12, 'max_depth': 609}. Best is trial 324 with value: 0.8860457100948587.[0m
[32m[I 2022-06-22 12:58:27,383][0m Trial 486 finished with value: 0.8722169933609559 and parameters: {'scalers': 'minmax', 'lambda_l1': 8.115259976526958e-06, 'lambda_l2': 

In [284]:
# Get best trial based on metric score
trial = study.best_trial

In [285]:
# Print params for best trial
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

    scalers: minmax
    lambda_l1: 4.75261507145719e-06
    lambda_l2: 0.011742802261727603
    num_leaves: 241
    feature_fraction: 0.4109969027768354
    bagging_fraction: 0.4003975536156305
    bagging_freq: 3
    min_child_samples: 11
    max_depth: 655


In [286]:
# Best Score from HP Opt
trial.values[0]

0.8860457100948587

In [287]:
gbm_opt = lgb.LGBMClassifier(random_state=SEED, **trial.params)

- Pré selecionar as features mais importantes auxiliou a obter uma performance ainda melhor a partir do pipeline com Robust Scaler e LightGBM

In [288]:
X_transform_test = feat_selector.transform(np.array(X_test))

In [289]:
X_transform_test.shape

(64, 27)

In [290]:
gbm_opt.fit(X_transform,y)



In [291]:
y_pred = gbm_opt.predict(X_transform_test)

In [292]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        MILD       0.67      0.74      0.70        42
      SEVERE       0.39      0.32      0.35        22

    accuracy                           0.59        64
   macro avg       0.53      0.53      0.53        64
weighted avg       0.58      0.59      0.58        64



### Testing other algorithms 

In [233]:
X_test, y_test = test_mod.drop("Group", axis=1), test_mod["Group"]
X, y = train_mod.drop("Group", axis=1), train_mod["Group"]

###### Get most important features from boruta

In [235]:
# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=SEED, )
feat_selector.fit(np.array(X), y)
print("Number of selected features: ", feat_selector.n_features_)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	635
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	635
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	635
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	635
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	635
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	635
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	635
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	50
Rejected: 	585
Iteration: 	9 / 100
Confirmed: 	10
Tentative: 	40
Rejected: 	585
Iteration: 	10 / 100
Confirmed: 	10
Tentative: 	40
Rejected: 	585
Iteration: 	11 / 100
Confirmed: 	10
Tentative: 	40
Rejected: 	585
Iteration: 	12 / 100
Confirmed: 	12
Tentative: 	32
Rejected: 	591
Iteration: 	13 / 100
Confirmed: 	12
Tentative: 	32
Rejected: 	591
Iteration: 	14 / 100
Confirmed: 	12
Tentative: 	32
Rejected: 	591
Iteration: 	15 / 100
Confirmed: 	12
Tentative: 	32
Rejected: 	591
Iteration: 	16 / 100
Confirmed: 	1

In [236]:
# Filter most importante features
X_transform = feat_selector.transform(np.array(X))
X_transform_test = feat_selector.transform(np.array(X_test))

In [237]:
gbc = GradientBoostingClassifier(random_state = SEED)

In [238]:
parameters = {'max_depth':list(range(1,100,1)), 'subsample':list(np.arange(0.05,1,0.05)),
             'learning_rate':list(np.arange(0.05,1,0.05)), 'n_estimators':list(range(50,500,25)), }

In [239]:
rscv_gbc = RandomizedSearchCV(gbc, parameters, scoring=metric_scorer, n_jobs = -1, verbose = 4, n_iter = 200)

In [240]:
scores_gbc = rscv_gbc.fit(X,y)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


In [241]:
scores_gbc.best_score_

0.8336363636363636

In [245]:
y_pred = scores_gbc.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        MILD       0.65      0.57      0.61        42
      SEVERE       0.33      0.41      0.37        22

    accuracy                           0.52        64
   macro avg       0.49      0.49      0.49        64
weighted avg       0.54      0.52      0.53        64



In [246]:
scores_gbc_boruta = rscv_gbc.fit(X_transform, y)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


In [247]:
scores_gbc_boruta.best_score_

0.8705681818181817

In [248]:
scores_gbc_boruta.best_params_

{'subsample': 0.9000000000000001,
 'n_estimators': 200,
 'max_depth': 6,
 'learning_rate': 0.2}

In [250]:
y_pred = scores_gbc_boruta.predict(X_transform_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        MILD       0.63      0.74      0.68        42
      SEVERE       0.27      0.18      0.22        22

    accuracy                           0.55        64
   macro avg       0.45      0.46      0.45        64
weighted avg       0.51      0.55      0.52        64



In [251]:
lgbm = lgb.LGBMClassifier(random_state=SEED)
parameters = {'num_leaves':list(range(20,100,5)), 'min_child_samples':list(range(5,30,5)),'max_depth':list(range(-1,20,1)),
             'learning_rate':list(np.arange(0.05,1,0.05)),'reg_alpha':list(np.arange(0,0.6, 0.1))}
rscv_lgbm = RandomizedSearchCV(lgbm, parameters, scoring=metric_scorer, n_jobs = -1, verbose = 4, n_iter = 200)

In [259]:
scores_lgbm = rscv_lgbm.fit(X, y)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


Unnamed: 0_level_0,Freq.1198.05707939985,Freq.1204.07148226988,Freq.1211.25531677913,Freq.1217.79534957812,Freq.1223.24180676615,Freq.1234.42874922858,Freq.1239.04873179697,Freq.1243.98934968702,Freq.1249.10239635402,Freq.1254.7097579948,...,Freq.8812.07967315325,Freq.8943.76551923189,Freq.9058.85825530971,Freq.9098.58510797401,Freq.9437.74469644083,Freq.9593.90405666006,Freq.9799.842201746,Freq.10432.4853106264,Freq.11006.9514551194,Freq.11161.31855876
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000319,0.000416,0.000077,4.034140e-05,0.000089,0.000282,0.000004,0.000130,0.000288,0.000000e+00,...,0.000004,0.000017,0.000005,0.000020,0.000029,0.000037,0.000024,0.000016,0.000030,3.036588e-05
10,0.000000,0.000000,0.000021,1.064014e-03,0.000113,0.001591,0.016310,0.002330,0.000476,6.074661e-04,...,0.000000,0.000040,0.000010,0.000001,0.000005,0.000002,0.000052,0.000003,0.000155,1.095279e-06
100,0.000199,0.000006,0.000013,5.089552e-08,0.000087,0.000543,0.000231,0.000068,0.000018,1.286428e-04,...,0.000020,0.000009,0.000044,0.000012,0.000067,0.000207,0.000020,0.000027,0.000100,1.003297e-04
101,0.000196,0.000244,0.000069,4.874671e-05,0.000125,0.000353,0.000029,0.000125,0.000120,4.988436e-07,...,0.000027,0.000072,0.000051,0.000013,0.000061,0.000040,0.000008,0.000003,0.000055,2.944864e-05
105,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,...,0.000034,0.000027,0.000031,0.000014,0.000017,0.000028,0.000009,0.000012,0.000026,1.714090e-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,0.000010,0.000394,0.000042,1.213255e-04,0.000052,0.000586,0.014572,0.001462,0.000112,2.465139e-04,...,0.000006,0.000003,0.000005,0.000007,0.000003,0.000011,0.000003,0.000016,0.001243,6.332741e-08
87,0.000048,0.000095,0.000039,7.052825e-05,0.000100,0.000121,0.001347,0.000230,0.000061,6.719760e-05,...,0.000000,0.000000,0.000023,0.000054,0.000042,0.000016,0.000023,0.000006,0.000498,0.000000e+00
93,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,...,0.000004,0.000012,0.000016,0.000014,0.000062,0.000028,0.000025,0.000010,0.000178,9.164915e-05
94,0.000000,0.000000,0.000206,1.762211e-04,0.000221,0.000161,0.000160,0.000013,0.000284,1.165049e-04,...,0.000001,0.000006,0.000013,0.000013,0.000016,0.000026,0.000008,0.000025,0.000049,2.309093e-05


In [253]:
scores_lgbm.best_score_

0.8252272727272727

In [261]:
X_test.shape

(64, 635)

In [263]:
y_pred = scores_lgbm.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        MILD       0.67      0.67      0.67        42
      SEVERE       0.36      0.36      0.36        22

    accuracy                           0.56        64
   macro avg       0.52      0.52      0.52        64
weighted avg       0.56      0.56      0.56        64



In [268]:
scores_lgbm_boruta = rscv_lgbm.fit(X_transform, y)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


In [274]:
scores_lgbm_boruta.best_score_

0.8743181818181818

In [270]:
X_transform_test.shape

(64, 27)

In [272]:
y_pred = scores_lgbm_boruta.predict(X_transform_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        MILD       0.62      0.60      0.61        42
      SEVERE       0.29      0.32      0.30        22

    accuracy                           0.50        64
   macro avg       0.46      0.46      0.46        64
weighted avg       0.51      0.50      0.50        64



In [None]:
svc_poly = svc.LGBMClassifier(random_state=SEED)
parameters = {'num_leaves':list(range(20,100,5)), 'min_child_samples':list(range(5,30,5)),'max_depth':list(range(-1,20,1)),
             'learning_rate':list(np.arange(0.05,1,0.05)),'reg_alpha':list(np.arange(0,0.6, 0.1))}
rscv_lgbm = RandomizedSearchCV(lgbm, parameters, scoring=metric_scorer, n_jobs = -1, verbose = 4, n_iter = 200)