In [1]:
import warnings
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

#supressing warnings for readability
warnings.filterwarnings("ignore")

# To plot pretty figures directly within Jupyter
%matplotlib inline

# pandas options
pd.set_option("display.max.columns", None)
pd.set_option("display.max.rows", None)
pd.set_option("display.precision", 2)

In [2]:
import pickle

with open('../data/var_store/X_resampled.pkl', 'rb') as f:
    X_resampled = pickle.load(f)

with open('../data/var_store/y_resampled.pkl', 'rb') as f:
    y_resampled = pickle.load(f)

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Create a decision tree classifier
tree = DecisionTreeClassifier()

# Train the decision tree classifier with class weighting
tree.fit(X_train, y_train)

# Make predictions on the test data
predictions = tree.predict(X_test)

# Evaluate the model performance
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

       False       0.78      0.76      0.77     70135
        True       0.77      0.79      0.78     70403

    accuracy                           0.77    140538
   macro avg       0.77      0.77      0.77    140538
weighted avg       0.77      0.77      0.77    140538



In [5]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, predictions))

0.7743599595838848


In [18]:
import optuna
import gc
import lightgbm as lgb
from optuna import Trial
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

def fitLGBM(trial,X, y):
    
    params={
      'n_estimators':trial.suggest_int('n_estimators', 0, 1000), 
      'num_leaves':trial.suggest_int('num_leaves', 2, 512),
      'max_depth':trial.suggest_int('max_depth', 2, 128),
      'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.15),
      'min_split_gain': trial.suggest_loguniform('min_split_gain', 0.001, 0.1),
      'feature_fraction':trial.suggest_uniform('feature_fraction',0.1, 1.0),
      'bagging_freq':trial.suggest_int('bagging_freq',0.1,10),
      'verbosity': -1,
      'random_state': 42
            }
    kfolds = KFold(n_splits=5, shuffle=True, random_state=42)
    model = LGBMClassifier(**params)
    
    res=[]
    for i, (tdx, vdx) in enumerate(kfolds.split(X, y)):
        X_train, X_valid, y_train, y_valid = X[tdx], X[vdx], y[tdx], y[vdx]
        model.fit(X_train, y_train,
                 eval_set=[(X_train, y_train), (X_valid, y_valid)],
                 early_stopping_rounds=30, verbose=False)
        preds = model.predict_proba(X_valid)
        res.append(roc_auc_score(y_valid, preds[:,1]))
    
    err = np.mean(res)
    
    return model, err

In [19]:
def objective(trial:Trial):
    
    gc.collect()
    models=[]
    validScore=0
   
    model,log = fitLGBM(trial,X_resampled,y_resampled)
    
    models.append(model)
    gc.collect()
    validScore+=log
    validScore/=len(models)
    
    return validScore

In [20]:
study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
study.optimize(objective, timeout=60*60*2)

[I 2023-07-03 13:00:42,167] A new study created in memory with name: no-name-ccc78b99-3769-4835-977b-6e5f43d59d42




[I 2023-07-03 13:05:05,274] Trial 0 finished with value: 0.870200334753946 and parameters: {'n_estimators': 567, 'num_leaves': 393, 'max_depth': 34, 'learning_rate': 0.024496052614050117, 'min_split_gain': 0.016672273427405817, 'feature_fraction': 0.42466694883715683, 'bagging_freq': 10}. Best is trial 0 with value: 0.870200334753946.




[I 2023-07-03 13:10:16,461] Trial 1 finished with value: 0.8882495131389302 and parameters: {'n_estimators': 799, 'num_leaves': 316, 'max_depth': 45, 'learning_rate': 0.09431663160560344, 'min_split_gain': 0.0027046346428031185, 'feature_fraction': 0.22963230135342616, 'bagging_freq': 7}. Best is trial 1 with value: 0.8882495131389302.




[I 2023-07-03 13:15:07,929] Trial 2 finished with value: 0.9092837947810187 and parameters: {'n_estimators': 915, 'num_leaves': 263, 'max_depth': 116, 'learning_rate': 0.02685462401865538, 'min_split_gain': 0.011070081216049299, 'feature_fraction': 0.9557156672738731, 'bagging_freq': 1}. Best is trial 2 with value: 0.9092837947810187.




[I 2023-07-03 13:15:56,722] Trial 3 finished with value: 0.8406688855305818 and parameters: {'n_estimators': 125, 'num_leaves': 207, 'max_depth': 127, 'learning_rate': 0.08462375728388427, 'min_split_gain': 0.036441804364678004, 'feature_fraction': 0.9233599678158428, 'bagging_freq': 7}. Best is trial 2 with value: 0.9092837947810187.




[I 2023-07-03 13:19:31,866] Trial 4 finished with value: 0.8554920707977223 and parameters: {'n_estimators': 977, 'num_leaves': 82, 'max_depth': 62, 'learning_rate': 0.043360403221815245, 'min_split_gain': 0.009054892857518258, 'feature_fraction': 0.5980421537947338, 'bagging_freq': 5}. Best is trial 2 with value: 0.9092837947810187.




[I 2023-07-03 13:24:56,558] Trial 5 finished with value: 0.783281028091008 and parameters: {'n_estimators': 668, 'num_leaves': 285, 'max_depth': 51, 'learning_rate': 0.010243638734552075, 'min_split_gain': 0.07262096433871847, 'feature_fraction': 0.20508924426978292, 'bagging_freq': 0}. Best is trial 2 with value: 0.9092837947810187.




[I 2023-07-03 13:26:00,111] Trial 6 finished with value: 0.7912921867715824 and parameters: {'n_estimators': 230, 'num_leaves': 68, 'max_depth': 99, 'learning_rate': 0.07105160489832119, 'min_split_gain': 0.02063249035565021, 'feature_fraction': 0.6579356244494993, 'bagging_freq': 2}. Best is trial 2 with value: 0.9092837947810187.




[I 2023-07-03 13:28:03,407] Trial 7 finished with value: 0.9010071859948445 and parameters: {'n_estimators': 229, 'num_leaves': 492, 'max_depth': 119, 'learning_rate': 0.04773433255129811, 'min_split_gain': 0.0065908197171353836, 'feature_fraction': 0.8918870732355446, 'bagging_freq': 5}. Best is trial 2 with value: 0.9092837947810187.




[I 2023-07-03 13:29:05,915] Trial 8 finished with value: 0.6621614480466249 and parameters: {'n_estimators': 279, 'num_leaves': 7, 'max_depth': 72, 'learning_rate': 0.0010507323505156844, 'min_split_gain': 0.002030996936924511, 'feature_fraction': 0.18479347531710932, 'bagging_freq': 3}. Best is trial 2 with value: 0.9092837947810187.




[I 2023-07-03 13:31:55,092] Trial 9 finished with value: 0.7118293524346665 and parameters: {'n_estimators': 508, 'num_leaves': 96, 'max_depth': 122, 'learning_rate': 0.0037017562299536478, 'min_split_gain': 0.0011513355901384326, 'feature_fraction': 0.1690926783807124, 'bagging_freq': 0}. Best is trial 2 with value: 0.9092837947810187.




[I 2023-07-03 13:37:00,227] Trial 10 finished with value: 0.8709658672378208 and parameters: {'n_estimators': 999, 'num_leaves': 192, 'max_depth': 87, 'learning_rate': 0.018071444932712642, 'min_split_gain': 0.005116348437865036, 'feature_fraction': 0.9808703308165659, 'bagging_freq': 2}. Best is trial 2 with value: 0.9092837947810187.




[I 2023-07-03 13:39:49,057] Trial 11 finished with value: 0.9044599700847782 and parameters: {'n_estimators': 327, 'num_leaves': 507, 'max_depth': 106, 'learning_rate': 0.03447808532299107, 'min_split_gain': 0.007026468536980811, 'feature_fraction': 0.8241603076305238, 'bagging_freq': 5}. Best is trial 2 with value: 0.9092837947810187.




[I 2023-07-03 13:42:22,951] Trial 12 finished with value: 0.9545355962948342 and parameters: {'n_estimators': 398, 'num_leaves': 467, 'max_depth': 97, 'learning_rate': 0.14211866232745138, 'min_split_gain': 0.012695577575646797, 'feature_fraction': 0.7905187707745103, 'bagging_freq': 8}. Best is trial 12 with value: 0.9545355962948342.




[I 2023-07-03 13:44:00,309] Trial 13 finished with value: 0.7083585763810722 and parameters: {'n_estimators': 765, 'num_leaves': 401, 'max_depth': 2, 'learning_rate': 0.1114004872569563, 'min_split_gain': 0.013667216323111894, 'feature_fraction': 0.7917703150399283, 'bagging_freq': 10}. Best is trial 12 with value: 0.9545355962948342.




[I 2023-07-03 13:47:35,311] Trial 14 finished with value: 0.832373797955043 and parameters: {'n_estimators': 386, 'num_leaves': 393, 'max_depth': 86, 'learning_rate': 0.010673220418015252, 'min_split_gain': 0.027113706883352354, 'feature_fraction': 0.9970335751183358, 'bagging_freq': 8}. Best is trial 12 with value: 0.9545355962948342.




[I 2023-07-03 13:47:50,308] Trial 15 finished with value: 0.7409254790191742 and parameters: {'n_estimators': 16, 'num_leaves': 214, 'max_depth': 103, 'learning_rate': 0.04745503520242492, 'min_split_gain': 0.011665894213231387, 'feature_fraction': 0.7394837902661563, 'bagging_freq': 8}. Best is trial 12 with value: 0.9545355962948342.




[I 2023-07-03 13:50:12,924] Trial 16 finished with value: 0.9460921083251396 and parameters: {'n_estimators': 421, 'num_leaves': 343, 'max_depth': 86, 'learning_rate': 0.13552756993414367, 'min_split_gain': 0.038335116092581216, 'feature_fraction': 0.8526178355406715, 'bagging_freq': 3}. Best is trial 12 with value: 0.9545355962948342.




[I 2023-07-03 13:52:51,998] Trial 17 finished with value: 0.9482618901888825 and parameters: {'n_estimators': 427, 'num_leaves': 444, 'max_depth': 80, 'learning_rate': 0.13385659522612686, 'min_split_gain': 0.04829318158369396, 'feature_fraction': 0.6977891561561549, 'bagging_freq': 3}. Best is trial 12 with value: 0.9545355962948342.




[I 2023-07-03 13:56:24,914] Trial 18 finished with value: 0.9409209868954809 and parameters: {'n_estimators': 588, 'num_leaves': 428, 'max_depth': 71, 'learning_rate': 0.1477169937590579, 'min_split_gain': 0.06259101392082667, 'feature_fraction': 0.4973362840731054, 'bagging_freq': 4}. Best is trial 12 with value: 0.9545355962948342.




[I 2023-07-03 13:59:22,563] Trial 19 finished with value: 0.9307204562201857 and parameters: {'n_estimators': 449, 'num_leaves': 456, 'max_depth': 19, 'learning_rate': 0.08045625433186519, 'min_split_gain': 0.09064360136930436, 'feature_fraction': 0.7106385577754225, 'bagging_freq': 6}. Best is trial 12 with value: 0.9545355962948342.




[I 2023-07-03 14:00:22,623] Trial 20 finished with value: 0.8926069323077723 and parameters: {'n_estimators': 139, 'num_leaves': 343, 'max_depth': 79, 'learning_rate': 0.1380534127962248, 'min_split_gain': 0.04503126292822689, 'feature_fraction': 0.6683067141387914, 'bagging_freq': 8}. Best is trial 12 with value: 0.9545355962948342.




[I 2023-07-03 14:02:42,234] Trial 21 finished with value: 0.9493807427347536 and parameters: {'n_estimators': 411, 'num_leaves': 349, 'max_depth': 91, 'learning_rate': 0.1491806046665702, 'min_split_gain': 0.028042897022967875, 'feature_fraction': 0.8337929782208551, 'bagging_freq': 3}. Best is trial 12 with value: 0.9545355962948342.




[I 2023-07-03 14:05:01,269] Trial 22 finished with value: 0.9521032674576293 and parameters: {'n_estimators': 363, 'num_leaves': 458, 'max_depth': 95, 'learning_rate': 0.14982739845078177, 'min_split_gain': 0.02309446375731033, 'feature_fraction': 0.7768453676892351, 'bagging_freq': 3}. Best is trial 12 with value: 0.9545355962948342.




[I 2023-07-03 14:07:29,018] Trial 23 finished with value: 0.9265211943680892 and parameters: {'n_estimators': 341, 'num_leaves': 464, 'max_depth': 96, 'learning_rate': 0.06370457056706422, 'min_split_gain': 0.0221470309288299, 'feature_fraction': 0.7927525523737704, 'bagging_freq': 4}. Best is trial 12 with value: 0.9545355962948342.




[I 2023-07-03 14:10:29,297] Trial 24 finished with value: 0.9356829747579433 and parameters: {'n_estimators': 524, 'num_leaves': 355, 'max_depth': 109, 'learning_rate': 0.06865002399701908, 'min_split_gain': 0.029751341143343196, 'feature_fraction': 0.8689348623423179, 'bagging_freq': 2}. Best is trial 12 with value: 0.9545355962948342.




[I 2023-07-03 14:14:59,278] Trial 25 finished with value: 0.9615111929027051 and parameters: {'n_estimators': 693, 'num_leaves': 511, 'max_depth': 93, 'learning_rate': 0.10280746422425602, 'min_split_gain': 0.017646719947123954, 'feature_fraction': 0.7669349761936317, 'bagging_freq': 4}. Best is trial 25 with value: 0.9615111929027051.




[I 2023-07-03 14:19:20,994] Trial 26 finished with value: 0.9577968311404902 and parameters: {'n_estimators': 672, 'num_leaves': 509, 'max_depth': 63, 'learning_rate': 0.09631600563606657, 'min_split_gain': 0.021017449466571376, 'feature_fraction': 0.7564959548316496, 'bagging_freq': 6}. Best is trial 25 with value: 0.9615111929027051.




[I 2023-07-03 14:23:39,093] Trial 27 finished with value: 0.9524125453055481 and parameters: {'n_estimators': 656, 'num_leaves': 512, 'max_depth': 67, 'learning_rate': 0.09402066870257342, 'min_split_gain': 0.016760132779716268, 'feature_fraction': 0.6072655812056732, 'bagging_freq': 6}. Best is trial 25 with value: 0.9615111929027051.




[I 2023-07-03 14:28:30,001] Trial 28 finished with value: 0.9526260226312102 and parameters: {'n_estimators': 781, 'num_leaves': 486, 'max_depth': 58, 'learning_rate': 0.05802903331666217, 'min_split_gain': 0.015769491777615177, 'feature_fraction': 0.7510905311611897, 'bagging_freq': 9}. Best is trial 25 with value: 0.9615111929027051.




[I 2023-07-03 14:33:00,740] Trial 29 finished with value: 0.9054686641731425 and parameters: {'n_estimators': 620, 'num_leaves': 419, 'max_depth': 30, 'learning_rate': 0.03299349604242962, 'min_split_gain': 0.019087741077648726, 'feature_fraction': 0.5285670236539418, 'bagging_freq': 6}. Best is trial 25 with value: 0.9615111929027051.




[I 2023-07-03 14:37:33,131] Trial 30 finished with value: 0.9639839649258637 and parameters: {'n_estimators': 732, 'num_leaves': 476, 'max_depth': 41, 'learning_rate': 0.09696302667271771, 'min_split_gain': 0.013955193706078131, 'feature_fraction': 0.9080993113718123, 'bagging_freq': 10}. Best is trial 30 with value: 0.9639839649258637.




[I 2023-07-03 14:42:04,196] Trial 31 finished with value: 0.9630670311411018 and parameters: {'n_estimators': 713, 'num_leaves': 476, 'max_depth': 41, 'learning_rate': 0.09486111356988691, 'min_split_gain': 0.014780278531057737, 'feature_fraction': 0.9098694831164946, 'bagging_freq': 10}. Best is trial 30 with value: 0.9639839649258637.




[I 2023-07-03 14:46:46,985] Trial 32 finished with value: 0.9638809690869754 and parameters: {'n_estimators': 699, 'num_leaves': 512, 'max_depth': 39, 'learning_rate': 0.09065116077290701, 'min_split_gain': 0.01643069646992642, 'feature_fraction': 0.9155064635757202, 'bagging_freq': 10}. Best is trial 30 with value: 0.9639839649258637.




[I 2023-07-03 14:51:43,473] Trial 33 finished with value: 0.9632914911872772 and parameters: {'n_estimators': 862, 'num_leaves': 420, 'max_depth': 37, 'learning_rate': 0.08714139722324872, 'min_split_gain': 0.009407188032293737, 'feature_fraction': 0.915144389050584, 'bagging_freq': 10}. Best is trial 30 with value: 0.9639839649258637.




[I 2023-07-03 14:56:34,559] Trial 34 finished with value: 0.958051038927984 and parameters: {'n_estimators': 873, 'num_leaves': 377, 'max_depth': 39, 'learning_rate': 0.0716140415106059, 'min_split_gain': 0.009161946736916561, 'feature_fraction': 0.9446078365438996, 'bagging_freq': 10}. Best is trial 30 with value: 0.9639839649258637.




[I 2023-07-03 15:01:13,274] Trial 35 finished with value: 0.9456563973261731 and parameters: {'n_estimators': 736, 'num_leaves': 425, 'max_depth': 27, 'learning_rate': 0.04979305564966731, 'min_split_gain': 0.013619776000698377, 'feature_fraction': 0.9222570324996335, 'bagging_freq': 9}. Best is trial 30 with value: 0.9639839649258637.


In [23]:
best_param = study.best_params
print(study.best_value)
best_param

0.9639839649258637


{'n_estimators': 732,
 'num_leaves': 476,
 'max_depth': 41,
 'learning_rate': 0.09696302667271771,
 'min_split_gain': 0.013955193706078131,
 'feature_fraction': 0.9080993113718123,
 'bagging_freq': 10}

In [27]:
model = LGBMClassifier(**best_param)

In [28]:
%%time
model.fit(X_train, y_train)

CPU times: total: 4min 53s
Wall time: 43.5 s


In [29]:
y_pred_val = model.predict(X_test)
accuracy_val = accuracy_score(y_test, y_pred_val)
print(f"Validation Accuracy: {accuracy_val}")

Validation Accuracy: 0.904445772673583


In [30]:
# model.booster_.save_model('../models/1-cls-07032023.txt')

<lightgbm.basic.Booster at 0x19f839248e0>

In [32]:
loaded_model = lgb.Booster(model_file='../models/1-cls-07032023.txt')

In [42]:
y_pred_val = loaded_model.predict(X_test)
accuracy_val = accuracy_score(y_test, y_pred_val > 0.5)
print(f"Validation Accuracy: {accuracy_val}")

Validation Accuracy: 0.904445772673583
