# Tuning LightGBM

In [1]:
import numpy as np
import math
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, RidgeClassifier
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import SCORERS, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.metrics import confusion_matrix, f1_score

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train = pd.read_csv('/users/stanislav/data/consumption/train.csv')

In [3]:
X_train, X_test, y_train, y_test = train_test_split(train.drop('TARGET', axis=1), train.TARGET,
                                                    train_size=0.7, random_state=42)

##### When I had done many prints of metrics, I guessed that I could  make some functions, so then I wrote it here below.

In [4]:
def fit_and_result(model):
    model.fit(X_train, y_train)
    
    print('accuracy: ', accuracy_score(y_test, model.predict(X_test)))
    print('recall:   ', recall_score(y_test, model.predict(X_test)))
    print('f1:       ', f1_score(y_test, model.predict(X_test)))

In [5]:
def feature_importance(model):
    return(pd.DataFrame(model.feature_importances_, index=X_train.columns, 
           columns=['Importance']).sort_values(by='Importance', ascending=False)[:10])

### Defalut models - xgb and lightgbm

In [6]:
xgb = XGBClassifier()

fit_and_result(xgb)

accuracy:  0.9598351311058494
recall:    0.001092896174863388
f1:        0.0021786492374727667


In [7]:
lgb = LGBMClassifier(random_state=42)

fit_and_result(lgb)

accuracy:  0.9593966500043848
recall:    0.003278688524590164
f1:        0.006437768240343348


### Tuning hyperparameters - complexity 

In [8]:
param_grid = {'num_leaves' : [7, 15, 31, 63],
              'max_depth' : [3, 4, 5, 6, -1],
              'reg_alpha' : range(0, 2)}

In [9]:
grid_searcher = GridSearchCV(estimator=lgb, param_grid=param_grid, cv=5, verbose=1, scoring='f1')

In [10]:
fit_and_result(grid_searcher)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed: 23.9min finished


accuracy:  0.9594404981145312
recall:    0.00546448087431694
f1:        0.0106951871657754


In [11]:
# best 'num_leaves' is on high border of given param_grid
# that means that better parameter may be higher and
# out of given grid 
grid_searcher.best_params_, grid_searcher.best_score_

({'max_depth': -1, 'num_leaves': 63, 'reg_alpha': 0}, 0.022947909852012758)

### Tuning hyperparameteres - convergence

In [12]:
# increase number of estimators from 100 (default) to 200
lgb2 = LGBMClassifier(random_state=42, max_depth=-1, num_leaves=63, n_estimators=200)

In [13]:
# np.logspace(-3, 0, 10) = array([0.001, 0.00215443, 0.00464159, 0.01, 0.02154435,
#                                 0.04641589, 0.1, 0.21544347, 0.46415888, 1.0])
param_grid2 = {'learning_rate' : np.logspace(-3, 0, 10)}

In [14]:
grid_searcher2 = GridSearchCV(estimator=lgb2, param_grid=param_grid2, cv=5, verbose=1, scoring='f1')

In [15]:
fit_and_result(grid_searcher2)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 18.8min finished


accuracy:  0.9337455055687099
recall:    0.15737704918032788
f1:        0.16008893829905504


In [16]:
# default learning_rate=0.1
grid_searcher2.best_params_, grid_searcher2.best_score_

({'learning_rate': 1.0}, 0.12855780953810678)

### Final LightGBM

In [17]:
final_lgb = LGBMClassifier(random_state=42, n_estimators=300, num_leaves=63, max_depth=-1, learning_rate=1)

In [18]:
fit_and_result(final_lgb)

accuracy:  0.8755590634043673
recall:    0.38579234972677595
f1:        0.1992099322799097


### One more final LightGBM

In [19]:
# increase num_leaves from 63 to 127 (didn't improve)
# increase n_estimators from 300 to 500 
# it made recall and f1 worse so I return previous param-s 
# then I increase only n_estimators and came to clue 
# that it does not improve metrics
final_lgb_two = LGBMClassifier(random_state=42, n_estimators=500, num_leaves=63, max_depth=-1, learning_rate=1)

In [20]:
fit_and_result(final_lgb_two)

accuracy:  0.8755590634043673
recall:    0.38579234972677595
f1:        0.1992099322799097


### Feature importance between default lgb and improved lgb

In [21]:
feature_importance(lgb)

Unnamed: 0,Importance
ID,337
var38,296
var15,272
saldo_medio_var5_hace3,146
saldo_medio_var5_hace2,121
saldo_var30,104
saldo_medio_var5_ult3,98
saldo_medio_var5_ult1,92
num_var45_hace3,85
saldo_var5,79


In [22]:
feature_importance(final_lgb)

Unnamed: 0,Importance
ID,2358
var38,1846
var15,1699
saldo_medio_var5_hace3,643
saldo_medio_var5_ult3,457
saldo_medio_var5_hace2,368
saldo_var5,320
num_var45_ult3,297
num_var45_hace2,260
num_var22_ult3,248


### Tuning model with  'scale_pos_weight' (default=1)

#### By hand

In [23]:
# scale_pos_weight = sqrt(count(negative examples)/count(Positive examples)) 
# sqrt 96 / 4 = 4.899 - isnt better than final_model_two which has recall and f1 of 0.38 and 0.2
# but with scale = 5 makes model's recall 0.51 but f1 0.12 and accuracy is 0.70 
# with scale = 6 accuracy, recall and f1 are 0.65, 0.63 and 0.13 
final_lgb_four = LGBMClassifier(random_state=42, n_estimators=500, num_leaves=63, max_depth=-1, learning_rate=1,
                                n_jobs=1, scale_pos_weight=4.899)

In [24]:
fit_and_result(final_lgb_four)

accuracy:  0.8868718758221521
recall:    0.17923497267759564
f1:        0.11279229711141678


#### With grid

In [25]:
final_lgb_five = LGBMClassifier(random_state=42, n_estimators=500, num_leaves=63, max_depth=-1, learning_rate=1)

In [26]:
param_grid4 = {'scale_pos_weight' :  [1, 4, 4.899, 5, 6, 7]}

In [27]:
grid_searcher4 = GridSearchCV(estimator=final_lgb_five, param_grid=param_grid4, cv=5, scoring='f1')

In [28]:
fit_and_result(grid_searcher4)

accuracy:  0.6960010523546435
recall:    0.5158469945355191
f1:        0.11984257966230799


In [29]:
# but with this parameter model isn't better than with defalut parameter (=1)
grid_searcher4.best_params_, grid_searcher4.best_score_

({'scale_pos_weight': 5}, 0.15081468357248878)

#### 'Is_balance'=True

In [30]:
# instead of scale_pos_weitght I try 'is_unbalance = True'
final_lgb_six = LGBMClassifier(random_state=42, n_estimators=500, num_leaves=63, max_depth=-1, learning_rate=1,
                               is_unbalance=True)

In [31]:
# with parameter 'is_unbalance=True' results are as the same as with 'scale_pos_weight'=6
fit_and_result(final_lgb_six)

accuracy:  0.6426817504165571
recall:    0.6426229508196721
f1:        0.12611260053619305


### Tuning model with bagging_fraction (default=1)

In [32]:
final_lgb_seven = LGBMClassifier(random_state=42, n_estimators=500, num_leaves=63, max_depth=-1, learning_rate=1)

In [33]:
param_grid5 = {'bagging_fraction' : np.logspace(-3, 0, 10)}

In [34]:
grid_searcher5 = GridSearchCV(estimator=final_lgb_seven, param_grid=param_grid5, cv=5, scoring='f1')

In [35]:
fit_and_result(grid_searcher5)

accuracy:  0.8755590634043673
recall:    0.38579234972677595
f1:        0.1992099322799097


In [36]:
grid_searcher5.best_params_, grid_searcher5.best_score_

({'bagging_fraction': 0.001}, 0.13569482223375434)

### Tuning model with 'boosting' (default='gbdt')

In [37]:
final_lgb_eight = LGBMClassifier(random_state=42, n_estimators=500, num_leaves=63, max_depth=-1, learning_rate=1,
                                 bagging_fraction=0.001)

In [38]:
param_grid6 = {'boosting' : ['gbdt', 'rf', 'dart', 'goss']}

In [39]:
grid_searcher6 = GridSearchCV(estimator=final_lgb_eight, param_grid=param_grid6, cv=5, scoring='f1')

In [40]:
fit_and_result(grid_searcher6)

lightgbm.basic.LightGBMError: Check failed: config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f at /tmp/pip-req-build-ny46z16n/compile/src/boosting/rf.hpp, line 35 .




accuracy:  0.9341401385600281
recall:    0.14426229508196722
f1:        0.14949037372593432


In [41]:
grid_searcher6.best_params_, grid_searcher6.best_score_

({'boosting': 'goss'}, 0.13985023089072035)