In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_selection import VarianceThreshold,mutual_info_classif,SelectKBest,SelectFromModel

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

from sklearn.model_selection import  GridSearchCV,cross_val_score,TimeSeriesSplit,cross_validate
from sklearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline
from sklearn import tree
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix, recall_score, precision_score, roc_auc_score,roc_curve, auc,matthews_corrcoef,f1_score,brier_score_loss,make_scorer
 

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [3]:
#reading csv files
X_train=pd.read_csv('X_train_F.csv')
Y_train=pd.read_csv('Y_train_F.csv')
X_test=pd.read_csv('X_test_F.csv')
Y_test=pd.read_csv('Y_test_F.csv')

In [4]:
X_Data=pd.read_csv('X_Data.csv')
Y_Data=pd.read_csv('Y_Data.csv')

In [5]:
X_train= X_train[:70000]
Y_train=Y_train[:70000]

In [5]:
#Random forest

In [6]:
# with default paramter

clf = RandomForestClassifier(class_weight="balanced")
feature_selector = SelectKBest(mutual_info_classif, k=15)
pipeline_RF = Pipeline([('feature_sele',feature_selector),('clf',clf)])

In [7]:
pipeline_RF.fit(X_train,Y_train.values.ravel())
y_trainpredprobem=pipeline_RF.predict_proba(X_train)
y_testpred = pipeline_RF.predict_proba(X_test)

In [8]:
# auc with default parameters
print('auc of train with default parameter:',roc_auc_score(Y_train,y_trainpredprobem[:,1]))
print('test auc with default parameter:',roc_auc_score(Y_test,y_testpred[:,1]))

auc of train with default parameter: 0.7890967875862566
test auc with default parameter: 0.49532348391462055


In [9]:
print('brier_score_loss:',brier_score_loss(Y_test,y_testpred[:,1]))

brier_score_loss: 0.06631016597596268


In [11]:
y_testpred_ =pipeline_RF.predict(X_test)
print("Recall:\n",metrics.recall_score(Y_test,y_testpred_))
print("matthews_corrcoef:\n",matthews_corrcoef(Y_test, y_testpred_))

Recall:
 0.0394990366088632
matthews_corrcoef:
 -0.02117381932887601


In [12]:
#  parameters currently used
print('Parameters currently in use:\n')
print(clf.get_params())

Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [16]:
#GridSearchCV and feature selection in nested cv
from sklearn.pipeline import Pipeline
time_splitter_inner = TimeSeriesSplit(n_splits = 3)
time_splitter_outer= TimeSeriesSplit(n_splits = 10)

model=RandomForestClassifier(class_weight="balanced")
params = {'n_estimators':[500,700,900,1000],
           'max_depth':[3,5,7,9,11,13]}
feature_selector = SelectKBest(mutual_info_classif, k=25)

gs_rf = GridSearchCV(model, param_grid = params, cv = time_splitter_inner, scoring = 'roc_auc',verbose=10, n_jobs=-1)

pipeline  = Pipeline([('feature_sele',feature_selector),
                      ('clf_cv',gs_rf)])

In [17]:
pipeline.fit(X_Data,Y_Data.values.ravel())

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   29.1s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   54.4s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done  65 out of  72 | elapsed:  4.7min remaining:   30.1s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  5.5min finished


Pipeline(steps=[('feature_sele',
                 SelectKBest(k=25,
                             score_func=<function mutual_info_classif at 0x7ffa1fc04ef0>)),
                ('clf_cv',
                 GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=3),
                              estimator=RandomForestClassifier(class_weight='balanced'),
                              n_jobs=-1,
                              param_grid={'max_depth': [3, 5, 7, 9, 11, 13],
                                          'n_estimators': [500, 700, 900,
                                                           1000]},
                              scoring='roc_auc', verbose=10))])

In [19]:
custom_scorer = {
                 'recall': make_scorer(recall_score),
                 'Brier': make_scorer(brier_score_loss),
                 'Matthewscorrcoef': make_scorer(matthews_corrcoef),
                 'ROC_AUC': make_scorer(roc_auc_score),
                 'f1': make_scorer(f1_score)
                 }
forest_scores = cross_validate(pipeline, X_Data,Y_Data.values.ravel(), cv = time_splitter_outer, scoring = custom_scorer)
sorted(forest_scores.keys())

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   27.5s
[Parallel(n_jobs=-1)]: Done  65 out of  72 | elapsed:   32.4s remaining:    3.5s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   36.5s finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   17.3s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   24.7s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   34.0s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   45.4s
[Parallel(n_jobs=-1)]: Done  65 out of  72 | elapsed:   53.1s remaining:    5.7s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  1.0min finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   46.9s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  65 out of  72 | elapsed:  1.2min remaining:    8.0s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  1.5min finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   29.5s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   42.3s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  65 out of  72 | elapsed:  1.6min remaining:   10.6s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  1.9min finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   23.5s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   38.5s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   54.2s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  65 out of  72 | elapsed:  2.1min remaining:   13.7s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  2.5min finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   13.9s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   27.2s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   46.8s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  65 out of  72 | elapsed:  2.6min remaining:   16.8s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  3.1min finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   32.4s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   54.8s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  65 out of  72 | elapsed:  3.1min remaining:   20.1s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  3.7min finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   18.3s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   36.1s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  65 out of  72 | elapsed:  3.4min remaining:   22.3s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  4.1min finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   37.8s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  65 out of  72 | elapsed:  3.8min remaining:   24.4s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  4.5min finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   22.3s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   43.0s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done  65 out of  72 | elapsed:  4.2min remaining:   26.8s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  5.0min finished


['fit_time',
 'score_time',
 'test_Brier',
 'test_Matthewscorrcoef',
 'test_ROC_AUC',
 'test_f1',
 'test_recall']

In [20]:
RF_time=forest_scores['fit_time']
RF_recall = forest_scores['test_recall']
RF_Brier = forest_scores['test_Brier']
RF_Matthewscorrcoef = forest_scores['test_Matthewscorrcoef']
RF_ROC_AUC = forest_scores['test_ROC_AUC']
RF_f1 = forest_scores['test_f1']

In [21]:
print("===  F1 scores ===")
print(RF_f1)
print("=== All recall Scores ===")
print(RF_recall)
print("=== All Brier Scores ===")
print(RF_Brier)
print("=== All Matthewscorrcoef Scores ===")
print(RF_Matthewscorrcoef)
print("=== All ROC_AUC  Score ===")
print( RF_ROC_AUC )
print("===  fit_time ===")
print(RF_time)

===  F1 scores ===
[0.29457364 0.29038113 0.27475248 0.4502924  0.18563923 0.39039039
 0.18259936 0.2382134  0.35185185 0.20494053]
=== All recall Scores ===
[0.20430108 0.44692737 0.58115183 0.385      0.59887006 0.37356322
 0.76576577 0.69902913 0.48717949 0.61878453]
=== All Brier Scores ===
[0.02860286 0.061449   0.09209492 0.02954581 0.14615747 0.03190319
 0.11959767 0.14474305 0.0330033  0.1365708 ]
=== All Matthewscorrcoef Scores ===
[0.31661285 0.28169801 0.28784029 0.44231772 0.2109002  0.37444505
 0.25328117 0.2717663  0.35074731 0.23217286]
=== All ROC_AUC  Score ===
[0.59939839 0.69985437 0.74958434 0.68722659 0.73000406 0.67918749
 0.82410169 0.77975656 0.73158206 0.74468829]
===  fit_time ===
[ 40.68857193  71.32628298 102.61993909 133.39612484 178.33364201
 213.115453   257.58026791 289.47608709 326.40497303 346.66295385]


In [22]:
print("=== mean F1 score ===")
print(RF_f1.mean())
print("=== Mean recall Score ===")
print( RF_recall.mean())
print("=== Mean Brier Score ===")
print( RF_Brier.mean())
print("=== Mean Matthewscorrcoef  Score ===")
print( RF_Matthewscorrcoef .mean())
print("=== Mean ROC_AUC  Score ===")
print( RF_ROC_AUC .mean())
print("=== Mean RF_time  ===")
print( RF_time .mean())

=== mean F1 score ===
0.2863634398895363
=== Mean recall Score ===
0.5160572466464792
=== Mean Brier Score ===
0.08236680810938238
=== Mean Matthewscorrcoef  Score ===
0.3021781758253513
=== Mean ROC_AUC  Score ===
0.7225383848134994
=== Mean RF_time  ===
195.9604295730591


In [23]:
print("=== Minimum F1 score ===")
print(RF_f1.min())
print("=== Minimum recall Score ===")
print( RF_recall.min())
print("=== Minimum Brier Score ===")
print( RF_Brier.min())
print("=== Minimum Matthewscorrcoef  Score ===")
print( RF_Matthewscorrcoef.min())
print("=== Minimum ROC_AUC  Score ===")
print( RF_ROC_AUC.min())
print("=== Minimum RF_time  ===")
print( RF_time.min())

=== Minimum F1 score ===
0.18259935553168638
=== Minimum recall Score ===
0.20430107526881722
=== Minimum Brier Score ===
0.028602860286028604
=== Minimum Matthewscorrcoef  Score ===
0.2109001957345315
=== Minimum ROC_AUC  Score ===
0.5993983925801752
=== Minimum RF_time  ===
40.68857192993164


In [24]:
print("=== Maximum F1 score ===")
print(RF_f1.max())
print("=== Maximum recall Score ===")
print( RF_recall.max())
print("=== Minimum Brier Score ===")
print( RF_Brier.min())
print("=== Maximum Matthewscorrcoef  Score ===")
print( RF_Matthewscorrcoef .max())
print("=== Maximum ROC_AUC  Score ===")
print( RF_ROC_AUC .max())
print("=== Maximum RF_time ===")
print( RF_time .max())

=== Maximum F1 score ===
0.45029239766081874
=== Maximum recall Score ===
0.7657657657657657
=== Minimum Brier Score ===
0.028602860286028604
=== Maximum Matthewscorrcoef  Score ===
0.44231772345896514
=== Maximum ROC_AUC  Score ===
0.824101692863689
=== Maximum RF_time ===
346.6629538536072
