In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_selection import VarianceThreshold,mutual_info_classif,SelectKBest,SelectFromModel

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import  GridSearchCV,cross_val_score,TimeSeriesSplit,cross_validate
from sklearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline
from sklearn import tree
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix, recall_score, precision_score, roc_auc_score,roc_curve, auc,matthews_corrcoef,f1_score,brier_score_loss,make_scorer
 
from sklearn.decomposition import PCA

In [3]:
#reading csv files
#reading csv files
X_train=pd.read_csv('X_train_F.csv')
Y_train=pd.read_csv('Y_train_F.csv')
X_test=pd.read_csv('X_test_F.csv')
Y_test=pd.read_csv('Y_test_F.csv')

In [4]:
X_Data=pd.read_csv('X_Data.csv')
Y_Data=pd.read_csv('Y_Data.csv')

In [5]:
X_train= X_train[:70000]
Y_train=Y_train[:70000]

In [6]:
#Random forest

In [7]:
# with default paramter

clf = RandomForestClassifier(class_weight="balanced")
pca = PCA(.96)
pipeline_RF = Pipeline([('feature_sele',pca),('clf',clf)])

In [8]:
pipeline_RF.fit(X_train,Y_train.values.ravel())
y_trainpredprobem=pipeline_RF.predict_proba(X_train)
y_testpred = pipeline_RF.predict_proba(X_test)

In [9]:
# auc with default parameters
print('auc of train with default parameter:',roc_auc_score(Y_train,y_trainpredprobem[:,1]))
print('test auc with default parameter:',roc_auc_score(Y_test,y_testpred[:,1]))

auc of train with default parameter: 1.0
test auc with default parameter: 0.5371375985639915


In [9]:
print('brier_score_loss:',brier_score_loss(Y_test,y_testpred[:,1]))

brier_score_loss: 0.06631016597596268


In [10]:
y_testpred_ =pipeline_RF.predict(X_test)
print("Recall:\n",metrics.recall_score(Y_test,y_testpred_))
print("matthews_corrcoef:\n",matthews_corrcoef(Y_test, y_testpred_))

Recall:
 0.0
matthews_corrcoef:
 0.0


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [11]:
#  parameters currently used
print('Parameters currently in use:\n')
print(clf.get_params())

Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [14]:
#GridSearchCV with timeseries
from sklearn.pipeline import Pipeline
time_splitter_inner = TimeSeriesSplit(n_splits = 3)
time_splitter_outer= TimeSeriesSplit(n_splits = 10)

model=RandomForestClassifier(class_weight="balanced")
params = {'n_estimators':[500,700,900,1000],
           'max_depth':[3,5,7,9,11,13]}
pca = PCA(.90)

gs_rf = GridSearchCV(model, param_grid = params, cv = time_splitter_inner, scoring = 'roc_auc',verbose=10, n_jobs=-1)

pipeline  = Pipeline([('feature_sele',pca),
                      ('clf_cv',gs_rf)])

In [15]:
pipeline.fit(X_Data,Y_Data.values.ravel())

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   17.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   53.6s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done  65 out of  72 | elapsed: 10.3min remaining:  1.1min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed: 12.8min finished


Pipeline(steps=[('feature_sele', PCA(n_components=0.9)),
                ('clf_cv',
                 GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=3),
                              estimator=RandomForestClassifier(class_weight='balanced'),
                              n_jobs=-1,
                              param_grid={'max_depth': [3, 5, 7, 9, 11, 13],
                                          'n_estimators': [500, 700, 900,
                                                           1000]},
                              scoring='roc_auc', verbose=10))])

In [16]:
custom_scorer = {
                 'recall': make_scorer(recall_score),
                 'Brier': make_scorer(brier_score_loss),
                 'Matthewscorrcoef': make_scorer(matthews_corrcoef),
                 'ROC_AUC': make_scorer(roc_auc_score),
                 
                 }
forest_scores = cross_validate(pipeline,X_Data,Y_Data.values.ravel(), cv = time_splitter_outer, scoring = custom_scorer)
sorted(forest_scores.keys())

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   23.6s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   32.1s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   41.7s
[Parallel(n_jobs=-1)]: Done  65 out of  72 | elapsed:   49.4s remaining:    5.3s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   57.2s finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   42.2s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  65 out of  72 | elapsed:  1.6min remaining:   10.2s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  1.9min finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   25.3s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   44.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  65 out of  72 | elapsed:  2.4min remaining:   15.3s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  2.9min finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   33.7s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  65 out of  72 | elapsed:  3.5min remaining:   22.8s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  4.2min finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   52.5s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done  65 out of  72 | elapsed:  2.9min remaining:   18.9s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  3.6min finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   18.9s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   37.0s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  65 out of  72 | elapsed:  3.6min remaining:   23.4s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  4.4min finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   30.9s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   59.4s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done  65 out of  72 | elapsed:  6.1min remaining:   39.3s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  7.4min finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   35.2s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done  65 out of  72 | elapsed:  7.3min remaining:   47.1s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  8.9min finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   40.1s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done  65 out of  72 | elapsed:  8.4min remaining:   54.2s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed: 10.2min finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   45.8s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done  65 out of  72 | elapsed:  9.2min remaining:   59.2s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed: 11.3min finished


['fit_time',
 'score_time',
 'test_Brier',
 'test_Matthewscorrcoef',
 'test_ROC_AUC',
 'test_f1',
 'test_recall']

In [17]:
RF_time=forest_scores['fit_time']
RF_recall = forest_scores['test_recall']
RF_Brier = forest_scores['test_Brier']
RF_Matthewscorrcoef = forest_scores['test_Matthewscorrcoef']
RF_ROC_AUC = forest_scores['test_ROC_AUC']


In [18]:
print("=== All recall Scores ===")
print(RF_recall)
print("=== All Brier Scores ===")
print(RF_Brier)
print("=== All Matthewscorrcoef Scores ===")
print(RF_Matthewscorrcoef)
print("=== All ROC_AUC  Score ===")
print( RF_ROC_AUC )
print("===  fit_time ===")
print(RF_time)

=== All recall Scores ===
[0.11827957 0.55865922 0.51832461 0.63       0.66101695 0.65517241
 0.62162162 0.63592233 0.60683761 0.62983425]
=== All Brier Scores ===
[0.03001729 0.18261826 0.16580229 0.21467861 0.21515009 0.22190791
 0.19959139 0.20634921 0.21797894 0.22693698]
=== All Matthewscorrcoef Scores ===
[0.2195084  0.16300584 0.16578823 0.1762435  0.17723239 0.16914386
 0.13818669 0.18729753 0.12678605 0.15977272]
=== All ROC_AUC  Score ===
[0.55695426 0.69176493 0.68114869 0.71018092 0.72470505 0.71836016
 0.71260224 0.71742519 0.6960701  0.7035454 ]
===  fit_time ===
[ 66.29376507 119.50782228 182.52688098 269.40343189 236.81748009
 284.68181396 489.91052794 562.35562396 674.35634685 710.30564713]


In [19]:
print("=== Mean recall Score ===")
print( RF_recall.mean())
print("=== Mean Brier Score ===")
print( RF_Brier.mean())
print("=== Mean Matthewscorrcoef  Score ===")
print( RF_Matthewscorrcoef .mean())
print("=== Mean ROC_AUC  Score ===")
print( RF_ROC_AUC .mean())
print("=== Mean RF_time  ===")
print( RF_time .mean())

=== Mean recall Score ===
0.5635668570745019
=== Mean Brier Score ===
0.1881030960238881
=== Mean Matthewscorrcoef  Score ===
0.16829651964738065
=== Mean ROC_AUC  Score ===
0.6912756934492857
=== Mean RF_time  ===
359.6159340143204


In [20]:
print("=== Minimum recall Score ===")
print( RF_recall.min())
print("=== Minimum Brier Score ===")
print( RF_Brier.min())
print("=== Minimum Matthewscorrcoef  Score ===")
print( RF_Matthewscorrcoef.min())
print("=== Minimum ROC_AUC  Score ===")
print( RF_ROC_AUC.min())
print("=== Minimum RF_time  ===")
print( RF_time.min())

=== Minimum recall Score ===
0.11827956989247312
=== Minimum Brier Score ===
0.030017287443030016
=== Minimum Matthewscorrcoef  Score ===
0.12678605121878037
=== Minimum ROC_AUC  Score ===
0.5569542579914041
=== Minimum RF_time  ===
66.2937650680542


In [21]:
print("=== Maximum recall Score ===")
print( RF_recall.max())
print("=== Minimum Brier Score ===")
print( RF_Brier.min())
print("=== Maximum Matthewscorrcoef  Score ===")
print( RF_Matthewscorrcoef .max())
print("=== Maximum ROC_AUC  Score ===")
print( RF_ROC_AUC .max())
print("=== Maximum RF_time ===")
print( RF_time .max())

=== Maximum recall Score ===
0.6610169491525424
=== Minimum Brier Score ===
0.030017287443030016
=== Maximum Matthewscorrcoef  Score ===
0.21950839678674983
=== Maximum ROC_AUC  Score ===
0.7247050474828344
=== Maximum RF_time ===
710.3056471347809
