In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_selection import VarianceThreshold,mutual_info_classif,SelectKBest,SelectFromModel

from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import  GridSearchCV,cross_val_score,TimeSeriesSplit,cross_validate
from sklearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline
from sklearn import tree
from sklearn.metrics import confusion_matrix, recall_score, precision_score, roc_auc_score,roc_curve, auc,matthews_corrcoef,f1_score,brier_score_loss,make_scorer, metrics

In [2]:
#reading csv files
X_train=pd.read_csv('X_train_F.csv')
Y_train=pd.read_csv('Y_train_F.csv')
X_test=pd.read_csv('X_test_F.csv')
Y_test=pd.read_csv('Y_test_F.csv')

In [3]:
X_Data=pd.read_csv('X_Data.csv')
Y_Data=pd.read_csv('Y_Data.csv')

In [4]:
X_train= X_train[:70000]
Y_train=Y_train[:70000]

In [4]:
#SDG Classifier

In [5]:
from sklearn.preprocessing import StandardScaler

In [6]:
#lets try with first default value
clf = SGDClassifier(loss="log",class_weight="balanced")
feature_selector = SelectKBest(mutual_info_classif, k=15)
pipeline_SD = Pipeline([('feature_sele',feature_selector),('clf',clf)])


In [7]:
pipeline_SD.fit(X_train,Y_train.values.ravel())
y_trainpredprobem=pipeline_SD.predict_proba(X_train)
y_testpredprobem =pipeline_SD.predict_proba(X_test)

In [8]:
#auc score with default values
print('auc of train with default parameter:',roc_auc_score(Y_train,y_trainpredprobem[:,1]))
print('auc of test with  default parameter:',roc_auc_score(Y_test,y_testpredprobem[:,1]))

auc of train with default parameter: 0.6684462360295172
auc of test with  default parameter: 0.6973641785968249


In [9]:
print('brier_score_loss:',brier_score_loss(Y_test,y_testpredprobem[:,1]))

brier_score_loss: 0.19461369670764764


In [10]:
y_testpred_ =pipeline_SD.predict(X_test)
print("Recall:\n",metrics.recall_score(Y_test,y_testpred_))
print("matthews_corrcoef:\n",matthews_corrcoef(Y_test, y_testpred_))

Recall:
 0.5459216441875402
matthews_corrcoef:
 0.16712475030808316


In [13]:
##GridsearchCV and feature selection with nested cv 

time_splitter_outer = TimeSeriesSplit(n_splits = 10)
time_splitter_inner = TimeSeriesSplit(n_splits = 5)

model = SGDClassifier(loss="log",class_weight="balanced")
params = {
         'alpha':[10**-6,10**-5,10**-4,10**-3,10**-2,10**-1,10**0,10**1,10**2,10**3],#learning rate
         'penalty':['l1','l2']
         }
feature_selector = SelectKBest(mutual_info_classif, k=25)

gs_SD = GridSearchCV(model, param_grid = params, cv = time_splitter_inner, scoring = 'roc_auc',verbose=10, n_jobs=-1)

pipelineSD  = Pipeline([('feature_sele',feature_selector),
                      ('clf_cv',gs_SD)])

In [15]:
pipelineSD.fit(X_Data,Y_Data.values.ravel())

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   15.4s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   19.5s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   25.8s
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:   34.7s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   42.1s finished


Pipeline(steps=[('feature_sele',
                 SelectKBest(k=25,
                             score_func=<function mutual_info_classif at 0x7fb62f408e60>)),
                ('clf_cv',
                 GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=5),
                              estimator=SGDClassifier(class_weight='balanced',
                                                      loss='log'),
                              n_jobs=-1,
                              param_grid={'alpha': [1e-06, 1e-05, 0.0001, 0.001,
                                                    0.01, 0.1, 1, 10, 100,
                                                    1000],
                                          'penalty': ['l1', 'l2']},
                              scoring='roc_auc', verbose=10))])

In [18]:
custom_scorer = {
                 'recall': make_scorer(recall_score),
                 'Brier': make_scorer(brier_score_loss),
                 'Matthewscorrcoef': make_scorer(matthews_corrcoef),
                 'ROC_AUC': make_scorer(roc_auc_score)
                 }
SD_scores = cross_validate(pipelineSD,X_Data,Y_Data.values.ravel(),cv = time_splitter_outer, scoring = custom_scorer)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0253s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1149s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  79 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  85 out of 100 | elapsed:    1.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:    1.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.9s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0568s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  85 out of 100 | elapsed:    3.3s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:    4.8s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    5.3s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0742s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:    6.3s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    7.3s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0941s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  85 out of 100 | elapsed:    6.1s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:    9.7s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   11.5s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1232s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0432s.) Setting batch_size=1.
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done  85 out of 100 | elapsed:   10.4s remaining:    1.8s
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:   13.4s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   16.2s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1806s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1520s.) Setting batch_size=1.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done  73 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done  85 out of 100 | elapsed:   15.0s remaining:    2.6s
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:   17.6s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   20.7s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1661s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0501s.) Setting batch_size=1.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done  67 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done  79 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done  85 out of 100 | elapsed:   13.3s remaining:    2.3s
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:   17.1s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   22.9s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1844s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1834s.) Setting batch_size=1.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done  67 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done  79 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:   17.9s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   24.7s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:   19.2s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   27.0s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   15.1s
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:   20.7s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   27.9s finished


In [19]:
sorted(SD_scores.keys())

['fit_time',
 'score_time',
 'test_Brier',
 'test_Matthewscorrcoef',
 'test_ROC_AUC',
 'test_recall']

In [20]:
SD_time=SD_scores['fit_time']
SD_recall = SD_scores['test_recall']
SD_Brier = SD_scores['test_Brier']
SD_Matthewscorrcoef = SD_scores['test_Matthewscorrcoef']
SD_ROC_AUC = SD_scores['test_ROC_AUC']

In [21]:
print("=== All recall Scores ===")
print(SD_recall)
print("=== All Brier Scores ===")
print(SD_Brier)
print("=== All Matthewscorrcoef Scores ===")
print(SD_Matthewscorrcoef)
print("=== All ROC_AUC  Scores ===")
print(SD_ROC_AUC )
print("=== All fit_time ===")
print(SD_time )

=== All recall Scores ===
[0.97727273 0.98165138 0.55793991 0.84577114 0.80952381 0.98540146
 0.59555556 0.41791045 0.60633484 0.45882353]
=== All Brier Scores ===
[0.95159516 0.93963146 0.16240374 0.74051155 0.7169967  0.90676568
 0.59254675 0.20860836 0.33594609 0.19526953]
=== All Matthewscorrcoef Scores ===
[-0.0040852   0.01322909  0.19153738  0.03398951  0.02822724  0.0317585
 -0.00105864  0.0719867   0.09846949  0.10400514]
=== All ROC_AUC  Scores ===
[0.49834992 0.50677409 0.70239658 0.54429697 0.53923882 0.53075259
 0.49850149 0.60815668 0.63609892 0.63591697]
=== All fit_time ===
[ 15.72785592  35.12854195  54.58489609  77.60266709 104.52630901
 127.64568996 152.58510804 176.49382901 199.62624383 221.15017796]


In [22]:
print("=== Mean recall Score ===")
print( SD_recall.mean())
print("=== Mean Brier Score ===")
print( SD_Brier.mean())
print("=== Mean Matthewscorrcoef  Score ===")
print( SD_Matthewscorrcoef .mean())
print("=== Mean ROC_AUC  Scores ===")
print(SD_ROC_AUC .mean())
print("=== Mean fit_time===")
print(SD_time .mean())

=== Mean recall Score ===
0.7236184805596511
=== Mean Brier Score ===
0.575027502750275
=== Mean Matthewscorrcoef  Score ===
0.056805921152688545
=== Mean ROC_AUC  Scores ===
0.5700483030769107
=== Mean fit_time===
116.50713188648224


In [23]:
print("=== Minimum recall Score ===")
print( SD_recall.min())
print("=== Minimum Brier Score ===")
print(  SD_Brier.min())
print("=== Minimum Matthewscorrcoef  Score ===")
print(  SD_Matthewscorrcoef.min())
print("=== Minimum ROC_AUC  Score ===")
print(  SD_ROC_AUC.min())
print("=== Minimum RF_time  ===")
print(  SD_time.min())

=== Minimum recall Score ===
0.417910447761194
=== Minimum Brier Score ===
0.1624037403740374
=== Minimum Matthewscorrcoef  Score ===
-0.004085201960300118
=== Minimum ROC_AUC  Score ===
0.4983499200742536
=== Minimum RF_time  ===
15.727855920791626


In [24]:
print("=== Maximum recall Score ===")
print( SD_recall.max())
print("=== Minimum Brier Score ===")
print( SD_Brier.min())
print("=== Maximum Matthewscorrcoef  Score ===")
print( SD_Matthewscorrcoef .max())
print("=== Maximum ROC_AUC  Scores ===")
print(SD_ROC_AUC .max())
print("=== Maximum fit_time ===")
print(SD_time .max())

=== Maximum recall Score ===
0.9854014598540146
=== Minimum Brier Score ===
0.1624037403740374
=== Maximum Matthewscorrcoef  Score ===
0.19153738129807613
=== Maximum ROC_AUC  Scores ===
0.7023965801814172
=== Maximum fit_time ===
221.15017795562744
