In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_selection import VarianceThreshold,mutual_info_classif,SelectKBest,SelectFromModel

from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import  GridSearchCV,cross_val_score,TimeSeriesSplit,cross_validate
from sklearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline
from sklearn import tree
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix, recall_score, precision_score, roc_auc_score,roc_curve, auc,matthews_corrcoef,f1_score,brier_score_loss,make_scorer
 
from sklearn.decomposition import PCA

In [2]:
#reading csv files
X_train=pd.read_csv('X_train_F.csv')
Y_train=pd.read_csv('Y_train_F.csv')
X_test=pd.read_csv('X_test_F.csv')
Y_test=pd.read_csv('Y_test_F.csv')

In [3]:
X_Data=pd.read_csv('X_Data.csv')
Y_Data=pd.read_csv('Y_Data.csv')

In [4]:
X_train= X_train[:70000]
Y_train=Y_train[:70000]

In [5]:
#SDG Classifier

In [6]:
from sklearn.preprocessing import StandardScaler

In [7]:
#lets try with first default value
clf = SGDClassifier(loss="log",class_weight="balanced")
pca = PCA(.96)
pipeline_SD = Pipeline([('feature_sele',pca),('clf',clf)])


In [8]:
pipeline_SD.fit(X_train,Y_train.values.ravel())
y_trainpredprobem=pipeline_SD.predict_proba(X_train)
y_testpredprobem =pipeline_SD.predict_proba(X_test)

In [9]:
#auc score with default values
print('auc of train with default parameter:',roc_auc_score(Y_train,y_trainpredprobem[:,1]))
print('auc of test with  default parameter:',roc_auc_score(Y_test,y_testpredprobem[:,1]))

auc of train with default parameter: 0.501071880367776
auc of test with  default parameter: 0.673548598113101


In [10]:
print('brier_score_loss:',brier_score_loss(Y_test,y_testpredprobem[:,1]))

brier_score_loss: 0.3160322721602227


In [11]:
y_testpred_ =pipeline_SD.predict(X_test)
print("Recall:\n",metrics.recall_score(Y_test,y_testpred_))
print("matthews_corrcoef:\n",matthews_corrcoef(Y_test, y_testpred_))

Recall:
 0.6746949261400128
matthews_corrcoef:
 0.13979987583087938


In [14]:
##GridsearchCV with Timeseries

time_splitter_outer = TimeSeriesSplit(n_splits = 10)
time_splitter_inner = TimeSeriesSplit(n_splits = 5)

model = SGDClassifier(loss="log",class_weight="balanced")
params = {
         'alpha':[10**-6,10**-5,10**-4,10**-3,10**-2,10**-1,10**0,10**1,10**2,10**3],#learning rate
         'penalty':['l1','l2']
         }

pca = PCA(.96)
gs_SD = GridSearchCV(model, param_grid = params, cv = time_splitter_inner, scoring = 'roc_auc',verbose=10, n_jobs=-1)

pipelineSD  = Pipeline([('feature_sele',pca),
                      ('clf_cv',gs_SD)])

In [15]:
pipelineSD.fit(X_Data,Y_Data.values.ravel())

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   15.6s
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:   17.8s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   32.9s finished


Pipeline(steps=[('feature_sele', PCA(n_components=0.96)),
                ('clf_cv',
                 GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=5),
                              estimator=SGDClassifier(class_weight='balanced',
                                                      loss='log'),
                              n_jobs=-1,
                              param_grid={'alpha': [1e-06, 1e-05, 0.0001, 0.001,
                                                    0.01, 0.1, 1, 10, 100,
                                                    1000],
                                          'penalty': ['l1', 'l2']},
                              scoring='roc_auc', verbose=10))])

In [16]:
custom_scorer = {
                 'recall': make_scorer(recall_score),
                 'Brier': make_scorer(brier_score_loss),
                 'Matthewscorrcoef': make_scorer(matthews_corrcoef),
                 'ROC_AUC': make_scorer(roc_auc_score)
                 }
SD_scores = cross_validate(pipelineSD,X_Data,Y_Data.values.ravel(),cv = time_splitter_outer, scoring = custom_scorer)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0204s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0919s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  62 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  85 out of 100 | elapsed:    0.6s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.6s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0242s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1962s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  59 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  70 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:    1.9s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    4.0s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0371s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done  85 out of 100 | elapsed:    2.1s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:    2.6s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.1s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0760s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:    3.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    8.6s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0619s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done  85 out of 100 | elapsed:    3.2s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:    8.1s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    8.4s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0797s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  19 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done  85 out of 100 | elapsed:    3.9s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:    8.9s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   12.0s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0965s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done  85 out of 100 | elapsed:    4.6s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:   11.3s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   12.8s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1323s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  35 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Batch computation too slow (3.9354s.) Setting batch_size=1.
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:   14.0s
[Parallel(n_jobs=-1)]: Done  85 out of 100 | elapsed:   14.2s remaining:    2.5s
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:   17.1s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   22.8s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1093s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  35 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0005s.) Setting batch_size=1.
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done  85 out of 100 | elapsed:   12.4s remaining:    2.2s
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:   19.3s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   24.0s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1069s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.4240s.) Setting batch_size=1.
[Parallel(n_jobs=-1)]: Done  36 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done  75 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:   16.0s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   24.5s finished


In [17]:
sorted(SD_scores.keys())

['fit_time',
 'score_time',
 'test_Brier',
 'test_Matthewscorrcoef',
 'test_ROC_AUC',
 'test_recall']

In [18]:
SD_time=SD_scores['fit_time']
SD_recall = SD_scores['test_recall']
SD_Brier = SD_scores['test_Brier']
SD_Matthewscorrcoef = SD_scores['test_Matthewscorrcoef']
SD_ROC_AUC = SD_scores['test_ROC_AUC']

In [19]:
print("=== All recall Scores ===")
print(SD_recall)
print("=== All Brier Scores ===")
print(SD_Brier)
print("=== All Matthewscorrcoef Scores ===")
print(SD_Matthewscorrcoef)
print("=== All ROC_AUC  Scores ===")
print(SD_ROC_AUC )
print("=== All fit_time ===")
print(SD_time )

=== All recall Scores ===
[0.65053763 0.57541899 0.60732984 0.645      0.62146893 0.59195402
 0.63963964 0.6407767  0.60683761 0.56353591]
=== All Brier Scores ===
[0.42008487 0.24139557 0.22976583 0.24453874 0.31573157 0.22662266
 0.29671539 0.25742574 0.24893918 0.24422442]
=== All Matthewscorrcoef Scores ===
[0.07771535 0.13034011 0.15370322 0.16202758 0.10819504 0.14319721
 0.09814617 0.15481906 0.11145821 0.12511367]
=== All ROC_AUC  Scores ===
[0.6141631  0.66966293 0.69130264 0.70202296 0.65376712 0.68521598
 0.67202711 0.69337844 0.68030001 0.66246999]
=== All fit_time ===
[ 1.98477006  4.41726708  3.64722514  9.24949074  9.14145994 13.03864694
 13.78394079 24.18591499 25.61284804 26.14775229]


In [20]:
print("=== Mean recall Score ===")
print( SD_recall.mean())
print("=== Mean Brier Score ===")
print( SD_Brier.mean())
print("=== Mean Matthewscorrcoef  Score ===")
print( SD_Matthewscorrcoef .mean())
print("=== Mean ROC_AUC  Scores ===")
print(SD_ROC_AUC .mean())
print("=== Mean fit_time===")
print(SD_time .mean())

=== Mean recall Score ===
0.6142499278404707
=== Mean Brier Score ===
0.2725443972968726
=== Mean Matthewscorrcoef  Score ===
0.1264715628086997
=== Mean ROC_AUC  Scores ===
0.6724310286123917
=== Mean fit_time===
13.120931601524353


In [21]:
print("=== Minimum recall Score ===")
print( SD_recall.min())
print("=== Minimum Brier Score ===")
print(  SD_Brier.min())
print("=== Minimum Matthewscorrcoef  Score ===")
print(  SD_Matthewscorrcoef.min())
print("=== Minimum ROC_AUC  Score ===")
print(  SD_ROC_AUC.min())
print("=== Minimum RF_time  ===")
print(  SD_time.min())

=== Minimum recall Score ===
0.56353591160221
=== Minimum Brier Score ===
0.22662266226622663
=== Minimum Matthewscorrcoef  Score ===
0.07771534899406542
=== Minimum ROC_AUC  Score ===
0.6141631024560413
=== Minimum RF_time  ===
1.9847700595855713


In [22]:
print("=== Maximum recall Score ===")
print( SD_recall.max())
print("=== Minimum Brier Score ===")
print( SD_Brier.min())
print("=== Maximum Matthewscorrcoef  Score ===")
print( SD_Matthewscorrcoef .max())
print("=== Maximum ROC_AUC  Scores ===")
print(SD_ROC_AUC .max())
print("=== Maximum fit_time ===")
print(SD_time .max())

=== Maximum recall Score ===
0.6505376344086021
=== Minimum Brier Score ===
0.22662266226622663
=== Maximum Matthewscorrcoef  Score ===
0.16202758409710472
=== Maximum ROC_AUC  Scores ===
0.7020229595975985
=== Maximum fit_time ===
26.147752285003662
