In [1]:
import math
import collections

import pandas as pd
import numpy as np

import params
from utils.sequence_data import data_to_sequences_and_labels
from utils.metrics import print_report_for_binary_classfier
from utils.preprocessing import probs_to_binary_classes
from utils.plot import plot_train_validation_metric

In [2]:
dataset = pd.read_csv("../datasets/data_for_models/dataset_1996-01-01_2019-08-22.csv",
                           header=0, parse_dates=[0], index_col=0)

In [3]:
#params for generating sequences 
val_max_idx = math.ceil(len(dataset)*(params.TRAIN_RATIO+params.VAL_RATIO))
label_index = len(dataset.columns) -1


# prepare data
test_X, test_Y = data_to_sequences_and_labels(dataset.to_numpy(), params.LOOKBACK, 
                                                params.STEP, 
                                                val_max_idx+1, None, 
                                                params.DELAY,
                                                label_index)
X, Y = data_to_sequences_and_labels(dataset.to_numpy(), params.LOOKBACK,
                                   params.STEP,
                                   0, val_max_idx,
                                   params.DELAY
                                   ,label_index)

In [4]:
#Prepare for gridsearchCV
from sklearn.metrics import matthews_corrcoef, make_scorer
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV 
cv = TimeSeriesSplit(n_splits=3)
score = make_scorer(matthews_corrcoef, greater_is_better=True)


In [6]:
def flatten_sequences(sequences):
    row_size = sequences.shape[0]
    feature_size = sequences.shape[1] * sequences.shape[2]
    flattened_sequences = np.zeros((row_size, feature_size))
    for i in range(row_size):
        flattened_sequences[i] = sequences[i].flatten()
        
    return flattened_sequences

assert((flatten_sequences(test_X)[0][:test_X.shape[-1]] == test_X[0][0]).all())

In [7]:
#Flattern sequences from (LOOKBACK, features) to (LOOKBACK*features)
test_X_flattened = flatten_sequences(test_X)
X = flatten_sequences(X)

## SVM with Radial basis kernel

In [8]:
#Train SVM
from sklearn.svm import SVC
svm_tuned_parameters = {'kernel': ['rbf'],
                     'C': [1, 10, 50, 100, 1000]}
 

svm_clf = GridSearchCV(SVC(gamma='scale'), svm_tuned_parameters, cv=cv,
                   scoring=score)
svm_clf.fit(X,Y)

GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=3),
             error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 10, 50, 100, 1000], 'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(matthews_corrcoef), verbose=0)

In [13]:
#print reports for SVM
print_report_for_binary_classfier(test_Y,svm_clf.predict(test_X_flattened))

F1 score: 0.277778
precision score: 0.238095
recall score: 0.333333
accuracy score: 0.938169
matthews_corrcoef: 0.250229

Confusion matrix:
[[779  32]
 [ 20  10]]


## Random Forest

In [26]:
from sklearn.ensemble import RandomForestClassifier

# randForeset_params = {'n_estimators' : [11,12,13,14,15], "max_features": ['sqrt', 'auto']}
randForeset_params = { 
    'n_estimators': [13, 50, 100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
}
random_forest_clf = GridSearchCV(RandomForestClassifier(), randForeset_params, cv=cv,
                   scoring=score)
random_forest_clf.fit(X,Y)

GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=3),
             error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                     

In [27]:
print_report_for_binary_classfier(test_Y,random_forest_clf.predict(test_X_flattened))

F1 score: 0.181818
precision score: 0.285714
recall score: 0.133333
accuracy score: 0.957194
matthews_corrcoef: 0.175408

Confusion matrix:
[[801  10]
 [ 26   4]]


## KNN

In [18]:
from sklearn.neighbors import KNeighborsClassifier
knn_params = {"weights":['uniform', 'distance'],
             'n_neighbors':range(3,7)}
knn = GridSearchCV(KNeighborsClassifier(), knn_params, cv=cv,
                   scoring=score)
knn.fit(X,Y)

GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=3),
             error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': range(3, 7),
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(matthews_corrcoef), verbose=0)

In [22]:
knn.best_params_

{'n_neighbors': 5, 'weights': 'uniform'}

In [19]:
print_report_for_binary_classfier(test_Y,knn.predict(test_X_flattened))

F1 score: 0.000000
precision score: 0.000000
recall score: 0.000000
accuracy score: 0.951249
matthews_corrcoef: -0.022142

Confusion matrix:
[[800  11]
 [ 30   0]]


## QDA 

In [23]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda_params = {"priors":[[0.80, 0.2],[0.95,0.05], None]}
qda_classifier = GridSearchCV(QuadraticDiscriminantAnalysis(), qda_params, cv=cv, scoring=score)
qda_classifier.fit(X,Y)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=3),
             error_score='raise-deprecating',
             estimator=QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
                                                     store_covariance=False,
                                                     tol=0.0001),
             iid='warn', n_jobs=None,
             param_grid={'priors': [[0.8, 0.2], [0.95, 0.05], None]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(matthews_corrcoef), verbose=0)

In [29]:

print_report_for_binary_classfier(test_Y,qda_classifier.predict(test_X_flattened))

F1 score: 0.000000
precision score: 0.000000
recall score: 0.000000
accuracy score: 0.947681
matthews_corrcoef: -0.025024

Confusion matrix:
[[797  14]
 [ 30   0]]
