In [1]:
import math
import collections

import pandas as pd
import numpy as np

import params
from utils.sequence_data import data_to_sequences_and_labels
from utils.metrics import print_report_for_binary_classfier
from utils.preprocessing import probs_to_binary_classes
from utils.plot import plot_train_validation_metric

In [2]:
dataset = pd.read_csv("../datasets/data_for_models/dataset_1996-01-01_2019-08-22.csv",
                           header=0, parse_dates=[0], index_col=0)

input_shape = (params.LOOKBACK//params.STEP, dataset.shape[-1] )

In [3]:
#params for generating sequences 
train_max_idx = math.ceil(len(dataset)*params.TRAIN_RATIO)
val_max_idx = math.ceil(len(dataset)*(params.TRAIN_RATIO+params.VAL_RATIO))
label_index = len(dataset.columns) -1


# prepare data
train_X, train_Y = data_to_sequences_and_labels(dataset.to_numpy(), params.LOOKBACK, 
                                                params.STEP, 
                                                0, train_max_idx, 
                                                params.DELAY,
                                                label_index) 
val_X, val_Y = data_to_sequences_and_labels(dataset.to_numpy(), params.LOOKBACK, 
                                                params.STEP, 
                                                train_max_idx+1, val_max_idx, 
                                                params.DELAY,
                                                label_index)
test_X, test_Y = data_to_sequences_and_labels(dataset.to_numpy(), params.LOOKBACK, 
                                                params.STEP, 
                                                val_max_idx+1, None, 
                                                params.DELAY,
                                                label_index)

In [10]:
def flatten_sequences(sequences):
    row_size = sequences.shape[0]
    feature_size = sequences.shape[1] * sequences.shape[2]
    flattened_sequences = np.zeros((row_size, feature_size))
    for i in range(row_size):
        flattened_sequences[i] = sequences[i].flatten()
        
    return flattened_sequences

assert((flatten_sequences(train_X)[0][:train_X.shape[-1]] == train_X[0][0]).all())

In [25]:
#Flattern sequences from (LOOKBACK, features) to (LOOKBACK*features)

train_X_flattened = flatten_sequences(train_X)
val_X_flattened = flatten_sequences(val_X)
test_X_flattened = flatten_sequences(test_X)


In [30]:
#Train SVM

from sklearn.svm import SVC
svm_clf = SVC(gamma='auto')
svm_clf.fit(train_X_flattened, train_Y,verbose=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [35]:
#print reports for SVM
print_report_for_binary_classfier(train_Y, svm_clf.predict(train_X_flattened))
print_report_for_binary_classfier(val_Y, svm_clf.predict(val_X_flattened))
print_report_for_binary_classfier(test_Y,svm_clf.predict(test_X_flattened))

F1 score: 0.045680
precision score: 1.000000
recall score: 0.023374
accuracy score: 0.761716
matthews_corrcoef: 0.133313

Confusion matrix:
[[3049    0]
 [ 961   23]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


F1 score: 0.000000
precision score: 0.000000
recall score: 0.000000
accuracy score: 0.987104
matthews_corrcoef: 0.000000

Confusion matrix:
[[842   0]
 [ 11   0]]
F1 score: 0.000000
precision score: 0.000000
recall score: 0.000000
accuracy score: 0.964328
matthews_corrcoef: 0.000000

Confusion matrix:
[[811   0]
 [ 30   0]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [47]:
from sklearn.ensemble import RandomForestClassifier

random_forest_clf = RandomForestClassifier(n_estimators=300, verbose=1, max_features='auto')
random_forest_clf.fit(train_X_flattened, train_Y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:   21.2s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=1, warm_start=False)

In [48]:
print_report_for_binary_classfier(train_Y, random_forest_clf.predict(train_X_flattened))
print_report_for_binary_classfier(val_Y, random_forest_clf.predict(val_X_flattened))
print_report_for_binary_classfier(test_Y,random_forest_clf.predict(test_X_flattened))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


F1 score: 1.000000
precision score: 1.000000
recall score: 1.000000
accuracy score: 1.000000
matthews_corrcoef: 1.000000

Confusion matrix:
[[3049    0]
 [   0  984]]
F1 score: 0.054054
precision score: 0.031746
recall score: 0.181818
accuracy score: 0.917937
matthews_corrcoef: 0.047182

Confusion matrix:
[[781  61]
 [  9   2]]
F1 score: 0.000000
precision score: 0.000000
recall score: 0.000000
accuracy score: 0.964328
matthews_corrcoef: 0.000000

Confusion matrix:
[[811   0]
 [ 30   0]]


[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.1s finished
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [50]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(train_X_flattened, train_Y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [51]:
print_report_for_binary_classfier(train_Y, knn.predict(train_X_flattened))
print_report_for_binary_classfier(val_Y, knn.predict(val_X_flattened))
print_report_for_binary_classfier(test_Y,knn.predict(test_X_flattened))

F1 score: 0.850829
precision score: 0.841112
recall score: 0.860772
accuracy score: 0.926358
matthews_corrcoef: 0.802043

Confusion matrix:
[[2889  160]
 [ 137  847]]
F1 score: 0.000000
precision score: 0.000000
recall score: 0.000000
accuracy score: 0.901524
matthews_corrcoef: -0.034967

Confusion matrix:
[[769  73]
 [ 11   0]]
F1 score: 0.090909
precision score: 0.142857
recall score: 0.066667
accuracy score: 0.952438
matthews_corrcoef: 0.075192

Confusion matrix:
[[799  12]
 [ 28   2]]
