In [1]:
import math

import pandas as pd
import numpy as np
from keras import Input,backend
from keras.models import Model, Sequential
from keras.layers import *
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score, matthews_corrcoef

Using TensorFlow backend.


In [2]:
dataset = pd.read_csv("../datasets/train_test_data.csv",
                           header=0, parse_dates=[0], index_col=0)

#params for generator
label_index = len(dataset.columns) -1


In [3]:
#hyperparamters
batch_size=64

#params for generator
delay=0
step=1 # 1 timestep = 1 day
lookback=10

#ratio for train/val/test split
train_ratio=0.7
val_ratio=0.15

train_max_idx = math.ceil(len(dataset)*train_ratio)
val_max_idx = math.ceil(len(dataset)*(train_ratio+val_ratio))

# 1 step = 1 batche of samples 
train_steps = (train_max_idx+1) // batch_size
val_steps =  (val_max_idx - train_max_idx - lookback) // batch_size
test_steps = (len(dataset) - val_max_idx - lookback) // batch_size

In [4]:
def generator(data, label_index, lookback, delay, min_index, max_index,
              shuffle=False, batch_size=64, step=1):
    if max_index is None:
        max_index = len(data) - delay - 1
    i = min_index + lookback
    while 1:
        if shuffle:
            rows = np.random.randint(
                min_index + lookback, max_index, size=batch_size)
        else:
            if i + batch_size >= max_index: 
                i = min_index + lookback #reset 'i'
            rows = np.arange(i, min(i + batch_size, max_index))
            i += len(rows)

        samples = np.zeros((len(rows),
                           lookback // step,
                           (data.shape[-1] - 1)))
        targets = np.zeros((len(rows),))
        for j, row in enumerate(rows):
            indices = range(rows[j] - lookback, rows[j], step)
            samples[j] = np.delete(data[indices], label_index, axis=1) # an np array without the label col
            targets[j] = data[rows[j] + delay][label_index]
        yield samples, targets

In [5]:
#init generators
train_rand_gen = generator(dataset.to_numpy(),
                      label_index=label_index,
                      lookback=lookback,
                      delay=delay,
                      min_index=0,
                      max_index=train_max_idx,
                      shuffle=True,
                      step=step, 
                      batch_size=batch_size)

train_gen = generator(dataset.to_numpy(),
                      label_index=label_index,
                      lookback=lookback,
                      delay=delay,
                      min_index=0,
                      max_index=train_max_idx,
                      shuffle=False,
                      step=step, 
                      batch_size=batch_size)


val_gen = generator(dataset.to_numpy(),
                    label_index=label_index,
                      lookback=lookback,
                      delay=delay,
                      min_index=train_max_idx+1,
                      max_index=val_max_idx,
                      shuffle=False,
                      step=step, 
                      batch_size=batch_size)

test_gen = generator(dataset.to_numpy(),
                      label_index=label_index,
                      lookback=lookback,
                      delay=delay,
                      min_index=val_max_idx+1,
                      max_index=None,
                      shuffle=False,
                      step=step, 
                      batch_size=batch_size)

In [6]:
def print_metric(y_true, Y_pred):
    print('F1 score: %f' % f1_score(y_true, Y_pred))
    print('precision score: %f' % precision_score(y_true, Y_pred))
    print('recall score: %f' % recall_score(y_true, Y_pred))
    print('accuracy score: %f' % accuracy_score(y_true, Y_pred))
    print('matthews_corrcoef: %f' % matthews_corrcoef(y_true, Y_pred))
    print('\nConfusion matrix:')
    print(confusion_matrix(y_true, Y_pred, labels=[0,1]))

In [7]:
def probs_to_binary_classes(preds, threshold=0.5):
    """
    preds: np array
    threshold: scalar
    """
    return np.where(preds > threshold, 1, 0)

In [8]:
def print_unique_counts(x):
    unique, counts = np.unique(x, return_counts=True)
    print(np.asarray((unique, counts)).T)


In [9]:
#build s FC model from the book

input_shape = (lookback//step, dataset.shape[-1] - 1)

model = Sequential()
model.add(Flatten(input_shape=input_shape))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

# #fit the model
model.fit_generator(train_rand_gen,
                    steps_per_epoch=train_steps,
                    epochs=50, verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 250)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                8032      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 8,065
Trainable params: 8,065
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50


<keras.callbacks.History at 0x7f97d53a29b0>

In [10]:
def generator_to_samples_and_targets(generator, steps):
    count=0
    X, Y = [], []
    for samples, targets in generator:
        if count >= steps:
            break;
        else:
            count+=1

        X.append(samples)
        Y.append(targets)

    return np.concatenate(X, axis=0), np.concatenate(Y, axis=0)


In [11]:
neg = 2762+158
pos = 177+807
print("We have {0} neg cases and {1} pos cases from train data".format(neg,pos))
print("the common sense baseline (accuracy score) is {0}".format(neg/(pos+neg)))

We have 2920 neg cases and 984 pos cases from train data
the common sense baseline (accuracy score) is 0.7479508196721312


In [12]:
#Make predictions for train set
    
X, Y = generator_to_samples_and_targets(train_gen, train_steps)    
Y_pred = model.predict(X)
print_metric(Y, probs_to_binary_classes(Y_pred))

F1 score: 0.500673
precision score: 0.741036
recall score: 0.378049
accuracy score: 0.809939
matthews_corrcoef: 0.432616

Confusion matrix:
[[2790  130]
 [ 612  372]]


In [13]:
#Make predictions from dev set
X, Y = generator_to_samples_and_targets(val_gen, val_steps)    
Y_pred = model.predict(X)
print_metric(Y, probs_to_binary_classes(Y_pred))


F1 score: 0.000000
precision score: 0.000000
recall score: 0.000000
accuracy score: 0.985577
matthews_corrcoef: -0.004015

Confusion matrix:
[[820   1]
 [ 11   0]]


In [14]:
#Make predictions for test set
X, Y = generator_to_samples_and_targets(test_gen, test_steps)    
Y_pred = model.predict(X)
print_metric(Y, probs_to_binary_classes(Y_pred))


F1 score: 0.000000
precision score: 0.000000
recall score: 0.000000
accuracy score: 0.963942
matthews_corrcoef: 0.000000

Confusion matrix:
[[802   0]
 [ 30   0]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
