In [1]:
import math
import collections

import pandas as pd

from keras import regularizers
from keras.models import Model, Sequential
from keras.layers import *

import params
from utils.sequence_data import generator_for_binary_classifier, generator_to_samples_and_targets
from utils.metrics import print_report_for_binary_classfier
from utils.preprocessing import probs_to_binary_classes
from utils.plot import plot_train_validation_metric

Using TensorFlow backend.


In [2]:
dataset = pd.read_csv("../datasets/data_for_models/dataset_1996-01-01_2019-08-22.csv",
                           header=0, parse_dates=[0], index_col=0)

#params for generator
label_index = len(dataset.columns) -1
input_shape = (params.LOOKBACK//params.STEP, dataset.shape[-1] )

In [3]:
train_max_idx = math.ceil(len(dataset)*params.TRAIN_RATIO)
val_max_idx = math.ceil(len(dataset)*(params.TRAIN_RATIO+params.VAL_RATIO))

# 1 step = 1 batche of samples 
train_steps = (train_max_idx+1 -params.LOOKBACK) // params.BATCH_SIZE
val_steps =  (val_max_idx - train_max_idx - params.LOOKBACK) // params.BATCH_SIZE
test_steps = (len(dataset) - val_max_idx - params.LOOKBACK) // params.BATCH_SIZE

In [4]:
#init generator_for_binary_classifiers
train_gen = generator_for_binary_classifier(dataset.to_numpy(),
                      label_index=label_index,
                      lookback=params.LOOKBACK,
                      delay=params.DELAY,
                      min_index=0,
                      max_index=train_max_idx,
                      shuffle=False,
                      step=params.STEP, 
                      batch_size=params.BATCH_SIZE,
                      interval_label=True)



val_gen = generator_for_binary_classifier(dataset.to_numpy(),
                    label_index=label_index,
                      lookback=params.LOOKBACK,
                      delay=params.DELAY,
                      min_index=train_max_idx+1,
                      max_index=val_max_idx,
                      shuffle=False,
                      step=params.STEP, 
                      batch_size=params.BATCH_SIZE,
                      interval_label=True)

test_gen = generator_for_binary_classifier(dataset.to_numpy(),
                      label_index=label_index,
                      lookback=params.LOOKBACK,
                      delay=params.DELAY,
                      min_index=val_max_idx+1,
                      max_index=None,
                      shuffle=False,
                      step=params.STEP, 
                      batch_size=params.BATCH_SIZE,
                      interval_label=True)


In [5]:
def fc_model(hidden_unit, n_layer, l2_weight, input_shape):
    model = Sequential()
    model.add(Flatten(input_shape=input_shape))
    
    for _ in range(n_layer):
        model.add(Dense(hidden_unit, activation='relu', kernel_regularizer=regularizers.l2(l2_weight)))
        
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [6]:
def min_avg_model_loss(history):
    """return the epoch and its min avg loss"""
    loss = np.array(history.history['loss'])
    val_loss = np.array(history.history['val_loss'])
    avg_loss = list((loss + val_loss)/2)
    min_idx = avg_loss.index(min(avg_loss))

    return (min_idx+1), avg_loss[min_idx]

In [None]:
def hyperparams_search(model_func, train_steps, val_steps, train_gen, val_gen, num_epoch, hidden_units, n_layers, l2_weights):
    """return the best combination of params with minimal average of train and val loss"""
    min_avg_loss=999999
    best_params={}
    for l2_weight in l2_weights:
        for n_layer in n_layers:
            for hidden_unit in hidden_units:
                # compile 
                model = model_func(hidden_unit, n_layer, l2_weight, input_shape)
                
                # fit the model
                history = model.fit_generator(train_gen,
                                    steps_per_epoch=train_steps,
                                    validation_data=val_gen,
                                    validation_steps=val_steps,
                                    epochs=num_epoch, verbose=0)
                
                cur_num_epoch, cur_avg_loss = min_avg_model_loss(history)
                
                if(cur_avg_loss < min_avg_loss):
                    best_params['epochs'] = cur_num_epoch
                    best_params['n_layer'] = n_layer
                    best_params['hidden_unti'] = hidden_unit
                    best_params['l2_weight'] = l2_weight
                    min_avg_loss = cur_avg_loss
                    
    return best_params 

In [None]:
best_params = hyperparams_search(model_func=fc_model, 
                                 train_steps=train_steps, 
                                 val_steps=val_steps, 
                                 train_gen=train_gen, 
                                 val_gen=val_gen, 
                                 num_epoch=3000,
                                 hidden_units=[16,32,64], 
                                 n_layers=[1,2,3,4], 
                                 l2_weights=[0,0.0005,0.00075 ,0.001, 0.0015,0.01])
print(best_params)

In [None]:
#hyperparameters
num_epoch = best_params['epochs'] 
n_layer= best_params['n_layer'] 
hidden_unit=best_params['hidden_unti']  
l2_weight=best_params['l2_weight']
                    

model = fc_model(hidden_unit, n_layer, l2_weight, input_shape)

# #fit the model
history = model.fit_generator(train_gen,
                    steps_per_epoch=train_steps,
                    validation_data=val_gen,
                    validation_steps=val_steps,
                    epochs=num_epoch, verbose=0)

plot_train_validation_metric(history, 'loss', 'val_loss')

### We have 2984 positive and 984 negtive cases. 

Make a function that takes lists of hyperparameters and return the best model with certain combination of hyperparameters.

def hyperparams_search(epochs, l2_weights, n_layers, hidden_units):
    ...
    
    return the best combination minimal average of train and val loss

In [None]:
#Make predictions for train set
    
X, Y = generator_to_samples_and_targets(train_gen, train_steps)    
Y_pred = model.predict(X)
print(collections.Counter(Y))
print_report_for_binary_classfier(Y, probs_to_binary_classes(Y_pred))

In [None]:
#Make predictions from dev set
X, Y = generator_to_samples_and_targets(val_gen, val_steps)    
Y_pred = model.predict(X)
print(collections.Counter(Y))
print_report_for_binary_classfier(Y, probs_to_binary_classes(Y_pred))

In [None]:
#Make predictions for test set
X, Y = generator_to_samples_and_targets(test_gen, test_steps)    
Y_pred = model.predict(X)
print(collections.Counter(Y))
print_report_for_binary_classfier(Y, probs_to_binary_classes(Y_pred))

# Save the best model 

In [26]:
from keras.models import load_model

model.save(params.BEST_MODEL_PATH)  # creates a HDF5 file 'my_model.h5'