In [63]:
import plaidml.keras
plaidml.keras.install_backend()

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab

import statsmodels.api as sm
import statsmodels.tsa.api as smt
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.tsa.stattools as ts
from statsmodels.tsa.stattools import adfuller

from fbprophet import Prophet

import math

# import pyflux as pf

import warnings
warnings.filterwarnings('ignore')

import itertools

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV

from sklearn.model_selection import TimeSeriesSplit

from pandas.plotting import autocorrelation_plot

import re

import sys
import os

from functools import reduce

import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, Input, LSTM
#from keras.layers import Concatenate
from keras.utils import np_utils
from keras.utils.np_utils import to_categorical
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import model_to_dot, plot_model
from keras.datasets import imdb, reuters
from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import pickle

from numpy.random import seed

from tensorflow import set_random_seed

In [2]:
seed(2019)
set_random_seed(2019)

## Importing Data

In [3]:
df_combined = pd.read_pickle('./processed_data/df_combined.pickle')

In [4]:
df_combined.head(2)

Unnamed: 0,ds,y,vix,gold
0,2017-02-08 00:00:00,1054.03,18.879,2732.0
1,2017-02-08 01:00:00,1060.48,18.915,2693.0


## Functions

In [6]:
def split_train_test_chronological(df, ratio = 0.9, use_ratio = True, index = 1000):
    
    '''
    Input is a dataframe, and a ratio. Splits dataframe into 2 dataframes chronologically.
    Returns first dataframe up to the index of the length of the input dataframe times the input ratio, 
    and returns second dataframe of remaining elements.
    use_ratio is a flag, wether ratio should be used or indicies instead.
    
    df = input dataframe
    ratio = ratio to be used for splitting
    use_ratio = if True, use ratio, 
    index = index to split input dataframe on
    
    '''
    if use_ratio:
        size = len(df) * ratio
        size_round = round(size)

        df_train = df[0:(size_round)]
        df_test = df[size_round:]
    else:
        df_train = df[0:(index)]
        df_test = df[index:]
    return df_train, df_test

In [45]:
def slice_data(train_perc, stocks_to_trade, return_df_2):
    train_len = int(return_df.shape[0] * (train_perc))
    volume_to_trade = ['v_'+ticker for ticker in stocks_to_trade]
    for i in volume_to_trade:
        stocks_to_trade.append(i)

    train = return_df_2[stocks_to_trade][1:train_len].copy()
    train['diff'] = train[stocks_to_trade[0]] - train[stocks_to_trade[1]]
    train.reset_index(inplace=True, drop=True)

    test = return_df_2[stocks_to_trade][train_len:-400].copy()
    test['diff'] = test[stocks_to_trade[0]] - test[stocks_to_trade[1]]
    test.reset_index(inplace=True, drop=True)

    true_test = return_df_2[stocks_to_trade][-400:].copy()
    true_test['diff'] = true_test[stocks_to_trade[0]] - true_test[stocks_to_trade[1]]
    true_test.reset_index(inplace=True, drop=True)

    feature_names = volume_to_trade
    feature_names.append('diff')
    
    return train, test, true_test, feature_names


In [10]:
def make_variables(df, lags, look_back, predict_window):
    
    '''
    In order to use keras LSTM, we need to convert the input into a keras-friendly input.
    
    df = input dataframe
    lags = number of lags
    look_back = number of preceding elements to be considered
    predict_window = size of window for predictions
    
    '''
    
    features = lags + 1
    start = look_back
    stop = len(df) - lags - predict_window

    lstm_in_X = np.zeros(shape=(stop-start, look_back+1, features))
    lstm_in_Y = np.zeros(shape=(stop-start, look_back+1))

    iter_list = [num for num in range(look_back+1)][::-1]
    for i in range(start, stop):
        for index, j in enumerate(iter_list):
            X = df[i - j : i - j + lags + 1, -1]
            lstm_in_X[i - start, index] = np.ravel(X)
            Y = df[i - j + lags + 1, -1]
            lstm_in_Y[i-start, index] = Y
            
    return lstm_in_X, lstm_in_Y, features

In [53]:
def train_lstm(train_X, train_Y, lags, look_back, predict_window, lstm_nodes,
               dense_layers, dropout = 0.1, loss_type = 'hinge', optimizer_type = 'adam',
               number_epochs = 300, batch_size = 64, ):
    

    model = Sequential()
    model.add(LSTM(lstm_nodes, input_shape=(look_back+1, features)))
    model.add(Dropout(dropout))
    for nodes in dense_layers:
        model.add(Dense(nodes))
        model.add(Dropout(dropout))
    model.add(Dense(look_back + 1))
    model.compile(loss=loss_type, optimizer= optimizer_type)
    model.fit(train_X, train_Y, epochs=number_epochs, batch_size = batch_size, verbose = 1)
    
    pred_Y_train = model.predict(train_X)
    predictions = pred_Y_train[:,-1]
    actuals = train_Y[:,-1]
    
    print(model.summary())
    
    return model, dataset, train_X, train_Y, predictions, actuals

In [143]:
    predictions_test = []
    actuals_test = []
    
    for i in range(0, len(test_y) - 1):
        model.fit(train_x, train_y, 
                    epochs= 2, 
                    batch_size = 128, 
#                     validation_data=(test_x[i], test_y[i]),
                    verbose=2,
#                     callbacks=[earlystopper],
                    shuffle=False)
        pred_Y_test = model.predict(test_x)
        test_x = np.concatenate((test_x, (test_x[i].reshape(1 , 1 , 23))))
        le = len(test_y) + 1
        test_y = np.concatenate((test_y, (test_y[i].reshape(1 ,))))
        predict_test = pred_Y_test[-1,-1]
        actual_test = train_x[-1,-1]
        predictions_test.append(predict_test)
        actuals_test.append(actual_test)
        
    
    return predictions_test, actuals_test

In [48]:

def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    '''
    
    
    '''
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

## Train, Val, Test split

In [42]:
df_train, df_test = split_train_test_chronological(df_combined, ratio = .99)

In [43]:
df_train, df_val = split_train_test_chronological(df_combined, ratio = .95)

## Modeling

In [85]:
# load dataset
values = df_combined.drop('ds', axis = 1).values
# integer encode direction
# ensure all data is float
values = values.astype('float32')
# normalize features
scaler = MinMaxScaler()
scaled = scaler.fit_transform(values)
# frame as supervised learning
reframed = series_to_supervised(scaled, 7, 1)

reframed_2 = series_to_supervised(values, 7, 1)
reframed.head()

Unnamed: 0,var1(t-7),var2(t-7),var3(t-7),var1(t-6),var2(t-6),var3(t-6),var1(t-5),var2(t-5),var3(t-5),var1(t-4),...,var3(t-3),var1(t-2),var2(t-2),var3(t-2),var1(t-1),var2(t-1),var3(t-1),var1(t),var2(t),var3(t)
7,0.005247,0.179461,0.003901,0.005594,0.180275,0.001623,0.005727,0.179032,0.00274,0.005882,...,0.004491,0.005908,0.179438,0.004092,0.005895,0.179642,0.003908,0.005905,0.179642,0.003899
8,0.005594,0.180275,0.001623,0.005727,0.179032,0.00274,0.005882,0.176997,0.004602,0.006007,...,0.004092,0.005895,0.179642,0.003908,0.005905,0.179642,0.003899,0.004164,0.179642,0.002629
9,0.005727,0.179032,0.00274,0.005882,0.176997,0.004602,0.006007,0.176997,0.004491,0.005908,...,0.003908,0.005905,0.179642,0.003899,0.004164,0.179642,0.002629,0.004071,0.179642,0.00316
10,0.005882,0.176997,0.004602,0.006007,0.176997,0.004491,0.005908,0.179438,0.004092,0.005895,...,0.003899,0.004164,0.179642,0.002629,0.004071,0.179642,0.00316,0.004373,0.179642,0.003959
11,0.006007,0.176997,0.004491,0.005908,0.179438,0.004092,0.005895,0.179642,0.003908,0.005905,...,0.002629,0.004071,0.179642,0.00316,0.004373,0.179642,0.003959,0.004777,0.179642,0.003199


In [71]:
# drop columns we don't want to predict
y = reframed.iloc[:,-11].values
X = reframed.drop('var1(t)', axis =1).values
# split into train and test sets

n_train_hours = 12000
n_test_hours = 320
train_X = X[:n_train_hours,:]
train_y = y[:n_train_hours]

val_X= X[n_train_hours:-n_test_hours,]
val_y= y[n_train_hours:-n_test_hours]

test_X = X[-n_test_hours:,:]
test_y = y[-n_test_hours:]



# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
val_X = val_X.reshape((val_X.shape[0], 1, val_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
print(train_X.shape, train_y.shape,val_X.shape, val_y.shape ,test_X.shape, test_y.shape)

(12000, 1, 23) (12000,) (341, 1, 23) (341,) (320, 1, 23) (320,)


In [86]:
# drop columns we don't want to predict
y = reframed_2.iloc[:,-11].values
X = reframed_2.drop('var1(t)', axis =1).values
# split into train and test sets

n_train_hours = 12000
n_test_hours = 320
train_X = X[:n_train_hours,:]
train_y = y[:n_train_hours]

val_X= X[n_train_hours:-n_test_hours,]
val_y= y[n_train_hours:-n_test_hours]

test_X = X[-n_test_hours:,:]
test_y = y[-n_test_hours:]



# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
val_X = val_X.reshape((val_X.shape[0], 1, val_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
print(train_X.shape, train_y.shape,val_X.shape, val_y.shape ,test_X.shape, test_y.shape)

(12000, 1, 23) (12000,) (341, 1, 23) (341,) (320, 1, 23) (320,)


In [72]:
# train_perc = 0.40
# lags = 50
# look_back = 6       # 0 is a look_back of 1, 1 is a look_back of 2, etc.
# predict_window = 1

# # Neural net parameters
# lstm_neurons = 16
# dense_layers = []

# model, predictions_test, actuals_test, predictions_train, actuals_train = main(df_train, df_val, train_perc,
#                                                                                lags, look_back, 
#                                                                                predict_window, 
#                                                                                lstm_neurons, dense_layers)

In [73]:
train_X

array([[[0.00524709, 0.17946108, 0.00390136, ..., 0.00390837,
         0.17964192, 0.00389902]],

       [[0.00559421, 0.18027489, 0.00162329, ..., 0.00389902,
         0.17964192, 0.00262855]],

       [[0.00572659, 0.1790316 , 0.00274013, ..., 0.00262855,
         0.17964192, 0.0031601 ]],

       ...,

       [[0.14719364, 0.6875989 , 0.06145546, ..., 0.06514594,
         0.71517694, 0.06405188]],

       [[0.14818656, 0.6857679 , 0.06328142, ..., 0.06405188,
         0.71517694, 0.06482349]],

       [[0.1501584 , 0.67446536, 0.06471661, ..., 0.06482349,
         0.71517694, 0.06527327]]], dtype=float32)

In [74]:
val_X[0]

array([[0.14945447, 0.7185677 , 0.06664537, 0.14903203, 0.71517694,
        0.06430598, 0.14952445, 0.71517694, 0.06589244, 0.14759782,
        0.71517694, 0.06514594, 0.14829904, 0.71517694, 0.06405188,
        0.1499469 , 0.71517694, 0.06482349, 0.15080851, 0.71517694,
        0.06527327, 0.71517694, 0.06773242]], dtype=float32)

In [89]:
LSTM_model_3 = Sequential()
LSTM_model_3.add(LSTM(64, activation='relu', input_shape=(train_X.shape[1], train_X.shape[2]), dropout=0.05,recurrent_dropout=0.05))
LSTM_model_3.add(Dense(64))
# LSTM_model_3.add(LSTM(16, activation='relu'))
LSTM_model_3.add(Dense(1))
LSTM_model_3.compile(optimizer='adam', loss='mse')





In [91]:
history_3 = LSTM_model_3.fit(train_X, train_y, 
                    epochs= 100, 
                    batch_size = 128, 
                    validation_data=(val_X, val_y),
                    verbose=2,
#                     callbacks=[earlystopper],
                    shuffle=False)

Train on 12000 samples, validate on 341 samples
Epoch 1/100
 - 3s - loss: 79.1496 - val_loss: 178.8948
Epoch 2/100
 - 2s - loss: 81.0312 - val_loss: 178.0877
Epoch 3/100
 - 2s - loss: 81.0999 - val_loss: 177.9849
Epoch 4/100
 - 2s - loss: 81.1355 - val_loss: 177.9613
Epoch 5/100
 - 2s - loss: 81.1654 - val_loss: 177.9484
Epoch 6/100
 - 2s - loss: 83.4045 - val_loss: 178.3477
Epoch 7/100
 - 2s - loss: 81.1642 - val_loss: 177.9843
Epoch 8/100
 - 2s - loss: 81.2057 - val_loss: 177.9368
Epoch 9/100
 - 2s - loss: 81.2314 - val_loss: 177.9236
Epoch 10/100
 - 2s - loss: 81.2536 - val_loss: 177.9150
Epoch 11/100
 - 2s - loss: 81.2735 - val_loss: 177.9084
Epoch 12/100
 - 2s - loss: 81.3353 - val_loss: 177.9114
Epoch 13/100
 - 2s - loss: 81.3083 - val_loss: 177.8968
Epoch 14/100
 - 2s - loss: 81.3240 - val_loss: 177.8909
Epoch 15/100
 - 2s - loss: 81.3378 - val_loss: 177.8867
Epoch 16/100
 - 2s - loss: 81.3503 - val_loss: 177.8829
Epoch 17/100
 - 2s - loss: 81.3616 - val_loss: 177.8795
Epoch 18/

In [170]:
def make_refitting_window(model, train_x, train_y, test_x, test_y):
    predictions_test = []
    actuals_test = []
    
    for i in range(0, len(test_y) - 1):
        model.fit(train_x, train_y, 
                    epochs= 2, 
                    batch_size = 128, 
                    validation_data=(test_x[i].reshape(1,1,23), (test_y[i].reshape(1 ,))),
                    verbose=2,
#                     callbacks=[earlystopper],
                    shuffle=False)
        pred_Y_test = model.predict(test_x)
        train_x = np.concatenate((train_x, (test_x[i].reshape(1 , 1 , 23))))
        le = len(test_y) + 1
        train_y = np.concatenate((train_y, (test_y[i].reshape(1 ,))))
        predict_test = pred_Y_test[-1,-1]
        actual_test = train_x[-1,-1]
        predictions_test.append(predict_test)
        actuals_test.append(actual_test)
        
    
    return predictions_test, actuals_test

In [172]:
res = make_refitting_window(LSTM_model_3, train_X, train_y, val_X, val_y)

Train on 12000 samples, validate on 1 samples
Epoch 1/2
 - 2s - loss: 80.6118 - val_loss: 72.0893
Epoch 2/2
 - 2s - loss: 80.6158 - val_loss: 72.1114
Train on 12001 samples, validate on 1 samples
Epoch 1/2
 - 2s - loss: 80.6120 - val_loss: 72.1152
Epoch 2/2
 - 2s - loss: 80.6098 - val_loss: 72.1161
Train on 12002 samples, validate on 1 samples
Epoch 1/2
 - 2s - loss: 80.6073 - val_loss: 72.1169
Epoch 2/2
 - 2s - loss: 80.6053 - val_loss: 72.1174
Train on 12003 samples, validate on 1 samples
Epoch 1/2
 - 2s - loss: 80.6028 - val_loss: 72.1181
Epoch 2/2
 - 2s - loss: 80.6008 - val_loss: 72.1187
Train on 12004 samples, validate on 1 samples
Epoch 1/2
 - 2s - loss: 80.5983 - val_loss: 72.1195
Epoch 2/2
 - 2s - loss: 80.5962 - val_loss: 72.1200
Train on 12005 samples, validate on 1 samples
Epoch 1/2
 - 2s - loss: 80.5937 - val_loss: 72.1208
Epoch 2/2
 - 2s - loss: 80.5916 - val_loss: 72.1214
Train on 12006 samples, validate on 1 samples
Epoch 1/2
 - 2s - loss: 80.5891 - val_loss: 72.1222
Ep

KeyboardInterrupt: 

In [134]:
test_y[0]

39.267

In [135]:
test_y.append(test_y[0])

AttributeError: 'numpy.ndarray' object has no attribute 'append'

In [149]:
np.concatenate((test_y, (test_y[0].reshape(1 ,))))

array([39.267, 39.267, 39.267, 39.267, 39.267, 39.267, 39.267, 39.267,
       39.267, 40.298, 40.198, 39.728, 39.557, 39.488, 39.098, 38.947,
       39.438, 39.438, 39.438, 39.438, 39.438, 39.438, 39.438, 39.438,
       39.438, 39.438, 39.438, 39.438, 39.438, 39.438, 39.438, 39.438,
       39.438, 39.198, 38.427, 37.898, 37.728, 37.688, 37.717, 37.908,
       37.658, 37.658, 37.658, 37.658, 37.658, 37.658, 37.658, 37.658,
       37.658, 37.658, 37.658, 37.658, 37.658, 37.658, 37.658, 37.658,
       37.658, 37.277, 37.368, 37.807, 37.957, 37.728, 37.548, 37.747,
       38.348, 38.348, 38.348, 38.348, 38.348, 38.348, 38.348, 38.348,
       38.348, 38.348, 38.348, 38.348, 38.348, 38.348, 38.348, 38.348,
       38.348, 38.648, 38.137, 38.097, 38.147, 38.067, 37.897, 37.917,
       37.997, 37.997, 37.997, 37.997, 37.997, 37.997, 37.997, 37.997,
       37.997, 37.997, 37.997, 37.997, 37.997, 37.997, 37.997, 37.997,
       37.997, 36.867, 37.247, 36.857, 36.388, 36.657, 36.647, 37.018,
      

In [146]:
test_y

array([39.267, 39.267, 39.267, 39.267, 39.267, 39.267, 39.267, 39.267,
       39.267, 40.298, 40.198, 39.728, 39.557, 39.488, 39.098, 38.947,
       39.438, 39.438, 39.438, 39.438, 39.438, 39.438, 39.438, 39.438,
       39.438, 39.438, 39.438, 39.438, 39.438, 39.438, 39.438, 39.438,
       39.438, 39.198, 38.427, 37.898, 37.728, 37.688, 37.717, 37.908,
       37.658, 37.658, 37.658, 37.658, 37.658, 37.658, 37.658, 37.658,
       37.658, 37.658, 37.658, 37.658, 37.658, 37.658, 37.658, 37.658,
       37.658, 37.277, 37.368, 37.807, 37.957, 37.728, 37.548, 37.747,
       38.348, 38.348, 38.348, 38.348, 38.348, 38.348, 38.348, 38.348,
       38.348, 38.348, 38.348, 38.348, 38.348, 38.348, 38.348, 38.348,
       38.348, 38.648, 38.137, 38.097, 38.147, 38.067, 37.897, 37.917,
       37.997, 37.997, 37.997, 37.997, 37.997, 37.997, 37.997, 37.997,
       37.997, 37.997, 37.997, 37.997, 37.997, 37.997, 37.997, 37.997,
       37.997, 36.867, 37.247, 36.857, 36.388, 36.657, 36.647, 37.018,
      

In [150]:
list(test_y)

[39.267,
 39.267,
 39.267,
 39.267,
 39.267,
 39.267,
 39.267,
 39.267,
 39.267,
 40.298,
 40.198,
 39.728,
 39.557,
 39.488,
 39.098,
 38.947,
 39.438,
 39.438,
 39.438,
 39.438,
 39.438,
 39.438,
 39.438,
 39.438,
 39.438,
 39.438,
 39.438,
 39.438,
 39.438,
 39.438,
 39.438,
 39.438,
 39.438,
 39.198,
 38.427,
 37.898,
 37.728,
 37.688,
 37.717,
 37.908,
 37.658,
 37.658,
 37.658,
 37.658,
 37.658,
 37.658,
 37.658,
 37.658,
 37.658,
 37.658,
 37.658,
 37.658,
 37.658,
 37.658,
 37.658,
 37.658,
 37.658,
 37.277,
 37.368,
 37.807,
 37.957,
 37.728,
 37.548,
 37.747,
 38.348,
 38.348,
 38.348,
 38.348,
 38.348,
 38.348,
 38.348,
 38.348,
 38.348,
 38.348,
 38.348,
 38.348,
 38.348,
 38.348,
 38.348,
 38.348,
 38.348,
 38.648,
 38.137,
 38.097,
 38.147,
 38.067,
 37.897,
 37.917,
 37.997,
 37.997,
 37.997,
 37.997,
 37.997,
 37.997,
 37.997,
 37.997,
 37.997,
 37.997,
 37.997,
 37.997,
 37.997,
 37.997,
 37.997,
 37.997,
 37.997,
 36.867,
 37.247,
 36.857,
 36.388,
 36.657,
 36.647,
 

In [155]:
test_y.shape

(320,)