In [1]:
import plaidml.keras
plaidml.keras.install_backend()

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab

import statsmodels.api as sm
import statsmodels.tsa.api as smt
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.tsa.stattools as ts
from statsmodels.tsa.stattools import adfuller

from fbprophet import Prophet

import math

# import pyflux as pf

import warnings
warnings.filterwarnings(action='once')

import itertools

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import TimeSeriesSplit

from pandas.plotting import autocorrelation_plot

import re

import sys
import os

from functools import reduce

import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, Input, LSTM
#from keras.layers import Concatbenate
from keras.utils import np_utils
from keras.utils.np_utils import to_categorical
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import model_to_dot, plot_model
from keras.datasets import imdb, reuters
from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop
from keras.models import load_model

import datetime as dt

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import pickle

from numpy.random import seed

from tensorflow import set_random_seed


builtin type EagerTensor has no __module__ attribute


inspect.getargspec() is deprecated since Python 3.0, use inspect.signature() or inspect.getfullargspec()


unclosed file <_io.TextIOWrapper name='/Users/robinleoknauth/.keras/keras.json' mode='r' encoding='UTF-8'>



In [2]:
seed(2019)
set_random_seed(2019)

In [3]:
df_combined = pd.read_pickle('./processed_data/df_combined.pickle')

df_combined.tail(2)

Unnamed: 0,ds,y,vix,gold
12666,2019-06-01 22:00:00,8568.81,30.447,3561.58
12667,2019-06-01 23:00:00,8560.63,30.447,3569.34


In [4]:
df_daily = df_combined[df_combined.ds.dt.hour == 0]

In [6]:
df_daily.drop(['vix', 'gold'], axis = 1, inplace = True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [7]:
df_daily.reset_index(inplace=True, drop = True)

In [8]:
df_daily.head(3)

Unnamed: 0,ds,y
0,2017-02-08,1054.03
1,2017-02-09,1059.04
2,2017-02-10,986.42


In [9]:
def split_train_test_chronological(df, ratio = 0.9, use_ratio = True, index = 1000):
    
    '''
    Input is a dataframe, and a ratio. Splits dataframe into 2 dataframes chronologically.
    Returns first dataframe up to the index of the length of the input dataframe times the input ratio, 
    and returns second dataframe of remaining elements.
    use_ratio is a flag, wether ratio should be used or indicies instead.
    
    df = input dataframe
    ratio = ratio to be used for splitting
    use_ratio = if True, use ratio, 
    index = index to split input dataframe on
    
    '''
    if use_ratio:
        size = len(df) * ratio
        size_round = round(size)

        df_train = df[0:(size_round)]
        df_test = df[size_round:]
    else:
        df_train = df[0:(index)]
        df_test = df[index:]
    return df_train, df_test

In [10]:
def make_variables(df, lags, look_back, predict_window):
    
    '''
    In order to use keras LSTM, we need to convert the input into a keras-friendly input.
    
    df = input dataframe
    lags = number of lags
    look_back = number of preceding elements to be considered
    predict_window = size of window for predictions
    
    '''
    
    features = lags + 1
    start = look_back
    stop = len(df) - lags - predict_window

    lstm_in_X = np.zeros(shape=(stop-start, look_back+1, features))
    lstm_in_Y = np.zeros(shape=(stop-start, look_back+1))

    iter_list = [num for num in range(look_back+1)][::-1]
    for i in range(start, stop):
        for index, j in enumerate(iter_list):
            X = df[i - j : i - j + lags + 1, -1]
            lstm_in_X[i - start, index] = np.ravel(X)
            Y = df[i - j + lags + 1, -1]
            lstm_in_Y[i-start, index] = Y
            
    return lstm_in_X, lstm_in_Y, features

In [11]:
def train_lstm(train_X, train_Y, lags, look_back, predict_window, lstm_nodes,
               dense_layers, dropout = 0.1, loss_type = 'hinge', optimizer_type = 'adam',
               number_epochs = 300, batch_size = 24, ):
    
    
    model = Sequential()
    model.add(LSTM(lstm_nodes, input_shape=(look_back+1, features)))
    model.add(Dropout(dropout))
    for nodes in dense_layers:
        model.add(Dense(nodes))
        model.add(Dropout(dropout))
    model.add(Dense(look_back + 1))
    model.compile(loss=loss_type, optimizer= optimizer_type)
    model.fit(train_X, train_Y, epochs=number_epochs, batch_size = batch_size, verbose = 1)
    
    pred_Y_train = model.predict(train_X)
    predictions = pred_Y_train[:,-1]
    actuals = train_Y[:,-1]
    
    print(model.summary())
    
    return model, dataset, train_X, train_Y, predictions, actuals

In [12]:
def make_rolling_window(model, train_x, train_y, test_x, test_y, batch_size = 218, epochs = 2):
    predictions_test = []
    actuals_test = []
    yhats = []
    ys = []
    
    dimension = train_x.shape[2]
    
    for i in range(0, len(test_y) - 1):
        model.fit(train_x, train_y, 
                    epochs= epochs, 
                    batch_size = batch_size, 
                    validation_data=(test_x[i].reshape(1,1,dimension), (test_y[i].reshape(1 ,))),
                    verbose=2,
#                     callbacks=[earlystopper],
                    shuffle=False)
        pred_Y_test = model.predict(test_x[i].reshape(1,1,dimension))
        train_x = np.concatenate((train_x, (test_x[i].reshape(1 , 1 , dimension))))

        train_y = np.concatenate((train_y, (test_y[i].reshape(1 ,))))
        predict_test = pred_Y_test[-1,-1]
        actual_test = train_y[-1]
        predictions_test.append(predict_test)
        actuals_test.append(actual_test)
        yhats.append(pred_Y_test)
        ys.append(test_y[i])
        
    return predictions_test, actuals_test, yhats, ys

In [13]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    '''
    
    
    '''
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [65]:
# load dataset
values = df_daily.drop('ds', axis = 1).values
# values = df_combined.drop('ds', axis = 1).values
# integer encode direction
# ensure all data is float
values = values.astype('float32')
# normalize features
# scaler = MinMaxScaler()
scaler = StandardScaler()
scaled = scaler.fit_transform(values)
# frame as supervised learning
reframed = series_to_supervised(scaled, 7, 1)
reframed.head()
# reframed_2 = series_to_supervised(values, 7, 1)
# reframed_2.head()

# drop columns we don't want to predict
y = reframed.iloc[:,-1].values
X = reframed.drop('var1(t)', axis =1).values
# split into train and test sets

n_train_days = 500
n_test_days = 10
train_X = X[:n_train_days,:]
train_y = y[:n_train_days]

val_X= X[n_train_days:-n_test_days,]
val_y= y[n_train_days:-n_test_days]


test_X = X[-n_test_days:,:]
test_y = y[-n_test_days:]



# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
val_X = val_X.reshape((val_X.shape[0], 1, val_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))

train_and_val_X = np.concatenate((train_X, val_X), axis = 0)

train_and_val_y = np.concatenate((train_y, val_y), axis = 0)

print(train_X.shape, train_y.shape,val_X.shape, val_y.shape ,test_X.shape, test_y.shape)

(500, 1, 7) (500,) (11, 1, 7) (11,) (10, 1, 7) (10,)


In [80]:
reframed.tail()

Unnamed: 0,var1(t-7),var1(t-6),var1(t-5),var1(t-4),var1(t-3),var1(t-2),var1(t-1),var1(t)
523,-1.044255,-1.070802,-1.063028,-1.055287,-1.061987,-1.039832,-1.082549,-1.121784
524,-1.070802,-1.063028,-1.055287,-1.061987,-1.039832,-1.082549,-1.121784,-1.004746
525,-1.063028,-1.055287,-1.061987,-1.039832,-1.082549,-1.121784,-1.004746,-0.914822
526,-1.055287,-1.061987,-1.039832,-1.082549,-1.121784,-1.004746,-0.914822,-0.547356
527,-1.061987,-1.039832,-1.082549,-1.121784,-1.004746,-0.914822,-0.547356,0.435893


In [67]:
scaler = StandardScaler()
scaled = scaler.fit_transform(np.array(df_daily.y[:n_train_days]).reshape(-1,1))

In [68]:
LSTM_model_daily_1 = Sequential()
LSTM_model_daily_1.add(LSTM(64, activation='relu', input_shape=(train_X.shape[1], train_X.shape[2]), dropout=0.20,recurrent_dropout=0.20))
# LSTM_model_daily_1.add(LSTM(128))
LSTM_model_daily_1.add(Dense(32))
LSTM_model_daily_1.add(Dense(32))
# LSTM_model_daily_1.add(LSTM(16, activation='relu'))
LSTM_model_daily_1.add(Dense(1))
LSTM_model_daily_1.compile(optimizer='adam', loss='mse')


In [69]:
history_daily_1 = LSTM_model_daily_1.fit(train_X, train_y, 
                    epochs= 100, 
                    batch_size = 32, 
                    validation_data=(val_X, val_y),
                    verbose=2,
#                     callbacks=[earlystopper],
                    shuffle=False)

Train on 500 samples, validate on 11 samples
Epoch 1/100
 - 0s - loss: 0.8445 - val_loss: 0.5689
Epoch 2/100
 - 0s - loss: 0.4827 - val_loss: 0.2287
Epoch 3/100
 - 0s - loss: 0.1896 - val_loss: 0.0098
Epoch 4/100
 - 0s - loss: 0.1160 - val_loss: 0.0091
Epoch 5/100
 - 0s - loss: 0.1304 - val_loss: 0.0152
Epoch 6/100
 - 0s - loss: 0.0932 - val_loss: 0.0228
Epoch 7/100
 - 0s - loss: 0.1130 - val_loss: 0.0078
Epoch 8/100
 - 0s - loss: 0.1126 - val_loss: 0.0162
Epoch 9/100
 - 0s - loss: 0.1039 - val_loss: 0.0061
Epoch 10/100
 - 0s - loss: 0.0817 - val_loss: 0.0114
Epoch 11/100
 - 0s - loss: 0.0880 - val_loss: 0.0055
Epoch 12/100
 - 0s - loss: 0.0892 - val_loss: 0.0169
Epoch 13/100
 - 0s - loss: 0.0793 - val_loss: 0.0117
Epoch 14/100
 - 0s - loss: 0.0831 - val_loss: 0.0102
Epoch 15/100
 - 0s - loss: 0.0765 - val_loss: 0.0123
Epoch 16/100
 - 0s - loss: 0.0833 - val_loss: 0.0098
Epoch 17/100
 - 0s - loss: 0.0831 - val_loss: 0.0032
Epoch 18/100
 - 0s - loss: 0.0837 - val_loss: 0.0220
Epoch 19/1

In [70]:
res_1 = make_rolling_window(LSTM_model_daily_1, train_X, train_y, val_X, val_y, batch_size = 32, epochs = 2)

Train on 500 samples, validate on 1 samples
Epoch 1/2
 - 0s - loss: 0.0619 - val_loss: 0.1174
Epoch 2/2
 - 0s - loss: 0.0578 - val_loss: 0.0116
Train on 501 samples, validate on 1 samples
Epoch 1/2
 - 0s - loss: 0.0538 - val_loss: 0.0808
Epoch 2/2
 - 0s - loss: 0.0732 - val_loss: 0.0149
Train on 502 samples, validate on 1 samples
Epoch 1/2
 - 0s - loss: 0.0702 - val_loss: 0.1235
Epoch 2/2
 - 0s - loss: 0.0565 - val_loss: 0.0255
Train on 503 samples, validate on 1 samples
Epoch 1/2
 - 0s - loss: 0.0555 - val_loss: 0.0586
Epoch 2/2
 - 0s - loss: 0.0644 - val_loss: 0.0036
Train on 504 samples, validate on 1 samples
Epoch 1/2
 - 0s - loss: 0.0601 - val_loss: 0.0946
Epoch 2/2
 - 0s - loss: 0.0775 - val_loss: 0.0219
Train on 505 samples, validate on 1 samples
Epoch 1/2
 - 0s - loss: 0.0623 - val_loss: 0.0670
Epoch 2/2
 - 0s - loss: 0.0642 - val_loss: 0.0045
Train on 506 samples, validate on 1 samples
Epoch 1/2
 - 0s - loss: 0.0522 - val_loss: 0.0961
Epoch 2/2
 - 0s - loss: 0.0527 - val_loss:

In [71]:
predicts_1 = scaler.inverse_transform(np.array(res_1[0]))
actuals_1 = scaler.inverse_transform(np.array(res_1[1]))

In [72]:
rmse = math.sqrt(mean_squared_error(predicts_1, actuals_1))
print('Validation RMSE: %.3f' % rmse)

Validation RMSE: 418.505


In [73]:
res_test = make_rolling_window(LSTM_model_daily_1, train_and_val_X, train_and_val_y, test_X, test_y, batch_size = 32, epochs = 2)


Train on 511 samples, validate on 1 samples
Epoch 1/2
 - 0s - loss: 0.0629 - val_loss: 0.1142
Epoch 2/2
 - 0s - loss: 0.0729 - val_loss: 0.0095
Train on 512 samples, validate on 1 samples
Epoch 1/2
 - 0s - loss: 0.0547 - val_loss: 0.0913
Epoch 2/2
 - 0s - loss: 0.0740 - val_loss: 0.0064
Train on 513 samples, validate on 1 samples
Epoch 1/2
 - 0s - loss: 0.0586 - val_loss: 0.0847
Epoch 2/2
 - 0s - loss: 0.0612 - val_loss: 0.0017
Train on 514 samples, validate on 1 samples
Epoch 1/2
 - 0s - loss: 0.0716 - val_loss: 0.1111
Epoch 2/2
 - 0s - loss: 0.0825 - val_loss: 3.3939e-04
Train on 515 samples, validate on 1 samples
Epoch 1/2
 - 0s - loss: 0.0814 - val_loss: 0.2064
Epoch 2/2
 - 0s - loss: 0.1046 - val_loss: 3.6296e-04
Train on 516 samples, validate on 1 samples
Epoch 1/2
 - 0s - loss: 0.0709 - val_loss: 0.1426
Epoch 2/2
 - 0s - loss: 0.0882 - val_loss: 0.0036
Train on 517 samples, validate on 1 samples
Epoch 1/2
 - 0s - loss: 0.0585 - val_loss: 0.0275
Epoch 2/2
 - 0s - loss: 0.0662 - v

In [74]:
predicts_1 = scaler.inverse_transform(np.array(res_test[0]))
actuals_1 = scaler.inverse_transform(np.array(res_test[1]))
rmse = math.sqrt(mean_squared_error(predicts_1, actuals_1))
print('TEST RMSE: %.3f' % rmse)

TEST RMSE: 458.633
