In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import math

from keras.optimizers       import Adam
from keras.models           import Sequential
from keras.layers           import Dense
from keras.layers           import LSTM

Using TensorFlow backend.


In [2]:
# Hyperparameters
look_back = 10
nodes = [32, 32, 32, 32]
epochs = 1

In [3]:
usecols = ['Dates', 
           'DJI_rv', 
           'FCHI_rv', 
           'FTSE_rv', 
           'IBEX_rv']

data = pd.read_excel('realizedlibrary01.xls', skiprows = 1, usecols = usecols)
data['Dates'] = pd.to_datetime(data['Dates'], format = '%Y%m%d')
data = data.set_index('Dates')

# Find a better solution for gaps in the data
data = data.interpolate(limit = 2) 
data = data.fillna(0)

In [4]:
#data.plot()

In [5]:
# Normalize the values
scaler = MinMaxScaler(feature_range=(0, 1))

for cols in usecols[1:]:
    data['Norm'+ cols] = scaler.fit_transform(data[cols].values.reshape(len(data[cols]),1))

data.head()

Unnamed: 0_level_0,DJI_rv,FCHI_rv,FTSE_rv,IBEX_rv,NormDJI_rv,NormFCHI_rv,NormFTSE_rv,NormIBEX_rv
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1996-01-03,1.9e-05,4.3e-05,0.0,2.1e-05,0.003361,0.009789,0.0,0.008607
1996-01-04,8e-05,8.1e-05,0.0,3.2e-05,0.014247,0.018371,0.0,0.012893
1996-01-05,4.2e-05,9.7e-05,0.0,2.8e-05,0.007496,0.021945,0.0,0.011494
1996-01-08,1.1e-05,5.8e-05,0.0,1.5e-05,0.001936,0.013153,0.0,0.00604
1996-01-09,3.9e-05,7.9e-05,0.0,2.5e-05,0.006922,0.017944,0.0,0.010036


In [6]:
def create_dataset(input_series):
    
    # Creates a training and test set that consist of time series and a 1 day ahead label
    # input series is an numpy array of shape (?,) or (?,1)
    
    input_series = input_series.reshape(len(input_series),1)

    def data_split(input_series, split = 0.80):

        # Split the data into a training and test set
        # Input series is a numpy array
        # We are using 80 percent (default) of the data as training set and 20% as the test set

        train_size  = int(len(input_series) * split)
        test_size   = len(input_series) - train_size

        train, test = input_series[0:train_size], input_series[train_size:len(input_series)]

        return train, test

    def create_timeseries(time_series, look_back):

        # dataX is the is the rolling window of past oberservations 
        # dataY becomes the the value that is one day ahead of the rolling window. 
        # This is the label/prediction for the past values

        dataX, dataY = [], []

        for i in range(1,len(time_series) - look_back - 1):

            x = time_series[i:i + look_back]
            dataX.append(x)

            y = time_series[i + look_back + 1]
            dataY.append(y)

        return np.array(dataX), np.array(dataY)

    # Create the dataset with rolling window for the training set and test set
    trainX, trainY  = create_timeseries(data_split(input_series)[0], look_back)
    testX, testY    = create_timeseries(data_split(input_series)[1], look_back)

    # Reshape input to be [samples, time steps, features]
    trainX  = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
    testX   = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
    
    return trainX, trainY, testX, testY

In [7]:
def define_model(nodes):
    
    hidden_layers = len(nodes) - 1 
    
    # Define Input layer
    model = Sequential()
    model.add(LSTM(nodes[0], return_sequences = True, input_shape = (1, look_back)))
    
    # Add hidden Layers
    for i in range(hidden_layers - 1):
        model.add(LSTM(nodes[i+1], return_sequences = True))
    
    # Define last hidden layer
    model.add(LSTM(nodes[-1]))
    
    # Define output layer
    model.add(Dense(1))

    # Compile Model
    model.compile(loss = 'mean_squared_error', optimizer = Adam(lr = 0.0001))
              
    #print(model.summary())
    
    return model

In [22]:
def evaluate_model(input_series, nodes):
    
    trainX, trainY, testX, testY = create_dataset(input_series)
    
    def train_model(input_series, nodes): 
    
        model = define_model(nodes)

        model.fit(trainX, trainY, epochs = epochs, batch_size = 1, verbose = 0)

        #model.save('Output/Model_LB-'+ str(look_back) + '_EP-'+str(epochs)+'.h5')

        return model

    def calculate_error(model, trainX, trainY, testX, testY):
    
        # Make predictions 
        trainPredict    = model.predict(trainX)
        testPredict     = model.predict(testX)

        # Inverse the normalization procedure of the data
        trainY = np.reshape(trainY,(trainY.shape[0],))
        testY  = np.reshape(testY,(testY.shape[0],))

        trainPredict    = scaler.inverse_transform(trainPredict)
        trainY          = scaler.inverse_transform([trainY])
        testPredict     = scaler.inverse_transform(testPredict)
        testY           = scaler.inverse_transform([testY])

        # Calculate root mean squared error
        trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
        testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
        
        #print('Train Score: %.6f RMSE' % (trainScore))
        #print('Test Score: %.6f RMSE' % (testScore))
        
        return trainScore, testScore
    
    
    model = train_model(input_series, nodes)
        
    trainScore, testScore = calculate_error(model, trainX, trainY, testX, testY)
    
    return trainScore, testScore

In [9]:
evaluate_model(data["NormDJI_rv"].values, [8, 8, 8, 8])

[8, 8, 8, 8]
Train Score: 0.000041 RMSE
Test Score: 0.000171 RMSE


In [10]:
evaluate_model(data["NormDJI_rv"].values, [16, 16, 16])

[16, 16, 16]
Train Score: 0.000039 RMSE
Test Score: 0.000167 RMSE


In [11]:
evaluate_model(data["NormDJI_rv"].values, [32, 32])

[32, 32]
Train Score: 0.000035 RMSE
Test Score: 0.000127 RMSE


In [12]:
evaluate_model(data["NormDJI_rv"].values, [64])

[64]
Train Score: 0.000032 RMSE
Test Score: 0.000126 RMSE


In [23]:
def create_architectures(max_nodes, max_layers):

    architectures = []
    
    for layers in range(1,max_layers):
        
        for nodes in range(5, max_nodes, 5):
    
            node_structure = []

            for i in range(layers):

                    node_structure.append(int(np.ceil(nodes - nodes/max_layers*i)))

            architectures.append(node_structure)

    return architectures

In [24]:
def initialize_training_series(data, max_nodes, max_layers):
    
    architectures = create_architectures(100,5)
    
    results = {"Architecture": ["Train Score RSME", "Test Score RSME"]}
    
    for architecture in architectures:
        
        trainScore, testScore = evaluate_model(data, architecture)
        
        results[str(architecture)] = [trainScore, testScore]
        
    print(results)

In [25]:
initialize_training_series(data["NormDJI_rv"].values, 20, 3)

KeyboardInterrupt: 