### Aim: practice using RNNs

# LSTM approach

In [1]:
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, GRU, LSTM  # Bidirectional
from keras.optimizers import SGD
from sklearn.metrics import mean_squared_error
from keras.backend import tensorflow_backend as tb
from sklearn.model_selection import train_test_split
import datetime as dt
import IPython
import keras as k
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import tensorflow as tf
plt.style.use('fivethirtyeight')
print("Session setup successful")

Using TensorFlow backend.


Session setup successful


In [2]:
# Allow gradual memory alloc for GPU use
random_seed = 1 # Random seed included for reproducability/consistency reasons when learning.
np.random.seed(random_seed)
if tb._SESSION is None:
    if not os.environ.get('OMP_NUM_THREADS'):
        config = tf.ConfigProto(allow_soft_placement=True)
    else:
        num_thread = int(os.environ.get('OMP_NUM_THREADS'))
        config = tf.ConfigProto(intra_op_parallelism_threads=num_thread, allow_soft_placement=True)
    config.gpu_options.allow_growth=True
    _SESSION = tf.Session(config=config)
session = _SESSION
print('Gradual memory alloc setup successful!')

Gradual memory alloc setup successful!


In [3]:
# Some functions to help out with
def plot_predictions(test,predicted,stock='IBM') -> None:
    plt.plot(test, color='red',label='Real IBM Stock Price')
    plt.plot(predicted, color='blue',label='Predicted IBM Stock Price')
    plt.title('IBM Stock Price Prediction')
    plt.xlabel('Time')
    plt.ylabel(stock + ' Stock Price')
    plt.legend()
    plt.show()

def return_rmse(test,predicted) -> None:
    rmse = math.sqrt(mean_squared_error(test, predicted))
    print("The root mean squared error is {}.".format(rmse))

In [4]:
stock_data_path = '../data/stock-time-series-20050101-to-20171231/'
os.listdir(stock_data_path)[:5]

['AABA_2006-01-01_to_2018-01-01.csv',
 'AAPL_2006-01-01_to_2018-01-01.csv',
 'all_stocks_2006-01-01_to_2018-01-01.csv',
 'all_stocks_2017-01-01_to_2018-01-01.csv',
 'AMZN_2006-01-01_to_2018-01-01.csv']

In [5]:
# First, we get the data
dataset = pd.read_csv(stock_data_path + 'IBM_2006-01-01_to_2018-01-01.csv', index_col='Date', parse_dates=['Date'])
print("Number of entries in dataframe:", len(dataset))
dataset.head(3)

Number of entries in dataframe: 3020


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Name
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2006-01-03,82.45,82.55,80.81,82.06,11715200,IBM
2006-01-04,82.2,82.5,81.33,81.95,9840600,IBM
2006-01-05,81.4,82.9,81.0,82.5,7213500,IBM


In [6]:
print(dataset.columns[:])
print(dataset.columns[1:5])

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Name'], dtype='object')
Index(['High', 'Low', 'Close', 'Volume'], dtype='object')


In [7]:
# # Siddarth: Checking for missing values (K: ??? This doesn't do any null checking AFAIK!)
# training_set = dataset[:'2016'].iloc[:,3:4].values
# test_set = dataset['2017':].iloc[:,3:4].values

In [8]:
training_set[:3]

NameError: name 'training_set' is not defined

In [None]:
# # # PROJECT PARAMETERS # # #
features = ['High', 'Low', 'Close']
num_prev_elems = 60
min_max_scale = (0.05, 0.95)

In [None]:
training_set = dataset[:'2016'].loc[:, features].values
test_set = dataset['2017':].loc[:, features].values

In [None]:
dataset["Close"][:'2016'].plot(figsize=(16,4),legend=True)
dataset["Close"]['2017':].plot(figsize=(16,4),legend=True)
plt.legend(['Training set (Before 2017)','Test set (2017 and beyond)'])
plt.title('IBM stock price (close)')
plt.show()

In [None]:
# Scaling the training set
sc = MinMaxScaler(feature_range=min_max_scale)
training_set_scaled = sc.fit_transform(training_set)
training_set_scaled[:5]

In [None]:
# sc??

In [None]:
# set up x_train, y_train
x_train = []
y_train = []
for i in range(num_prev_elems, len(training_set)):  # training set is 2759 elems long
    x_train.append(
        training_set_scaled[i-num_prev_elems:i, :])
    y_train.append(
        training_set_scaled[i,:])
x_train, y_train = np.array(x_train), np.array(y_train)
print('xtrain shape:', x_train.shape)
print('ytrain shape:', y_train.shape)

In [None]:
x_train.shape

In [None]:
# Reshaping x_train for efficient modelling
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))  # <--TODO: what happens here?!
print('new xtrain shape:', x_train.shape)
print("As seen above, RESHAPING IS USELESS")

In [None]:
# Build LTSM model
regressor = Sequential()
regressor.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1], x_train.shape[2])))
regressor.add(Dropout(0.5))
regressor.add(LSTM(units=40, return_sequences=True))
regressor.add(Dropout(0.5))
# regressor.add(LSTM(units=30, return_sequences=True))
# regressor.add(Dropout(0.5))
# regressor.add(LSTM(units=20))
# regressor.add(Dropout(0.5))
# regressor.add(Flatten())
regressor.add(Dense(units=1))
# model variables
ltsm_batch_size = 32
ltsm_epochs = 5
# Some other optimizers include: RMSprop,Adagrad,Adadelta,Adam
ltsm_optimizer = k.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0) # Siddarth
# ltsm_optimizer = k.optimizers.Adam()
regressor.compile(optimizer=ltsm_optimizer, loss='mean_squared_error')  # TODO: figure out other loss functions. Hinge?
ltsm_runtime_name = 'LTSM_504030_ADAM' \
    + '_BS' + str(ltsm_batch_size) \
    + '_epochs' + str(ltsm_epochs) \
    + '_TensorboardStopEarly' \
    + '_' + str(dt.datetime.now()).replace(":","H",1).replace(":","M",1)
# Declare callbacks  # Some metrics to MONITER include 'loss' and 'val_loss'
stopearly = k.callbacks.EarlyStopping(monitor='loss', min_delta=0.0009, patience=1, verbose=1,
                                      mode='auto', baseline=None, restore_best_weights=False)
tensorboard_ltsm = k.callbacks.TensorBoard(log_dir='../tensorboard/' + ltsm_runtime_name,
                                           histogram_freq=0,batch_size=ltsm_batch_size, write_graph=True, write_images=True)
regressor.summary()

In [None]:
# # Visualize model (Comment out as needed)
k.utils.plot_model(regressor, to_file='../visualizations/' + ltsm_runtime_name +'.png', show_shapes=True)
IPython.display.Image('../visualizations/' + ltsm_runtime_name + '.png')

In [None]:
regressor.summary()

In [None]:
# Train model
history = regressor.fit(x_train, y_train, epochs=ltsm_epochs, batch_size=ltsm_batch_size,  # shuffle=True,
                        verbose=1, callbacks=[stopearly, tensorboard_ltsm])

In [None]:
# Prep test set similar to train set
# The following has been done so forst 60 entires of test set have 60 previous values which is impossible to get unless
# we take the whole 
# 'High' attribute data for processing
dataset_total = pd.concat(
    (dataset["Close"][:'2016'],
     dataset["Close"]['2017':]),axis=0)
print('len(datasettotal)', len(dataset_total))
print('len(test_set)', len(test_set))
print()
# Get inputs as _
inputs = dataset_total[ len(dataset_total) - len(test_set) - num_prev_elems: ].values
print("Sample of inputs:", inputs[:5])
# Reshape inputs
inputs = inputs.reshape(-1,1)
print("Sample of inputs reshaped:", inputs[:5])
# Squishify inputs between 0 and 1
inputs  = sc.transform(inputs)
print("Samples of inputs transformed:", inputs[:5])

In [None]:
# Preparing X_test and predicting the prices
X_test = []
for i in range(num_prev_elems, 311):  # How did this value of 311 get calculated?
    X_test.append(
        inputs[i - num_prev_elems:i, 0]
    )
X_test = np.array(X_test)
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
predicted_stock_price = regressor.predict(X_test)
predicted_stock_price = sc.inverse_transform(predicted_stock_price)

In [None]:
# Visualizing the results for LSTM
plot_predictions(test_set, predicted_stock_price)

In [None]:
# Evaluating our model
return_rmse(test_set, predicted_stock_price)

---

## Gated Recurrent Units
In simple words, the GRU unit does not have to use a memory unit to control the flow of information like the LSTM unit. It can directly makes use of the all hidden states without any control. GRUs have fewer parameters and thus may train a bit faster or need less data to generalize. But, with large data, the LSTMs with higher expressiveness may lead to better results.

They are almost similar to LSTMs except that they have two gates: reset gate and update gate. Reset gate determines how to combine new input to previous memory and update gate determines how much of the previous state to keep. Update gate in GRU is what input gate and forget gate were in LSTM. We don't have the second non linearity in GRU before calculating the outpu, .neither they have the output gate.

In [None]:
# The GRU architecture (original architecture has 20% dropout b/w layers)
regressorGRU = Sequential()
regressorGRU.add(GRU(units=45, return_sequences=True,
                     input_shape=(x_train.shape[1],1),
                     activation='tanh'))  # TODO: he specifies tanh function, but whats the default? other options are..?
regressorGRU.add(GRU(units=30, return_sequences=True,  # TODO: why input shape thru all levels? necessary?
                     input_shape=(x_train.shape[1],1),
                     activation='tanh'))
regressorGRU.add(GRU(units=20, return_sequences=True,
                     input_shape=(x_train.shape[1],1),
                     activation='tanh'))
regressorGRU.add(GRU(units=10, activation='tanh'))
regressorGRU.add(Dense(units=1))
# Model variables
gru_batch_size = 32 # keras.losses.hinge(y_true, y_pred)

gru_epochs = 6
# model_optimizer = SGD(lr=0.005, decay=1e-7, momentum=0.95, nesterov=False)  # Default
# gru_optimizer = k.optimizers.Adadelta()  # First run of Adadelta was S L O W compared to SGD. Terrible error on a 3 epoch run. Not great.
# gru_optimizer = k.optimizers.Adam() # Great first run. Small error, small lag behind actual data
# gru_optimizer = k.optimizers.Adagrad()  # OK. Good adherence to small changes, but error larger than liked.
gru_optimizer = k.optimizers.RMSprop() # Great! great adherence, low error. a good contender. 
# gru_optimizer = SGD()  # Standard out. Error normal, not bad but not great. 

regressorGRU.compile(optimizer=gru_optimizer, loss=k.losses.mean_squared_error)
# Callbacks
stopearly = k.callbacks.EarlyStopping(monitor='loss', min_delta=0.0009, patience=1, verbose=1,
                                      mode='auto', baseline=None, restore_best_weights=False)
gru_runtime_name = 'GRU_45302010_RMSPROP' \
+ '_BS' + str(gru_batch_size) \
+ '_epochs' + str(gru_epochs) \
+ '_TensorboardStopEarly' \
+ '_' + str(dt.datetime.now()).replace(":","H",1).replace(":","M",1)
tensorboard_gru = k.callbacks.TensorBoard(log_dir='../tensorboard/' + gru_runtime_name, histogram_freq=0, batch_size=gru_batch_size, write_graph=True, write_images=True)

In [None]:
# # Visualize model
# k.utils.plot_model(regressorGRU, to_file='../visualizations/' + gru_runtime_name +'.png', show_shapes=True)
# IPython.display.Image('../visualizations/' + gru_runtime_name +'.png')

In [None]:
# Train model
history = regressorGRU.fit(x_train, y_train, epochs=gru_epochs, batch_size=gru_batch_size,  # shuffle=True,
                        verbose=1, callbacks=[stopearly, tensorboard_gru])

In [None]:
# Preparing X_test and predicting the prices
X_test = []
for i in range(num_prev_elems, 311):
    X_test.append(inputs[i - num_prev_elems:i, 0])
# print("X_test after appendsc:", X_test)
X_test = np.array(X_test)
# TODO: below: decode reshape
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
GRU_predicted_stock_price = regressorGRU.predict(X_test)
GRU_predicted_stock_price = sc.inverse_transform(GRU_predicted_stock_price)

In [None]:
# Visualizing the results for GRU
plot_predictions(test_set, GRU_predicted_stock_price)

In [None]:
# Evaluating GRU
return_rmse(test_set, GRU_predicted_stock_price)

---
## Sequence Generation (Siddarth)
Here, I will generate a sequence using just initial (60) values instead of using last (60) values for every new prediction. **Due to doubts in various comments about predictions making use of test set values, I have decided to include sequence generation.** The above models make use of test set so it is using last (60) true values for predicting the new value(I will call it a benchmark). This is why the error is so low. Strong models can bring similar results like above models for sequences too but they require more than just data which has previous values. In case of stocks, we need to know the sentiments of the market, the movement of other stocks and a lot more. So, don't expect a remotely accurate plot. The error will be great and the best I can do is generate the trend similar to the test set. A GRU model is used for predictions. 

In [None]:
len(training_set)

In [None]:
# Preparing sequence data
initial_sequence = x_train[2708,:]  # TODO: how did this value of 2708 get generated?
sequence = []
for i in range(251):  # TODO: how did this value of 251 get generated?
    new_prediction = regressorGRU.predict(
        initial_sequence.reshape(initial_sequence.shape[1],initial_sequence.shape[0],1)
    )
    initial_sequence = initial_sequence[1:]
    initial_sequence = np.append(initial_sequence,new_prediction,axis=0)
    sequence.append(new_prediction)
sequence = sc.inverse_transform(np.array(sequence).reshape(251,1))  # What does the array look like before reshape?

In [None]:
# Visualizing the sequence
plot_predictions(test_set,sequence)

In [None]:
# Evaluating the sequence
return_rmse(test_set,sequence)

So, GRU works better than LSTM in this case. Bidirectional LSTM is also a good way so make the model stronger. But this may vary for different data sets. **Applying both LSTM and GRU together gave even better results.** 