# Implementing a Naive LSTM

Here we present an example of 1 dimension LSTM to predict stock market. Note this script was modified from the original code presented in https://www.kaggle.com/taronzakaryan/stock-prediction-lstm-using-pytorch/data. 

First we load all important packages

In [None]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
from torch.autograd import Variable
from typing import Optional, List, Tuple
from enum import IntEnum
import os

device = torch.device("cuda: 0" if torch.cuda.is_available() else "cpu")

Now, let's code an LSTM!

In [None]:
class Dim(IntEnum):
    batch = 0
    seq = 1
    feature = 2
 
class NaiveLSTM(nn.Module):
    def __init__(self, input_sz: int, hidden_sz: int):
        super().__init__()
        self.input_size = input_sz
        self.hidden_size = hidden_sz
        # input gate
        self.W_ii = nn.Parameter(torch.Tensor(input_sz, hidden_sz))
        self.W_hi = nn.Parameter(torch.Tensor(hidden_sz, hidden_sz))
        self.b_i = nn.Parameter(torch.Tensor(hidden_sz))
        # forget gate
        self.W_if = nn.Parameter(torch.Tensor(input_sz, hidden_sz))
        self.W_hf = nn.Parameter(torch.Tensor(hidden_sz, hidden_sz))
        self.b_f = nn.Parameter(torch.Tensor(hidden_sz))
        # input info
        self.W_ig = nn.Parameter(torch.Tensor(input_sz, hidden_sz))
        self.W_hg = nn.Parameter(torch.Tensor(hidden_sz, hidden_sz))
        self.b_g = nn.Parameter(torch.Tensor(hidden_sz))
        # output gate
        self.W_io = nn.Parameter(torch.Tensor(input_sz, hidden_sz))
        self.W_ho = nn.Parameter(torch.Tensor(hidden_sz, hidden_sz))
        self.b_o = nn.Parameter(torch.Tensor(hidden_sz))
         
        self.init_weights()
     
    def init_weights(self):
        for p in self.parameters():
            if p.data.ndimension() >= 2:
                nn.init.xavier_uniform_(p.data)
            else:
                nn.init.zeros_(p.data)
         
    def forward(self, x: torch.Tensor, 
                init_states: Optional[Tuple[torch.Tensor]]=None
               ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """Assumes x is of shape (batch, sequence, feature)"""
        bs, seq_sz, _ = x.size()
        hidden_seq = []
        
        if init_states is None:
            h_t, c_t = torch.zeros(self.hidden_size).to(x.device), torch.zeros(self.hidden_size).to(x.device)
        else:
            h_t, c_t = init_states
            
        for t in range(seq_sz): # iterate over the time steps
            x_t = x[:, t, :]
            i_t = torch.sigmoid(x_t @ self.W_ii + h_t @ self.W_hi + self.b_i)
            f_t = torch.sigmoid(x_t @ self.W_if + h_t @ self.W_hf + self.b_f)
            g_t = torch.tanh(x_t @ self.W_ig + h_t @ self.W_hg + self.b_g)
            o_t = torch.sigmoid(x_t @ self.W_io + h_t @ self.W_ho + self.b_o)
            c_t = f_t * c_t + i_t * g_t
            h_t = o_t * torch.tanh(c_t)
            hidden_seq.append(h_t.unsqueeze(Dim.batch))
            
        hidden_seq = torch.cat(hidden_seq, dim=Dim.batch)
        # reshape from shape (sequence, batch, feature) to (batch, sequence, feature)
        hidden_seq = hidden_seq.transpose(Dim.batch, Dim.seq).contiguous()
        return hidden_seq, (h_t, c_t)    

Next we define the network and generate a random input (time series).

In [None]:
hidden_size = 125
input_size = 50
sequence_len = 150
high = 1000000 

## Generate random input with 
test_idx = torch.randint(high=high, size=(1, sequence_len)).to(device)
embeddings = nn.Embedding(high, input_size).to(device)
test_embeddings = embeddings(test_idx).to(device)
test_idx_np = test_idx.data.numpy()

Next we build our LSTM model and define a step by hand.

In [None]:
lstm = NaiveLSTM(input_size,hidden_size)

def lstm_step(x_t, h_t, c_t, W_ii, W_hi, b_i, W_if, W_hf, b_f,
              W_ig, W_hg, b_g, W_io, W_ho, b_o, use_forget_gate=False):
    
    i_t = torch.sigmoid(x_t @ W_ii + h_t @ W_hi + b_i)
    if use_forget_gate:
        f_t = torch.sigmoid(x_t @ W_if + h_t @ W_hf + b_f)
    g_t = torch.tanh(x_t @ W_ig + h_t @ W_hg + b_g)
    o_t = torch.sigmoid(x_t @ W_io + h_t @ W_ho + b_o)
    if use_forget_gate:
        c_t = f_t * c_t + i_t * g_t
    else:
        c_t = c_t + i_t * g_t
    h_t = o_t * torch.tanh(c_t)
    return h_t, c_t

Case 1, we set the use_forget_gate == False. In the below plot, one could see instead of decaying, the gradient keeps on accumulating! The reason the gradient behaves this way is because of the update rule
c_t = c_{t-1} + i_t * g_t. 

In [None]:
# generate 1 
h_0, c_0 = (torch.zeros(hidden_size, requires_grad=True), 
            torch.zeros(hidden_size, requires_grad=True))
grads = []
h_t, c_t = h_0, c_0

for t in range(sequence_len):
    h_t, c_t = lstm_step(
        test_embeddings[:, t, :], h_t, c_t,
        lstm.W_ii, lstm.W_hi, lstm.b_i,
        lstm.W_if, lstm.W_hf, lstm.b_f,
        lstm.W_ig, lstm.W_hg, lstm.b_g,
        lstm.W_io, lstm.W_ho, lstm.b_o,
        use_forget_gate=False,
    ) 
    loss = h_t.abs().sum()
    loss.backward(retain_graph=True)
    grads.append(torch.norm(h_0.grad).item())
    h_0.grad.zero_()
    lstm.zero_grad()
    
grads_np = np.array(grads)
plt.plot(grads_np)

Case 2, we set the use_forget_gate == True. In this case, one we turn the forget gate, the gradient will vanish through time.

In [None]:
h_02, c_02 = (torch.zeros(hidden_size, requires_grad=True), 
            torch.zeros(hidden_size, requires_grad=True))
grads2 = []
h_t2, c_t2 = h_02, c_02

for t in range(sequence_len):
    h_t2, c_t2 = lstm_step(
        test_embeddings[:, t, :], h_t2, c_t2,
        lstm.W_ii, lstm.W_hi, lstm.b_i,
        lstm.W_if, lstm.W_hf, lstm.b_f,
        lstm.W_ig, lstm.W_hg, lstm.b_g,
        lstm.W_io, lstm.W_ho, lstm.b_o,
        use_forget_gate=True,
    ) 
    # use_forget_gate=True,
    loss2 = h_t2.abs().sum()
    loss2.backward(retain_graph=True)
    grads2.append(torch.norm(h_02.grad).item())
    h_02.grad.zero_()
    lstm.zero_grad()
    
grads_np2 = np.array(grads2)
plt.plot(grads_np2)

# Application of LSTM

Here we present an example of 1 dimension LSTM to predict some time series (such as the stock market). Note this script was modified from the original code presented in https://www.kaggle.com/taronzakaryan/stock-prediction-lstm-using-pytorch/data. 


Next, we will import data and plot using plt.plot the stock prince of three companies (GOOGLE, IBM, and APPLE) between 2-Jan-2015 to 31-Dec-2016.

In [None]:
file_download_link = "https://github.com/KCL-BMEIS/AdvancedMachineLearningCourse/blob/main/Week10_Sequences_and_Geometry/Data/Week10_data.zip?raw=true"
!wget -O Week10_data.zip --no-check-certificate "$file_download_link"
!unzip Week10_data.zip

for dirname, _, filenames in os.walk('/content/Stocks'):
    for i, filename in enumerate(filenames):
        if i<5:
            print(os.path.join(dirname,filename))
            
def stocks_data(symbols, dates):
    df = pd.DataFrame(index=dates)
    for symbol in symbols:
        df_temp = pd.read_csv("/content/Stocks/{}.us.txt".format(symbol), index_col='Date',
                parse_dates=True, usecols=['Date', 'Close'], na_values=['nan'])
        df_temp = df_temp.rename(columns={'Close': symbol})
        df = df.join(df_temp)
    return df

dates = pd.date_range('2015-01-02','2016-12-31',freq='B')
symbols = ['goog','ibm','aapl']
df = stocks_data(symbols, dates)
df.fillna(method='pad')
df.interpolate().plot()
plt.show()
df.head()
# read header

Next we pick to plot only the price of IBM between 2-Jan-2010 to 11-Oct-2017, this is the data that we will analyse .

In [None]:
dates = pd.date_range('2010-01-02','2017-10-11',freq='B')
df1=pd.DataFrame(index=dates)
df_ibm=pd.read_csv("input/Data/Stocks/ibm.us.txt", parse_dates=True, index_col=0)
df_ibm=df1.join(df_ibm)
df_ibm[['Close']].plot()
plt.ylabel("stock_price")
plt.title("IBM Stock")
plt.show()

df_ibm=df_ibm[['Close']]
df_ibm.info()

df_ibm=df_ibm.fillna(method='ffill')
scaler = MinMaxScaler(feature_range=(-1, 1))
df_ibm['Close'] = scaler.fit_transform(df_ibm['Close'].values.reshape(-1,1))

Next, we will divide the data (IBM-stock price as shown in the above figure) into training and testing. At the below code, we decided to take 1606 time points to train the model and the rest (402) to test the model.

In [None]:
# Divide the data into train and test
# function to create train, test data given stock data and sequence length
def load_data(stock, look_back):
    data_raw = stock.as_matrix() # convert to numpy array
    data = []
    
    # create all possible sequences of length seq_len
    for index in range(len(data_raw) - look_back): 
        data.append(data_raw[index: index + look_back])
    
    data = np.array(data);
    test_set_size = int(np.round(0.2*data.shape[0]));
    train_set_size = data.shape[0] - (test_set_size);
    
    x_train = data[:train_set_size,:-1,:]
    y_train = data[:train_set_size,-1,:]
    
    x_test = data[train_set_size:,:-1]
    y_test = data[train_set_size:,-1,:]
    
    return [x_train, y_train, x_test, y_test]

look_back = 20 # choose sequence length
x_train, y_train, x_test, y_test = load_data(df_ibm, look_back)
print('x_train.shape = ',x_train.shape)
print('y_train.shape = ',y_train.shape)
print('x_test.shape = ',x_test.shape)
print('y_test.shape = ',y_test.shape)

# make training and testing in torch
# make training and test sets in torch
x_train = torch.from_numpy(x_train).type(torch.Tensor)
x_test = torch.from_numpy(x_test).type(torch.Tensor)
y_train = torch.from_numpy(y_train).type(torch.Tensor)
y_test = torch.from_numpy(y_test).type(torch.Tensor)

#train_X = train_X.view([-1, x_train.shape[0], 1])
#test_X = test_X.view([-1, x_test.shape[0], 1])
#train_Y = train_Y.view([y_train.shape[0], 1])

n_steps = look_back-1
batch_size = 1606
#n_iters = 3000
num_epochs = 100 #n_iters / (len(train_X) / batch_size)
#num_epochs = int(num_epochs)

train = torch.utils.data.TensorDataset(x_train,y_train)
test = torch.utils.data.TensorDataset(x_test,y_test)

train_loader = torch.utils.data.DataLoader(dataset=train, 
                                           batch_size=batch_size, 
                                           shuffle=False)

test_loader = torch.utils.data.DataLoader(dataset=test, 
                                          batch_size=batch_size, 
                                          shuffle=False)


Next step is to build the model by defining the LSTM parameters (e.g. input_dim, hidden_dim, num_layers and output_dim). Since we would like to use the history of stock price to predict its future values (both the input and output dimension are one).

In [None]:
## Build model

input_dim = 1
hidden_dim = 32
num_layers = 2 
output_dim = 1

# Here we define our model as a class
class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(LSTM, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim

        # Number of hidden layers
        self.num_layers = num_layers

        # Building your LSTM
        # batch_first=True causes input/output tensors to be of shape
        # (batch_dim, seq_dim, feature_dim)
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)

        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()

        # Initialize cell state
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()

        # One time step
        # We need to detach as we are doing truncated backpropagation through time (BPTT)
        # If we don't, we'll backprop all the way to the start even after going through another batch
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

        # Index hidden state of last time step
        # out.size() --> 100, 28, 100
        # out[:, -1, :] --> 100, 100 --> just want last time step hidden states! 
        out = self.fc(out[:, -1, :]) 
        # out.size() --> 100, 10
        return out

Next we implement the model, define loss function, and start learning using torch.optim.Adam with learning rate of 0.01. 

In [None]:
model = LSTM(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers)

loss_fn = torch.nn.MSELoss(size_average=True)

optimiser = torch.optim.Adam(model.parameters(), lr=0.01)
print(model)
print(len(list(model.parameters())))
for i in range(len(list(model.parameters()))):
    print(list(model.parameters())[i].size())

Next we test our model.

In [None]:
# Test model
hist = np.zeros(num_epochs)

# Number of steps to unroll
seq_dim =look_back-1  

for t in range(num_epochs):
    # Initialise hidden state
    # Don't do this if you want your LSTM to be stateful
    #model.hidden = model.init_hidden()
    
    # Forward pass
    y_train_pred = model(x_train)

    loss = loss_fn(y_train_pred, y_train)
    if t % 10 == 0 and t !=0:
        print("Epoch ", t, "MSE: ", loss.item())
    hist[t] = loss.item()

    # Zero out gradient, else they will accumulate between epochs
    optimiser.zero_grad()

    # Backward pass
    loss.backward()

    # Update parameters
    optimiser.step()

We then plot the results we have. Please consider changing the above parameter(s) and see how it would affect the predicting accuracy.

In [None]:
## Step 7: Make prediction
y_test_pred = model(x_test)

# invert predictions
y_train_pred_scal = scaler.inverse_transform(y_train_pred.detach().numpy())
y_train_scal = scaler.inverse_transform(y_train.detach().numpy())
y_test_pred_scal = scaler.inverse_transform(y_test_pred.detach().numpy())
y_test_scal = scaler.inverse_transform(y_test.detach().numpy())

# calculate root mean squared error
trainScore = math.sqrt(mean_squared_error(y_train_scal[:,0], y_train_pred_scal[:,0]))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = math.sqrt(mean_squared_error(y_test_scal[:,0], y_test_pred_scal[:,0]))
print('Test Score: %.2f RMSE' % (testScore))

# shift train predictions for plotting
trainPredictPlot = np.empty_like(df_ibm)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back:len(y_train_pred_scal)+look_back, :] = y_train_pred_scal

# shift test predictions for plotting
testPredictPlot = np.empty_like(df_ibm)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(y_train_pred_scal)+look_back-1:len(df_ibm)-1, :] = y_test_pred_scal

# plot baseline and predictions
plt.figure(figsize=(15,8))
plt.plot(scaler.inverse_transform(df_ibm)) # in blue
plt.plot(trainPredictPlot) # in orange
plt.plot(testPredictPlot) # in green
plt.show()