# Stocks RNN Predictor

By: Kesavar Kabilar, Ryan James Laporte, and Carmelo Restivo-Caponcello

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import torch
import torch.nn as nn

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
np.random.seed(7)

In [4]:
data_path = "/content/gdrive/My Drive/Project/Stocks/" # Change file directory

In [5]:
def get_all_files(data_path=data_path):
    """
    Given a path to the stocks data, this function will return the file 
    locations of companies consisting of at least 100 lines of previous stocks 
    data.
    """
    all_files = []

    for file in os.listdir(data_path)[:100]:
        f = open(data_path+file)
        if len(f.readlines()) > 100: 
            all_files.append(data_path+file)
        f.close()
    
    return all_files

all_files = get_all_files()

In [6]:
n = len(all_files)

all_files = np.array(all_files, dtype=str)
np.random.shuffle(all_files)

training_files = all_files[:int(n*0.6)] 
validation_files = all_files[int(n*0.6):int(n*0.8)]
test_files = all_files[int(n*0.8):]

In [7]:
def transform_data(file, K, augment=False):
    """
    Given a file, a K value it will return X, T. Each row of X contains K x 4 
    values where each row represents the open value, highest price, lowest 
    price, and the closing price. Each column represent the previous K days. 
    Each row of T represents the closing price of K+1 day. If augment is true, 
    the function will also add the reverse of the days.
    """
    X, T = [], []

    f = open(file, "r")

    all_data = []

    data = []
    for line in f.readlines()[-100:]:
        line = line.split(",")[1:-2]
        all_data.append(line)

        data.append(line)
        if len(data) == K+1:
            x = np.array(data[:-1], dtype=float)
            X.append(x)
            T.append(float(data[-1][-1]))

            data = data[1:]

    f.close()
    if augment:
        data = []
        for line in all_data[::-1]:

            data.append(line)
            if len(data) == K+1:
                x = np.array(data[:-1], dtype=float)
                X.append(x)
                T.append(float(data[-1][-1]))

                data = data[1:]
    
    return torch.tensor(np.array(X), dtype=torch.float), torch.tensor(np.array(T), dtype=torch.float)

In [8]:
class StocksRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(StocksRNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        # normalize input
        s1, s2, s3 = x.shape
        x = x.reshape(s1, s2*s3)
        min, max = torch.amin(x, axis=1), torch.amax(x, axis=1)
        x = (x-min[:, np.newaxis]) / (max[:, np.newaxis] - min[:, np.newaxis])
        x = x.reshape(s1, s2, s3)
        # RNN Layer
        out, _ = self.rnn(x)
        # Fully connected layer
        out = self.fc(out[:, -1, :])
        # Denormalize output
        return ((out.reshape(-1) * (max-min)) + min).reshape(-1, 1)

class StocksLSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(StocksLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        # normalize input
        s1, s2, s3 = x.shape
        x = x.reshape(s1, s2*s3)
        min, max = torch.amin(x, axis=1), torch.amax(x, axis=1)
        x = (x-min[:, np.newaxis]) / (max[:, np.newaxis] - min[:, np.newaxis])
        x = x.reshape(s1, s2, s3)
        # LSTM Layer
        out, _ = self.lstm(x)
        # Fully connected layer
        out = self.fc(out[:, -1, :])
        # Denormalize output
        return ((out.reshape(-1) * (max-min)) + min).reshape(-1, 1)


In [9]:
def get_mean_squared_error(model, files, K, incrDecr=False):
    """
    Given the model, a set of files, and K value, The function will calculate 
    the mean squared error of the dataset.
    """
    error = 0
    total = 0

    for f in files:
        x, t = transform_data(f, K)
        
        if incrDecr:
            output = model(x).reshape(-1)

            t[np.where(x[:, -1, -1] >= t)] = 0
            t[np.where(x[:, -1, -1] < t)] = 1
            output[np.where(x[:, -1, -1] >= output)] = 0
            output[np.where(x[:, -1, -1] < output)] = 1

        else:
            s1, s2, s3 = x.shape
            x = x.reshape(s1, s2*s3)
            min, max = torch.amin(x, axis=1), torch.amax(x, axis=1)
            x = (x-min[:, np.newaxis]) / (max[:, np.newaxis] - min[:, np.newaxis])
            x = x.reshape(s1, s2, s3)
            t = (t-min) / (max - min)

            output = model(x).reshape(-1)

        error += int(sum((output - t)**2).detach())
        total += output.shape[0]

    return error / total

In [10]:
def train_rnn_network(model, train, valid, num_epochs, batch_size, learning_rate, K, name, augment=False, show_data=True):
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    losses, train_acc, valid_acc = [], [], []
    epochs = []

    iter_x = []
    iter_t = []
    for f in train:
        x, t = transform_data(f, K, augment)
        iter_x.append(x)
        iter_t.append(t)

    train_data_x = torch.concat(iter_x, 0)
    train_data_t = torch.concat(iter_t, 0)

    for epoch in range(num_epochs):

        p = np.random.permutation(train_data_t.shape[0])
        X, T = train_data_x[p], train_data_t[p]

        for i in range(0, int(T.shape[0]), batch_size):
            batch_x = X[i: (i+batch_size)]
            batch_t = T[i: (i+batch_size)]
            pred = model(batch_x)
            loss = criterion(pred, batch_t.reshape(-1, 1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        losses.append(float(loss))

        epochs.append(epoch)
        train_acc.append(get_mean_squared_error(model, train, K))
        valid_acc.append(get_mean_squared_error(model, valid, K))
        if (epoch+1) % 10 == 0 and show_data:
            print("Epoch %d; Loss %f; Train Acc %f; Val Acc %f" % (
                epoch+1, loss, train_acc[-1], valid_acc[-1]))

    if show_data:
        plt.title(name + " Training Curve")
        plt.plot(epochs, train_acc, label="Train")
        plt.plot(epochs, valid_acc, label="Validation")
        plt.xlabel("Epoch")
        plt.ylabel("Mean Squared Error")
        plt.legend(loc='best')
        plt.savefig(name.replace(" ", "")+".png")
        plt.show()

In [11]:
modelRNN = StocksRNN(4, 150)
train_rnn_network(modelRNN, 
                  training_files, 
                  validation_files, 
                  num_epochs=100, 
                  batch_size=500, 
                  learning_rate=1e-5,
                  K=5, 
                  name="RNN Model")
print("Test Data Error: ", get_mean_squared_error(modelRNN, test_files, 5))
print("Test Data Increase/Decrease Accuracy: ", get_mean_squared_error(modelRNN, test_files, 5, incrDecr=True))

In [12]:
modelLSTM = StocksLSTM(4, 150)
train_rnn_network(modelLSTM, 
                  training_files, 
                  validation_files, 
                  num_epochs=100, 
                  batch_size=500, 
                  learning_rate=1e-5,
                  K=5, 
                  name="LSTM Model")
print("Test Data Accuracy: ", get_mean_squared_error(modelLSTM, test_files, 5))
print("Test Data Increase/Decrease Accuracy: ", get_mean_squared_error(modelLSTM, test_files, 5, incrDecr=True))

In [13]:
modelRNNAug = StocksRNN(4, 150)
train_rnn_network(modelRNNAug, 
                  training_files, 
                  validation_files, 
                  num_epochs=100, 
                  batch_size=500, 
                  learning_rate=1e-5,
                  K=5, 
                  name="RNN Model Augmented", 
                  augment=True)
print("Test Data Accuracy: ", get_mean_squared_error(modelRNNAug, test_files, 5))
print("Test Data Increase/Decrease Accuracy: ", get_mean_squared_error(modelRNNAug, test_files, 5, incrDecr=True))

In [14]:
modelLSTMAug = StocksLSTM(4, 150)
train_rnn_network(modelLSTMAug, 
                  training_files, 
                  validation_files, 
                  num_epochs=100, 
                  batch_size=500, 
                  learning_rate=1e-5,
                  K=5, 
                  name="LSTM Model Augmented", 
                  augment=True)
print("Test Data Accuracy: ", get_mean_squared_error(modelLSTMAug, test_files, 5))
print("Test Data Increase/Decrease Accuracy: ", get_mean_squared_error(modelLSTMAug, test_files, 5, incrDecr=True))

In [15]:
def goodBadExample(model, K, test_files=test_files):
    """
    Given a model, K, and test_files dataset, this function will output the 
    company with the worst mean squared error and the company with the best 
    mean squared error. Display a graph for which it least correctly predicted 
    the values and a graph for which it most correctly predicted the values.
    """
    bad_error, bad_file = float("-inf"), ""
    good_error, good_file = float("inf"), ""

    for f in test_files:
        error = get_mean_squared_error(model, [f], K)
        if error > bad_error:
            bad_error = error
            bad_file = f
        
        if error < good_error:
            good_error = error
            good_file = f

    X, T = transform_data(good_file, K)
    Y = model(X)

    plt.title("Best Model Prediction")
    plt.ylabel("Stock Price ($)")
    plt.xlabel("Days")
    plt.plot(np.arange(len(Y)), Y.detach().numpy(), label="Model Prediction")
    plt.plot(np.arange(len(T)), T, label="Target Value")
    plt.legend()
    plt.savefig("BestModelPrediction.png")
    plt.show()

    print("Best Model Prediction Error: ", 
          get_mean_squared_error(model, [good_file], K))

    X, T = transform_data(bad_file, K)
    Y = model(X)

    plt.title("Worst Model Prediction")
    plt.ylabel("Stock Price ($)")
    plt.xlabel("Days")
    plt.plot(np.arange(len(Y)), Y.detach().numpy(), label="Model Prediction")
    plt.plot(np.arange(len(T)), T, label="Target Value")
    plt.legend()
    plt.savefig("WorstModelPrediction.png")
    plt.show()

    print("Worst Model Prediction Error: ", 
          get_mean_squared_error(model, [bad_file], K))

# goodBadExample(modelRNN, 5, test_files=test_files)

In [16]:
def testModels(modelType):
    print(modelType, "Model")
    print("Epochs\tBatch Size\tNum Days\tTraining Error\tValidation Error\tTest Error\tClassification Error")
    for epochs in [50, 100, 200]:
        for batch_size in [500, 1000]:
            for K in [5, 10, 15]:
                print(str(epochs) + "\t" + str(batch_size) + "\t\t" + str(K) + "\t\t", end="")
                if modelType == "RNN":
                    model = StocksRNN(4, 150)
                else:
                    model = StocksLSTM(4, 150)
                train_rnn_network(model, 
                                  training_files, 
                                  validation_files, 
                                  num_epochs=epochs, 
                                  batch_size=batch_size, 
                                  learning_rate=1e-5,
                                  K=K, 
                                  name="RNN Model", 
                                  show_data=False)
                train_acc = get_mean_squared_error(model, training_files, K)
                valid_acc = get_mean_squared_error(model, validation_files, K)
                test_acc = get_mean_squared_error(model, test_files, K)
                testc_acc = get_mean_squared_error(model, test_files, K, incrDecr=True)
                print(f'{train_acc:.6f}' + "\t", end="")
                print(f'{valid_acc:.6f}' + "\t\t", end="")
                print(f'{test_acc:.6f}' + "\t", end="")
                print(f'{testc_acc:.6f}' + "\t")

In [17]:
testModels("LSTM")

LSTM Model
Epochs	Batch Size	Num Days	Training Error	Validation Error	Test Error	Classification Error
50	500		5		0.312865	0.316374		0.283657	0.470360	
50	500		10		0.242181	0.254938		0.222222	0.482456	
50	500		15		0.152723	0.164052		0.147988	0.490402	
50	1000		5		0.402729	0.417544		0.374515	0.477008	
50	1000		10		0.333951	0.361728		0.316374	0.482456	
50	1000		15		0.311329	0.342484		0.313313	0.489164	
100	500		5		0.252242	0.239181		0.221053	0.475346	
100	500		10		0.150000	0.139506		0.129240	0.486550	
100	500		15		0.114161	0.111111		0.104644	0.500310	
100	1000		5		0.345809	0.354971		0.316343	0.458726	
100	1000		10		0.260288	0.275926		0.243860	0.485380	
100	1000		15		0.195643	0.216340		0.195046	0.476161	
200	500		5		0.244444	0.227485		0.213296	0.485319	
200	500		10		0.136008	0.120370		0.114035	0.480702	
200	500		15		0.097821	0.094118		0.087926	0.490402	
200	1000		5		0.247953	0.232164		0.216066	0.489751	
200	1000		10		0.154321	0.141975		0.132164	0.485965	
200	1000		15		0.114597	0.109804		0.

In [18]:
testModels("RNN")

RNN Model
Epochs	Batch Size	Num Days	Training Error	Validation Error	Test Error	Classification Error
50	500		5		0.239376	0.225731		0.208310	0.489197	
50	500		10		0.142181	0.130247		0.119298	0.486550	
50	500		15		0.130501	0.126797		0.119505	0.501548	
50	1000		5		0.264717	0.258480		0.232687	0.472022	
50	1000		10		0.255144	0.269753		0.238596	0.483626	
50	1000		15		0.102614	0.103922		0.095356	0.488545	
100	500		5		0.223197	0.209357		0.193352	0.478116	
100	500		10		0.140535	0.130247		0.119883	0.481871	
100	500		15		0.089760	0.085621		0.081734	0.484830	
100	1000		5		0.230214	0.212865		0.199446	0.483102	
100	1000		10		0.155350	0.144444		0.133333	0.484211	
100	1000		15		0.099564	0.094118		0.090402	0.492879	
200	500		5		0.210721	0.195322		0.182271	0.469806	
200	500		10		0.113169	0.103704		0.092398	0.485380	
200	500		15		0.077342	0.076471		0.070588	0.481734	
200	1000		5		0.232164	0.215789		0.201108	0.479778	
200	1000		10		0.130658	0.119753		0.110526	0.485380	
200	1000		15		0.087364	0.084967		0.0