# Installing libraries for colab

In [None]:
# Download the python packages, note that we use other packages that you might also need to install if you run
# this notebook on your machine. If you run it on colab, these are the only packages that need to be pip installed
!pip install yfinance --upgrade --no-cache-dir
!pip install ta
!pip install datasets

# Downloading and setting up the data

In [1]:
# yfinance is the API and ta is a technical analysis tool
import yfinance as yf
import ta
import matplotlib.pyplot as plt
import random

# For data manupilation
import json
import pandas as pd
import os
from data_utils import pull_stock_data, pull_stock_indicators, cleaning_data

In [2]:
# Obtaining the SNP 500 companies
stock_dfs = {}
all_tickers = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
all_stocks = all_tickers.Symbol.to_list()

# Obtaining most important company's stocks
# These are subdivided into categories
tech = ['MSFT','AAPL','NVDA', 'GOOGL', 'GOOG', 'AMZN', 'META', 'AVGO', 'TSLA','ORCL','CRM','NFLX','AMD','QCOM','ADBE','ASML','ADI','DELL']
pharma = ['LLY',"JNJ",'MRK','ABBV','TMO','MRK','MRNA','PFE',"CVS"]
finance = ['BRK-B','JPM','V','MA','BAC','WFC', 'ACN', 'MS','BLK','GS',"PYPL"]
defense = ['BA','LMT','RTX','GE','NOC']
energy = ['XOM','CVX','LIN','SBGSY']
retail = ['WMT', "UNH", "PG", 'COST', 'HD','KO','PEP','MCD','LVMUY', 'NSRGY','TM', 'LRLCY', 'GM', 'BAB','NKE',"T",'F']

all_stocks = tech+pharma+finance+defense+energy+retail

In [3]:
# Calling the functions to import the stock data -> to see functions go to data_utils.py file
full_stock_data = pull_stock_data(all_stocks, '2022-06-01', '2024-05-01', '1h')
full_stock_data = pull_stock_indicators(full_stock_data)
full_stock_data = cleaning_data(full_stock_data)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

In [4]:
# Printing out example dataframe
full_stock_data['AAPL']

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,SMA_20,SMA_50,%K,%D,RSI,BB_Middle,BB_Upper,BB_Lower,MACD,MACD_Signal,ATR
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2022-06-10 09:30:00-04:00,140.190002,140.759995,137.460007,137.490005,137.490005,23421485,146.779439,147.263031,0.246150,0.429504,15.611410,146.779439,152.291020,141.267858,-1.466908,-0.547692,1.526065
2022-06-10 10:30:00-04:00,137.489899,138.539001,137.270004,137.399994,137.399994,13497913,146.285439,147.027887,1.054255,0.754015,15.499934,146.285439,153.137159,139.433719,-1.930529,-0.824260,1.507703
2022-06-10 11:30:00-04:00,137.410995,138.070007,137.289993,137.975006,137.975006,7736991,145.775689,146.813787,6.109205,2.469870,19.456611,145.775689,153.457423,138.093955,-2.225894,-1.104586,1.455725
2022-06-10 12:30:00-04:00,137.979996,138.210007,137.460007,138.089996,138.089996,6411832,145.242439,146.614387,7.321360,4.828273,20.260719,145.242439,153.483210,137.001667,-2.422766,-1.368222,1.405316
2022-06-10 13:30:00-04:00,138.089996,138.740005,137.729996,137.960007,137.960007,7293856,144.708189,146.406943,6.160738,6.530434,20.017429,144.708189,153.371900,136.044477,-2.559770,-1.606532,1.377080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-04-30 11:30:00-04:00,173.869904,174.199905,173.604996,173.836594,173.836594,3446845,172.079618,168.986474,67.932656,70.962925,64.151716,172.079618,176.451501,167.707735,1.488587,1.438002,1.206533
2024-04-30 12:30:00-04:00,173.839996,173.929993,173.220001,173.275299,173.275299,4148810,172.286883,169.164732,59.726591,65.335760,59.757395,172.286883,176.468678,168.105088,1.380488,1.426499,1.171066
2024-04-30 13:30:00-04:00,173.304993,173.879898,173.100006,173.179993,173.179993,3964263,172.449133,169.329932,58.333222,61.997490,59.018155,172.449133,176.503188,168.395077,1.272460,1.395692,1.143125
2024-04-30 14:30:00-04:00,173.199997,173.345001,172.559998,172.729996,172.729996,4786101,172.590633,169.483732,51.754316,56.604709,55.525482,172.590633,176.472827,168.708438,1.137425,1.344038,1.117545


In [None]:
# Saving the means and std for normalization
means = pd.DataFrame()
std_devs = pd.DataFrame()

for ticker, df in full_stock_data.items():
    mean = df.mean()
    std_dev = df.std()

    means[ticker] = mean
    std_devs[ticker] = std_dev

means.to_csv('train_hourly_mean.csv')
std_devs.to_csv('train_hourly_std_dev.csv')

# Creating/Training the LSTM

In [24]:
# Importing relevant libraries for the training of the LSTM
import sklearn.preprocessing as sklp
import sklearn.model_selection as sklm
import torch
import copy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import numpy as np
from tqdm import tqdm

In [25]:
# Importing the datasets we use throughout our code -> see specs in the lstm_datasets.py file
from lstm_datasets import TimeSeriesDataset, ValTimeSeriesDataset

In [26]:
stock_data = full_stock_data
tickers = list(stock_data.keys())

In [27]:
# In case we train on GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## LSTM Model

In [28]:
# Import model, see specs in lstm.py file
from lstm import LSTM

### Train & Validation loops

In [29]:
# The validation loop given a criterion and dataloader
def val_on_stock(model, val_dataloader, criterion):
  # Get val loss
  val_running_loss = 0
  tot = 0

  model.eval()
  with torch.no_grad():
    for i, (inputs, labels) in enumerate(val_dataloader):
        labels = labels.view(-1, 1).to(device)
        inputs = inputs.to(device)

        outputs = model(inputs)

        loss = criterion(outputs, labels)

        val_running_loss += loss.item()
        tot += 1

  return val_running_loss/tot

In [30]:
# The training loop given a criterion and dataloader
def train_on_stock(model, val_dataloader, train_dataloader, criterion, epochs, print_epochs=False):
  # To store the losses
  train_loss_arr = []
  val_loss_arr = []
  best_loss_val = float('inf')

  # Store best_model
  best_model = None

  #Initialize optimizer
  optimizer = optim.Adam(model.parameters(), lr=learning_rate)

  # Training loop
  for epoch in range(epochs):

      running_loss = 0.0
      tot = 0

      model.train()

      for i, (inputs, labels) in enumerate(train_dataloader):

        labels = labels.view(-1, 1).to(device)
        inputs = inputs.to(device)

        optimizer.zero_grad()
        preds = model(inputs) # Forward pass
        loss = criterion(preds, labels)


        running_loss += loss.item()

        loss.backward()
        optimizer.step() # Gradient step

        tot += 1

      val_loss = val_on_stock(model, val_dataloader, criterion) # Getting val loss
      train_loss = running_loss/tot

      train_loss_arr.append(train_loss)
      val_loss_arr.append(val_loss)
      if best_model==None or best_val_loss>val_loss:
        best_val_loss = val_loss
        best_model = copy.deepcopy(model.state_dict())

      if print_epochs:
        print(f"Epoch {epoch+1}/{epochs}: Training loss = {round(train_loss, 5)}, Validation loss = {round(val_loss, 5)}")

  return train_loss_arr, val_loss_arr, best_model

## Optimal hyperparameters used to train all models

In [31]:
# Hyper parameters
num_layers = 4
hidden_size = 100
input_size = 17

batch_size = 20

learning_rate = 1e-5
lstm_epochs = 30

## Training to look forward for a day

In [32]:
#params for dataset
b = 1
f = 1

n_lags = 8 * 30 * b #look back n MONTHS: hours/market-day * days * quantity
forecast_horizon = 8 * f #look ahead f days:

# Creating the val/train split used for all stocks
tot = int(sum([len(d)-n_lags-forecast_horizon+1 for d in full_stock_data.values()])/len(full_stock_data))
train_split, val_split = tot - int(tot*0.2), int(tot*0.2)
indices = list(range(tot))
train_indices = indices[:train_split]
val_indices = indices[train_split:]

stock_datastruct = {}

# Creating datasets
for key, value in full_stock_data.items():
  mean = value.mean()
  std_dev = value.std()
  value = (value - mean) / std_dev

  val_dataset = TimeSeriesDataset(key, value, n_lags, forecast_horizon, indices=val_indices)
  train_dataset = TimeSeriesDataset(key, value, n_lags, forecast_horizon, indices=train_indices)

  temp = {'val_dataset':val_dataset, 'train_dataset':train_dataset, 'mean':mean, 'std':std_dev}
  stock_datastruct[val_dataset.ticker] = temp

In [33]:
# Creating the loss
criterion = nn.MSELoss()

# Initialize LSTM models for each stock
for ticker in stock_datastruct.keys():
  stock_datastruct[ticker]['LSTM'] = LSTM(num_layers, input_size, hidden_size, device).to(device)

In [34]:
# Creating dataloaders
for ticker in stock_datastruct.keys():
    val_dataloader = DataLoader(stock_datastruct[ticker]['val_dataset'], batch_size=batch_size, shuffle=False)
    train_dataloader = DataLoader(stock_datastruct[ticker]['train_dataset'], batch_size=batch_size, shuffle=True)
    stock_datastruct[ticker]['val_dataloader'] = val_dataloader
    stock_datastruct[ticker]['train_dataloader'] = train_dataloader

In [None]:
# Making the directory where we will be storing all the models for each ticker
!rm -r /content/LSTM_weights
!mkdir LSTM_weights

In [None]:
# Dictionary where we will be storing training losses and validation losses
loss_dict = dict()
loss_dict['val_loss'] = dict()
loss_dict['train_loss'] = dict()
loss_dict['best_val'] = dict()

# Iterate through each ticker, train the model, save the weights and save all loses in a file
for ticker in stock_datastruct.keys():
    stock = stock_datastruct[ticker]
    model = stock['LSTM']
    val_dataloader = stock['val_dataloader']
    train_dataloader = stock['train_dataloader']

    # Train this lticker's lstm
    lstm_train_loss, lstm_val_loss, best_model = train_on_stock(model, val_dataloader, train_dataloader, criterion, lstm_epochs)

    # Printing ticker's best val loss
    model.load_state_dict(best_model)
    best_val_loss = val_on_stock(model, val_dataloader, criterion)
    print(f"Test/Validation loss for {ticker} after training : {round(best_val_loss, 5)}")

    # Record results
    loss_dict['val_loss'][ticker] = lstm_val_loss
    loss_dict['train_loss'][ticker] = lstm_train_loss
    loss_dict['best_val'][ticker] = best_val_loss

    # Save the model for this ticker
    torch.save(best_model, "LSTM_weights/"+ticker+".pt")

In [None]:
# Zipping the weights folder to download
!zip -r LSTM_weights.zip LSTM_weights

In [None]:
# Saving the loss_dict which has all train/val stats
file_path = "run_stats_day.json"
with open(file_path, "w") as json_file:
    json.dump(loss_dict, json_file)

## Predict future changes

In [1]:
# Go in the lstm_predict file to see specs
from lstm_predict import predict
import pandas as pd

In [2]:
# Inputs
timestep = "d"
start_date = pd.Timestamp('2023-12-29 15:30:00-05:00', tz='America/New_York')

# Function
preds = predict(timestep, start_date)
print(preds)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

{'AAPL': {'var': 0.032300143908709285, 'pred': 0.9963619238309536}, 'ABBV': {'var': 0.09144707651995122, 'pred': 0.9951824545455544}, 'ACN': {'var': 0.1225214783269912, 'pred': 0.9520050074180425}, 'ADBE': {'var': 0.07247056803945452, 'pred': 0.9827917193202353}, 'ADI': {'var': 0.0661500588087365, 'pred': 0.9738939568535772}, 'AMD': {'var': 0.07163922864943743, 'pred': 0.9063770870685602}, 'AMZN': {'var': 0.04803225942607969, 'pred': 0.9704827540779853}, 'ASML': {'var': 0.04483962216973305, 'pred': 0.9922063793137952}, 'AVGO': {'var': 0.1398374718632549, 'pred': 0.9300869177863592}, 'BA': {'var': 0.03284130145050585, 'pred': 0.9640552134099531}, 'BAB': {'var': 0.052841137733310464, 'pred': 1.0057641123204335}, 'BAC': {'var': 0.06285906941071152, 'pred': 0.9901530334133524}, 'BLK': {'var': 0.0969752541910857, 'pred': 0.946921805539976}, 'BRK-B': {'var': 0.02655041145090945, 'pred': 1.0010710809865424}, 'COST': {'var': 0.2208194261966273, 'pred': 0.9453130074863343}, 'CRM': {'var': 0.084




In [3]:
# Inputs
timestep = "w"
start_date = '2023-12-29'

# Function
preds = predict(timestep, start_date)
print(preds)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

{'AAPL': {'var': 0.07187532869939534, 'pred': 0.9796605355549907}, 'ACN': {'var': 0.012338845785822602, 'pred': 0.9767178345423326}, 'ADBE': {'var': 0.015374914440431271, 'pred': 0.9739318934424135}, 'ADI': {'var': 0.02266283431205781, 'pred': 0.9527028505004763}, 'AMD': {'var': 0.023931704522928577, 'pred': 0.9806404486496129}, 'AMZN': {'var': 0.015018152099540341, 'pred': 1.0159901400556646}, 'ASML': {'var': 0.021060422883025027, 'pred': 0.9926310023768864}, 'AVGO': {'var': 0.06434888849957837, 'pred': 0.7344561262703375}, 'BA': {'var': 0.05976397898635483, 'pred': 0.9592030228142739}, 'BAB': {'var': 0.0879679675683849, 'pred': 1.023611659275051}, 'BAC': {'var': 0.031193489654551824, 'pred': 0.9953165968501121}, 'BLK': {'var': 0.03180508332448668, 'pred': 0.9826450328390806}, 'BRK-B': {'var': 0.022362935129209078, 'pred': 1.0009270342903274}, 'COST': {'var': 0.04571870813851018, 'pred': 0.9286561564785465}, 'CRM': {'var': 0.019617577760662832, 'pred': 0.983260092621987}, 'CVS': {'var




In [4]:
# Inputs
timestep = "m"
start_date = '2023-12-29'

# Function
preds = predict(timestep, start_date)
print(preds)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

Failed to predict ABBV


Predicting: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 54/54 [01:08<00:00,  1.27s/it]

{'AAPL': {'var': 0.03195311777764635, 'pred': 0.9053451999511254}, 'ACN': {'var': 0.08188173686450517, 'pred': 0.9650279081258072}, 'ADBE': {'var': 0.07676217165479267, 'pred': 0.9902861952787125}, 'ADI': {'var': 0.04582030324011364, 'pred': 0.8667514489542039}, 'AMD': {'var': 0.06396295388271524, 'pred': 0.8731673031362652}, 'AMZN': {'var': 0.047144869600147514, 'pred': 1.0272536056748238}, 'ASML': {'var': 0.03024276360996406, 'pred': 0.9468456966870966}, 'AVGO': {'var': 0.10143451306079938, 'pred': 0.8226091316530231}, 'BA': {'var': 0.136223172493787, 'pred': 1.0263170431226143}, 'BAB': {'var': 0.147120979341509, 'pred': 1.0377711137601076}, 'BAC': {'var': 0.10849383588329974, 'pred': 1.0116661605523691}, 'BLK': {'var': 0.09842314818366008, 'pred': 0.9428125615729552}, 'BRK-B': {'var': 0.04537350273140791, 'pred': 0.9339295543900179}, 'COST': {'var': 0.14491826208858666, 'pred': 0.9017752409573567}, 'CRM': {'var': 0.0855291056393815, 'pred': 0.9635325265231627}, 'CVS': {'var': 0.1242




In [20]:
# Inputs
timestep = "y"
start_date = '2023-12-25'

# Function
preds = predict(timestep, start_date)
print(preds)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

Failed to predict ACN


Predicting:  15%|████████████████▎                                                                                          | 9/59 [00:02<00:08,  5.78it/s]

Failed to predict AVGO
Failed to predict BAB


Predicting:  24%|█████████████████████████▏                                                                                | 14/59 [00:03<00:08,  5.37it/s]

Failed to predict CRM


Predicting:  32%|██████████████████████████████████▏                                                                       | 19/59 [00:03<00:07,  5.24it/s]

Failed to predict GM
Failed to predict GOOG
Failed to predict GOOGL


Predicting:  51%|█████████████████████████████████████████████████████▉                                                    | 30/59 [00:05<00:05,  4.98it/s]

Failed to predict LRLCY
Failed to predict LVMUY
Failed to predict MA


Predicting:  58%|█████████████████████████████████████████████████████████████                                             | 34/59 [00:05<00:02,  8.88it/s]

Failed to predict META


Predicting:  64%|████████████████████████████████████████████████████████████████████▎                                     | 38/59 [00:06<00:03,  6.77it/s]

Failed to predict NFLX


Predicting:  83%|████████████████████████████████████████████████████████████████████████████████████████                  | 49/59 [00:08<00:02,  4.87it/s]

Failed to predict SBGSY


Predicting:  90%|███████████████████████████████████████████████████████████████████████████████████████████████▏          | 53/59 [00:09<00:01,  5.41it/s]

Failed to predict TSLA


Predicting:  93%|██████████████████████████████████████████████████████████████████████████████████████████████████▊       | 55/59 [00:09<00:00,  6.59it/s]

Failed to predict V


Predicting: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 59/59 [00:10<00:00,  5.78it/s]

{'AAPL': {'var': 0.7231786324593582, 'pred': 0.5238913179615309}, 'ADBE': {'var': 0.3096861918947022, 'pred': 0.8434118008866937}, 'ADI': {'var': 0.13406955563512288, 'pred': 0.839696577888173}, 'AMD': {'var': 1.1147789335564564, 'pred': 0.35134244215579485}, 'AMZN': {'var': 0.25043960405807747, 'pred': 0.9380493131932702}, 'ASML': {'var': 0.7335281308348242, 'pred': 0.5430935770159888}, 'BA': {'var': 1.4100297516898106, 'pred': 0.6372686875678951}, 'BAC': {'var': 0.22872340056653084, 'pred': 0.8796171548962737}, 'BLK': {'var': 0.3346226226146284, 'pred': 0.7957661938420028}, 'BRK-B': {'var': 0.16866494234847396, 'pred': 0.8784445451746732}, 'COST': {'var': 0.16209408887171825, 'pred': 0.7189997227141498}, 'CVS': {'var': 0.2262202304248747, 'pred': 0.8395205405049446}, 'CVX': {'var': 0.7213339597771042, 'pred': 0.8020848511285763}, 'F': {'var': 0.6148050279405556, 'pred': 0.9366230764246573}, 'GE': {'var': 0.23704569973051548, 'pred': 1.0368813824881213}, 'GS': {'var': 1.09853611906108


