In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn
from torch import optim
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, PowerTransformer, minmax_scale, power_transform, scale, minmax_scale
from tqdm.auto import tqdm

import lightning.pytorch as pl

from helpers.iterative import *

# Set global seed for reproducibility in numpy, torch, scikit-learn
pl.seed_everything(42)
# torch.manual_seed(42)
# torch.mps.manual_seed(42)
# torch.backends.mps.deterministic = True
# torch.cuda.manual_seed(42)
# torch.backends.cudnn.deterministic = True
# np.random.seed(42)

Global seed set to 42


42

## Work with sample of 10 companies for architecture design

## Preparing data as tensors

In [3]:
# Get data by going to project root using pd.read_parquet
data = pd.read_parquet("./DATA/Monthly/Processed/month_data_fin_tec.parquet")
macro = pd.read_parquet("./DATA/Monthly/Processed/month_data_macro_USCA.parquet")
with open("./DATA/Tickers/month_tickers_clean.txt", "r") as f:
    tickers = f.read().strip().split("\n")

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 281 entries, 2000-01-31 to 2023-05-31
Columns: 41309 entries, SLF_CR to DXT_others_cr
dtypes: float64(40900), int64(409)
memory usage: 88.6 MB


In [7]:
macro

Unnamed: 0,IVEY Index,CAEICAIR Index,CACAPUTL Index,CACOUSCO Index,GCAN10YR Index,GCAN2YR Index,OEOTKLAF Index,RRCACONT Index,SPTSX Index,MXWO Index,...,CAIPYOY Index,COSYNFRM Index,IMP1YOY% Index,CAWCWGCY Index,CDGGBE10 Index,CL1 Comdty,CRB CMDT Index,EHSLMP%Y Index,OUSTUS Index,EUCBCI Index
2000-01-31,0.0,255390,86.1,108.2,6.538,6.273,101.2716,0.0,8481.11,1338.25,...,3.74,0.9,7.1,3.1,0.000,27.64,225.03,2.32,86.24,90.6
2000-02-29,0.0,150690,86.1,108.3,6.126,5.984,101.2993,0.0,9128.99,1340.58,...,5.30,0.9,9.3,3.1,0.000,30.43,220.68,4.49,86.45,90.9
2000-03-31,0.0,144590,85.7,108.4,5.922,5.936,101.2977,0.0,9462.39,1431.94,...,4.97,4.4,9.2,3.2,0.000,26.90,228.01,3.59,86.87,91.1
2000-04-30,0.0,130150,85.7,108.5,6.170,6.139,101.2694,0.0,9347.61,1370.11,...,4.97,4.4,6.6,3.2,0.000,25.74,227.37,4.10,87.21,91.4
2000-05-31,0.0,135980,85.7,108.5,6.031,6.195,101.2148,0.0,9251.99,1334.14,...,5.23,4.4,6.1,3.2,0.000,29.01,234.16,3.57,87.40,91.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-01-31,54.7,308620,81.8,141.2,2.916,3.752,98.6910,70040.0,20767.38,2785.00,...,4.78,4.9,0.9,3.4,1.802,78.87,557.02,1.30,136.77,107.7
2023-02-28,50.8,158610,81.8,141.2,3.329,4.205,98.6910,66990.0,20221.19,2714.57,...,1.36,4.9,-1.1,3.4,2.073,77.05,548.53,-0.51,137.01,108.0
2023-03-31,65.2,161720,81.9,141.2,2.897,3.737,98.6910,64956.0,20099.89,2791.44,...,-2.09,3.8,-4.7,2.8,1.807,75.67,550.63,-1.53,137.41,107.8
2023-04-30,55.6,150170,81.9,141.2,2.841,3.656,98.6910,65545.0,20636.54,2835.93,...,-3.75,3.8,-4.9,2.8,1.749,76.78,547.45,-2.86,137.88,107.9


In [6]:
macro.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 281 entries, 2000-01-31 to 2023-05-31
Columns: 51 entries, IVEY Index to EUCBCI Index
dtypes: float64(41), int64(10)
memory usage: 114.2 KB


In [None]:
# Check for any NAN or infinite values
data.isna().sum().sum(), np.isinf(data).sum().sum()

## Single company approach

In [None]:
X_train, X_val, X_test, y_train, y_val, y_test = format_tensors_it(data,
                                                                macro, 
                                                                tickers[10],
                                                                lookback=6, 
                                                                pred_horizon=1,
                                                                multistep=False,
                                                                multicolinearity_threshold=None,
                                                                debug=False,
                                                                start_train_at=None)

In [None]:
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

In [None]:
X_train.min(), X_train.max(), X_val.min(), X_val.max(), X_test.min(), X_test.max()

In [None]:
y_train.min(), y_train.max(), y_val.min(), y_val.max(), y_test.min(), y_test.max()

In [None]:
np.isnan(X_train).any(), np.isnan(X_val).any(), np.isnan(X_test).any(), np.isinf(X_train).any(), np.isinf(X_val).any(), np.isinf(X_test).any()

In [None]:
np.isnan(y_train).any(), np.isnan(y_val).any(), np.isnan(y_test).any(), np.isinf(y_train).any(), np.isinf(y_test).any()

## Simple LSTM

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device

In [None]:
X_train_tensor = torch.from_numpy(X_train).float()
X_val_tensor = torch.from_numpy(X_val).float()
X_test_tensor = torch.from_numpy(X_test).float()
y_train_tensor = torch.from_numpy(y_train).float()
y_val_tensor = torch.from_numpy(y_val).float()
y_test_tensor = torch.from_numpy(y_test).float()

In [None]:
X_train_tensor.shape, X_val_tensor.shape, X_test_tensor.shape, y_train_tensor.shape, y_val_tensor.shape, y_test_tensor.shape

In [None]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

In [None]:
LEARING_RATE = 1e-4 # 1e-4 ind standard
EPOCHS = 300
BATCH_SIZE = 32 # Small batch size since we are using a small dataset

In [None]:
# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
# See iteration of data
for i, (X, y) in enumerate(train_loader):
    print(X.shape, y.shape)
    break

In [None]:
from typing import Any
from torch import nn
import lightning.pytorch as pl
from pytorch_ranger import Ranger
criterion = nn.MSELoss()


class LSTM(pl.LightningModule):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout):
        super().__init__()

        self.automatic_optimization = False
        self.save_hyperparameters()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=0, bidirectional=False)
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, output_size),
        )

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

    def training_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self.forward(X)#.squeeze(1)
        loss = nn.functional.mse_loss(y_hat, y)
        self.log("train_loss", loss, prog_bar=True)
    
        # Ranger requires manual backward pass since it is designed/executed differently to base torch optimizers
        optimizer = self.optimizers()
        optimizer.zero_grad()
        self.manual_backward(loss)
        optimizer.step()

        return loss
    
    def validation_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self.forward(X).squeeze(1)
        loss = nn.functional.mse_loss(y_hat, y)
        self.log("val_loss", loss, prog_bar=True)
        return loss
    
    def configure_optimizers(self):
        optimizer = Ranger(self.parameters(), lr=LEARING_RATE)
        return optimizer


In [None]:
y_train_tensor.shape

In [None]:
input_size = X_train_tensor.shape[2]
hidden_size = input_size
num_layers = 2
output_size = 1 # 1 if multi_step set to false, 2 for true
dropout = 0 #.5
input_size, hidden_size, num_layers, output_size

In [None]:
model = LSTM(input_size=input_size, 
             hidden_size=hidden_size, 
             num_layers=num_layers, 
             output_size=output_size, 
             dropout=dropout,)

In [None]:
# Get first iter of train loader using next
x_in, y_in = next(iter(train_loader))

In [None]:
x_in.shape, y_in.shape

In [None]:
from torchinfo import summary

summary(model, input_size=(BATCH_SIZE, X_train_tensor.shape[1], X_train_tensor.shape[2]), col_names=["input_size", "output_size", "num_params", "trainable"])

In [None]:
early_stopping = pl.callbacks.EarlyStopping(monitor="val_loss", patience=20, mode="min")
checkpoint_callback = pl.callbacks.ModelCheckpoint(save_top_k=1, monitor="val_loss", mode="min")
trainer = pl.Trainer(accelerator="gpu", max_epochs=EPOCHS, log_every_n_steps=1, callbacks=[early_stopping, checkpoint_callback], enable_checkpointing=True, enable_progress_bar=True)
trainer.fit(model=model, train_dataloaders=train_loader, val_dataloaders=val_loader)

In [None]:
%load_ext tensorboard
%tensorboard --logdir lightning_logs/

In [None]:
print(f"Best model path: {checkpoint_callback.best_model_path}")
print(f"Best model score: {checkpoint_callback.best_model_score}")

best_model = LSTM.load_from_checkpoint(checkpoint_path=checkpoint_callback.best_model_path)

In [None]:
best_model.eval()
with torch.inference_mode():
    y_pred = best_model(X_val_tensor.to(device)).cpu().detach().numpy()

In [None]:
y_pred, y_val

In [None]:
# R2 score and MAPE
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
print(f"MAPE: {mean_absolute_percentage_error(y_val_tensor, y_pred):.4%}")
print(f"MAE: {mean_absolute_error(y_val_tensor, y_pred):.4f}")