In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn
from torch import optim
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, PowerTransformer, minmax_scale, power_transform, scale, minmax_scale
from tqdm.auto import tqdm

import lightning.pytorch as pl

from helpers.iterative import *

# Set global seed for reproducibility in numpy, torch, scikit-learn
pl.seed_everything(42)
# torch.manual_seed(42)
# torch.mps.manual_seed(42)
# torch.backends.mps.deterministic = True
# torch.cuda.manual_seed(42)
# torch.backends.cudnn.deterministic = True
# np.random.seed(42)

Global seed set to 42


42

## Work with sample of 10 companies for architecture design

## Preparing data as tensors

In [2]:
# Get data by going to project root using pd.read_parquet
data = pd.read_parquet("./DATA/Daily/Processed/day_data_fin_tec.parquet")
macro = pd.read_parquet("./DATA/Daily/Processed/day_data_macro_USCA.parquet")
with open("./DATA/Tickers/day_tickers_clean.txt", "r") as f:
    tickers = f.read().strip().split("\n")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5858 entries, 2000-01-04 to 2023-05-31
Columns: 41309 entries, SLF_CR to DXT_others_cr
dtypes: float64(40900), int64(409)
memory usage: 1.8 GB


In [4]:
macro.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5858 entries, 2000-01-04 to 2023-05-31
Columns: 51 entries, IVEY Index to EUCBCI Index
dtypes: float64(41), int64(10)
memory usage: 2.3 MB


In [5]:
# Check for any NAN or infinite values
data.isna().sum().sum(), np.isinf(data).sum().sum()

(0, 0)

## Single company approach

In [6]:
X_train, X_val, X_test, y_train, y_val, y_test = format_tensors_it(data,
                                                                macro, 
                                                                tickers[10],
                                                                lookback=21*36, 
                                                                pred_horizon=22, # Since ultimately, we want to predict 22 days ahead (may 2023 has 22 trading days)
                                                                multistep=False,
                                                                multicolinearity_threshold=None,
                                                                debug=False,
                                                                start_train_at=None)

  0%|          | 0/2534 [00:00<?, ?it/s]

In [7]:
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

((2469, 756, 144), (1, 756, 144), (1, 756, 144), (2469, 1), (1,), (1,))

In [8]:
X_train.min(), X_train.max(), X_val.min(), X_val.max(), X_test.min(), X_test.max()

(-25.98394310594216,
 21.566602034336963,
 -13.62578561152848,
 21.566602034336963,
 -13.62578561152848,
 21.566602034336963)

In [9]:
y_train.min(), y_train.max(), y_val.min(), y_val.max(), y_test.min(), y_test.max()

(0.8, 27.18, 6.31, 6.31, 5.89, 5.89)

In [10]:
np.isnan(X_train).any(), np.isnan(X_val).any(), np.isnan(X_test).any(), np.isinf(X_train).any(), np.isinf(X_val).any(), np.isinf(X_test).any()

(False, False, False, False, False, False)

In [11]:
np.isnan(y_train).any(), np.isnan(y_val).any(), np.isnan(y_test).any(), np.isinf(y_train).any(), np.isinf(y_test).any()

(False, False, False, False, False)

## Simple LSTM

In [12]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device

device(type='mps')

In [13]:
X_train_tensor = torch.from_numpy(X_train).float()
X_val_tensor = torch.from_numpy(X_val).float()
X_test_tensor = torch.from_numpy(X_test).float()
y_train_tensor = torch.from_numpy(y_train).float()
y_val_tensor = torch.from_numpy(y_val).float()
y_test_tensor = torch.from_numpy(y_test).float()

In [14]:
X_train_tensor.shape, X_val_tensor.shape, X_test_tensor.shape, y_train_tensor.shape, y_val_tensor.shape, y_test_tensor.shape

(torch.Size([2469, 756, 144]),
 torch.Size([1, 756, 144]),
 torch.Size([1, 756, 144]),
 torch.Size([2469, 1]),
 torch.Size([1]),
 torch.Size([1]))

In [15]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

In [16]:
LEARING_RATE = 1e-4 # 1e-4 ind standard
EPOCHS = 300
BATCH_SIZE = 32 # Small batch size since we are using a small dataset

In [17]:
# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [18]:
# See iteration of data
for i, (X, y) in enumerate(train_loader):
    print(X.shape, y.shape)
    break

torch.Size([32, 756, 144]) torch.Size([32, 1])


In [19]:
from typing import Any
from torch import nn
import lightning.pytorch as pl
from pytorch_ranger import Ranger
criterion = nn.MSELoss()


class LSTM(pl.LightningModule):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout):
        super().__init__()

        self.automatic_optimization = False
        self.save_hyperparameters()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=0, bidirectional=False)
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, output_size),
        )

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

    def training_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self.forward(X)#.squeeze(1)
        loss = nn.functional.mse_loss(y_hat, y)
        self.log("train_loss", loss, prog_bar=True)
    
        # Ranger requires manual backward pass since it is designed/executed differently to base torch optimizers
        optimizer = self.optimizers()
        optimizer.zero_grad()
        self.manual_backward(loss)
        optimizer.step()

        return loss
    
    def validation_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self.forward(X).squeeze(1)
        loss = nn.functional.mse_loss(y_hat, y)
        self.log("val_loss", loss, prog_bar=True)
        return loss
    
    def configure_optimizers(self):
        optimizer = Ranger(self.parameters(), lr=LEARING_RATE)
        return optimizer


In [20]:
y_train_tensor.shape

torch.Size([2469, 1])

In [21]:
input_size = X_train_tensor.shape[2]
hidden_size = input_size * 2
num_layers = 2
output_size = 1 # 1 if multi_step set to false, 2 for true
dropout = 0
input_size, hidden_size, num_layers, output_size

(144, 288, 2, 1)

In [22]:
model = LSTM(input_size=input_size, 
             hidden_size=hidden_size, 
             num_layers=num_layers, 
             output_size=output_size, 
             dropout=dropout,)

In [23]:
# Get first iter of train loader using next
x_in, y_in = next(iter(train_loader))

In [24]:
x_in.shape, y_in.shape

(torch.Size([32, 756, 144]), torch.Size([32, 1]))

In [25]:
from torchinfo import summary

summary(model, input_size=(BATCH_SIZE, X_train_tensor.shape[1], X_train_tensor.shape[2]), col_names=["input_size", "output_size", "num_params", "trainable"])

Layer (type:depth-idx)                   Input Shape               Output Shape              Param #                   Trainable
LSTM                                     [32, 756, 144]            [32, 1]                   --                        True
├─LSTM: 1-1                              [32, 756, 144]            [32, 756, 288]            1,165,824                 True
├─Sequential: 1-2                        [32, 288]                 [32, 1]                   --                        True
│    └─Linear: 2-1                       [32, 288]                 [32, 1]                   289                       True
Total params: 1,166,113
Trainable params: 1,166,113
Non-trainable params: 0
Total mult-adds (G): 28.20
Input size (MB): 13.93
Forward/backward pass size (MB): 55.74
Params size (MB): 4.66
Estimated Total Size (MB): 74.34

In [26]:
early_stopping = pl.callbacks.EarlyStopping(monitor="val_loss", patience=3, mode="min")
checkpoint_callback = pl.callbacks.ModelCheckpoint(save_top_k=1, monitor="val_loss", mode="min")
trainer = pl.Trainer(accelerator="gpu", max_epochs=EPOCHS, log_every_n_steps=1, callbacks=[early_stopping, checkpoint_callback], enable_checkpointing=True, enable_progress_bar=True)
trainer.fit(model=model, train_dataloaders=train_loader, val_dataloaders=val_loader)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name | Type       | Params
------------------------------------
0 | lstm | LSTM       | 1.2 M 
1 | fc   | Sequential | 289   
------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.664     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

	addcmul_(Number value, Tensor tensor1, Tensor tensor2)
Consider using one of the following signatures instead:
	addcmul_(Tensor tensor1, Tensor tensor2, *, Number value) (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/python_arg_parser.cpp:1485.)
  exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [32]:
%load_ext tensorboard
%tensorboard --logdir lightning_logs/

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 32663), started 0:03:27 ago. (Use '!kill 32663' to kill it.)

In [33]:
print(f"Best model path: {checkpoint_callback.best_model_path}")
print(f"Best model score: {checkpoint_callback.best_model_score}")

best_model = LSTM.load_from_checkpoint(checkpoint_path=checkpoint_callback.best_model_path)

Best model path: /Users/johnbergmann/Developer/Master-Thesis/lightning_logs/version_31/checkpoints/epoch=4-step=390.ckpt
Best model score: 0.02511233650147915


In [34]:
best_model.eval()
with torch.inference_mode():
    y_pred = best_model(X_val_tensor.to(device)).cpu().detach().numpy()

In [35]:
y_pred, y_val

(array([[6.151531]], dtype=float32), array([6.31]))

In [36]:
# R2 score and MAPE
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
print(f"MAPE: {mean_absolute_percentage_error(y_val_tensor, y_pred):.4%}")
print(f"MAE: {mean_absolute_error(y_val_tensor, y_pred):.4f}")

MAPE: 2.5114%
MAE: 0.1585


# BAZINGA!