In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn
from torch import optim
from torch import mps
from torch.nn.functional import mse_loss
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, PowerTransformer, minmax_scale, scale, minmax_scale, robust_scale, label_binarize
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from tqdm.auto import tqdm
# from tqdm import tqdm
import os
from ranger21 import Ranger21
from pytorch_ranger import Ranger
#import hdbscan

import pytorch_lightning as pl

from helpers.cross_sectorial import CS_DATAMODULE_2D, CS_DATAMODULE_1D, _format_tensors_cs_1D, _format_tensors_cs_2D
from models.cross_sectorial import P_MH_CNN_2D_LSTM, P_CNN_2D_LSTM, MH_CNN_2D_LSTM, Vanilla_LSTM, CNN_1D_LSTM

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [2]:
def inference(model_dir, tickers, batch_size, mode="test"):

    if torch.cuda.is_available():
        DEVICE = "cuda"
        torch.set_float32_matmul_precision("medium")
    elif torch.backends.mps.is_available():
        DEVICE = "mps"
    else:
        DEVICE = "cpu"

    ckpt = torch.load(model_dir, map_location=DEVICE)
    params = ckpt["hyper_parameters"]

    data = CS_DATAMODULE_2D(
        batch_size=batch_size,
        lookback=params["lookback"],
        pred_horizon=1,
        multistep=False,
        data_type="monthly",
        train_workers=0,
        overwrite_cache=True,
        pred_target="price",
        scaling_fn=robust_scale,
        cluster=0,
        goal="regression",
        tickers=tickers,
    )

    # Global model hyperparameters and constants
    data.prepare_data()
    data.setup()
    model = P_CNN_2D_LSTM.load_from_checkpoint(model_dir, optimizer=Ranger)

    model.eval()
    with torch.inference_mode():
        if mode == "test":
            model.to(DEVICE)
            y_pred = model(data.X_test_tensor.to(DEVICE))
            
            pred = y_pred.cpu().detach().numpy()
            true = data.y_test_tensor.cpu().detach().numpy()
        else:
            model.to(DEVICE)
            y_pred = model(data.X_val_tensor.to(DEVICE))
            
            pred = y_pred.cpu().detach().numpy()
            true = data.y_val_tensor.cpu().detach().numpy()

    return pd.DataFrame({"ticker": tickers, "preds": pred.squeeze(), "trues": true.squeeze()})

In [3]:
with open("./DATA/Tickers/month_tickers_clean_cluster0.txt", "r") as f:
    tickers = f.read().strip().split("\n")

In [4]:
BATCH_1 = "D:/Development/Master-Thesis/logs/hp_opt_cs_price_monthly_1mo_cluster0batch1/trial_244/version_0/checkpoints/epoch=132-step=1197.ckpt" # batch size 32 first 136 tickers
BATCH_2 = "D:/Development/Master-Thesis/logs/hp_opt_cs_price_monthly_1mo_cluster0batch2/trial_114/version_0/checkpoints/epoch=691-step=11072.ckpt" # batch size 16 second 136 tickers
BATCH_3 = "D:/Development/Master-Thesis/logs/hp_opt_cs_price_monthly_1mo_cluster0batch3/trial_493/version_0/checkpoints/epoch=300-step=4816.ckpt" # batch size 16 last 135 tickers

In [5]:
b1val = inference(BATCH_1, tickers[:136], batch_size=32, mode="val")
b2val = inference(BATCH_2, tickers[136:272], batch_size=16, mode="val")
b3val = inference(BATCH_3, tickers[272:], batch_size=16, mode="val")
val = pd.concat([b1val, b2val, b3val]).reset_index(drop=True)

Preparing Tensors:   0%|          | 0/136 [00:00<?, ?it/s]

Preparing Tensors:   0%|          | 0/136 [00:00<?, ?it/s]

Preparing Tensors:   0%|          | 0/135 [00:00<?, ?it/s]

In [6]:
print(f"Batch 1 Val MAPE: {mean_absolute_percentage_error(b1val.trues, b1val.preds):.2%}")
print(f"Batch 1 Val MAE: {mean_absolute_error(b1val.trues, b1val.preds):.2f}")
print(f"Batch 1 Val MSE: {mean_squared_error(b1val.trues, b1val.preds):.2f}")
print(f"Batch 1 Val RMSE: {mean_squared_error(b1val.trues, b1val.preds, squared=False):.2f}")
print("---------------------------")

print(f"Batch 2 Val MAPE: {mean_absolute_percentage_error(b2val.trues, b2val.preds):.2%}")
print(f"Batch 2 Val MAE: {mean_absolute_error(b2val.trues, b2val.preds):.2f}")
print(f"Batch 2 Val MSE: {mean_squared_error(b2val.trues, b2val.preds):.2f}")
print(f"Batch 2 Val RMSE: {mean_squared_error(b2val.trues, b2val.preds, squared=False):.2f}")
print("---------------------------")

print(f"Batch 3 Val MAPE: {mean_absolute_percentage_error(b3val.trues, b3val.preds):.2%}")
print(f"Batch 3 Val MAE: {mean_absolute_error(b3val.trues, b3val.preds):.2f}")
print(f"Batch 3 Val MSE: {mean_squared_error(b3val.trues, b3val.preds):.2f}")
print(f"Batch 3 Val RMSE: {mean_squared_error(b3val.trues, b3val.preds, squared=False):.2f}")
print("---------------------------")

print(f"Total Val MAPE: {mean_absolute_percentage_error(val.trues, val.preds):.2%}")
print(f"Total Val MAE: {mean_absolute_error(val.trues, val.preds):.2f}")
print(f"Total Val MSE: {mean_squared_error(val.trues, val.preds):.2f}")
print(f"Total Val RMSE: {mean_squared_error(val.trues, val.preds, squared=False):.2f}")

Batch 1 Val MAPE: 15.11%
Batch 1 Val MAE: 4.37
Batch 1 Val MSE: 40.64
Batch 1 Val RMSE: 6.37
---------------------------
Batch 2 Val MAPE: 31.35%
Batch 2 Val MAE: 2.37
Batch 2 Val MSE: 32.67
Batch 2 Val RMSE: 5.72
---------------------------
Batch 3 Val MAPE: 112.70%
Batch 3 Val MAE: 6.29
Batch 3 Val MSE: 136.50
Batch 3 Val RMSE: 11.68
---------------------------
Total Val MAPE: 52.91%
Total Val MAE: 4.34
Total Val MSE: 69.77
Total Val RMSE: 8.35


In [7]:
b1test = inference(BATCH_1, tickers[:136], batch_size=32, mode="test")
b2test = inference(BATCH_2, tickers[136:272], batch_size=16, mode="test")
b3test = inference(BATCH_3, tickers[272:], batch_size=16, mode="test")
test = pd.concat([b1test, b2test, b3test]).reset_index(drop=True)

Preparing Tensors:   0%|          | 0/136 [00:00<?, ?it/s]

Preparing Tensors:   0%|          | 0/136 [00:00<?, ?it/s]

Preparing Tensors:   0%|          | 0/135 [00:00<?, ?it/s]

In [8]:
print(f"Batch 1 Test MAPE: {mean_absolute_percentage_error(b1test.trues, b1test.preds):.2%}")
print(f"Batch 1 Test MAE: {mean_absolute_error(b1test.trues, b1test.preds):.2f}")
print(f"Batch 1 Test MSE: {mean_squared_error(b1test.trues, b1test.preds):.2f}")
print(f"Batch 1 Test RMSE: {mean_squared_error(b1test.trues, b1test.preds, squared=False):.2f}")
print("---------------------------")

print(f"Batch 2 Test MAPE: {mean_absolute_percentage_error(b2test.trues, b2test.preds):.2%}")
print(f"Batch 2 Test MAE: {mean_absolute_error(b2test.trues, b2test.preds):.2f}")
print(f"Batch 2 Test MSE: {mean_squared_error(b2test.trues, b2test.preds):.2f}")
print(f"Batch 2 Test RMSE: {mean_squared_error(b2test.trues, b2test.preds, squared=False):.2f}")
print("---------------------------")

print(f"Batch 3 Test MAPE: {mean_absolute_percentage_error(b3test.trues, b3test.preds):.2%}")
print(f"Batch 3 Test MAE: {mean_absolute_error(b3test.trues, b3test.preds):.2f}")
print(f"Batch 3 Test MSE: {mean_squared_error(b3test.trues, b3test.preds):.2f}")
print(f"Batch 3 Test RMSE: {mean_squared_error(b3test.trues, b3test.preds, squared=False):.2f}")
print("---------------------------")

print(f"Total Test MAPE: {mean_absolute_percentage_error(test.trues, test.preds):.2%}")
print(f"Total Test MAE: {mean_absolute_error(test.trues, test.preds):.2f}")
print(f"Total Test MSE: {mean_squared_error(test.trues, test.preds):.2f}")
print(f"Total Test RMSE: {mean_squared_error(test.trues, test.preds, squared=False):.2f}")

Batch 1 Test MAPE: 16.21%
Batch 1 Test MAE: 4.79
Batch 1 Test MSE: 57.83
Batch 1 Test RMSE: 7.60
---------------------------
Batch 2 Test MAPE: 37.90%
Batch 2 Test MAE: 2.49
Batch 2 Test MSE: 31.52
Batch 2 Test RMSE: 5.61
---------------------------
Batch 3 Test MAPE: 121.83%
Batch 3 Test MAE: 6.27
Batch 3 Test MSE: 147.18
Batch 3 Test RMSE: 12.13
---------------------------
Total Test MAPE: 58.49%
Total Test MAE: 4.51
Total Test MSE: 78.68
Total Test RMSE: 8.87


In [9]:
def prep_dfs(val, test):
    fin_df = pd.read_parquet("./DATA/Monthly/Processed/month_data_fin_tec.parquet")
    val_df = val.copy()
    test_df = test.copy()

    val_df["act_return"] = val_df["ticker"].apply(lambda x: fin_df[f"{x}_CP"].pct_change().iloc[-2]*100)
    val_df["pred_return"] = val_df.apply(lambda row: ((row["preds"]) - (fin_df[f"{row['ticker']}_CP"].iloc[-3])) / (fin_df[f"{row['ticker']}_CP"].iloc[-3])*100, axis=1)
    val_df["mape"] = val_df.apply(lambda row: mean_absolute_percentage_error(np.array([row["trues"]]), np.array([row["preds"]])), axis=1)

    test_df["act_return"] = test_df["ticker"].apply(lambda x: fin_df[f"{x}_CP"].pct_change().iloc[-1]*100)
    test_df["pred_return"] = test_df.apply(lambda row: ((row["preds"]) - (fin_df[f"{row['ticker']}_CP"].iloc[-2])) / (fin_df[f"{row['ticker']}_CP"].iloc[-2])*100, axis=1)

    return val_df, test_df

In [10]:
val, test = prep_dfs(val, test)
b1val, b1test = prep_dfs(b1val, b1test)
b2val, b2test = prep_dfs(b2val, b2test)
b3val, b3test = prep_dfs(b3val, b3test)

In [11]:
val

Unnamed: 0,ticker,preds,trues,act_return,pred_return,mape
0,SLF,65.876801,66.459999,5.258156,4.334496,0.008775
1,ENGH,38.527027,37.290001,-2.509804,0.724254,0.033173
2,HR-U,9.471850,11.750000,-6.746032,-24.826584,0.193885
3,IVN,12.057223,11.750000,-3.767404,-1.251242,0.026147
4,WFG,112.881378,98.000000,1.659751,17.096865,0.151851
...,...,...,...,...,...,...
402,CFX,10.372426,1.980000,-33.333333,249.239934,4.238599
403,FN,25.925940,37.810001,-0.813221,-31.988616,0.314310
404,BPF-U,16.246813,15.610000,3.789894,8.024021,0.040795
405,ADEN,17.984587,26.020000,0.930954,-30.238221,0.308817


In [12]:
test

Unnamed: 0,ticker,preds,trues,act_return,pred_return
0,SLF,64.679565,65.750000,-1.068312,-2.678957
1,ENGH,39.712414,37.689999,1.072674,6.496149
2,HR-U,9.398281,10.130000,-13.787234,-20.014629
3,IVN,11.495772,9.940000,-15.404255,-2.163639
4,WFG,108.841515,91.610001,-6.520408,11.062770
...,...,...,...,...,...
402,CFX,10.372337,1.900000,-4.040404,423.855421
403,FN,25.924307,38.349998,1.428194,-31.435316
404,BPF-U,16.246832,16.309999,4.484305,4.079641
405,ADEN,17.982759,28.090000,7.955419,-30.888703


In [32]:
# val_portfolio = val.loc[val["mape"] < 0.005].sort_values(by="pred_return", ascending=False).head(10)
val_portfolio = b1val.loc[(b1val["mape"] < 0.05) & (b1val["pred_return"] > 0)].sort_values(by="pred_return", ascending=False).head(10)
val_portfolio.to_csv("./results/CS/val_portfolio.csv", index=False)
val_portfolio

Unnamed: 0,ticker,preds,trues,act_return,pred_return,mape
15,RCH,42.012596,40.740002,9.105517,12.513648,0.031237
60,SEA,19.60544,18.99,8.825215,12.352092,0.032409
129,CPX,45.557152,44.16,6.051873,9.407185,0.031638
74,TCL/A,15.234717,14.59,3.991447,8.586724,0.044189
71,CRR-U,16.165813,15.42,1.181102,6.074891,0.048367
85,GRT-U,88.18782,84.260002,0.681085,5.374382,0.046615
0,SLF,65.876801,66.459999,5.258156,4.334496,0.008775
105,CAR-U,49.431515,49.630001,4.682556,4.263899,0.003999
113,EMA,57.835129,57.650002,3.836455,4.169901,0.003211
92,FM,32.333527,32.919998,5.954297,4.066709,0.017815


In [33]:
val_portfolio["act_return"].mean()

4.956770416969258

In [34]:
# test_portfolio = test.loc[(val["mape"] < 0.005)].sort_values(by="pred_return", ascending=False).head(10)
test_portfolio = b1test.loc[(b1val["mape"] < 0.05) & (b1val["pred_return"] > 0)].sort_values(by="pred_return", ascending=False).head(10)
test_portfolio.to_csv("./results/CS/test_portfolio.csv", index=False)
test_portfolio

Unnamed: 0,ticker,preds,trues,act_return,pred_return
1,ENGH,39.712414,37.689999,1.072674,6.496149
55,SSL,8.167051,7.27,-6.675225,4.840197
60,SEA,19.714163,18.42,-3.00158,3.81339
74,TCL/A,15.098315,14.15,-3.015764,3.483998
71,CRR-U,15.850718,14.3,-7.263294,2.79324
85,GRT-U,86.462196,79.660004,-5.459293,2.613573
123,PBH,103.159851,100.0,-1.681251,1.425475
41,SII,49.349018,46.0,-5.602298,1.270302
86,TIH,110.348991,105.0,-4.092072,0.793744
15,RCH,40.974804,42.540001,4.418262,0.576347


In [35]:
test_portfolio["act_return"].mean()

-3.1299840551571902