## AI Finance Project - Loda Enrico

In [106]:
#!pip install yfinance
!pip show yfinance

Name: yfinance
Version: 0.2.61
Summary: Download market data from Yahoo! Finance API
Home-page: https://github.com/ranaroussi/yfinance
Author: Ran Aroussi
Author-email: ran@aroussi.com
License: Apache
Location: /usr/local/lib/python3.11/dist-packages
Requires: beautifulsoup4, curl_cffi, frozendict, multitasking, numpy, pandas, peewee, platformdirs, protobuf, pytz, requests, websockets
Required-by: 


In [107]:
# import
import pandas as pd
import yfinance as yf
import numpy as np
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler

We want to calculate the daily simple rate of return of our assets. To do so, we define:

---

**Simple rate of return**  
Given:  
- **P**: the price of the asset at the time of investment  
- **T**: maturity (number of periods)  
- **C**: the price of the asset at maturity **T**

The formula is:
$$
r = \frac{\frac{C}{P} - 1}{T}
$$

---

**Daily simple rate of return:**  
For daily returns, considering $P_t$ as the close price of the asset at time t, the formula is:
$$
r_t = \frac{P_t - P_{t-1}}{P_{t-1}} = \frac{P_t}{P_{t-1}} - 1
$$

In [108]:
df = yf.download(["SPY","NVDA"], start="2010-01-01", end="2025-05-17")
spy = df.xs('SPY', axis=1, level=1)
nvda = df.xs('NVDA', axis=1, level=1)

prices = df["Close"]
returns = (prices / prices.shift(1)) - 1
returns = returns.dropna()
SPY_r = returns['SPY']
NVDA_r = returns['NVDA']

[                       0%                       ][*********************100%***********************]  2 of 2 completed


In [109]:
df

Price,Close,Close,High,High,Low,Low,Open,Open,Volume,Volume
Ticker,NVDA,SPY,NVDA,SPY,NVDA,SPY,NVDA,SPY,NVDA,SPY
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
2010-01-04,0.423884,85.768448,0.426864,85.813854,0.415172,84.391067,0.424342,85.041918,800204000,118944600
2010-01-05,0.430073,85.995499,0.434658,86.033341,0.422279,85.405193,0.422279,85.715485,728648000,111579900
2010-01-06,0.432824,86.056030,0.433741,86.267934,0.425718,85.844126,0.429844,85.912236,649168000,116074400
2010-01-07,0.424342,86.419304,0.432366,86.525256,0.421133,85.654932,0.430532,85.897108,547792000,131091100
2010-01-08,0.425259,86.706894,0.428239,86.744736,0.418382,86.018206,0.420903,86.192268,478168000,126402800
...,...,...,...,...,...,...,...,...,...,...
2025-05-12,123.000000,582.989990,123.000000,583.000000,120.279999,577.039978,121.970001,581.469971,225023300,78993600
2025-05-13,129.929993,586.840027,131.220001,589.080017,124.470001,582.840027,124.980003,583.409973,330430100,67947200
2025-05-14,135.339996,587.590027,135.440002,588.979980,131.679993,585.539978,133.199997,587.809998,281180800,66283500
2025-05-15,134.830002,590.460022,136.300003,590.969971,132.660004,585.099976,134.289993,585.559998,226632600,71268100


In [110]:
returns[:]

Ticker,NVDA,SPY
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-05,0.014603,0.002647
2010-01-06,0.006397,0.000704
2010-01-07,-0.019598,0.004221
2010-01-08,0.002161,0.003328
2010-01-11,-0.014016,0.001397
...,...,...
2025-05-12,0.054436,0.033047
2025-05-13,0.056341,0.006604
2025-05-14,0.041638,0.001278
2025-05-15,-0.003768,0.004884


#dataset for training
The features, for a total of 24, are the following:

*   $\mu_1$, $\mu_2$ of $r_1$, $r_2$ calculated on the last 10 lags
*   $[r_1^t,\dots ,r_1^{t-10}]$, $[r_2^t,\dots ,r_2^{t-10}]$

The target is the values of my covariance matrix:
* $\Sigma_{t+1}$

In [111]:
rNVDA_series = returns['NVDA']
rSPY_series = returns['SPY']
df = pd.DataFrame()
df['rNVDA_today'] = rNVDA_series
df['rSPY_today'] = rSPY_series

for i in range(1, 11):
    df[f'rNVDA_t-{i}'] = rNVDA_series.shift(i)
    df[f'rSPY_t-{i}'] = rSPY_series.shift(i)

df.head(11)

Unnamed: 0_level_0,rNVDA_today,rSPY_today,rNVDA_t-1,rSPY_t-1,rNVDA_t-2,rSPY_t-2,rNVDA_t-3,rSPY_t-3,rNVDA_t-4,rSPY_t-4,...,rNVDA_t-6,rSPY_t-6,rNVDA_t-7,rSPY_t-7,rNVDA_t-8,rSPY_t-8,rNVDA_t-9,rSPY_t-9,rNVDA_t-10,rSPY_t-10
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-05,0.014603,0.002647,,,,,,,,,...,,,,,,,,,,
2010-01-06,0.006397,0.000704,0.014603,0.002647,,,,,,,...,,,,,,,,,,
2010-01-07,-0.019598,0.004221,0.006397,0.000704,0.014603,0.002647,,,,,...,,,,,,,,,,
2010-01-08,0.002161,0.003328,-0.019598,0.004221,0.006397,0.000704,0.014603,0.002647,,,...,,,,,,,,,,
2010-01-11,-0.014016,0.001397,0.002161,0.003328,-0.019598,0.004221,0.006397,0.000704,0.014603,0.002647,...,,,,,,,,,,
2010-01-12,-0.033898,-0.009326,-0.014016,0.001397,0.002161,0.003328,-0.019598,0.004221,0.006397,0.000704,...,,,,,,,,,,
2010-01-13,0.013582,0.008446,-0.033898,-0.009326,-0.014016,0.001397,0.002161,0.003328,-0.019598,0.004221,...,0.014603,0.002647,,,,,,,,
2010-01-14,-0.015634,0.002705,0.013582,0.008446,-0.033898,-0.009326,-0.014016,0.001397,0.002161,0.003328,...,0.006397,0.000704,0.014603,0.002647,,,,,,
2010-01-15,-0.029495,-0.011225,-0.015634,0.002705,0.013582,0.008446,-0.033898,-0.009326,-0.014016,0.001397,...,-0.019598,0.004221,0.006397,0.000704,0.014603,0.002647,,,,
2010-01-19,0.018703,0.012496,-0.029495,-0.011225,-0.015634,0.002705,0.013582,0.008446,-0.033898,-0.009326,...,0.002161,0.003328,-0.019598,0.004221,0.006397,0.000704,0.014603,0.002647,,


In [112]:
cols = [f'rNVDA_t-{i}' for i in range(1, 11)]
df['mean_NVDA'] = df[cols].mean(axis=1)
cols.append('rNVDA_today')
df['var_NVDA'] = df[cols].var(axis=1)

cols = [f'rSPY_t-{i}' for i in range(1, 11)]
df['mean_SPY'] = df[cols].mean(axis=1)
cols.append('rSPY_today')
df['var_SPY'] = df[cols].var(axis=1)
df = df.dropna()
df#["r2_t-10"]

Unnamed: 0_level_0,rNVDA_today,rSPY_today,rNVDA_t-1,rSPY_t-1,rNVDA_t-2,rSPY_t-2,rNVDA_t-3,rSPY_t-3,rNVDA_t-4,rSPY_t-4,...,rNVDA_t-8,rSPY_t-8,rNVDA_t-9,rSPY_t-9,rNVDA_t-10,rSPY_t-10,mean_NVDA,var_NVDA,mean_SPY,var_SPY
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-20,-0.004016,-0.010168,0.018703,0.012496,-0.029495,-0.011225,-0.015634,0.002705,0.013582,0.008446,...,-0.019598,0.004221,0.006397,0.000704,0.014603,0.002647,-0.005720,0.000331,0.001539,0.000059
2010-01-21,-0.017857,-0.019229,-0.004016,-0.010168,0.018703,0.012496,-0.029495,-0.011225,-0.015634,0.002705,...,0.002161,0.003328,-0.019598,0.004221,0.006397,0.000704,-0.007581,0.000296,0.000258,0.000093
2010-01-22,-0.034604,-0.022292,-0.017857,-0.019229,-0.004016,-0.010168,0.018703,0.012496,-0.029495,-0.011225,...,-0.014016,0.001397,0.002161,0.003328,-0.019598,0.004221,-0.010007,0.000327,-0.001736,0.000131
2010-01-25,0.017011,0.005128,-0.034604,-0.022292,-0.017857,-0.019229,-0.004016,-0.010168,0.018703,0.012496,...,-0.033898,-0.009326,-0.014016,0.001397,0.002161,0.003328,-0.011507,0.000395,-0.004387,0.000132
2010-01-26,-0.031661,-0.004190,0.017011,0.005128,-0.034604,-0.022292,-0.017857,-0.019229,-0.004016,-0.010168,...,0.013582,0.008446,-0.033898,-0.009326,-0.014016,0.001397,-0.010022,0.000424,-0.004207,0.000127
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-12,0.054436,0.033047,-0.006134,-0.001274,0.002648,0.006968,0.031002,0.004206,-0.002460,-0.008358,...,-0.000917,0.000397,0.002667,0.006299,-0.020539,0.000381,0.005092,0.000471,0.002482,0.000127
2025-05-13,0.056341,0.006604,0.054436,0.033047,-0.006134,-0.001274,0.002648,0.006968,0.031002,0.004206,...,0.024697,0.007087,-0.000917,0.000397,0.002667,0.006299,0.012589,0.000545,0.005748,0.000124
2025-05-14,0.041638,0.001278,0.056341,0.006604,0.054436,0.033047,-0.006134,-0.001274,0.002648,0.006968,...,0.025894,0.014844,0.024697,0.007087,-0.000917,0.000397,0.017957,0.000575,0.005779,0.000126
2025-05-15,-0.003768,0.004884,0.041638,0.001278,0.056341,0.006604,0.054436,0.033047,-0.006134,-0.001274,...,-0.005939,-0.005734,0.025894,0.014844,0.024697,0.007087,0.022212,0.000588,0.005867,0.000124


In [113]:
def compute_covariance_row(row):
    r1 = [row[f'rNVDA_t-{i}'] for i in range(1, 11)]
    r1.append(row['rNVDA_today'])

    r2 = [row[f'rSPY_t-{i}'] for i in range(1, 11)]
    r2.append(row['rSPY_today'])

    return np.cov(r1, r2)[0, 1]

df['cov'] = df.apply(compute_covariance_row, axis=1)
df

Unnamed: 0_level_0,rNVDA_today,rSPY_today,rNVDA_t-1,rSPY_t-1,rNVDA_t-2,rSPY_t-2,rNVDA_t-3,rSPY_t-3,rNVDA_t-4,rSPY_t-4,...,rSPY_t-8,rNVDA_t-9,rSPY_t-9,rNVDA_t-10,rSPY_t-10,mean_NVDA,var_NVDA,mean_SPY,var_SPY,cov
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-20,-0.004016,-0.010168,0.018703,0.012496,-0.029495,-0.011225,-0.015634,0.002705,0.013582,0.008446,...,0.004221,0.006397,0.000704,0.014603,0.002647,-0.005720,0.000331,0.001539,0.000059,0.000097
2010-01-21,-0.017857,-0.019229,-0.004016,-0.010168,0.018703,0.012496,-0.029495,-0.011225,-0.015634,0.002705,...,0.003328,-0.019598,0.004221,0.006397,0.000704,-0.007581,0.000296,0.000258,0.000093,0.000111
2010-01-22,-0.034604,-0.022292,-0.017857,-0.019229,-0.004016,-0.010168,0.018703,0.012496,-0.029495,-0.011225,...,0.001397,0.002161,0.003328,-0.019598,0.004221,-0.010007,0.000327,-0.001736,0.000131,0.000153
2010-01-25,0.017011,0.005128,-0.034604,-0.022292,-0.017857,-0.019229,-0.004016,-0.010168,0.018703,0.012496,...,-0.009326,-0.014016,0.001397,0.002161,0.003328,-0.011507,0.000395,-0.004387,0.000132,0.000184
2010-01-26,-0.031661,-0.004190,0.017011,0.005128,-0.034604,-0.022292,-0.017857,-0.019229,-0.004016,-0.010168,...,0.008446,-0.033898,-0.009326,-0.014016,0.001397,-0.010022,0.000424,-0.004207,0.000127,0.000175
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-12,0.054436,0.033047,-0.006134,-0.001274,0.002648,0.006968,0.031002,0.004206,-0.002460,-0.008358,...,0.000397,0.002667,0.006299,-0.020539,0.000381,0.005092,0.000471,0.002482,0.000127,0.000202
2025-05-13,0.056341,0.006604,0.054436,0.033047,-0.006134,-0.001274,0.002648,0.006968,0.031002,0.004206,...,0.007087,-0.000917,0.000397,0.002667,0.006299,0.012589,0.000545,0.005748,0.000124,0.000190
2025-05-14,0.041638,0.001278,0.056341,0.006604,0.054436,0.033047,-0.006134,-0.001274,0.002648,0.006968,...,0.014844,0.024697,0.007087,-0.000917,0.000397,0.017957,0.000575,0.005779,0.000126,0.000181
2025-05-15,-0.003768,0.004884,0.041638,0.001278,0.056341,0.006604,0.054436,0.033047,-0.006134,-0.001274,...,-0.005734,0.025894,0.014844,0.024697,0.007087,0.022212,0.000588,0.005867,0.000124,0.000172


In [114]:
df.drop(['rNVDA_today', 'rSPY_today'], axis=1, inplace=True)
df

Unnamed: 0_level_0,rNVDA_t-1,rSPY_t-1,rNVDA_t-2,rSPY_t-2,rNVDA_t-3,rSPY_t-3,rNVDA_t-4,rSPY_t-4,rNVDA_t-5,rSPY_t-5,...,rSPY_t-8,rNVDA_t-9,rSPY_t-9,rNVDA_t-10,rSPY_t-10,mean_NVDA,var_NVDA,mean_SPY,var_SPY,cov
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-20,0.018703,0.012496,-0.029495,-0.011225,-0.015634,0.002705,0.013582,0.008446,-0.033898,-0.009326,...,0.004221,0.006397,0.000704,0.014603,0.002647,-0.005720,0.000331,0.001539,0.000059,0.000097
2010-01-21,-0.004016,-0.010168,0.018703,0.012496,-0.029495,-0.011225,-0.015634,0.002705,0.013582,0.008446,...,0.003328,-0.019598,0.004221,0.006397,0.000704,-0.007581,0.000296,0.000258,0.000093,0.000111
2010-01-22,-0.017857,-0.019229,-0.004016,-0.010168,0.018703,0.012496,-0.029495,-0.011225,-0.015634,0.002705,...,0.001397,0.002161,0.003328,-0.019598,0.004221,-0.010007,0.000327,-0.001736,0.000131,0.000153
2010-01-25,-0.034604,-0.022292,-0.017857,-0.019229,-0.004016,-0.010168,0.018703,0.012496,-0.029495,-0.011225,...,-0.009326,-0.014016,0.001397,0.002161,0.003328,-0.011507,0.000395,-0.004387,0.000132,0.000184
2010-01-26,0.017011,0.005128,-0.034604,-0.022292,-0.017857,-0.019229,-0.004016,-0.010168,0.018703,0.012496,...,0.008446,-0.033898,-0.009326,-0.014016,0.001397,-0.010022,0.000424,-0.004207,0.000127,0.000175
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-12,-0.006134,-0.001274,0.002648,0.006968,0.031002,0.004206,-0.002460,-0.008358,-0.005939,-0.005734,...,0.000397,0.002667,0.006299,-0.020539,0.000381,0.005092,0.000471,0.002482,0.000127,0.000202
2025-05-13,0.054436,0.033047,-0.006134,-0.001274,0.002648,0.006968,0.031002,0.004206,-0.002460,-0.008358,...,0.007087,-0.000917,0.000397,0.002667,0.006299,0.012589,0.000545,0.005748,0.000124,0.000190
2025-05-14,0.056341,0.006604,0.054436,0.033047,-0.006134,-0.001274,0.002648,0.006968,0.031002,0.004206,...,0.014844,0.024697,0.007087,-0.000917,0.000397,0.017957,0.000575,0.005779,0.000126,0.000181
2025-05-15,0.041638,0.001278,0.056341,0.006604,0.054436,0.033047,-0.006134,-0.001274,0.002648,0.006968,...,-0.005734,0.025894,0.014844,0.024697,0.007087,0.022212,0.000588,0.005867,0.000124,0.000172


### we are trying to scale in order to have numerical stability, we are not sure it is correct since we need to have the non-scaled version of the variance at inference time

In [115]:
train_df = df.loc["2010-01-01":"2022-12-31"]
validation_df = df.loc["2023-01-01":"2023-12-31"]
test_df  = df.loc["2024-01-01":]
print((len(train_df) + len(validation_df) + len(test_df)) == len(df))


##CARE FOR SCALER
#scaler = StandardScaler()
#scaler.fit(train_df)
#train_df = pd.DataFrame(scaler.transform(train_df), columns=train_df.columns, index=train_df.index)
#test_df = pd.DataFrame(scaler.transform(test_df), columns=test_df.columns, index=test_df.index)

True


Flip the columns in order to have a temporal sequence from past to present

In [116]:
train_df = train_df[train_df.columns[::-1]]
validation_df = validation_df[validation_df.columns[::-1]]
test_df = test_df[test_df.columns[::-1]]

In [117]:
train_df

Unnamed: 0_level_0,cov,var_SPY,mean_SPY,var_NVDA,mean_NVDA,rSPY_t-10,rNVDA_t-10,rSPY_t-9,rNVDA_t-9,rSPY_t-8,...,rSPY_t-5,rNVDA_t-5,rSPY_t-4,rNVDA_t-4,rSPY_t-3,rNVDA_t-3,rSPY_t-2,rNVDA_t-2,rSPY_t-1,rNVDA_t-1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-20,0.000097,0.000059,0.001539,0.000331,-0.005720,0.002647,0.014603,0.000704,0.006397,0.004221,...,-0.009326,-0.033898,0.008446,0.013582,0.002705,-0.015634,-0.011225,-0.029495,0.012496,0.018703
2010-01-21,0.000111,0.000093,0.000258,0.000296,-0.007581,0.000704,0.006397,0.004221,-0.019598,0.003328,...,0.008446,0.013582,0.002705,-0.015634,-0.011225,-0.029495,0.012496,0.018703,-0.010168,-0.004016
2010-01-22,0.000153,0.000131,-0.001736,0.000327,-0.010007,0.004221,-0.019598,0.003328,0.002161,0.001397,...,0.002705,-0.015634,-0.011225,-0.029495,0.012496,0.018703,-0.010168,-0.004016,-0.019229,-0.017857
2010-01-25,0.000184,0.000132,-0.004387,0.000395,-0.011507,0.003328,0.002161,0.001397,-0.014016,-0.009326,...,-0.011225,-0.029495,0.012496,0.018703,-0.010168,-0.004016,-0.019229,-0.017857,-0.022292,-0.034604
2010-01-26,0.000175,0.000127,-0.004207,0.000424,-0.010022,0.001397,-0.014016,-0.009326,-0.033898,0.008446,...,0.012496,0.018703,-0.010168,-0.004016,-0.019229,-0.017857,-0.022292,-0.034604,0.005128,0.017011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-23,0.000335,0.000156,-0.003457,0.000964,-0.010718,-0.007470,-0.009785,0.014417,0.031410,0.007570,...,-0.011806,-0.022475,-0.008480,-0.019130,0.001368,-0.010397,0.014953,0.025862,-0.014266,-0.070420
2022-12-27,0.000345,0.000154,-0.002135,0.001299,-0.010607,0.014417,0.031410,0.007570,0.030625,-0.006394,...,-0.008480,-0.019130,0.001368,-0.010397,0.014953,0.025862,-0.014266,-0.070420,0.005752,-0.008671
2022-12-28,0.000246,0.000129,-0.003971,0.001070,-0.020883,0.007570,0.030625,-0.006394,-0.022023,-0.024462,...,0.001368,-0.010397,0.014953,0.025862,-0.014266,-0.070420,0.005752,-0.008671,-0.003944,-0.071353
2022-12-29,0.000320,0.000165,-0.005971,0.001177,-0.024548,-0.006394,-0.022023,-0.024462,-0.040851,-0.011806,...,0.014953,0.025862,-0.014266,-0.070420,0.005752,-0.008671,-0.003944,-0.071353,-0.012428,-0.006019


In [118]:
scaler_feature  = StandardScaler()
scaler_target = StandardScaler()

class LSTMdataset(Dataset):
    def __init__(self, dataframe, train):
        super().__init__()

        feature_dataframe = dataframe.drop(['cov', 'var_NVDA', 'var_SPY'], axis=1)
        target_dataframe = dataframe[['cov', 'var_NVDA', 'var_SPY']]

        if train:
          feature_dataframe = scaler_feature.fit_transform(feature_dataframe)
          target_dataframe = scaler_target.fit_transform(target_dataframe)
        else:
          feature_dataframe = scaler_feature.transform(feature_dataframe)
          target_dataframe = scaler_target.transform(target_dataframe)

        self.X = torch.tensor(feature_dataframe, dtype=torch.float32)
        self.y = torch.tensor(target_dataframe, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [119]:
# pre normalization
col_names = ['cov', 'var_NVDA', 'var_SPY']
for name in col_names:
    print(f"{name}: min={train_df[name].min():.6f}, max={train_df[name].max():.6f}")

cov: min=-0.000185, max=0.007358
var_NVDA: min=0.000019, max=0.011850
var_SPY: min=0.000002, max=0.004897


In [120]:
# post normalization
train_dataset = LSTMdataset(train_df, train=True)
train_dataset[3]


# Get all targets from the dataset as a tensor
all_targets = train_dataset[:][1]  # shape: (num_samples, 3)

# Convert to numpy array
all_targets_np = all_targets.numpy()

# Print min and max for each column (cov, var_NVDA, var_SPY)
col_names = ['cov', 'var_NVDA', 'var_SPY']
for i, name in enumerate(col_names):
    print(f"{name}: min={all_targets_np[:, i].min():.6f}, max={all_targets_np[:, i].max():.6f}")

cov: min=-0.866729, max=16.036287
var_NVDA: min=-0.747650, max=10.675799
var_SPY: min=-0.411307, max=15.820754


In [121]:
# batch first lets us have the tensor in the shape (batch=32, sequence=10, feature=2)
class LSTM(torch.nn.Module):
    def __init__(self, input_size=2, hidden_size=20, num_layers=2, output_size=3, dropout=0.4):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm  = torch.nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
          )
        # hidden size + 2 since the fc will take in input the mean of each asset
        self.fc_1 = torch.nn.Linear (hidden_size+2, (hidden_size+2)//2)
        self.dropout = torch.nn.Dropout(dropout)
        self.fc_2 = torch.nn.Linear ((hidden_size+2)//2, output_size)

    # x_lstm = (32, 10, 2) [N,L,H_in]
    # x_means = (32,2)
    def forward (self, x_lstm, x_means):

      output_lstm, _ = self.lstm(x_lstm)

      # now x_lstm = (32, 10, hidden_size)[N,L,Hout​]
      # here we take only the last hidden state
      last_hidden_state = output_lstm[:,-1,:]



      # now last_hidden_state should be (32,hidden_size)

      x_cat = torch.cat([last_hidden_state, x_means], dim=1)
      x_out = self.fc_1(x_cat)
      x_out = torch.relu(x_out)
      x_out = self.dropout(x_out)
      x_out = self.fc_2(x_out)
      return x_out

# We use train and validation to fix hyperparameters

In [122]:
# Hyperparameters
batch_size = 32
learning_rate = 1e-3
#learning_rate = 1e-4
weight_decay=1e-4
epochs = 50

# Prepare datasets and dataloaders
#print(train_df[-1:])
train_dataset = LSTMdataset(train_df, train=True)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

# Device setup: CUDA > MPS (Mac) > CPU
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print(f"Using device: {device}")

# Model, loss, optimizer
model_val = LSTM(input_size=2, hidden_size=30, num_layers=3, output_size=3, dropout=0.5).to(device)
criterion = torch.nn.MSELoss()
#criterion = torch.nn.L1Loss()

# weight_decay implement L2 norm
optimizer = torch.optim.Adam(model_val.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Training loop
for epoch in range(epochs):
    model_val.train()
    running_loss = 0.0
    for X, y in train_loader:
        X = X.to(device)
        y = y.to(device)
        # X: (batch, features), need to reshape for LSTM: (batch, seq_len, input_size)
        # X shape: (batch, 24): first 2 are means, next 20 are lags
        x_lstm = X[:, 2:].reshape(-1, 10, 2)
        x_means = X[:, :2]

        optimizer.zero_grad()
        outputs = model_val(x_lstm, x_means)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * X.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.6f}")

Using device: cuda
Epoch 1/50, Loss: 1.018902
Epoch 2/50, Loss: 1.000630
Epoch 3/50, Loss: 0.984790
Epoch 4/50, Loss: 0.872859
Epoch 5/50, Loss: 0.719619
Epoch 6/50, Loss: 0.651192
Epoch 7/50, Loss: 0.591632
Epoch 8/50, Loss: 0.543140
Epoch 9/50, Loss: 0.548713
Epoch 10/50, Loss: 0.410292
Epoch 11/50, Loss: 0.467113
Epoch 12/50, Loss: 0.478142
Epoch 13/50, Loss: 0.442496
Epoch 14/50, Loss: 0.375759
Epoch 15/50, Loss: 0.484368
Epoch 16/50, Loss: 0.454023
Epoch 17/50, Loss: 0.355901
Epoch 18/50, Loss: 0.428440
Epoch 19/50, Loss: 0.352901
Epoch 20/50, Loss: 0.364831
Epoch 21/50, Loss: 0.311400
Epoch 22/50, Loss: 0.341499
Epoch 23/50, Loss: 0.315421
Epoch 24/50, Loss: 0.313185
Epoch 25/50, Loss: 0.378530
Epoch 26/50, Loss: 0.334285
Epoch 27/50, Loss: 0.308726
Epoch 28/50, Loss: 0.265739
Epoch 29/50, Loss: 0.295227
Epoch 30/50, Loss: 0.259567
Epoch 31/50, Loss: 0.315480
Epoch 32/50, Loss: 0.379041
Epoch 33/50, Loss: 0.353726
Epoch 34/50, Loss: 0.334645
Epoch 35/50, Loss: 0.298037
Epoch 36/5

## Validation

In [123]:
mse = torch.nn.MSELoss()
mae = torch.nn.L1Loss(reduction='sum')

validation_dataset = LSTMdataset(validation_df, train=False)
val_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)

model_val.eval()
val_mse = 0.0
val_mae = 0.0

with torch.no_grad():
    for X, y in val_loader:
        X = X.to(device)
        y = y.to(device)
        x_lstm = X[:, 2:].reshape(-1, 10, 2)
        x_means = X[:, :2]
        outputs = model_val(x_lstm, x_means)
        loss_mse = mse(outputs, y)
        loss_mae = mae(outputs, y)
        val_mse += loss_mse.item() * X.size(0)
        val_mae += loss_mae.item()

avg_val_mse = val_mse / len(val_loader.dataset)
avg_val_mae = val_mae / len(val_loader.dataset)
print(f"Validation MSE: {avg_val_mse:.6f}")
print(f"Validation MAE: {avg_val_mae:.6f}")

Validation MSE: 0.185582
Validation MAE: 0.583566


#now train on whole dataset

In [124]:
train_df = pd.concat([train_df, validation_df])

train_dataset = LSTMdataset(train_df, train=True)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

# Model, loss, optimizer
model = LSTM(input_size=2, hidden_size=30, num_layers=3, output_size=3, dropout=0.5).to(device)
criterion = torch.nn.MSELoss()
#criterion = torch.nn.L1Loss()

# weight_decay implement L2 norm
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Training loop
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for X, y in train_loader:
        X = X.to(device)
        y = y.to(device)
        # X: (batch, features), need to reshape for LSTM: (batch, seq_len, input_size)
        # X shape: (batch, 24): first 2 are means, next 20 are lags
        x_lstm = X[:, 2:].reshape(-1, 10, 2)
        x_means = X[:, :2]

        optimizer.zero_grad()
        outputs = model(x_lstm, x_means)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * X.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.6f}")

Epoch 1/50, Loss: 1.026322
Epoch 2/50, Loss: 0.955220
Epoch 3/50, Loss: 0.793394
Epoch 4/50, Loss: 0.675159
Epoch 5/50, Loss: 0.627585
Epoch 6/50, Loss: 0.565183
Epoch 7/50, Loss: 0.510656
Epoch 8/50, Loss: 0.459053
Epoch 9/50, Loss: 0.536078
Epoch 10/50, Loss: 0.400887
Epoch 11/50, Loss: 0.336881
Epoch 12/50, Loss: 0.412855
Epoch 13/50, Loss: 0.363743
Epoch 14/50, Loss: 0.388303
Epoch 15/50, Loss: 0.355825
Epoch 16/50, Loss: 0.286664
Epoch 17/50, Loss: 0.316428
Epoch 18/50, Loss: 0.303511
Epoch 19/50, Loss: 0.348436
Epoch 20/50, Loss: 0.338778
Epoch 21/50, Loss: 0.291976
Epoch 22/50, Loss: 0.406617
Epoch 23/50, Loss: 0.424492
Epoch 24/50, Loss: 0.256935
Epoch 25/50, Loss: 0.333565
Epoch 26/50, Loss: 0.279153
Epoch 27/50, Loss: 0.323450
Epoch 28/50, Loss: 0.267667
Epoch 29/50, Loss: 0.288271
Epoch 30/50, Loss: 0.275265
Epoch 31/50, Loss: 0.258157
Epoch 32/50, Loss: 0.301534
Epoch 33/50, Loss: 0.294533
Epoch 34/50, Loss: 0.305420
Epoch 35/50, Loss: 0.259774
Epoch 36/50, Loss: 0.281353
E

In [125]:
mse = torch.nn.MSELoss()
mae = torch.nn.L1Loss(reduction='sum')
#drop the last year, from
#test_df = test_df.loc[:"2024-12-31"]
#print last element
print(test_df[-1:])
test_dataset = LSTMdataset(test_df, train=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model.eval()
test_mse = 0.0
test_mae = 0.0

with torch.no_grad():
    for X, y in test_loader:
        X = X.to(device)
        y = y.to(device)
        x_lstm = X[:, 2:].reshape(-1, 10, 2)
        x_means = X[:, :2]
        outputs = model(x_lstm, x_means)
        loss_mse = mse(outputs, y)
        loss_mae = mae(outputs, y)
        test_mse += loss_mse.item() * X.size(0)
        test_mae += loss_mae.item()

avg_test_mse = test_mse / len(test_loader.dataset)
avg_test_mae = test_mae / len(test_loader.dataset)
print(f"Test MSE: {avg_test_mse:.6f}")
print(f"Test MAE: {avg_test_mae:.6f}")

                cov   var_SPY  mean_SPY  var_NVDA  mean_NVDA  rSPY_t-10  \
Date                                                                      
2025-05-16  0.00017  0.000123  0.005646  0.000606   0.019366   0.014844   

            rNVDA_t-10  rSPY_t-9  rNVDA_t-9  rSPY_t-8  ...  rSPY_t-5  \
Date                                                   ...             
2025-05-16    0.025894 -0.005734  -0.005939 -0.008358  ... -0.001274   

            rNVDA_t-5  rSPY_t-4  rNVDA_t-4  rSPY_t-3  rNVDA_t-3  rSPY_t-2  \
Date                                                                        
2025-05-16  -0.006134  0.033047   0.054436  0.006604   0.056341  0.001278   

            rNVDA_t-2  rSPY_t-1  rNVDA_t-1  
Date                                        
2025-05-16   0.041638  0.004884  -0.003768  

[1 rows x 25 columns]
Test MSE: 0.433551
Test MAE: 0.936814


In [126]:

# Suppose scaler_target is already fitted on your target columns (['cov', 'var_NVDA', 'var_SPY'])
# Get one sample from the test set
for i in range (3):
  X_sample, y_true = test_dataset[i]  # X_sample: features, y_true: normalized target

  # Prepare input for model (add batch dimension)
  X_sample = X_sample.unsqueeze(0).to(device)
  x_lstm = X_sample[:, 2:].reshape(-1, 10, 2)
  x_means = X_sample[:, :2]

  # Get model prediction (normalized)
  model.eval()
  with torch.no_grad():
      y_pred = model(x_lstm, x_means).cpu().numpy()  # shape: (1, 3)

  # Inverse transform to get un-normalized prediction
  y_pred_unscaled = scaler_target.inverse_transform(y_pred)[0]  # shape: (3,)

  # Inverse transform the true value for comparison
  y_true_unscaled = scaler_target.inverse_transform(y_true.cpu().numpy().reshape(1, -1))[0]

  print(f"Sample num. {i}. ['cov', 'var_NVDA', 'var_SPY']")
  print("Predicted (un-normalized):", [f"{v:.10f}" for v in y_pred_unscaled])
  print("True (un-normalized):     ", [f"{v:.10f}" for v in y_true_unscaled])
  #print("True (before normalization happens):")
  #print(test_df[col_names].iloc[i].apply(lambda x: f"{x:.10f}"))
  print("-----------------------------------------------------")

Sample num. 0. ['cov', 'var_NVDA', 'var_SPY']
Predicted (un-normalized): ['0.0000734396', '0.0004419770', '0.0000470199']
True (un-normalized):      ['0.0000843383', '0.0002897779', '0.0000415377']
-----------------------------------------------------
Sample num. 1. ['cov', 'var_NVDA', 'var_SPY']
Predicted (un-normalized): ['0.0000829931', '0.0004548163', '0.0000512241']
True (un-normalized):      ['0.0000960009', '0.0002867111', '0.0000482147']
-----------------------------------------------------
Sample num. 2. ['cov', 'var_NVDA', 'var_SPY']
Predicted (un-normalized): ['0.0000947603', '0.0004852623', '0.0000561091']
True (un-normalized):      ['0.0000760009', '0.0002265895', '0.0000452304']
-----------------------------------------------------


In [127]:
# Collect all predictions and true values
all_preds = []
all_preds_unscaled = []

model.eval()
with torch.no_grad():
    for X, y in test_loader:
        X = X.to(device)
        x_lstm = X[:, 2:].reshape(-1, 10, 2)
        x_means = X[:, :2]
        outputs = model(x_lstm, x_means).cpu().numpy()  # normalized predictions
        all_preds.append(outputs)
        # Un-normalize predictions
        outputs_unscaled = scaler_target.inverse_transform(outputs)
        all_preds_unscaled.append(outputs_unscaled)

# Stack all batches
all_preds = np.vstack(all_preds)  # shape: (num_samples, 3)
all_preds_unscaled = np.vstack(all_preds_unscaled)  # shape: (num_samples, 3)

col_names = ['cov', 'var_NVDA', 'var_SPY']

print("=== Normalized predictions ===")
for i, name in enumerate(col_names):
    print(f"{name}: min={all_preds[:, i].min():.6f}, max={all_preds[:, i].max():.6f}")

print("\n=== Un-normalized predictions ===")
for i, name in enumerate(col_names):
    print(f"{name}: min={all_preds_unscaled[:, i].min():.6f}, max={all_preds_unscaled[:, i].max():.6f}")

=== Normalized predictions ===
cov: min=-0.333730, max=4.486005
var_NVDA: min=-0.418008, max=3.620297
var_SPY: min=-0.300878, max=4.470698

=== Un-normalized predictions ===
cov: min=0.000054, max=0.002131
var_NVDA: min=0.000367, max=0.004607
var_SPY: min=0.000034, max=0.001424
