In [38]:
# import
import pandas as pd
import yfinance as yf
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn 

In [2]:
data = yf.download(["SPY","NVDA"], start="2010-01-01", end="2025-01-01")
spy = data.xs('SPY', axis=1, level=1)
nvda = data.xs('NVDA', axis=1, level=1)

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  2 of 2 completed


**Simple returns for every day:**
$$
R_t = \frac{P_t - P_{t-1}}{P_{t-1}} = \frac{P_t}{P_{t-1}} - 1
$$

In [3]:
prices = data["Close"]
returns = (prices / prices.shift(1)) - 1
returns = returns.dropna()
spy_r = returns['SPY']
nvda_r = returns['NVDA']

In [4]:
mu1 = returns['NVDA'].mean()
mu2 = returns['SPY'].mean()

print(mu1, mu2)

0.0019328877523807743 0.0005665853817745405


## BUILD DATASET

In [5]:
returns.head(5)

Ticker,NVDA,SPY
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-05,0.014602,0.002647
2010-01-06,0.006396,0.000704
2010-01-07,-0.019597,0.004221
2010-01-08,0.002161,0.003327
2010-01-11,-0.014016,0.001396


In [6]:
r1_series = returns['NVDA']
r1_series.head(5)

Date
2010-01-05    0.014602
2010-01-06    0.006396
2010-01-07   -0.019597
2010-01-08    0.002161
2010-01-11   -0.014016
Name: NVDA, dtype: float64

In [7]:
r2_series = returns['SPY']
r2_series.head(5)

Date
2010-01-05    0.002647
2010-01-06    0.000704
2010-01-07    0.004221
2010-01-08    0.003327
2010-01-11    0.001396
Name: SPY, dtype: float64

In [8]:
df = pd.DataFrame()
df['rNVDA_today'] = r1_series
df['rSPY_today'] = r2_series

df.head(5)

Unnamed: 0_level_0,rNVDA_today,rSPY_today
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-05,0.014602,0.002647
2010-01-06,0.006396,0.000704
2010-01-07,-0.019597,0.004221
2010-01-08,0.002161,0.003327
2010-01-11,-0.014016,0.001396


In [9]:
for i in range(1, 11):
    df[f'rNVDA_t-{i}'] = r1_series.shift(i)
    df[f'rSPY_t-{i}'] = r2_series.shift(i)

df.head(11)

Unnamed: 0_level_0,rNVDA_today,rSPY_today,rNVDA_t-1,rSPY_t-1,rNVDA_t-2,rSPY_t-2,rNVDA_t-3,rSPY_t-3,rNVDA_t-4,rSPY_t-4,...,rNVDA_t-6,rSPY_t-6,rNVDA_t-7,rSPY_t-7,rNVDA_t-8,rSPY_t-8,rNVDA_t-9,rSPY_t-9,rNVDA_t-10,rSPY_t-10
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-05,0.014602,0.002647,,,,,,,,,...,,,,,,,,,,
2010-01-06,0.006396,0.000704,0.014602,0.002647,,,,,,,...,,,,,,,,,,
2010-01-07,-0.019597,0.004221,0.006396,0.000704,0.014602,0.002647,,,,,...,,,,,,,,,,
2010-01-08,0.002161,0.003327,-0.019597,0.004221,0.006396,0.000704,0.014602,0.002647,,,...,,,,,,,,,,
2010-01-11,-0.014016,0.001396,0.002161,0.003327,-0.019597,0.004221,0.006396,0.000704,0.014602,0.002647,...,,,,,,,,,,
2010-01-12,-0.033898,-0.009326,-0.014016,0.001396,0.002161,0.003327,-0.019597,0.004221,0.006396,0.000704,...,,,,,,,,,,
2010-01-13,0.013582,0.008446,-0.033898,-0.009326,-0.014016,0.001396,0.002161,0.003327,-0.019597,0.004221,...,0.014602,0.002647,,,,,,,,
2010-01-14,-0.015634,0.002704,0.013582,0.008446,-0.033898,-0.009326,-0.014016,0.001396,0.002161,0.003327,...,0.006396,0.000704,0.014602,0.002647,,,,,,
2010-01-15,-0.029495,-0.011224,-0.015634,0.002704,0.013582,0.008446,-0.033898,-0.009326,-0.014016,0.001396,...,-0.019597,0.004221,0.006396,0.000704,0.014602,0.002647,,,,
2010-01-19,0.018702,0.012496,-0.029495,-0.011224,-0.015634,0.002704,0.013582,0.008446,-0.033898,-0.009326,...,0.002161,0.003327,-0.019597,0.004221,0.006396,0.000704,0.014602,0.002647,,


In [10]:
cols = [f'rNVDA_t-{i}' for i in range(1, 11)]
df['mean_NVDA'] = df[cols].mean(axis=1)
cols.append('rNVDA_today')
df['var_NVDA'] = df[cols].var(axis=1)

cols = [f'rSPY_t-{i}' for i in range(1, 11)]
df['mean_SPY'] = df[cols].mean(axis=1)
cols.append('rSPY_today')
df['var_SPY'] = df[cols].var(axis=1)

In [11]:
# remove nan => remove first ten days 
df = df.dropna()

In [12]:
df

Unnamed: 0_level_0,rNVDA_today,rSPY_today,rNVDA_t-1,rSPY_t-1,rNVDA_t-2,rSPY_t-2,rNVDA_t-3,rSPY_t-3,rNVDA_t-4,rSPY_t-4,...,rNVDA_t-8,rSPY_t-8,rNVDA_t-9,rSPY_t-9,rNVDA_t-10,rSPY_t-10,mean_NVDA,var_NVDA,mean_SPY,var_SPY
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-20,-0.004016,-0.010169,0.018702,0.012496,-0.029495,-0.011224,-0.015634,0.002704,0.013582,0.008446,...,-0.019597,0.004221,0.006396,0.000704,0.014602,0.002647,-0.005720,0.000331,0.001539,0.000059
2010-01-21,-0.017857,-0.019229,-0.004016,-0.010169,0.018702,0.012496,-0.029495,-0.011224,-0.015634,0.002704,...,0.002161,0.003327,-0.019597,0.004221,0.006396,0.000704,-0.007581,0.000296,0.000258,0.000093
2010-01-22,-0.034604,-0.022292,-0.017857,-0.019229,-0.004016,-0.010169,0.018702,0.012496,-0.029495,-0.011224,...,-0.014016,0.001396,0.002161,0.003327,-0.019597,0.004221,-0.010007,0.000327,-0.001736,0.000131
2010-01-25,0.017011,0.005127,-0.034604,-0.022292,-0.017857,-0.019229,-0.004016,-0.010169,0.018702,0.012496,...,-0.033898,-0.009326,-0.014016,0.001396,0.002161,0.003327,-0.011507,0.000395,-0.004387,0.000132
2010-01-26,-0.031661,-0.004190,0.017011,0.005127,-0.034604,-0.022292,-0.017857,-0.019229,-0.004016,-0.010169,...,0.013582,0.008446,-0.033898,-0.009326,-0.014016,0.001396,-0.010022,0.000424,-0.004207,0.000127
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-24,0.003938,0.011115,0.036897,0.005988,0.030762,0.012011,0.013730,-0.000307,-0.011351,-0.029803,...,-0.014141,-0.005153,0.031391,0.007731,-0.026943,-0.003109,0.000889,0.000547,-0.001269,0.000132
2024-12-26,-0.002068,0.000067,0.003938,0.011115,0.036897,0.005988,0.030762,0.012011,0.013730,-0.000307,...,-0.022499,-0.000199,-0.014141,-0.005153,0.031391,0.007731,0.003977,0.000464,0.000153,0.000131
2024-12-27,-0.020868,-0.010527,-0.002068,0.000067,0.003938,0.011115,0.036897,0.005988,0.030762,0.012011,...,-0.016760,0.004270,-0.022499,-0.000199,-0.014141,-0.005153,0.000631,0.000420,-0.000613,0.000134
2024-12-30,0.003503,-0.011412,-0.020868,-0.010527,-0.002068,0.000067,0.003938,0.011115,0.036897,0.005988,...,-0.012197,-0.004120,-0.016760,0.004270,-0.022499,-0.000199,-0.000041,0.000403,-0.001150,0.000142


## SPLIT DATASET

In [13]:
train_df = df.loc["2010-01-01":"2023-12-31"]
test_df  = df.loc["2024-01-01":]

In [14]:
# check consistency of the split
print((len(train_df) + len(test_df)) == len(df))

True


In [15]:
def compute_covariance_row(row):
    r1 = [row[f'rNVDA_t-{i}'] for i in range(1, 11)]
    r1.append(row['rNVDA_today'])
    r2 = [row[f'rSPY_t-{i}'] for i in range(1, 11)]
    r2.append(row['rSPY_today'])
    return np.cov(r1, r2)[0, 1]

train_df['cov'] = train_df.apply(compute_covariance_row, axis=1)
train_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['cov'] = train_df.apply(compute_covariance_row, axis=1)


Unnamed: 0_level_0,rNVDA_today,rSPY_today,rNVDA_t-1,rSPY_t-1,rNVDA_t-2,rSPY_t-2,rNVDA_t-3,rSPY_t-3,rNVDA_t-4,rSPY_t-4,...,rSPY_t-8,rNVDA_t-9,rSPY_t-9,rNVDA_t-10,rSPY_t-10,mean_NVDA,var_NVDA,mean_SPY,var_SPY,cov
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-20,-0.004016,-0.010169,0.018702,0.012496,-0.029495,-0.011224,-0.015634,0.002704,0.013582,0.008446,...,0.004221,0.006396,0.000704,0.014602,0.002647,-0.005720,0.000331,0.001539,0.000059,0.000097
2010-01-21,-0.017857,-0.019229,-0.004016,-0.010169,0.018702,0.012496,-0.029495,-0.011224,-0.015634,0.002704,...,0.003327,-0.019597,0.004221,0.006396,0.000704,-0.007581,0.000296,0.000258,0.000093,0.000111
2010-01-22,-0.034604,-0.022292,-0.017857,-0.019229,-0.004016,-0.010169,0.018702,0.012496,-0.029495,-0.011224,...,0.001396,0.002161,0.003327,-0.019597,0.004221,-0.010007,0.000327,-0.001736,0.000131,0.000153
2010-01-25,0.017011,0.005127,-0.034604,-0.022292,-0.017857,-0.019229,-0.004016,-0.010169,0.018702,0.012496,...,-0.009326,-0.014016,0.001396,0.002161,0.003327,-0.011507,0.000395,-0.004387,0.000132,0.000184
2010-01-26,-0.031661,-0.004190,0.017011,0.005127,-0.034604,-0.022292,-0.017857,-0.019229,-0.004016,-0.010169,...,0.008446,-0.033898,-0.009326,-0.014016,0.001396,-0.010022,0.000424,-0.004207,0.000127,0.000175
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-22,-0.003266,0.002010,0.018270,0.009482,-0.030098,-0.013857,-0.009445,0.006081,0.024279,0.005625,...,0.004567,-0.018503,0.003890,0.019530,0.004299,0.005178,0.000316,0.003544,0.000048,0.000073
2023-12-26,0.009195,0.004223,-0.003266,0.002010,0.018270,0.009482,-0.030098,-0.013857,-0.009445,0.006081,...,0.013790,0.022090,0.004567,-0.018503,0.003890,0.002899,0.000295,0.003315,0.000048,0.000072
2023-12-27,0.002800,0.001808,0.009195,0.004223,-0.003266,0.002010,0.018270,0.009482,-0.030098,-0.013857,...,0.003209,0.009044,0.013790,0.022090,0.004567,0.005669,0.000242,0.003348,0.000049,0.000074
2023-12-28,0.002125,0.000378,0.002800,0.001808,0.009195,0.004223,-0.003266,0.002010,0.018270,0.009482,...,-0.001647,0.005448,0.003209,0.009044,0.013790,0.003740,0.000212,0.003072,0.000049,0.000072


In [16]:
train_df.drop(['rNVDA_today', 'rSPY_today'], axis=1, inplace=True)
test_df.drop(['rNVDA_today', 'rSPY_today', 'var_NVDA', 'var_SPY'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.drop(['rNVDA_today', 'rSPY_today'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.drop(['rNVDA_today', 'rSPY_today', 'var_NVDA', 'var_SPY'], axis=1, inplace=True)


In [17]:
train_df = train_df[train_df.columns[::-1]]
test_df = test_df[test_df.columns[::-1]]

train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)

In [30]:
train_df

Unnamed: 0_level_0,cov,var_SPY,mean_SPY,var_NVDA,mean_NVDA,rSPY_t-10,rNVDA_t-10,rSPY_t-9,rNVDA_t-9,rSPY_t-8,...,rSPY_t-5,rNVDA_t-5,rSPY_t-4,rNVDA_t-4,rSPY_t-3,rNVDA_t-3,rSPY_t-2,rNVDA_t-2,rSPY_t-1,rNVDA_t-1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-20,0.000097,0.000059,0.001539,0.000331,-0.005720,0.002647,0.014602,0.000704,0.006396,0.004221,...,-0.009326,-0.033898,0.008446,0.013582,0.002704,-0.015634,-0.011224,-0.029495,0.012496,0.018702
2010-01-21,0.000111,0.000093,0.000258,0.000296,-0.007581,0.000704,0.006396,0.004221,-0.019597,0.003327,...,0.008446,0.013582,0.002704,-0.015634,-0.011224,-0.029495,0.012496,0.018702,-0.010169,-0.004016
2010-01-22,0.000153,0.000131,-0.001736,0.000327,-0.010007,0.004221,-0.019597,0.003327,0.002161,0.001396,...,0.002704,-0.015634,-0.011224,-0.029495,0.012496,0.018702,-0.010169,-0.004016,-0.019229,-0.017857
2010-01-25,0.000184,0.000132,-0.004387,0.000395,-0.011507,0.003327,0.002161,0.001396,-0.014016,-0.009326,...,-0.011224,-0.029495,0.012496,0.018702,-0.010169,-0.004016,-0.019229,-0.017857,-0.022292,-0.034604
2010-01-26,0.000175,0.000127,-0.004207,0.000424,-0.010022,0.001396,-0.014016,-0.009326,-0.033898,0.008446,...,0.012496,0.018702,-0.010169,-0.004016,-0.019229,-0.017857,-0.022292,-0.034604,0.005127,0.017011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-22,0.000073,0.000048,0.003544,0.000316,0.005178,0.004299,0.019530,0.003890,-0.018503,0.004567,...,-0.001647,0.011169,0.005625,0.024279,0.006081,-0.009445,-0.013857,-0.030098,0.009482,0.018270
2023-12-26,0.000072,0.000048,0.003315,0.000295,0.002899,0.003890,-0.018503,0.004567,0.022090,0.013790,...,0.005625,0.024279,0.006081,-0.009445,-0.013857,-0.030098,0.009482,0.018270,0.002010,-0.003266
2023-12-27,0.000074,0.000049,0.003348,0.000242,0.005669,0.004567,0.022090,0.013790,0.009044,0.003209,...,0.006081,-0.009445,-0.013857,-0.030098,0.009482,0.018270,0.002010,-0.003266,0.004223,0.009195
2023-12-28,0.000072,0.000049,0.003072,0.000212,0.003740,0.013790,0.009044,0.003209,0.005448,-0.001647,...,-0.013857,-0.030098,0.009482,0.018270,0.002010,-0.003266,0.004223,0.009195,0.001808,0.002800


In [18]:
class LSTMdataset(Dataset):
    def __init__(self, dataframe):
        super().__init__()
        self.X = torch.tensor(dataframe.drop(['cov', 'var_NVDA', 'var_SPY'], axis=1).values, dtype=torch.float32) 
        self.y = torch.tensor(dataframe[['cov', 'var_NVDA', 'var_SPY']].values, dtype=torch.float32) 

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [31]:
train_dataset = LSTMdataset(train_df)

In [32]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [None]:
# check shape in input
x, y = next(iter(train_loader))
print(x.shape, y.shape)
# need resize during training for lstm

torch.Size([32, 22]) torch.Size([32, 3])


In [None]:
class NeuralNet(nn.Module):
    def __init__(self, input_size=2, hidden_size=64, num_layers=1, output_size=3):
        super().__init__()

        self.lstm = nn.LSTM(input_size=input_size,
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            batch_first=True)
        
        self.fc = nn.Linear(hidden_size+2, output_size)


    def forward(self,x):

        means = torch.cat([x[:, 0, :], x[:, 1, :] ], dim=1)
        x_lstm = x[:, 2:, :]  
        lstm_out, _ = self.lstm(x_lstm)
        last_output = lstm_out[:, -1, :]

        new_input = torch.cat([last_output, means], dim=1)
        out = self.fc(new_input)

        return out