In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h_0 = torch.zeros(num_layers, x.size(0), hidden_size).to(x.device)
        c_0 = torch.zeros(num_layers, x.size(0), hidden_size).to(x.device)
        out, _ = self.lstm(x, (h_0, c_0))
        out = self.fc(out[:, -1, :])  # Take the output from the last time step
        return out

# Custom dataset class
class TimeSeriesDataset(Dataset):
    def __init__(self, df, seq_length, target_length, add_cols, target_col):
        self.seq_length = seq_length
        self.target_length = target_length
        self.add_cols = add_cols
        self.target_col = target_col
        self.scaler = StandardScaler()

        self.data = []
        self.targets = []

        for fund in df['PRODUCTREFERENCE'].unique():
            fund_data = df[df['PRODUCTREFERENCE'] == fund].sort_values('date')
            if len(fund_data) < seq_length + target_length:
                continue
            # Normalize the data
            scaled_data = self.scaler.fit_transform(fund_data[add_cols])

            for i in range(len(fund_data) - seq_length - target_length + 1):
                seq_x = scaled_data[i:i+seq_length]
                seq_y = fund_data[target_col].iloc[i+seq_length:i+seq_length+target_length].values
                self.data.append(seq_x)
                self.targets.append(seq_y)
        
        self.data = np.array(self.data)
        self.targets = np.array(self.targets)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32), torch.tensor(self.targets[idx], dtype=torch.float32)




In [2]:
add_cols = [
    'aum', 
    'SENT_', 
    'SMB', 'HML', 'RF', 'mom', 
    'confeature', 'tfpfeature', 'ipgfeature', 
    'termfeature', 'deffeature', 'deifeature', 
    'mktfeature', 'labfeature', 
    'exret'  # Assuming this is the predictor and 'excret' is the target
]

target_col = 'exret'

In [48]:
df = pd.read_csv('dataset/hf/hf.csv')

In [102]:
data_df = df.copy()
test_df = df.loc[df.date >= '2023-10-15', ['date', 'PRODUCTREFERENCE', 'exret']]
data_df = data_df.dropna()
train_df = data_df[data_df.date <= '2023-09-15']
test_df

Unnamed: 0,date,PRODUCTREFERENCE,exret
504,2023-10-15,21,0.003200
505,2023-11-15,21,0.039600
506,2023-12-15,21,0.015800
939,2023-10-15,29,-0.059800
940,2023-11-15,29,0.054700
...,...,...,...
433590,2023-11-15,106517,0.023429
433591,2023-12-15,106517,-0.029457
433883,2023-10-15,107151,0.126500
433884,2023-11-15,107151,-0.114500


In [76]:
# Hyperparameters
input_size = len(add_cols)
hidden_size = 128
num_layers = 2
output_size = 3  
seq_length = 36  # Length of input sequences
target_length = 3
batch_size = 32
num_epochs = 20
learning_rate = 0.001

# Assuming df is your DataFrame
dataset = TimeSeriesDataset(train_df, seq_length, target_length, add_cols, target_col)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Model, loss, optimizer
model = LSTMModel(input_size, hidden_size, num_layers, output_size)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [11]:

# Training loop
model.train()
for epoch in range(num_epochs):
    for batch_x, batch_y in dataloader:
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# After training, you can use model.eval() to switch to evaluation mode and make predictions

Epoch [1/20], Loss: 0.0012
Epoch [2/20], Loss: 0.0007
Epoch [3/20], Loss: 0.0011
Epoch [4/20], Loss: 0.0007
Epoch [5/20], Loss: 0.0003
Epoch [6/20], Loss: 0.0011
Epoch [7/20], Loss: 0.0006
Epoch [8/20], Loss: 0.0006
Epoch [9/20], Loss: 0.0004
Epoch [10/20], Loss: 0.0004
Epoch [11/20], Loss: 0.0008
Epoch [12/20], Loss: 0.0005
Epoch [13/20], Loss: 0.0008
Epoch [14/20], Loss: 0.0004
Epoch [15/20], Loss: 0.0005
Epoch [16/20], Loss: 0.0004
Epoch [17/20], Loss: 0.0004
Epoch [18/20], Loss: 0.0005
Epoch [19/20], Loss: 0.0004
Epoch [20/20], Loss: 0.0004


In [28]:
torch.save(model.state_dict(), 'hidden_128_seqlen_36_20epoch_new.pth')

In [85]:
# test_df = df[df.date >= '2023-10-15']

# predict_df = data_df[(data_df.date >= '2020-10-15') & ( data_df.date <= '2023-09-15')]
predict_df = data_df[data_df.date <= '2023-09-15']
count_df = predict_df.groupby(['PRODUCTREFERENCE']).agg({'date': 'count'}).reset_index()
predict_df.merge(count_df.loc[count_df.date >= seq_length], on='PRODUCTREFERENCE') 
funds_to_eval = list(predict_df.merge(test_df, on='PRODUCTREFERENCE', how='inner')['PRODUCTREFERENCE'].unique())
predict_df = predict_df[predict_df.PRODUCTREFERENCE.isin(funds_to_eval)]
predict_df['series_id'] = predict_df['PRODUCTREFERENCE']

# if only including the products with enough history
predict_fund_history_count = predict_df.groupby(['PRODUCTREFERENCE']).agg({'date':'count'}).reset_index()
predict_funds = list(predict_fund_history_count.loc[predict_fund_history_count.date >= seq_length, 'PRODUCTREFERENCE'])
predict_df = predict_df[predict_df.PRODUCTREFERENCE.isin(predict_funds)]

In [94]:
# test on training data
test_df = data_df[(data_df.date < '2023-10-15') & (data_df.date >= '2023-07-15')]

predict_df = data_df[(data_df.date >= '2017-10-15') & ( data_df.date <= '2023-06-15')]
count_df = predict_df.groupby(['PRODUCTREFERENCE']).agg({'date': 'count'}).reset_index()
predict_df.merge(count_df.loc[count_df.date >= seq_length], on='PRODUCTREFERENCE') 
funds_to_eval = list(predict_df.merge(test_df, on='PRODUCTREFERENCE', how='inner')['PRODUCTREFERENCE'].unique())
predict_df = predict_df[predict_df.PRODUCTREFERENCE.isin(funds_to_eval)]
predict_df['series_id'] = predict_df['PRODUCTREFERENCE']

# if only including the products with enough history
predict_fund_history_count = predict_df.groupby(['PRODUCTREFERENCE']).agg({'date':'count'}).reset_index()
predict_funds = list(predict_fund_history_count.loc[predict_fund_history_count.date >= seq_length, 'PRODUCTREFERENCE'])
predict_df = predict_df[predict_df.PRODUCTREFERENCE.isin(predict_funds)]

In [103]:
predict_df

Unnamed: 0,PRODUCTREFERENCE,PRIMARYCATEGORY,date,exret,aum,aum24m,aumrec,PTFSBD,PTFSFX,PTFSCOM,...,labbeta,confeature,tfpfeature,ipgfeature,termfeature,deffeature,deifeature,mktfeature,labfeature,series_id
867,29,Long/Short Equity Hedge,2017-10-15,-0.008000,14962687.0,1.486074e+07,1.769472e+07,-0.1554,-0.0239,-0.0592,...,1.566517,-0.007233,-0.000705,0.000230,8.322672e-18,-0.000151,0.002828,0.018559,0.005830,29
868,29,Long/Short Equity Hedge,2017-11-15,0.019000,15220348.0,1.484563e+07,1.768204e+07,-0.0806,-0.2678,-0.1222,...,2.731222,-0.006302,-0.003527,0.002271,-6.658714e-03,0.003209,-0.000032,0.023714,0.009531,29
869,29,Long/Short Equity Hedge,2017-12-15,0.011500,15402024.0,1.486273e+07,1.767040e+07,-0.1131,-0.1190,0.0084,...,2.792083,-0.001598,-0.003258,-0.001630,-3.375621e-03,-0.001530,0.000903,0.008126,0.009785,29
870,29,Long/Short Equity Hedge,2018-01-15,0.019200,15759775.0,1.492182e+07,1.766070e+07,0.2099,0.5127,0.0045,...,2.262481,0.002334,0.001401,0.003760,1.352823e-04,0.000000,-0.008841,0.039748,0.012361,29
871,29,Long/Short Equity Hedge,2018-02-15,-0.011400,13859601.0,1.492035e+07,1.764151e+07,-0.0921,-0.0844,0.0392,...,1.468317,0.004777,0.002624,0.001939,1.460951e-03,0.001742,-0.001518,-0.023943,0.003629,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433580,106517,Multi-Strategy,2023-01-15,0.007359,19200000.0,1.899633e+07,3.666113e+07,-0.1830,-0.1239,-0.1639,...,0.038081,0.004457,0.008856,0.011854,6.185814e-03,-0.006282,0.000627,0.010210,0.000272,106517
433581,106517,Multi-Strategy,2023-02-15,-0.022241,19100000.0,1.897007e+07,3.640662e+07,0.0588,-0.1890,-0.0560,...,1.046249,0.004883,-0.003087,0.000270,2.874895e-03,0.009152,0.004531,-0.003530,0.007482,106517
433583,106517,Multi-Strategy,2023-04-15,0.027229,17400000.0,1.877012e+07,3.613510e+07,-0.1711,-0.0147,-0.0459,...,0.713583,0.004134,0.000197,0.002232,1.730460e-04,0.006472,0.005818,0.000497,0.003052,106517
433584,106517,Multi-Strategy,2023-05-15,-0.056724,16800000.0,1.867162e+07,3.586277e+07,-0.1270,-0.0919,0.0491,...,0.675415,0.000000,0.001681,-0.001138,3.759342e-04,-0.004719,-0.002436,0.000357,0.002889,106517


In [66]:
model = LSTMModel(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, output_size=output_size)

# Load the saved model state dict
model.load_state_dict(torch.load('hidden_128_seqlen_36_50epoch_best.pth'))

<All keys matched successfully>

In [95]:
# Initialize a list to store results
results = []

# Iterate over each unique PRODUCTREFERENCE
for product_reference in predict_df['PRODUCTREFERENCE'].unique():
    
    # Extract the latest sequence for this PRODUCTREFERENCE
    fund_data = predict_df[predict_df['PRODUCTREFERENCE'] == product_reference].sort_values('date')

    # Skip this PRODUCTREFERENCE if there's not enough data
    if len(fund_data) < seq_length:
        continue
    
    scaler = StandardScaler()

    # option 1
    # scaled_data = scaler.fit_transform(fund_data[add_cols])
    # scaled_fund_data = scaled_data[-seq_length:]
    
    # option 2
    scaler.fit(fund_data[add_cols])
    scaled_fund_data = scaler.transform(fund_data[add_cols].tail(seq_length))

    # option 3


    # input data
    input_data = np.expand_dims(scaled_fund_data, axis=0)  # Add batch dimension
    # Convert to PyTorch tensor
    input_tensor = torch.tensor(input_data, dtype=torch.float32)


    # Switch model to evaluation mode
    model.eval()

    # Make prediction
    with torch.no_grad():
        prediction = model(input_tensor)

    # Convert prediction to numpy array
    predicted_values = prediction.numpy().flatten()
    
    print(predicted_values)

    # (Optional) Inverse scale the prediction if you scaled the data
    # predicted_values_original_scale = scaler.inverse_transform(predicted_values.reshape(-1, 1)).flatten()
    predicted_values_original_scale = predicted_values

    # Store the result
    results.append({
        'PRODUCTREFERENCE': product_reference,
        'Prediction_Period_1': predicted_values_original_scale[0],
        'Prediction_Period_2': predicted_values_original_scale[1],
        'Prediction_Period_3': predicted_values_original_scale[2]
    })

# Convert the results to a DataFrame for easier handling
predictions_df = pd.DataFrame(results)

# Display or save the predictions
print(predictions_df)

[0.01923302 0.06568377 0.04887139]
[0.0011267  0.07217118 0.04881562]
[0.02153692 0.06475216 0.04931376]
[0.00981168 0.07111658 0.04862896]
[0.02090641 0.06410188 0.05006267]
[0.00926523 0.07136723 0.04808789]
[0.00946084 0.07017653 0.04558769]
[0.00490828 0.07195754 0.04732652]
[0.00989357 0.06664696 0.04700595]
[0.01735542 0.06495791 0.04776597]
[0.00374131 0.06888176 0.04708175]
[0.00989173 0.06947912 0.04338474]
[-0.01707276  0.07447472  0.04697167]
[0.00934545 0.0652249  0.04677149]
[-0.00076534  0.06912328  0.0470513 ]
[-0.00161177  0.069077    0.04783442]
[-0.00158765  0.07276247  0.05108691]
[-0.00107143  0.06777108  0.0477992 ]
[0.00898386 0.06476824 0.04767713]
[0.01289042 0.06526019 0.04598769]
[0.01768419 0.06342503 0.04722811]
[0.0064601  0.06910073 0.04794075]
[0.00783306 0.07008462 0.04805417]
[0.0369009  0.06462419 0.05740889]
[0.00487005 0.07056535 0.04642818]
[0.0079183  0.06989478 0.04826371]
[0.00439057 0.07260107 0.05080303]
[0.01582255 0.06918787 0.04963469]
[0.01

In [96]:
predictions_df.dropna()

Unnamed: 0,PRODUCTREFERENCE,Prediction_Period_1,Prediction_Period_2,Prediction_Period_3
0,29,0.019233,0.065684,0.048871
1,35,0.001127,0.072171,0.048816
2,441,0.021537,0.064752,0.049314
3,727,0.009812,0.071117,0.048629
4,814,0.020906,0.064102,0.050063
...,...,...,...,...
290,106446,-0.007023,0.079680,0.044720
291,106455,0.010205,0.065948,0.048407
292,106484,0.000270,0.066112,0.046670
293,106485,-0.000731,0.065273,0.046860


In [97]:
# pred_1 = test_df.loc[test_df.date == '2023-10-15', ['PRODUCTREFERENCE', 'exret']]
# pred_2 = test_df.loc[test_df.date == '2023-11-15', ['PRODUCTREFERENCE', 'exret']]
# pred_3 = test_df.loc[test_df.date == '2023-12-15', ['PRODUCTREFERENCE', 'exret']]
pred_1 = test_df.loc[test_df.date == '2023-07-15', ['PRODUCTREFERENCE', 'exret']]
pred_2 = test_df.loc[test_df.date == '2023-08-15', ['PRODUCTREFERENCE', 'exret']]
pred_3 = test_df.loc[test_df.date == '2023-09-15', ['PRODUCTREFERENCE', 'exret']]



true_df = pred_1.merge(pred_2, on="PRODUCTREFERENCE", how="inner").merge(pred_3, on="PRODUCTREFERENCE", how="inner")
true_df

Unnamed: 0,PRODUCTREFERENCE,exret_x,exret_y,exret
0,29,0.046100,0.002400,-0.064300
1,35,0.002600,0.010800,0.002610
2,441,0.000800,-0.004000,-0.014400
3,727,-0.014238,-0.005774,0.005192
4,814,0.054600,0.018400,-0.000400
...,...,...,...,...
274,105997,0.009600,-0.014700,-0.017900
275,106455,0.003100,-0.014100,-0.019300
276,106484,0.059200,-0.062100,0.053100
277,106485,0.059200,-0.059300,-0.062400


In [98]:
combined_df = predictions_df.dropna().merge(true_df, on="PRODUCTREFERENCE", how="inner")
combined_df

Unnamed: 0,PRODUCTREFERENCE,Prediction_Period_1,Prediction_Period_2,Prediction_Period_3,exret_x,exret_y,exret
0,29,0.019233,0.065684,0.048871,0.046100,0.002400,-0.064300
1,35,0.001127,0.072171,0.048816,0.002600,0.010800,0.002610
2,441,0.021537,0.064752,0.049314,0.000800,-0.004000,-0.014400
3,727,0.009812,0.071117,0.048629,-0.014238,-0.005774,0.005192
4,814,0.020906,0.064102,0.050063,0.054600,0.018400,-0.000400
...,...,...,...,...,...,...,...
264,105997,0.014962,0.067656,0.049218,0.009600,-0.014700,-0.017900
265,106455,0.010205,0.065948,0.048407,0.003100,-0.014100,-0.019300
266,106484,0.000270,0.066112,0.046670,0.059200,-0.062100,0.053100
267,106485,-0.000731,0.065273,0.046860,0.059200,-0.059300,-0.062400


In [99]:
preds = combined_df.iloc[:, 1:3].to_numpy().flatten()
trues = combined_df.iloc[:, 4:6].to_numpy().flatten()

In [100]:
from torcheval.metrics import R2Score

In [101]:
metrics = R2Score()
# input = torch.tensor(preds[:,:,0].flatten())
# target = torch.tensor(trues[:,:,0].flatten())

input = torch.tensor(preds)
target = torch.tensor(trues)

metrics.update(input, target)
print(metrics.compute())

tensor(-1.7813)
