In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h_0 = torch.zeros(num_layers, x.size(0), hidden_size).to(x.device)
        c_0 = torch.zeros(num_layers, x.size(0), hidden_size).to(x.device)
        out, _ = self.lstm(x, (h_0, c_0))
        out = self.fc(out[:, -1, :])  # Take the output from the last time step
        return out

# Custom dataset class
class TimeSeriesDataset(Dataset):
    def __init__(self, df, seq_length, target_length, add_cols, target_col):
        self.seq_length = seq_length
        self.target_length = target_length
        self.add_cols = add_cols
        self.target_col = target_col
        self.scaler = StandardScaler()

        self.data = []
        self.targets = []

        for fund in df['PRODUCTREFERENCE'].unique():
            fund_data = df[df['PRODUCTREFERENCE'] == fund].sort_values('date')
            if len(fund_data) < seq_length + target_length:
                continue
            # Normalize the data
            scaled_data = self.scaler.fit_transform(fund_data[add_cols])

            for i in range(len(fund_data) - seq_length - target_length + 1):
                seq_x = scaled_data[i:i+seq_length]
                seq_y = fund_data[target_col].iloc[i+seq_length:i+seq_length+target_length].values
                self.data.append(seq_x)
                self.targets.append(seq_y)
        
        self.data = np.array(self.data)
        self.targets = np.array(self.targets)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32), torch.tensor(self.targets[idx], dtype=torch.float32)




In [2]:
add_cols = [
    'aum', 
    'SENT_', 
    'SMB', 'HML', 'RF', 'mom', 
    'confeature', 'tfpfeature', 'ipgfeature', 
    'termfeature', 'deffeature', 'deifeature', 
    'mktfeature', 'labfeature', 
    'exret'  # Assuming this is the predictor and 'excret' is the target
]

target_col = 'exret'

In [3]:
df = pd.read_csv('dataset/hf/hf.csv')

In [4]:
data_df = df.copy()
data_df = data_df.dropna()
train_df = data_df[data_df.date <= '2023-09-15']
test_df = data_df[data_df.date >= '2023-10-15']

In [6]:
# Hyperparameters
input_size = len(add_cols)
hidden_size = 128
num_layers = 2
output_size = 3  # Predicting the next 3 periods
seq_length = 36  # Length of input sequences
target_length = 3
batch_size = 32
num_epochs = 50
learning_rate = 0.001

# Assuming df is your DataFrame
dataset = TimeSeriesDataset(train_df, seq_length, target_length, add_cols, target_col)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Model, loss, optimizer
model = LSTMModel(input_size, hidden_size, num_layers, output_size)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [7]:


# Training loop
model.train()
for epoch in range(num_epochs):
    for batch_x, batch_y in dataloader:
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# After training, you can use model.eval() to switch to evaluation mode and make predictions

Epoch [1/50], Loss: 0.0010
Epoch [2/50], Loss: 0.0006
Epoch [3/50], Loss: 0.0007
Epoch [4/50], Loss: 0.0006
Epoch [5/50], Loss: 0.0005
Epoch [6/50], Loss: 0.0005
Epoch [7/50], Loss: 0.0004
Epoch [8/50], Loss: 0.0002
Epoch [9/50], Loss: 0.0004
Epoch [10/50], Loss: 0.0003
Epoch [11/50], Loss: 0.0008
Epoch [12/50], Loss: 0.0003
Epoch [13/50], Loss: 0.0002
Epoch [14/50], Loss: 0.0002
Epoch [15/50], Loss: 0.0004
Epoch [16/50], Loss: 0.0004
Epoch [17/50], Loss: 0.0002
Epoch [18/50], Loss: 0.0002
Epoch [19/50], Loss: 0.0002
Epoch [20/50], Loss: 0.0002
Epoch [21/50], Loss: 0.0001
Epoch [22/50], Loss: 0.0001
Epoch [23/50], Loss: 0.0001
Epoch [24/50], Loss: 0.0002
Epoch [25/50], Loss: 0.0001
Epoch [26/50], Loss: 0.0001
Epoch [27/50], Loss: 0.0002
Epoch [28/50], Loss: 0.0001
Epoch [29/50], Loss: 0.0001
Epoch [30/50], Loss: 0.0001
Epoch [31/50], Loss: 0.0002
Epoch [32/50], Loss: 0.0001
Epoch [33/50], Loss: 0.0001
Epoch [34/50], Loss: 0.0001
Epoch [35/50], Loss: 0.0001
Epoch [36/50], Loss: 0.0001
E

In [7]:
torch.save(model.state_dict(), 'hidden_128_seqlen_36_20epoch.pth')

In [7]:
test_df = data_df[data_df.date >= '2023-10-15']

predict_df = data_df[(data_df.date >= '2018-01-15') & ( data_df.date <= '2023-09-15')]
count_df = predict_df.groupby(['PRODUCTREFERENCE']).agg({'date': 'count'}).reset_index()
predict_df.merge(count_df.loc[count_df.date >= seq_length], on='PRODUCTREFERENCE') 
funds_to_eval = list(predict_df.merge(test_df, on='PRODUCTREFERENCE', how='inner')['PRODUCTREFERENCE'].unique())
predict_df = predict_df[predict_df.PRODUCTREFERENCE.isin(funds_to_eval)]
predict_df['series_id'] = predict_df['PRODUCTREFERENCE']

# if only including the products with enough history
# predict_fund_history_count = predict_df.groupby(['PRODUCTREFERENCE']).agg({'date':'count'}).reset_index()
# predict_funds = list(predict_fund_history_count.loc[predict_fund_history_count.date >= seq_length, 'PRODUCTREFERENCE'])
# predict_df = predict_df[predict_df.PRODUCTREFERENCE.isin(predict_funds)]

In [38]:
# test on training data
test_df = data_df[(data_df.date < '2023-10-15') & (data_df.date >= '2023-07-15')]

predict_df = data_df[(data_df.date >= '2017-10-15') & ( data_df.date <= '2023-06-15')]
count_df = predict_df.groupby(['PRODUCTREFERENCE']).agg({'date': 'count'}).reset_index()
predict_df.merge(count_df.loc[count_df.date >= seq_length], on='PRODUCTREFERENCE') 
funds_to_eval = list(predict_df.merge(test_df, on='PRODUCTREFERENCE', how='inner')['PRODUCTREFERENCE'].unique())
predict_df = predict_df[predict_df.PRODUCTREFERENCE.isin(funds_to_eval)]
predict_df['series_id'] = predict_df['PRODUCTREFERENCE']

# if only including the products with enough history
predict_fund_history_count = predict_df.groupby(['PRODUCTREFERENCE']).agg({'date':'count'}).reset_index()
predict_funds = list(predict_fund_history_count.loc[predict_fund_history_count.date >= seq_length, 'PRODUCTREFERENCE'])
predict_df = predict_df[predict_df.PRODUCTREFERENCE.isin(predict_funds)]

In [9]:
model = LSTMModel(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, output_size=output_size)

# Load the saved model state dict
model.load_state_dict(torch.load('hidden_128_seqlen_36_50epoch_best.pth'))

<All keys matched successfully>

In [10]:
# Initialize a list to store results
results = []

# Iterate over each unique PRODUCTREFERENCE
for product_reference in predict_df['PRODUCTREFERENCE'].unique():
    
    # Extract the latest sequence for this PRODUCTREFERENCE
    fund_data = predict_df[predict_df['PRODUCTREFERENCE'] == product_reference].sort_values('date')

    # Skip this PRODUCTREFERENCE if there's not enough data
    if len(fund_data) < seq_length:
        continue
    
    scaler = StandardScaler()

    # option 1
    # scaled_data = scaler.fit_transform(fund_data[add_cols])
    # scaled_fund_data = scaled_data[-seq_length:]
    
    # option 2
    scaler.fit(fund_data[add_cols])
    scaled_fund_data = scaler.transform(fund_data[add_cols].tail(seq_length))
    print(scaled_fund_data)

    # option 3


    # input data
    input_data = np.expand_dims(scaled_fund_data, axis=0)  # Add batch dimension
    # Convert to PyTorch tensor
    input_tensor = torch.tensor(input_data, dtype=torch.float32)


    # Switch model to evaluation mode
    model.eval()

    # Make prediction
    with torch.no_grad():
        prediction = model(input_tensor)

    # Convert prediction to numpy array
    predicted_values = prediction.numpy().flatten()
    
    print(predicted_values)

    # (Optional) Inverse scale the prediction if you scaled the data
    # predicted_values_original_scale = scaler.inverse_transform(predicted_values.reshape(-1, 1)).flatten()
    predicted_values_original_scale = predicted_values

    # Store the result
    results.append({
        'PRODUCTREFERENCE': product_reference,
        'Prediction_Period_1': predicted_values_original_scale[0],
        'Prediction_Period_2': predicted_values_original_scale[1],
        'Prediction_Period_3': predicted_values_original_scale[2]
    })

# Convert the results to a DataFrame for easier handling
predictions_df = pd.DataFrame(results)

# Display or save the predictions
print(predictions_df)

[[-7.51145774e-01 -3.90578341e-01  1.62480922e+00  1.03346296e+00
  -9.66404769e-01 -7.17155933e-01 -7.45065733e-02  1.92864827e-01
   4.71369578e-02  5.28910005e-01 -5.37801667e-02 -2.59332697e-01
  -5.90847784e-01 -6.04953951e-01  5.07292424e-02]
 [-1.38926922e-01 -2.84740911e-01  2.46121201e+00  5.41095475e-01
  -9.66404769e-01 -2.87653097e+00 -9.71758047e-02  1.78248130e-01
   5.76283113e-02  1.64864501e-01  5.53572928e-01  8.10316408e-02
   2.29423492e+00 -5.85805813e-01  2.19639085e+00]
 [ 1.81777501e-01 -6.80539867e-02  1.67560291e+00 -2.79516992e-01
  -9.66404769e-01 -5.49102226e-01 -2.07152904e-01  3.27429528e-01
   4.01890283e-02  1.81296174e+00  6.69001618e-01  1.51021902e-01
   7.68828540e-01 -5.22923260e-01  8.25156778e-01]
 [ 2.66252147e-01  4.10187590e-01  2.40364583e+00  7.50807550e-01
  -9.66404769e-01  1.02093378e+00 -3.69134477e-01  1.56766987e-01
   5.60431574e-02  1.37471621e+00  7.08842387e-01 -4.90080481e-01
  -1.81951498e-01  2.08621921e+00  1.83352918e-01]
 [ 7

In [11]:
predictions_df.dropna()

Unnamed: 0,PRODUCTREFERENCE,Prediction_Period_1,Prediction_Period_2,Prediction_Period_3
0,29,-0.040580,0.040314,0.002238
1,35,-0.023929,0.009857,-0.000618
2,441,-0.015323,0.004444,0.012471
3,727,0.002532,0.001800,0.017026
4,814,-0.010991,0.016613,-0.019120
...,...,...,...,...
285,106446,0.009624,0.024002,0.002572
286,106455,-0.007025,0.037528,0.017425
287,106484,-0.014801,0.003710,0.000080
288,106485,-0.002320,0.035080,0.013769


In [12]:
pred_1 = test_df.loc[test_df.date == '2023-10-15', ['PRODUCTREFERENCE', 'exret']]
pred_2 = test_df.loc[test_df.date == '2023-11-15', ['PRODUCTREFERENCE', 'exret']]

true_df = pred_1.merge(pred_2, on="PRODUCTREFERENCE", how="inner")
true_df

Unnamed: 0,PRODUCTREFERENCE,exret_x,exret_y
0,29,-0.059800,0.054700
1,35,-0.017070,0.025500
2,441,-0.021400,0.025700
3,727,-0.005193,-0.015797
4,814,-0.020300,0.035200
...,...,...,...
275,105997,-0.015300,0.020800
276,106455,0.041700,0.045000
277,106484,-0.027600,0.098700
278,106485,-0.027600,0.098600


In [13]:
combined_df = predictions_df.dropna().merge(true_df, on="PRODUCTREFERENCE", how="inner")
combined_df

Unnamed: 0,PRODUCTREFERENCE,Prediction_Period_1,Prediction_Period_2,Prediction_Period_3,exret_x,exret_y
0,29,-0.040580,0.040314,0.002238,-0.059800,0.054700
1,35,-0.023929,0.009857,-0.000618,-0.017070,0.025500
2,441,-0.015323,0.004444,0.012471,-0.021400,0.025700
3,727,0.002532,0.001800,0.017026,-0.005193,-0.015797
4,814,-0.010991,0.016613,-0.019120,-0.020300,0.035200
...,...,...,...,...,...,...
265,105997,0.011878,0.047895,0.015964,-0.015300,0.020800
266,106455,-0.007025,0.037528,0.017425,0.041700,0.045000
267,106484,-0.014801,0.003710,0.000080,-0.027600,0.098700
268,106485,-0.002320,0.035080,0.013769,-0.027600,0.098600


In [14]:
preds = combined_df.iloc[:, 1:3].to_numpy().flatten()
trues = combined_df.iloc[:, 4:].to_numpy().flatten()

In [15]:
from torcheval.metrics import R2Score

In [16]:
metrics = R2Score()
# input = torch.tensor(preds[:,:,0].flatten())
# target = torch.tensor(trues[:,:,0].flatten())

input = torch.tensor(preds)
target = torch.tensor(trues)

metrics.update(input, target)
print(metrics.compute())

tensor(0.1992)
