# Modeling

In [165]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split

In [166]:
# Set device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Preparing Data

In [167]:
df_time_series = pd.read_csv('../../../data/df_monthly_returns_complete_percentage.csv', index_col='Date')

df_time_series = df_time_series.loc[:, ~df_time_series.columns.str.contains('^Unnamed')]


In [168]:
df_time_series = df_time_series - 1

### Normalisation

In [169]:
df_ts_torch = torch.from_numpy(df_time_series.values)
# Reshape to (num_samples, num_features) for normalization
df_ts_flat = df_ts_torch.view(-1, df_ts_torch.shape[-1])  # Shape: (1000*300, 5)
print(df_ts_flat)

# Calculate min and max per feature
df_min = df_ts_flat.min(dim=0, keepdim=True)[0]
df_max = df_ts_flat.max(dim=0, keepdim=True)[0]

# Apply Min-Max normalization
df_ts_normalised = (df_ts_flat - df_min) / (df_max - df_min)

# Reshape back to original shape
df_time_series_torch = df_ts_normalised.view(df_ts_torch.shape)


tensor([[    nan,     nan,     nan,  ...,     nan,     nan,     nan],
        [ 0.1300,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.1600],
        [ 0.1100,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.0700],
        ...,
        [ 0.1900,  0.0800,  0.0200,  ...,  0.2800, -0.1000,  0.0900],
        [-0.0400, -0.2200,  0.0600,  ..., -0.0700, -0.1100,  0.0400],
        [-0.0100, -0.0500,  0.1000,  ..., -0.0500, -0.0600, -0.0100]],
       dtype=torch.float64)


### Create Sequence

In [170]:

# Parameters
T = 12  # Sequence length (months)
num_tickers = len(df_time_series.columns)

# Preparing training data
sequences = []
targets = []

# Create sequences and targets
for i in range(len(df_time_series) - T):
    # Extract sequence of T months
    sequence = df_time_series.iloc[i:i + T].values  # Shape: (T, num_tickers)
    target = df_time_series.iloc[i + T].values      # Shape: (num_tickers)

    sequences.append(sequence)
    targets.append(target)

# Convert to numpy arrays
X = np.array(sequences)  # Shape: (number of sequences, T, num_tickers)
y = np.array(targets)     # Shape: (number of sequences, num_tickers)

# Print shapes of the training and target data
print("Training data shape (X):", X.shape)  # Should be (288, 12, 5)
print("Target data shape (y):", y.shape)     # Should be (288, 5)


Training data shape (X): (288, 12, 1653)
Target data shape (y): (288, 1653)


### Train-Test Split

In [171]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = (torch.tensor(X_train, dtype=torch.float32),
                                    torch.tensor(X_test, dtype=torch.float32),
                                    torch.tensor(y_train, dtype=torch.float32),
                                    torch.tensor(y_test, dtype=torch.float32))
# Check the shapes of the training and test data
print("Shape of X_train:", X_train.shape)  # Should be (230, 12, 5) for 80% of 288
print("Shape of y_train:", y_train.shape)  # Should be (230, 5)
print("Shape of X_test:", X_test.shape)    # Should be (58, 12, 5) for 20% of 288
print("Shape of y_test:", y_test.shape)    # Should be (58, 5)

Shape of X_train: torch.Size([230, 12, 1653])
Shape of y_train: torch.Size([230, 1653])
Shape of X_test: torch.Size([58, 12, 1653])
Shape of y_test: torch.Size([58, 1653])


### LSTM Model

In [172]:
# Define LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size=1, hidden_size=128, output_size=1):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=False)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)  # (seq_len, batch, hidden_size)
        final_output = lstm_out[:, -1, :]  # Last time step output
        return self.fc(final_output)  # (batch, output_size)

# Model, Loss, Optimizer
model = LSTMModel(input_size=len(df_time_series.columns), output_size=len(df_time_series.columns)).to(device)
criterion = nn.MSELoss()  # Mean Squared Error for regression
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [173]:
EPOCHS = 100 # @TODO increase to 100
batch_size = 32

train_loader = data.DataLoader(data.TensorDataset(X_train, y_train), shuffle=False, batch_size=batch_size)

y_pred_all = torch.tensor([])

# Training Loop
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for index, (X_batch, y_batch) in enumerate(train_loader):
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        if (torch.any(torch.isnan(X_batch)))|(torch.any(torch.isnan(y_batch))):
            print("NaN values found in", epoch, i)
            continue
        # Forward pass
        optimizer.zero_grad()
        y_pred = model(X_batch)
        # @TODO check here
        y_pred = y_pred.squeeze(-1)  # Remove last dim for (batch, 1653)

        # Compute loss
        loss = criterion(y_pred, y_batch)
        total_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()
        print(y_pred_all.shape, y_pred.shape)
        if epoch == EPOCHS - 1:
            y_pred_all = torch.cat([y_pred_all, y_pred], dim=0)

    # Print epoch loss
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss / len(train_loader):.6f}")

# Save the trained model
torch.save(model.state_dict(), "lstm_univariate.pth")
print("Model training complete and saved.")


torch.Size([0]) torch.Size([32, 1653])
torch.Size([0]) torch.Size([32, 1653])
torch.Size([0]) torch.Size([32, 1653])
NaN values found in 0 287
torch.Size([0]) torch.Size([32, 1653])
torch.Size([0]) torch.Size([32, 1653])
torch.Size([0]) torch.Size([32, 1653])
torch.Size([0]) torch.Size([6, 1653])
Epoch 1/100, Loss: 0.123437
torch.Size([0]) torch.Size([32, 1653])
torch.Size([0]) torch.Size([32, 1653])
torch.Size([0]) torch.Size([32, 1653])
NaN values found in 1 287
torch.Size([0]) torch.Size([32, 1653])
torch.Size([0]) torch.Size([32, 1653])
torch.Size([0]) torch.Size([32, 1653])
torch.Size([0]) torch.Size([6, 1653])
Epoch 2/100, Loss: 0.120447
torch.Size([0]) torch.Size([32, 1653])
torch.Size([0]) torch.Size([32, 1653])
torch.Size([0]) torch.Size([32, 1653])
NaN values found in 2 287
torch.Size([0]) torch.Size([32, 1653])
torch.Size([0]) torch.Size([32, 1653])
torch.Size([0]) torch.Size([32, 1653])
torch.Size([0]) torch.Size([6, 1653])
Epoch 3/100, Loss: 0.118839
torch.Size([0]) torch.

In [174]:
pd.DataFrame(pd.DataFrame(y_pred_all.T.detach().numpy())).mean()

0     -0.007377
1      0.008797
2      0.042023
3      0.023645
4      0.060715
         ...   
193   -0.029992
194    0.028650
195    0.052810
196    0.003812
197    0.153400
Length: 198, dtype: float32

In [175]:
pd.DataFrame(torch.tensor(y, dtype=torch.float32))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1643,1644,1645,1646,1647,1648,1649,1650,1651,1652
0,0.00,0.00,0.00,0.00,0.06,-0.02,-0.05,-0.01,0.00,0.07,...,0.00,0.00,0.00,-0.08,-0.09,0.33,0.00,0.00,0.00,0.21
1,0.00,0.00,-0.10,0.00,-0.23,0.01,-0.34,0.00,0.00,-0.06,...,0.00,0.00,0.00,0.07,0.00,-0.34,0.00,0.00,0.00,-0.01
2,0.00,0.00,0.00,-0.06,0.03,0.05,0.22,0.00,-0.04,0.05,...,0.00,0.00,0.00,0.09,0.00,-0.16,0.00,0.00,0.00,0.09
3,0.01,0.00,0.55,0.04,0.09,-0.02,0.28,-0.08,0.02,-0.06,...,0.00,0.29,-0.09,-0.02,-0.04,0.69,-0.14,-0.24,0.00,0.05
4,0.00,-0.03,0.09,-0.13,-0.05,-0.01,-0.11,-0.02,-0.10,-0.08,...,-0.48,0.00,0.00,0.01,0.00,-0.26,0.00,0.00,0.00,0.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,-0.05,0.10,0.06,0.02,0.14,0.12,0.12,0.06,0.06,0.06,...,0.09,0.07,0.16,0.05,-0.01,0.08,0.03,-0.02,-0.14,-0.02
284,0.00,-0.04,-0.04,-0.03,-0.05,-0.04,-0.06,-0.06,-0.05,0.11,...,0.07,-0.09,0.18,-0.11,0.01,-0.10,0.19,0.01,0.14,-0.02
285,0.19,0.08,0.02,0.06,0.12,0.22,0.05,0.09,0.07,-0.08,...,0.10,0.07,0.06,0.16,-0.06,-0.04,0.14,0.28,-0.10,0.09
286,-0.04,-0.22,0.06,0.08,-0.02,-0.02,0.03,0.07,0.10,0.06,...,-0.05,0.04,0.16,-0.13,-0.04,0.00,-0.03,-0.07,-0.11,0.04


### Returns vs Predicted

In [176]:
# Compute average portfolio returns over all assets (per time step)
true_avg = pd.DataFrame(torch.tensor(y, dtype=torch.float32)).mean(axis=1)
pred_avg = pd.DataFrame(pd.DataFrame(y_pred_all.detach().numpy())).mean(axis=1)

print(len(true_avg), len(pred_avg))
pred_avg = pred_avg.reindex(range(len(true_avg)))

# Time indices
time_steps = np.arange(len(df_time_series))
print(len(pred_avg))
table = pd.DataFrame( {"Predicted returns": pred_avg.tolist(), "Actual returns": true_avg.tolist()})
table

288 198
288


Unnamed: 0,Predicted returns,Actual returns
0,-0.007377,-0.010690
1,0.008797,-0.028258
2,0.042023,0.009407
3,0.023645,0.039165
4,0.060715,-0.031022
...,...,...
283,,0.034162
284,,-0.006757
285,,0.055251
286,,-0.002801


In [177]:
# Plotly Visualization
fig = go.Figure()

fig.add_trace(go.Scatter(x=df_time_series.index.tolist(), y=true_avg, mode='lines', name='Actual Returns',
                         line=dict(color='#5c839f', width=2)))
fig.add_trace(go.Scatter(x=df_time_series.index.tolist(), y=pred_avg, mode='lines', name='Predicted Returns',
                         line=dict(color='green')))

# Layout settings
fig.update_layout(
    title="Portfolio Monthly Returns: Predicted vs Actual",
    legend_title="Legend",
    template="plotly_white",
    xaxis=dict(
        title='Date'
    ),
    yaxis=dict(
        title='Average Monthly Portfolio Return (%)',
        tickformat='.0%',
        range=[-0.2,0.2]
    ),
    legend=dict(title="Legend")
)

# Show plot
fig.show()