In [143]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
#
from pypfopt import EfficientFrontier
from pypfopt import risk_models
from pypfopt import expected_returns
from pypfopt.discrete_allocation import DiscreteAllocation, get_latest_prices
#
import importlib
import utilities.train_test as train_test

In [144]:
# Set device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Preparing Data

In [145]:
df_time_series = pd.read_csv('../../../data/df_monthly_returns_complete_percentage.csv', index_col='Date')

df_time_series = df_time_series.loc[:, ~df_time_series.columns.str.contains('^Unnamed')]

In [146]:
df_time_series = df_time_series

In [147]:
# 
df_time_series_plus1 = df_time_series
df_time_series = df_time_series - 1

### Normalisation

In [148]:
''' 
df_ts_torch = torch.from_numpy(df_time_series.values)
# Reshape to (num_samples, num_features) for normalization
df_ts_flat = df_ts_torch.view(-1, df_ts_torch.shape[-1])  # Shape: (1000*300, 5)

# Calculate min and max per feature
df_min = df_ts_flat.min(dim=0, keepdim=True)[0]
df_max = df_ts_flat.max(dim=0, keepdim=True)[0]

# Apply Min-Max normalization
df_ts_normalised = (df_ts_flat - df_min) / (df_max - df_min)

# Reshape back to original shape
df_time_series_torch = df_ts_normalised.view(df_ts_torch.shape)
'''


' \ndf_ts_torch = torch.from_numpy(df_time_series.values)\n# Reshape to (num_samples, num_features) for normalization\ndf_ts_flat = df_ts_torch.view(-1, df_ts_torch.shape[-1])  # Shape: (1000*300, 5)\n\n# Calculate min and max per feature\ndf_min = df_ts_flat.min(dim=0, keepdim=True)[0]\ndf_max = df_ts_flat.max(dim=0, keepdim=True)[0]\n\n# Apply Min-Max normalization\ndf_ts_normalised = (df_ts_flat - df_min) / (df_max - df_min)\n\n# Reshape back to original shape\ndf_time_series_torch = df_ts_normalised.view(df_ts_torch.shape)\n'

### Split the data into training and testing sets

### Train-Test Split

In [149]:
importlib.reload(train_test)
# Set sequence length (e.g., 10 time points)
in_seq_length = 12
out_seq_length = 12

# Create sequences for the normalized data
X, X_static, y = train_test.create_sequences(df_time_series, [], in_seq_length, out_seq_length)
print(X.shape, y.shape)
test_months = 5 * 12
# skip first item
X_train = X[:len(X) - test_months]
X_test = X[(len(X) - test_months):]

y_train = y[:(len(X) - test_months)]
y_test = y[(len(X) - test_months):]

# Check the shapes of the training and test data
print("Shape of X_train:", X_train.shape)  # Should be (230, 12, 5) for 80% of 288
print("Shape of y_train:", y_train.shape)  # Should be (230, 5)
print("Shape of X_test:", X_test.shape)    # Should be (58, 12, 5) for 20% of 288
print("Shape of y_test:", y_test.shape)    # Should be (58, 5)

torch.Size([276, 1653, 12]) torch.Size([276, 1653, 12])
Shape of X_train: torch.Size([216, 1653, 12])
Shape of y_train: torch.Size([216, 1653, 12])
Shape of X_test: torch.Size([60, 1653, 12])
Shape of y_test: torch.Size([60, 1653, 12])


## 1 Month

### LSTM Model

In [225]:
# Define LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=128, num_layers=1, output_size=1, learning_rate=0.001, dropout=0.2): # , hidden_size=128
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        # LSTM for time-series data (stock returns)
        self.lstm = nn.LSTM(input_size=input_size,
                            hidden_size=hidden_size,
                            # num_layers=num_layers,
                            batch_first=True)

        # FC layer for final prediction
        self.fc_final = nn.Linear(hidden_size, 12)

    def forward(self, ts_batch): # ts_batch (64, 1653, 10), static_data (64, 1653, 44)
        # Time-Series Data
        # Reshape dynamic data for LSTM (requires time-step as 2nd dimension)
        batch_size, num_stocks, sequence_length = ts_batch.shape[0], ts_batch.shape[1], ts_batch.shape[2]
        ts_batch_reshaped = ts_batch.view(batch_size * num_stocks, sequence_length)
        print('input 1', ts_batch_reshaped.shape)
        #
        ts_output_1, (hidden, cell)  = self.lstm(ts_batch_reshaped) # ts_batch_reshaped
        print('output 1', ts_output_1.shape)
        ts_output = ts_output_1.view(batch_size, num_stocks, self.hidden_size) # , -1 # 64, 1653, 10, 128
        #
        # ts_output_2 = self.fc_lstm(ts_output)
        print(self.fc_final(ts_output))
        #fc_final = nn.Linear(sequence_length, 1)
        # prediction =   # (64, 1653, 10)

        return self.fc_final(ts_output)#.squeeze(-1) # ts_output_2

# Model, Loss, Optimizer
model = LSTMModel(input_size=in_seq_length, output_size=out_seq_length).to(device)
criterion = nn.MSELoss()  # Mean Squared Error for regression
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [226]:
EPOCHS = 1 # 100
batch_size = 32

loss_fn = nn.MSELoss()
train_loader = data.DataLoader(data.TensorDataset(X_train, y_train), shuffle=False, batch_size=batch_size)
test_loader = data.DataLoader(data.TensorDataset(X_test, y_test), shuffle=False, batch_size=batch_size)

y_train_pred_all = torch.tensor([])
y_test_pred_all = torch.tensor([])

# Training Loop
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for index, (X_batch, y_batch) in enumerate(train_loader):
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        if torch.any(torch.isnan(X_batch)):
            print("NaN values found during training: X_batch: ", epoch, index, X_batch[0][0])
            continue
        if torch.any(torch.isnan(y_batch)):
            print("NaN values found during training: y_batch: ", epoch, index, y_batch[0][0])
            continue
        # Forward pass
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        y_pred = y_pred.squeeze(-1)  # @TODO check here - Remove last dim for (batch, 1653)

        # Compute loss
        loss = criterion(y_pred, y_batch)
        total_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()
        # Join all batch predictions together - only last epoch where model is fully trained
        if epoch == EPOCHS - 1:
            y_train_pred_all = torch.cat([y_train_pred_all, y_pred], dim=0)

    # Validation - Root-mean-square-error
    # if epoch != 1 and epoch % 100 != 0:
    #    continue
    model.eval()
    y_test_pred = torch.tensor([])
    with torch.no_grad():
        for index, (X_batch, y_batch) in enumerate(test_loader):
            #print('test', X_batch.shape, y_batch.shape)
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            if torch.any(torch.isnan(X_batch)):
                print("NaN values found during validation. X_batch: ", epoch, index, X_batch[0][0])
                continue
            if torch.any(torch.isnan(y_batch)):
                print("NaN values found during validation. y_batch: ", epoch, index, y_batch[0][0])
                continue
                
                y_test_pred = model(X_batch)
                y_test_pred = y_test_pred.squeeze(-1)  # @TODO check here - Remove last dim for (batch, 1653)
            # Join all batch predictions together
            if epoch == EPOCHS - 1:
                y_test_pred_all = torch.cat([y_test_pred_all, y_test_pred], dim=0)
        #
        y_train_pred = model(X_train)
        train_rmse = np.sqrt(loss_fn(y_train_pred, y_train))
        y_test_pred = model(X_test)
        test_rmse = np.sqrt(loss_fn(y_test_pred, y_test))
    # Print epoch loss
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss / len(train_loader):.6f}")

# Save the trained model
torch.save(model.state_dict(), "lstm_univariate.pth")
print("Model training complete and saved.")

input 1 torch.Size([52896, 12])
output 1 torch.Size([52896, 128])
tensor([[[ 0.0319,  0.0091, -0.0426,  ..., -0.0731,  0.0806,  0.0826],
         [ 0.0308,  0.0210, -0.0372,  ..., -0.0679,  0.0901,  0.0852],
         [ 0.0303,  0.0278, -0.0285,  ..., -0.0635,  0.0981,  0.0895],
         ...,
         [ 0.0351,  0.0391, -0.0324,  ..., -0.0630,  0.1019,  0.0867],
         [ 0.0377,  0.0395, -0.0155,  ..., -0.0615,  0.1019,  0.0770],
         [ 0.0326,  0.0376, -0.0209,  ..., -0.0608,  0.1086,  0.0871]],

        [[ 0.0294,  0.0403, -0.0237,  ..., -0.0595,  0.1082,  0.0875],
         [ 0.0288,  0.0412, -0.0250,  ..., -0.0584,  0.1079,  0.0887],
         [ 0.0296,  0.0407, -0.0301,  ..., -0.0584,  0.1085,  0.0841],
         ...,
         [ 0.0286,  0.0426, -0.0251,  ..., -0.0603,  0.1035,  0.0969],
         [ 0.0320,  0.0379, -0.0397,  ..., -0.0679,  0.1030,  0.0871],
         [ 0.0327,  0.0423, -0.0345,  ..., -0.0613,  0.1032,  0.0848]],

        [[ 0.0315,  0.0419, -0.0314,  ..., -0.0597

In [184]:
y_train_pred.shape

torch.Size([216, 1653, 12, 1])

In [185]:
y_train.shape

torch.Size([216, 1653, 12])

### Returns vs Predicted

In [None]:
# Compute average portfolio returns over all assets (per time step)
true_avg = pd.DataFrame(torch.tensor(y, dtype=torch.float32)).mean(axis=1)
pred_train_avg = pd.DataFrame(pd.DataFrame(y_train_pred_all.detach().numpy())).mean(axis=1)
pred_test_avg = pd.DataFrame(pd.DataFrame(y_test_pred_all.detach().numpy())).mean(axis=1)

'''pred_avg = pred_avg.reindex(range(len(true_avg)))

# Time indices
time_steps = np.arange(len(df_time_series))
print(len(pred_avg))
table = pd.DataFrame( {"Predicted returns": pred_avg.tolist(), "Actual returns": true_avg.tolist()})
table'''

In [None]:
# Plotly Visualization
fig = go.Figure()

fig.add_trace(go.Scatter(x=df_time_series.index.tolist(), y=true_avg, mode='lines', name='Actual Returns',
                         line=dict(color='#5c839f', width=2)))

# Add the training plot in red
fig.add_trace(go.Scatter(x=df_time_series.index.tolist(), y=pred_train_avg, mode='lines', name='Train returns',
                         line=dict(color='red', width=2)))

fig.add_trace(go.Scatter(x=df_time_series.index.tolist()[len(pred_train_avg):], y=pred_test_avg, mode='lines', name='Predicted Returns',
                         line=dict(color='green')))

# Layout settings
fig.update_layout(
    title="Portfolio Monthly Returns: Predicted vs Actual",
    legend_title="Legend",
    template="plotly_white",
    xaxis=dict(
        title='Date'
    ),
    yaxis=dict(
        title='Average Monthly Portfolio Return (%)',
        tickformat='.0%',
        range=[-0.2,0.2]
    ),
    legend=dict(title="Legend")
)

# Show plot
fig.show()

In [None]:
y_test_pred_all

## Sharpe Ratio

### Prediction to Dataframe

In [None]:
y_test_pred_all = y_test_pred_all + 1
#
df_pred = pd.DataFrame(y_test_pred_all)
df_pred.columns = df_time_series.columns
df_pred

In [None]:
def build_efficient_frontier(df_pred):
    # Calculate expected returns and sample covariance
    mu_0 = expected_returns.mean_historical_return(df_pred, frequency=12)
    # Get only tickers with a mean historical return of at least 5% 
    optimal_tickers = mu_0[mu_0 > 0.05].index

    df_optimal = df_pred[optimal_tickers]
    
    mu = expected_returns.mean_historical_return(df_optimal)
    S = risk_models.CovarianceShrinkage(df_optimal).ledoit_wolf() # Exponential Covariance

    # Optimize for maximal Sharpe ratio
    ef = EfficientFrontier(mu, S)
    ef_new = EfficientFrontier(mu, S)

    raw_weights = ef.max_sharpe()
    cleaned_weights = ef.clean_weights()
    ef.save_weights_to_file("weights.csv")  # saves to file
    #
    ef.portfolio_performance(verbose=True)

    return df_optimal
# @TODO to check results - df_pred is just the test results, ok?
build_efficient_frontier( df_pred )

### Optimal Allocation