In [37]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
#
import utilities.train_test.train_test as train_test

## Preparing Data

In [38]:
df_static = pd.read_csv('../../../data/df_overview.csv', index_col=0)
df_time_series = pd.read_csv('../../../data/df_monthly_returns_complete.csv', index_col='Date')

### Define Source and TargetColumns:
1. Static fields:
    * Industry
    * Market capital
    * Trailing P/E ratio
    * Beta
    * Return on equity
2. Time series data:
    * Market returns

In [39]:
df_static_columns = []
# Industry
df_static_columns = [s for s in df_static.columns.to_list() if "industry_" in s]
# Stock ticker
#df_static_columns.append('stock_ticker_symbol')
df_static_columns.append('stock_ticker_label')
#
df_static_columns.append('company_esg_score')
#
df_static_columns.append('market_capital_scale')
df_static_columns.append('trailing_pe')
df_static_columns.append('beta')
df_static_columns.append('return_on_equity')

In [40]:
df_time_series = df_time_series.loc[:, ~df_time_series.columns.str.contains('^Unnamed')]

## Normalisation

In [41]:
df_ts_torch = torch.from_numpy(df_time_series.values)
# Reshape to (num_samples, num_features) for normalization
df_ts_flat = df_ts_torch.view(-1, df_ts_torch.shape[-1])  # Shape: (1000*300, 5)
print(df_ts_flat)

# Calculate min and max per feature
df_min = df_ts_flat.min(dim=0, keepdim=True)[0]
df_max = df_ts_flat.max(dim=0, keepdim=True)[0]

# Apply Min-Max normalization
df_ts_normalised = (df_ts_flat - df_min) / (df_max - df_min)

# Reshape back to original shape
df_time_series_torch = df_ts_normalised.view(df_ts_torch.shape)


tensor([[1.9242e+02, 1.1570e+01, 4.4820e+01,  ..., 4.0000e-01, 5.6376e+01,
         8.5888e+00],
        [2.1815e+02, 1.1570e+01, 4.4820e+01,  ..., 4.0000e-01, 5.6376e+01,
         7.2504e+00],
        [2.4159e+02, 1.1570e+01, 4.4820e+01,  ..., 4.0000e-01, 5.6376e+01,
         6.7305e+00],
        ...,
        [8.1800e+02, 2.3700e+01, 1.3950e+01,  ..., 1.7730e+01, 1.9980e+03,
         2.4542e+02],
        [7.8300e+02, 1.8450e+01, 1.4830e+01,  ..., 1.6520e+01, 1.7740e+03,
         2.5473e+02],
        [7.7800e+02, 1.7590e+01, 1.6310e+01,  ..., 1.5620e+01, 1.6610e+03,
         2.5258e+02]], dtype=torch.float64)


## Prepare static shape

In [42]:
df_static_sel = df_static[df_static_columns]
df_static_torch = torch.tensor(df_static_sel.to_numpy(), dtype=torch.float32)
df_static_sel

Unnamed: 0,industry_Auto Components,industry_Automobiles,industry_Banks,industry_Building Products,industry_Chemicals,industry_Commercial Services,industry_Construction Materials,industry_Consumer Durables,industry_Consumer Services,industry_Containers & Packaging,...,industry_Traders & Distributors,industry_Transportation,industry_Transportation Infrastructure,industry_Utilities,stock_ticker_label,company_esg_score,market_capital_scale,trailing_pe,beta,return_on_equity
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1371,4.5,0.000273,19.858974,0.863,0.132260
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1018,4.5,0.000024,21.370369,1.258,0.038540
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1485,4.6,0.000173,60.117207,1.037,-0.037660
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1056,5.1,0.000434,60.117207,0.957,-0.081260
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1400,5.3,0.000086,17.210526,1.314,0.106330
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1657,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1040,20.0,0.000006,56.325687,0.594,-0.145360
1658,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1130,20.0,0.000044,2035.000000,1.238,0.080959
1659,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1165,20.0,0.000121,50.163414,1.191,-0.083910
1660,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,141,20.0,0.010681,27.933996,0.585,0.135850


In [43]:
# Add a batch dimension of similar size as time series
batch_size = df_ts_torch.shape[0]
expanded_data = np.expand_dims(df_static_torch, axis=0)  # Shape becomes (1, 1653, 44)
df_static_torch = np.tile(expanded_data, (batch_size, 1, 1))  # Shape becomes (300, 1653, 44)

print(df_static_torch.shape)  # Output: (300, 1653, 44)

(300, 1653, 44)


### Split the data into training and testing sets

In [44]:
train_size = int(len(df_time_series) * 0.8)
test_size = len(df_time_series) - train_size
train, test = df_time_series[:train_size], df_time_series[train_size:]

# Set sequence length (e.g., 10 time points)
in_seq_length = 12
out_seq_length = 12

# Create sequences for the normalized data
X_ts, X_static, y_ts = train_test.create_sequences(df_time_series, in_seq_length, out_seq_length)

test_months = 60
X_ts_train = X_ts.head(len(df_time_series) - test_months)
X_ts_test = X_ts.tail(test_months)

# @TODO recheck if we need it, it might help
X_static_train = X_static.head(len(df_time_series) - test_months)
X_static_test = X_static.tail(test_months)

y_train = y_ts.head(len(df_time_series) - test_months)
y_test = y_ts.tail(test_months)

test_size = len(df_time_series) - train_size

#X_ts_train, X_static_train, y_train = train_test.create_sequences(train, in_seq_length, out_seq_length)
#X_ts_test, X_static_test, y_test = train_test.create_sequences(test, in_seq_length, out_seq_length)


# Check the shapes
print('---------------------------------------------------------------')
print(f"X_ts shape: {X_ts.shape}, X_static shape: {X_static.shape}, y_ts shape: {y_ts.shape}")
print(f"X_ts_train shape: {X_ts_train.shape}, X_static_train shape: {X_static_train.shape}, y_train shape: {y_train.shape}")
print(f"X_ts_test shape: {X_ts_test.shape}, X_static_test shape: {X_static_test.shape}, y_test shape: {y_test.shape}")

TypeError: 'int' object is not subscriptable

In [35]:
X_ts[287][0]

tensor([713.7233, 657.2387, 725.3696, 804.2289, 770.8666, 739.4667, 713.1694,
        723.1781, 690.3063, 687.8532, 818.0000, 783.0000])

In [36]:
y_ts[287-12][0]

tensor([713.7233, 657.2387, 725.3696, 804.2289, 770.8666, 739.4667, 713.1694,
        723.1781, 690.3063, 687.8532, 818.0000, 783.0000])

## LSTM Multivariate - model

In [9]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, static_feature_size, hidden_size=128, num_layers=1, output_size=1, learning_rate=0.001, dropout=0.2): # , hidden_size=128
        super(LSTMModel, self).__init__()
        # 4.3. LSTM for time-series data (stock returns)
        self.lstm = nn.LSTM(input_size=1,
                            hidden_size=hidden_size,
                            # num_layers=num_layers,
                            batch_first=True)

        self.fc_lstm = nn.Linear(128, 1)
        # FC layer for final prediction
        self.fc_final = nn.Linear(256, 1)  # 256 for combined, 128 for only time-series

        # Final output layer
        self.fc = nn.Linear(1 + static_feature_size, output_size)

    def forward(self, ts_batch, static_data): # ts_batch (64, 1653, 10), static_data (64, 1653, 44)
        # Time-Series Data
        # Reshape dynamic data for 4.3. LSTM (requires time-step as 2nd dimension)
        batch_size, num_stocks, sequence_length = ts_batch.shape[0], ts_batch.shape[1], ts_batch.shape[2]
        ts_batch_reshaped = ts_batch.view(batch_size * num_stocks, sequence_length, 1)
        #
        #print('before lstm', ts_batch_reshaped.shape)
        ts_output_1, (hidden, cell)  = self.lstm(ts_batch_reshaped) # ts_batch_reshaped
        #print('after lstm', ts_output_1.shape)
        ts_output = ts_output_1.view(batch_size, num_stocks, sequence_length, -1) # 64, 1653, 10, 128

        ts_output_2 = self.fc_lstm(ts_output) # ts_output
        #print('after linear', ts_output_2.shape)
        
        # Static Data - Fully connected layer for static data
        # fc_static = nn.Sequential(nn.Linear(44, 128), nn.ReLU())
        # static_output = fc_static(static_data) # 64, 1653, 128

        # Expand static features to match sequence length
        # static_expanded = static_output.unsqueeze(2).expand(-1, -1, sequence_length, -1)  # (64, 1653, 10, 128)

        # Combine dynamic and static outputs
        # combined_output = static_output # torch.cat([ts_output, static_expanded], dim=-1)  # (64, 1653, 10, 256)

        #
        #fc_final = nn.Linear(sequence_length, 1)
        # prediction = fc_final(ts_output_2).squeeze(-1)  # (64, 1653, 10)
        #
        # print(prediction.shape)
        return ts_output_2 # prediction # ts_output 

## Training and Testing

### Train

In [10]:
# Input size: (num_stocks + num_static_features)
hidden_size = 128
batch_size = 64
seq_length = 10
#
input_size = batch_size + df_static_torch.shape[1]  # stocks number + 3 static features
output_size = df_static_torch.shape[1]  # Predicting returns for each stock

# Instantiate the model
model = LSTMModel(input_size=input_size,
                  hidden_size=hidden_size,
                  static_feature_size=len(df_static_torch[0]))
#
optimizer = optim.Adam(model.parameters())
loss_fn = nn.MSELoss()
train_loader = data.DataLoader(data.TensorDataset(X_ts_train, X_static_train, y_train), shuffle=False, batch_size=batch_size)

model.train()
train_loss = 0.0
#
n_epochs = 100 # @todo test with 100 epochs
for epoch in range(n_epochs):
    for i, (X_batch, X_static, y_batch) in enumerate(train_loader):
        y_pred = model(X_batch, X_static)
        #print(combined_output.shape)
        # Sequence-level loss
        #y_pred_sequence = fc_final(combined_output).squeeze(-1)  # (64, 1653, 10)
        #loss_sequence = loss_fn(y_pred_sequence, y_batch)  # Target shape: (64, 1653, 10)
        # Final-step loss

        # TODO resume here, sequence of y_pred_final or combined_output is buggy, it leads to wrong loss
        if i == 0:
            print('X″', X_batch.shape, X_batch[0][0], 
                  'pred', y_pred.shape, y_pred[0][0], 
                  'y', y_batch.shape, y_batch[0][0])
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # Validation
        if epoch % 100 != 0:
            continue
        model.eval()
        with torch.no_grad():
            y_pred = model(X_ts_train, X_static_train)
            train_rmse = np.sqrt(loss_fn(y_pred, y_train))
            y_pred = model(X_ts_test, X_static_test)
            test_rmse = np.sqrt(loss_fn(y_pred, y_test))
        print("Epoch %d: train RMSE %.4f, test RMSE %.4f" % (epoch, train_rmse, test_rmse))

# Average training loss
# train_loss /= len(train_loader)
# print(f"Training Loss: {train_loss:.4f}")

after view torch.Size([64, 1653, 10, 128])
X″ torch.Size([64, 1653, 10]) tensor([192.4249, 218.1522, 241.5886, 246.5859, 246.5859, 246.5859, 246.5859,
        246.5859, 246.5859, 249.5621]) pred torch.Size([64, 1653, 10, 1]) tensor([[-0.2708],
        [-0.3014],
        [-0.3067],
        [-0.3075],
        [-0.3073],
        [-0.3069],
        [-0.3064],
        [-0.3060],
        [-0.3055],
        [-0.3053]], grad_fn=<SelectBackward0>) y torch.Size([64, 1653, 1]) tensor([249.5621])


  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: The size of tensor a (10) must match the size of tensor b (1653) at non-singleton dimension 2

### Test

In [None]:
test_loader = data.DataLoader(data.TensorDataset(X_ts_test, X_static_test, y_test), shuffle=False, batch_size=batch_size)
# Calculate test loss
model.eval()  # Ensure the model is in evaluation mode
test_loss = 0.0

with torch.no_grad():
    for x_batch, x_static, y_batch in test_loader:
        # print(x_batch.shape, y_batch.shape)
        combined_output = model(x_batch, x_static)
        # print(combined_output.shape)
        # Forward pass
        y_pred_final = combined_output # fc_final(combined_output[:, :, -1, :]).squeeze(-1)  # (64, 1653)
        y_batch_final = y_batch[:, :, -1]

        # Compute loss
        loss = loss_fn(y_pred_final, y_batch_final)  # Target shape: (64, 1653)
        test_loss += loss.item()
        optimizer.zero_grad()

# Average test loss
test_loss /= len(test_loader)
print(f"Testing Loss: {test_loss:.4f}")


## Plot

In [None]:
timeseries = df_time_series.values.astype('float32')
train_size = X_ts_train.shape[0]
# shift train predictions for plotting
with torch.no_grad():
    train_plot = np.ones_like(timeseries) * np.nan
    y_train_pred = model(X_ts_train, df_static_torch)
    y_train_pred = y_train_pred[:, -1, :]
    #
    train_plot[0: train_size] = y_train_pred
    # shift test predictions for plotting
    test_plot = np.ones_like(timeseries) * np.nan
    y_test_pred = model(X_ts_test, df_static_torch)[:, -1, :]
    #
    test_plot[train_size+seq_length: len(timeseries)] = y_test_pred

In [None]:
timeseries_plt = np.hstack(timeseries) - 1
train_plt = np.hstack(train_plot) - 1
test_plt = np.hstack(test_plot) - 1

# Create the plot
fig = go.Figure()

# Add the timeseries line
fig.add_trace(go.Scatter(y=timeseries_plt, x=df_time_series.index.tolist(), mode='lines', name='Timeseries',
                         line=dict(color='#5c839f', width=2)))  #, line=dict(color='red'))
# Add the training plot in red
fig.add_trace(go.Scatter(y=train_plt, x=df_time_series.index.tolist(), mode='lines', name='Train Plot',
                         line=dict(color='green', width=2)))  #, line=dict(color='red')
# Add the testing plot in green
fig.add_trace(go.Scatter(y=test_plt, x=df_time_series.index.tolist(), mode='lines', name='Test Plot',
                         line=dict(color='red', width=2)))  # , line=dict(color='green')

# Add Vertical line
fig.add_vline(x=df_time_series.index.tolist()[train_size], line_color='red', line_dash='dash', line_width=1) #  line=dict(color="red", width=2, dash="dash")

# Update layout with labels
fig.update_layout(
    title='Timeseries plot with train and test',
    xaxis=dict(
        title='Date',
        tickformat='.0%',
    ),
    yaxis=dict(
        title='Investment Return',
        tickformat='.0%',
    ),
    legend=dict(title="Legend")
)

# Show plot
fig.show()

## Training

In [None]:
look_back = 4
with torch.no_grad():
    # shift train predictions for plotting
    train_plot = np.ones_like(df_time_series) * np.nan
    y_pred = model(X_ts_train)
    y_pred = y_pred[:, -1, :]
    train_plot[look_back: len(X_ts_train)] = model(X_ts_train)[:, -1, :]
    # shift test predictions for plotting
    test_plot = np.ones_like(df_time_series) * np.nan
    test_plot[len(X_ts_train) + look_back: len(df_time_series)] = model(X_ts_test)[:, -1, :]

### Evaluation

In [None]:
with torch.no_grad():
    # shift train predictions for plotting
    train_plot = np.ones_like(df_time_series) * np.nan
    y_pred = model(X_ts_train)
    y_pred = y_pred[:, -1, :]
    train_plot[look_back: len(X_ts_train)] = model(X_ts_train)[:, -1, :]
    # shift test predictions for plotting
    test_plot = np.ones_like(df_time_series) * np.nan
    test_plot[len(X_ts_train) + look_back: len(df_time_series)] = model(X_ts_test)[:, -1, :]

### Plotting

In [None]:
plt.scatter(X_ts_train, y_train, color="blue", label="Actual")
plt.scatter(X_ts_test, y_test, color="red", label="Predicted")
plt.title("DecisionTreeRegressor: Actual vs Predicted")
plt.xlabel("X")
plt.ylabel("y")
plt.legend()
plt.show()