### Prepare data for the LSTM

In [1]:
import numpy as np
import pandas as pd

In [12]:
data_yf = pd.read_csv('data/all_stock_data_transformed_horizontally.csv').drop(columns=['Unnamed: 0'])

In [13]:
data_yf

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Ticker,macd,macd_signal,...,WILLR_14_lag_3,WILLR_14_lag_4,WILLR_14_lag_5,OBV_lag_1,OBV_lag_2,OBV_lag_3,OBV_lag_4,OBV_lag_5,gain_loss_pct,win
0,2005-01-01,-0.497876,-0.496862,-0.497650,-0.496733,-0.450456,36.113739,AAPL,-0.805879,0.583648,...,0.800159,0.954954,0.967344,-6.249355,-6.249623,-6.250280,-6.248141,-6.246777,1.443757,1
1,2005-01-08,-0.497349,-0.496377,-0.497575,-0.496634,-0.450371,75.058095,AAPL,-2.215210,-0.005819,...,0.970425,0.800166,0.954961,-6.298288,-6.249074,-6.249342,-6.250000,-6.247860,0.054783,1
2,2005-01-15,-0.497347,-0.496663,-0.497097,-0.496604,-0.450346,23.887793,AAPL,-3.294118,-0.707636,...,1.034995,0.970432,0.800173,-6.196910,-6.298005,-6.248793,-6.249062,-6.249719,0.139038,1
3,2005-01-22,-0.497229,-0.496422,-0.496797,-0.496242,-0.450036,25.660756,AAPL,-4.101576,-1.441405,...,-1.908937,1.035001,0.970439,-6.164441,-6.196632,-6.297723,-6.248513,-6.248781,0.860787,1
4,2005-01-29,-0.496854,-0.495821,-0.496372,-0.495736,-0.449605,31.220490,AAPL,-4.687041,-2.153361,...,-1.908791,-1.908927,1.035009,-6.129584,-6.164164,-6.196353,-6.297440,-6.248232,1.184667,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393423,2023-11-25,1.300387,1.248745,1.299685,1.283046,1.344140,-0.214561,BIO,-1.762781,-1.712488,...,-0.939853,-1.357087,-1.820190,1.223875,1.223835,1.223817,1.223761,1.223713,-0.200466,-1
393424,2023-12-02,1.277312,1.280437,1.313742,1.266341,1.327306,-0.214341,BIO,-1.719633,-1.736968,...,-1.046805,-0.939844,-1.357077,1.223863,1.223840,1.223800,1.223782,1.223726,-0.125076,-1
393425,2023-12-09,1.278071,1.335034,1.255892,1.277924,1.338978,-0.208835,BIO,-1.656665,-1.743115,...,-0.776454,-1.046796,-0.939834,1.223851,1.223828,1.223805,1.223764,1.223747,0.007847,0
393426,2023-12-16,1.303308,1.336508,1.331884,1.363023,1.424733,-0.212919,BIO,-1.518174,-1.718477,...,-0.872208,-0.776445,-1.046787,1.223871,1.223816,1.223793,1.223770,1.223729,0.732549,1


In [None]:
train_dataset = TextDataset()
valid_dataset = TextDataset(
test_dataset = TextDataset(

# Define a dataloader
trainloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
validloader = DataLoader(valid_dataset, batch_size=32, shuffle=False)
testloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
import torch.nn.functional as F
from torch import nn
import torch 
class StockLSTM(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1):
        super(StockLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        all_h, (h, c) = self.lstm(x)
        out = self.fc(all_h) # Apply Linear layer to outputs from all the hidden state.
        return out

In [None]:
import gurobipy as gp
from gurobipy import GRB
import numpy as np
import pyepo
from pyepo.model.grb import optGrbModel
import torch
from torch import nn
from torch.utils.data import DataLoader

m = 50 # change based on number of assets
cov = np.cov(np.random.randn(10, m), rowvar=False) # covariance matrix
optmodel = pyepo.model.grb.portfolioModel(m, cov) # build model

In [None]:
import time

# train model
def trainModel(reg, loss_func, method_name, num_epochs=20, lr=1e-2):
    # set adam optimizer
    optimizer = torch.optim.Adam(reg.parameters(), lr=lr)
    # train mode
    reg.train()
    # init log
    train_loss_log = []
    loss_log_regret = [pyepo.metric.regret(reg, optmodel, loader_test)]
    # init elpased time
    elapsed = 0
    for epoch in range(num_epochs):
        # start timing
        tick = time.time()
        # load data
        train_loss = 0
        for i, data in enumerate(loader_train):
            x, c, w, z = data
            # cuda
            if torch.cuda.is_available():
                x, c, w, z = x.cuda(), c.cuda(), w.cuda(), z.cuda()
            # forward pass
            cp = reg(x)
            if method_name == "spo+":
                loss = loss_func(cp, c, w, z)
            elif method_name == "mse":
                loss = loss_func(cp, c)
            else:
                raise ValueError("Method name {} not supported".format(method_name))
            # backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # record time
            tock = time.time()
            elapsed += tock - tick
            train_loss += loss.item()
        train_loss /= len(loader_train)
        train_loss_log.append(train_loss)
        regret = pyepo.metric.regret(reg, optmodel, loader_test)
        loss_log_regret.append(regret)
        print("Epoch {:2},  Loss: {:9.4f},  Regret: {:7.4f}%".format(epoch+1, train_loss, regret*100))
    print("Total Elapsed Time: {:.2f} Sec.".format(elapsed))
    return train_loss_log, loss_log_regret

Create a Predict-then-Optimize Model

In [None]:
spop = pyepo.func.SPOPlus(optmodel, processes=1)

In [None]:
# Hyperparameters
VOCAB_SIZE = #fill in once data is imported
EMBEDDING_DIM = #param we can optimize
HIDDEN_DIM = #also a param to optimize
learning_rate = #another optimizable param
epoch = #optimize
num_layers = #optimize
epochs = 20
learning_rate = 2e-3
method_name = "spo+"

# Instantiate the model
lstm = StockLSTM(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM,num_layers=num_layers)
loss_log_lstm_spo, loss_log_regret_lstm_spo = trainModel(lstm, loss_func=spop, method_name=method_name)
