In [43]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

# import all data
df = pd.read_csv('data/upload_DJIA_table.csv', parse_dates=['Date'], index_col='Date')
df = df[['Close']]
df = df.sort_index()

train_data = df[:'2014']
test_data = df['2015':]

bert_embeddings = np.load('bert_embeddings.npy')
fingpt_sentiments = np.load('fingpt_sentiment.npy').astype('float32') 

bert_embeddings_train = bert_embeddings[:train_data.shape[0]].reshape(-1, 768)
bert_embeddings_test = bert_embeddings[train_data.shape[0]:].reshape(-1, 768)


def create_sequences(df, seq_length, bert_emb):
    xemb, x_pr, x_sent, ys = [], [], [], []
    # Iterate over data indices
    for i in range(len(df) - seq_length):
      	# Define inputs
        xemb.append(bert_emb[i:i+seq_length])
        x_pr.append(df.iloc[i:i+seq_length, 0].values.reshape(-1, 1))
        x_sent.append(fingpt_sentiments[i:i+seq_length])
        
        # xemb = np.concatenate((xemb, fingpt_sentiments[i:i+seq_length].reshape(-1, 1)), axis=1)
        # Define target
        y = df.iloc[i+seq_length, 0]
        # xs.append(x)
        ys.append(y)
    return np.array(xemb), np.array(x_pr).squeeze(), np.array(x_sent), np.array(ys)
    

X_train_emb, X_train_pr, X_train_sent, y_train = create_sequences(train_data, 60, bert_embeddings_train)
X_test_emb, X_test_pr, X_test_sent, y_test = create_sequences(test_data, 60, bert_embeddings_test)

In [44]:
X_train_sent.shape, X_test_sent.shape

((1551, 60), (318, 60))

In [45]:
print("Train shapes: ", X_train_emb.shape, X_train_pr.shape, X_train_sent.shape, y_train.shape)
print("Test shapes: ", X_test_emb.shape, X_test_pr.shape, X_test_sent.shape, y_test.shape)

# convert to torch dataset
dataset_train = TensorDataset(
    torch.from_numpy(X_train_emb).float(),
    torch.from_numpy(X_train_pr).float(),
    torch.from_numpy(X_train_sent).float(),
    torch.from_numpy(y_train).float()
)
dataset_test = TensorDataset(
    torch.from_numpy(X_test_emb).float(),
    torch.from_numpy(X_test_pr).float(),
    torch.from_numpy(X_test_sent).float(),
    torch.from_numpy(y_test).float()
)

dataloader_train = DataLoader(dataset_train, batch_size=32, shuffle=True)
dataloader_test = DataLoader(dataset_test, batch_size=32, shuffle=True)

Train shapes:  (1551, 60, 768) (1551, 60) (1551, 60) (1551,)
Test shapes:  (318, 60, 768) (318, 60) (318, 60) (318,)


In [50]:
import torch.nn as nn

class StockPredictor(nn.Module):
    def __init__(self, embedding_dim, price_dim, sent_dim, hidden_dim, num_layers):
        super(StockPredictor, self).__init__()
        self.lstm_bert = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.lstm_price = nn.LSTM(price_dim, hidden_dim, num_layers, batch_first=True)
        self.lstm_sent = nn.LSTM(sent_dim, hidden_dim, num_layers, batch_first=True)
        # self.fc = nn.Linear(hidden_dim * 2, 1)
        self.head = nn.Sequential(
            nn.Linear(hidden_dim*3, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
    
    def forward(self, bert_x, price_x, sent_x):
        lstm_out_bert, _ = self.lstm_bert(bert_x)
        lstm_out_price, _ = self.lstm_price(price_x)
        lstm_out_sent, _ = self.lstm_sent(sent_x)
        # Only consider the output of the last LSTM cell
        last_out_bert = lstm_out_bert[:, -1, :]
        last_out_price = lstm_out_price[:, -1, :]
        last_out_sent = lstm_out_sent[:, -1, :]
        
        # Concatenate the outputs of both LSTMs
        combined_out = torch.cat((last_out_bert, last_out_price, last_out_sent), dim=1)
        output = self.head(combined_out)
        return output

# Model parameters
embedding_dim = 768  # Size of BERT embeddings
price_dim = 1        # Each stock price is a single number
sent_dim = 1        # Each sentiment is a single number

hidden_dim = 128
num_layers = 2

model = StockPredictor(embedding_dim, price_dim, sent_dim, hidden_dim, num_layers)


In [51]:
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim


# Training parameters
num_epochs = 10
learning_rate = 0.001

# Loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
model.train()
for epoch in range(num_epochs):
    for batch_bert, batch_price, batch_sent, batch_y in dataloader_train:
        optimizer.zero_grad()
        outputs = model(batch_bert, batch_price.unsqueeze(-1), batch_sent.unsqueeze(-1))
        loss = criterion(outputs.squeeze(), batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}, MAE: {loss.item()**0.5}")


Epoch 1, Loss: 144917312.0, MAE: 12038.16065684455
Epoch 2, Loss: 162123424.0, MAE: 12732.76969084103
Epoch 3, Loss: 116569328.0, MAE: 10796.727652395424
Epoch 4, Loss: 33920120.0, MAE: 5824.0982134576
Epoch 5, Loss: 13177386.0, MAE: 3630.0669415315197
Epoch 6, Loss: 7766516.5, MAE: 2786.8470535714728
Epoch 7, Loss: 8222755.5, MAE: 2867.534742596853
Epoch 8, Loss: 7412333.5, MAE: 2722.5601003467305
Epoch 9, Loss: 4933209.0, MAE: 2221.082844020006
Epoch 10, Loss: 4819522.0, MAE: 2195.3409757939653


In [53]:
import torchmetrics


mse = torchmetrics.MeanSquaredError()
model.eval()
with torch.no_grad():
    for batch_bert, batch_price, batch_sent, batch_y in dataloader_test:
        outputs = model(batch_bert, batch_price.unsqueeze(-1), batch_sent.unsqueeze(-1))
        mse(outputs.squeeze(), batch_y)

print("Test MSE: ", mse.compute())
print("Test MAE: ", mse.compute()**0.5)   

Test MSE:  tensor(23709278.)
Test MAE:  tensor(4869.2173)
