In [2]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader
from nltk import WordPunctTokenizer
from nltk.corpus import stopwords
import nltk

## Preprocessing


In [3]:
# import all data
df = pd.read_csv('data/upload_DJIA_table.csv', parse_dates=['Date'], index_col='Date')
df = df[['Close']]
df = df.sort_index()

train_data = df[:'2014']
test_data = df['2015':]

def create_sequences(df, seq_length):
    xs, ys = [], []
    # Iterate over data indices
    for i in range(len(df) - seq_length):
      	# Define inputs
        x = df.iloc[i:i+seq_length, 0]
        # Define target
        y = df.iloc[i+seq_length, 0]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)


X_train, y_train = create_sequences(train_data, 60)
X_test, y_test = create_sequences(test_data, 60)

print("Train shapes: ", X_train.shape, y_train.shape)
print("Test shapes: ", X_test.shape, y_test.shape)

# convert to torch dataset
dataset_train = TensorDataset(
    torch.from_numpy(X_train).float(),
    torch.from_numpy(y_train).float()
)
dataset_test = TensorDataset(
    torch.from_numpy(X_test).float(),
    torch.from_numpy(y_test).float()
)

dataloader_train = DataLoader(dataset_train, batch_size=32, shuffle=True)
dataloader_test = DataLoader(dataset_test, batch_size=32, shuffle=True)


Train shapes:  (1551, 60) (1551,)
Test shapes:  (318, 60) (318,)


## Pure NLP 

Use embeddings for predicting price
Vectorize data using:
1. Word2Vec
2. Tf-Idf
3. Fasttext


In [4]:
combined_news_djia = pd.read_csv('data/Combined_News_DJIA.csv')
combined_news_djia['Top1'] = combined_news_djia['Top1'].apply(lambda x: x[2:-1] if x[0]=='b' else x)
combined_news_djia['Top2'] = combined_news_djia['Top2'].apply(lambda x: x[2:-1] if x[0]=='b' else x)
prices = pd.read_csv('data/upload_DJIA_table.csv')
prices = prices[['Date', 'Close']]
data = pd.DataFrame(columns=['Date', 'Text'])

data['Text'] = combined_news_djia['Top1'] + " " + combined_news_djia['Top2']
data['Date'] = combined_news_djia['Date']
data['Close'] = prices.sort_values(by='Date').reset_index()['Close']
data = data.set_index('Date')

nltk.download('stopwords')
tokenizer = WordPunctTokenizer()
stop_words = set(stopwords.words('english'))

def process_headline(x):
    return " ".join([w.lower() for w in tokenizer.tokenize(x) if not w.lower() in stop_words])

data['Text'] = data['Text'].apply(process_headline)
data

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/baga_nuhkadiev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0_level_0,Text,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2008-08-08,georgia ' downs two russian warplanes ' countr...,11734.320312
2008-08-11,"wont america nato help us ? wont help us , hel...",11782.349609
2008-08-12,remember adorable 9 - year - old sang opening ...,11642.469727
2008-08-13,u . . refuses israel weapons attack iran : rep...,11532.959961
2008-08-14,experts admit legalise drugs war south osetia ...,11615.929688
...,...,...
2016-06-27,barclays rbs shares suspended trading tanking ...,17140.240234
2016-06-28,"2 , 500 scientists australia : want save great...",17409.720703
2016-06-29,explosion airport istanbul yemeni former presi...,17694.679688
2016-06-30,jamaica proposes marijuana dispensers tourists...,17929.990234


Tf-Idf return vectors size 11020. That's too much

Let's try getting word2vec embeddings from fasttext

In [64]:
import fasttext.util

# Download FastText word vectors
fasttext.util.download_model('en', if_exists='ignore')  # Download English language embeddings
ft = fasttext.load_model('cc.en.300.bin')  # Load the downloaded model

def get_embeddings(data):
    combo = []
    for row in data.values:
        news_embedding = np.mean([ft.get_word_vector(word) for word in row[0].split()], axis=0)
        combo.append(news_embedding)
    return np.array(combo), data.values[:, 1]

train_data = data[:'2014']
test_data = data['2015':]
train_emb = get_embeddings(train_data)
test_emb = get_embeddings(train_data)

# prices = data.values[:, 1].reshape(-1, 1)
# d = np.hstack((combo, prices))
# d[0, -1]

X_train, y_train = get_embeddings(train_data)
X_test, y_test = get_embeddings(test_data)

# convert to torch dataset
dataset_train = TensorDataset(
    torch.from_numpy(X_train).float(),
    torch.from_numpy(y_train.astype(float)).float()
)
dataset_test = TensorDataset(
    torch.from_numpy(X_test).float(),
    torch.from_numpy(y_test.astype(float)).float()
)

dataloader_train = DataLoader(dataset_train, batch_size=32, shuffle=True)
dataloader_test = DataLoader(dataset_test, batch_size=32, shuffle=True)




In [66]:
predictor = nn.Sequential(
    nn.Linear(300, 150),
    nn.ReLU(),
    nn.Linear(150, 100),
    nn.ReLU(),
    nn.Linear(100, 50),
    nn.ReLU(),
    nn.Linear(50, 1)
)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(predictor.parameters(), lr=0.001)
epochs=10

for epoch in range(epochs):
    for features, labels in dataloader_train:
        outputs = predictor(features)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}, MAE: {loss.item()**0.5}")

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1, Loss: 137767888.0, MAE: 11737.45662398801
Epoch 2, Loss: 151986544.0, MAE: 12328.282280999247
Epoch 3, Loss: 99173752.0, MAE: 9958.601909906833
Epoch 4, Loss: 36034648.0, MAE: 6002.886638942968
Epoch 5, Loss: 10408812.0, MAE: 3226.2690526364972
Epoch 6, Loss: 9505107.0, MAE: 3083.035354970812
Epoch 7, Loss: 12741957.0, MAE: 3569.587791328293
Epoch 8, Loss: 8894268.0, MAE: 2982.3259379216083
Epoch 9, Loss: 8255190.0, MAE: 2873.1846442580054
Epoch 10, Loss: 8483179.0, MAE: 2912.5897411067012


In [70]:
import torchmetrics


mse = torchmetrics.MeanSquaredError()
predictor.eval()
with torch.no_grad():
    for features, labels in dataloader_test:
        outputs = predictor(features).squeeze()
        mse(outputs, labels)

print("Test MSE: ", mse.compute())
print("Test MAE: ", mse.compute()**0.5)   

Test MSE:  tensor(43151148.)
Test MAE:  tensor(6568.9531)


## Modelling with BERT embeddings

Here we build multi-input model that consists of

RNN for stock prices in window of n

?

In [10]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/Users/baga_nuhkadiev/.cache/huggingface'



In [11]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    # Get the embeddings from the last hidden state
    embeddings = outputs.last_hidden_state
    # Pool the embeddings (use mean pooling for simplicity)
    pooled_embeddings = torch.mean(embeddings, dim=1)
    return pooled_embeddings.detach().numpy()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [21]:
headlines = data['Text'].values
embeddings = [get_bert_embeddings(headline) for headline in headlines]


In [26]:
bert_embeddings = np.array(embeddings)
bert_embeddings.shape

(1989, 1, 768)

In [32]:
# with open('bert_embeddings.npy', 'wb') as f:
#     np.save(f, bert_embeddings)


In [33]:
data['Close']

Date
2008-08-08    11734.320312
2008-08-11    11782.349609
2008-08-12    11642.469727
2008-08-13    11532.959961
2008-08-14    11615.929688
                  ...     
2016-06-27    17140.240234
2016-06-28    17409.720703
2016-06-29    17694.679688
2016-06-30    17929.990234
2016-07-01    17949.369141
Name: Close, Length: 1989, dtype: float64

In [None]:
# Create sequences of stock prices and BERT embeddings
def create_sequences(data, seq_length):
    sequences = []
    for i in range(len(data) - seq_length):
        seq = data[i:i + seq_length]
        sequences.append(seq)
    return np.array(sequences)


# Parameters
SEQ_LENGTH = 5

# Extract stock prices and BERT embeddings
stock_prices = df['Stock Price'].values
bert_embeddings = np.stack(df['BERT Embedding'].values)