In [1]:
import regex as re
import random
import numpy as np
from tqdm import tqdm
import torch.nn as nn
from torch import optim
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

## Language prediction RNN

In [2]:
max_len = 25
num_pairs = 1000

In [3]:
def preprocess(sent):
    sent = re.sub(r"([,.!?«»])", r"", sent)
    sent = re.sub(r"(['])", r" ", sent)
    return sent

In [4]:
def read_pairs(path, max_len):

    text = open(path, encoding='utf-8').read()
    text = text.strip().split('\n')
    pairs = []

    for pair in text:
        sent = pair.split('\t')
        preprocessed_first_sent = preprocess(sent[0].lower()).strip()
        preprocessed_second_sent = preprocess(sent[1].lower()).strip()
        if (len(preprocessed_first_sent.split()) < max_len and len(preprocessed_second_sent.split()) < max_len):
            pairs.append((preprocessed_first_sent, preprocessed_second_sent))
          
    return pairs

In [5]:
pairs_indo = read_pairs('Data/ind.txt', max_len)
print(len(pairs_indo))

pairs_dutch = read_pairs('Data/nld.txt', max_len)
print(len(pairs_dutch))

13007
76212


In [6]:
vocab = set()

for sent_eng, sent_indo in pairs_indo:
    for word in sent_eng.split(" "):
        vocab.add(word)
    for word in sent_indo.split(" "):
        vocab.add(word)
for sent_eng, sent_dutch in pairs_dutch:
    for word in sent_dutch.split(" "):
        vocab.add(word)

In [7]:
len(vocab)

24959

In [8]:
def bag_of_words(vocab):   
    word_id, id_word = {}, {}
    
    word_id['<EOS>'] = 0
    id_word[0] = '<EOS>'   

    for i, word in enumerate(vocab):
        word_id[word] = i + 1
        id_word[i + 1] = word

    return word_id, id_word

In [9]:
word_id, id_word = bag_of_words(vocab)

In [10]:
data_eng, data_indo, data_dutch = [], [], []

In [11]:
for sent_eng, sent_indo in pairs_indo[0:num_pairs]:
    data_eng.append(([word_id[word] for word in sent_eng.split(" ")], 0))
    data_indo.append(([word_id[word] for word in sent_indo.split(" ")], 1))

for _, sent_dutch in pairs_dutch[0:num_pairs]:
    data_dutch.append(([word_id[word] for word in sent_dutch.split(" ")], 2))

In [12]:
import torch

lr = 0.001
epochs = 10
bs = 64
num_classes = 3

#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cuda')


In [13]:
class RNN(nn.Module):
    def __init__(self, input_size, emb_size, hidden_size, num_classes):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(num_embeddings=input_size, embedding_dim=emb_size) 
        self.gru = nn.GRU(emb_size, hidden_size, batch_first=True) 
        self.output = nn.Linear(hidden_size, num_classes) 

    def forward(self, x):
        emb = self.embedding(x)
        _, hid = self.gru(emb)
        out = self.output(hid)
        
        return out

In [14]:
rnn = RNN(len(vocab)+1, 100, 50, num_classes).to(device)

In [15]:
print(rnn)

RNN(
  (embedding): Embedding(24960, 100)
  (gru): GRU(100, 50, batch_first=True)
  (output): Linear(in_features=50, out_features=3, bias=True)
)


In [16]:
data_all = data_eng + data_indo + data_dutch
data_len = len(data_all)
input_ids = np.zeros((data_len, max_len), dtype=np.int32)                           # make uniform inputs
target_ids = np.zeros((data_len, num_classes), dtype=np.float64)                    # make one hot

for index, (sent, label) in enumerate(data_all):
    input_ids[index, :len(sent)] = sent
    target_ids[index, label] = 1

train_data = TensorDataset(torch.LongTensor(input_ids).to(device), torch.FloatTensor(target_ids).to(device))
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

In [17]:
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn.parameters(), lr=lr)

losses = []

for epoch in range(epochs):
    total_loss = 0
    for x, y in tqdm(train_dataloader):
        
        optimizer.zero_grad()

        out = rnn(x)
        
        current_loss = loss_func(out, y.reshape(out.size()))
        current_loss.backward()

        optimizer.step()

        total_loss += current_loss.item()

    losses.append(current_loss.item())
    print('epoch: {}, loss {}'.format(epoch, total_loss/len(train_dataloader)))

100%|██████████| 47/47 [00:00<00:00, 119.60it/s]


epoch: 0, loss 87.10934091121592


100%|██████████| 47/47 [00:00<00:00, 405.16it/s]


epoch: 1, loss 80.28364952574385


100%|██████████| 47/47 [00:00<00:00, 376.00it/s]


epoch: 2, loss 74.53903344336976


100%|██████████| 47/47 [00:00<00:00, 382.11it/s]


epoch: 3, loss 71.61094340872258


100%|██████████| 47/47 [00:00<00:00, 415.92it/s]


epoch: 4, loss 70.2987706610497


100%|██████████| 47/47 [00:00<00:00, 405.17it/s]


epoch: 5, loss 68.82320412169112


100%|██████████| 47/47 [00:00<00:00, 415.92it/s]


epoch: 6, loss 68.24786782771983


100%|██████████| 47/47 [00:00<00:00, 394.97it/s]


epoch: 7, loss 67.70297168163543


100%|██████████| 47/47 [00:00<00:00, 353.38it/s]


epoch: 8, loss 67.3416013514742


100%|██████████| 47/47 [00:00<00:00, 379.03it/s]

epoch: 9, loss 67.14562468833111





In [18]:
torch.save(rnn.state_dict(), "rnn.ckpt")

In [19]:
rnn = RNN(len(vocab)+1, 100, 50, 3)
rnn = rnn.to(device)
rnn.load_state_dict(torch.load("rnn.ckpt"))
rnn.eval()

RNN(
  (embedding): Embedding(24960, 100)
  (gru): GRU(100, 50, batch_first=True)
  (output): Linear(in_features=50, out_features=3, bias=True)
)

In [20]:
random.shuffle(data_all)

In [21]:
for sent, label in data_all[:10]:
    print(" ".join([id_word[word] for word in sent]))

    x = torch.tensor(sent, dtype=torch.long, device=device)
    y_pred = rnn(x)

    if y_pred.argmax().item() == 0:
        print('English\n')

    if y_pred.argmax().item() == 1:
        print('Indonesian\n')

    if y_pred.argmax().item() == 2:
        print('Dutch\n')

it s weird
Indonesian

ik heb gelijk
Dutch

i feel numb
Indonesian

vouw het
Dutch

dank u
Dutch

ontspan u
Dutch

lihat ini
English

are you sleepy
English

cium saya
Dutch

sekarang pukul 830
Indonesian



## Stock sentiment prediction RNN

In [22]:
from datasets import load_dataset

dataset = load_dataset("zeroshot/twitter-financial-news-sentiment")

In [23]:
dataset['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 9543
})

In [24]:
def preprocess_stock_dataset(data):
    texts = []
    labels = []
    for text in data:
        txt = text['text']
        txt = re.sub(r'https\S+', '', txt)                      # delete links
        txt = re.sub(r'(\$[A-Z]+)+', '', txt)                   # delete tickers
        txt = re.sub(r'\(.*\)\s', '', txt)                      # delete unnecessary symbols
        txt = re.sub(r"([-:,.!?«»])", r"", txt)                 # delete unnecessary symbols
        txt = txt.strip()
        if (len(txt.split(" ")) < max_len):
            texts.append(txt)
            labels.append(text['label'])
    return texts, labels

In [25]:
x_train, y_train = preprocess_stock_dataset(dataset['train'])
x_test, y_test = preprocess_stock_dataset(dataset['validation'])

In [26]:
len(x_train)

9526

In [27]:
vocab = set()

for sent in x_train:
    for word in sent.split(" "):
        vocab.add(word)
for sent in x_test:
    for word in sent.split(" "):
        vocab.add(word)

In [28]:
len(vocab)

25081

In [29]:
word_id, id_word = bag_of_words(vocab)

In [30]:
train_data, test_data = [], []
for x, y in zip(x_train, y_train):
    train_data.append(([word_id[word] for word in x.split(" ")], y))

for x, y in zip(x_test, y_test):
    test_data.append(([word_id[word] for word in x.split(" ")], y))

In [31]:
import torch

lr = 0.001
epochs = 15
bs = 64
num_classes = 3

#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cuda')

In [32]:
class RNN(nn.Module):
    def __init__(self, input_size, emb_size, hidden_size, num_classes):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(num_embeddings=input_size, embedding_dim=emb_size) 
        self.gru = nn.GRU(emb_size, hidden_size, batch_first=True) 
        self.output = nn.Linear(hidden_size, num_classes) 

    def forward(self, x):
        emb = self.embedding(x)
        _, hidden_state = self.gru(emb)
        out = self.output(hidden_state)
        
        return out

In [33]:
rnn = RNN(len(vocab)+1, 1000, 500, num_classes).to(device)

In [34]:
print(rnn)

RNN(
  (embedding): Embedding(25082, 1000)
  (gru): GRU(1000, 500, batch_first=True)
  (output): Linear(in_features=500, out_features=3, bias=True)
)


In [35]:
input_ids = np.zeros((len(train_data), max_len), dtype=np.int32)
target_ids = np.zeros((len(train_data), num_classes), dtype=np.float64)

for index, (sent, label) in enumerate(train_data):
    input_ids[index, :len(sent)] = sent
    target_ids[index, label] = 1                                                                                    # creating one hot

data_train = TensorDataset(torch.LongTensor(input_ids).to(device), torch.FloatTensor(target_ids).to(device))
train_sampler = RandomSampler(data_train)
train_dataloader = DataLoader(data_train, sampler=train_sampler, batch_size=bs)

In [36]:
input_ids

array([[11680, 14418, 20523, ...,     0,     0,     0],
       [ 4511,  7038,  7368, ...,     0,     0,     0],
       [14271, 12662,  5691, ...,     0,     0,     0],
       ...,
       [  952, 16918,  3962, ...,     0,     0,     0],
       [18464, 13750,  1670, ...,     0,     0,     0],
       [10303, 18104, 20705, ...,     0,     0,     0]])

In [37]:
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn.parameters(), lr=lr)

losses = []

for epoch in range(epochs):
    total_loss = 0
    for x, y in tqdm(train_dataloader):
        
        optimizer.zero_grad()

        out = rnn(x)
        
        current_loss = loss_func(out, y.reshape(out.size()))
        current_loss.backward()

        optimizer.step()

        total_loss += current_loss.item()

    losses.append(current_loss.item())
    print('epoch: {}, loss {}'.format(epoch, total_loss/len(train_dataloader)))

100%|██████████| 149/149 [00:01<00:00, 108.66it/s]


epoch: 0, loss 84.67715244165203


100%|██████████| 149/149 [00:01<00:00, 148.12it/s]


epoch: 1, loss 77.39226744478981


100%|██████████| 149/149 [00:00<00:00, 149.20it/s]


epoch: 2, loss 72.69254646045249


100%|██████████| 149/149 [00:00<00:00, 149.43it/s]


epoch: 3, loss 71.08163787534573


100%|██████████| 149/149 [00:00<00:00, 149.00it/s]


epoch: 4, loss 70.50410041553062


100%|██████████| 149/149 [00:01<00:00, 148.40it/s]


epoch: 5, loss 70.43340629219209


100%|██████████| 149/149 [00:01<00:00, 148.85it/s]


epoch: 6, loss 70.29190680484643


100%|██████████| 149/149 [00:01<00:00, 143.32it/s]


epoch: 7, loss 70.165168915819


100%|██████████| 149/149 [00:01<00:00, 148.53it/s]


epoch: 8, loss 70.12904007162824


100%|██████████| 149/149 [00:00<00:00, 150.48it/s]


epoch: 9, loss 70.31301262874732


100%|██████████| 149/149 [00:00<00:00, 152.59it/s]


epoch: 10, loss 70.47438978668828


100%|██████████| 149/149 [00:00<00:00, 150.96it/s]


epoch: 11, loss 70.23481141160798


100%|██████████| 149/149 [00:00<00:00, 153.15it/s]


epoch: 12, loss 70.23566569897953


100%|██████████| 149/149 [00:00<00:00, 152.43it/s]


epoch: 13, loss 70.21507273424392


100%|██████████| 149/149 [00:00<00:00, 152.64it/s]

epoch: 14, loss 70.18170757421711





In [38]:
torch.save(rnn.state_dict(), "stock_rnn.ckpt")

In [39]:
rnn = RNN(len(vocab)+1, 1000, 500, 3)
rnn = rnn.to(device)
rnn.load_state_dict(torch.load("stock_rnn.ckpt"))
rnn.eval()

RNN(
  (embedding): Embedding(25082, 1000)
  (gru): GRU(1000, 500, batch_first=True)
  (output): Linear(in_features=500, out_features=3, bias=True)
)

In [40]:
random.shuffle(test_data)

In [41]:
for sent, label in test_data[:10]:
    print(" ".join([id_word[word] for word in sent]))

    x = torch.tensor(sent, dtype=torch.long, device=device)
    y_pred = rnn(x)

    if y_pred.argmax().item() == 0:
        print('Predicted: Bearish\n')
        
    if y_pred.argmax().item() == 1:
        print('Predicted: Bullish\n')

    if y_pred.argmax().item() == 2:
        print('Predicted: Neutral\n')

    if label == 0:
        print('Real: Bearish\n')
        
    if label == 1:
        print('Real: Bullish\n')

    if label == 2:
        print('Real: Neutral\n')

CACI gains a bull on growth acceleration
Predicted: Bullish

Real: Bullish

Joe Biden says Lindsey Graham his longtime friend and Senate colleague will regret pursuing an investigation of t…
Predicted: Neutral

Real: Neutral

Mall Operator Simon Property Buying a Major Competitor
Predicted: Bearish

Real: Neutral

“Periods aren't just a women's issue it's a human issue" Meet @nadyaokamoto an activist whose fight for gender…
Predicted: Neutral

Real: Neutral

Job Growth Surges In January Beating Wall Street Expectations
Predicted: Bearish

Real: Bullish

Nickel Monthly News For The Month Of January 2020  #markets #economy #stocks
Predicted: Neutral

Real: Neutral

Venture capitalists are embracing direct listings and curbing fees for bankers
Predicted: Neutral

Real: Neutral

PhillipsVan Heusen Q3 2020 Earnings Preview
Predicted: Neutral

Real: Neutral

A South African grieving family took a man's corpse to an insurance company to prove he's dead  Here's why their…
Predicted: Neutral

R