## Download dataset and import libraries

In [None]:
!gdown 1uYXI4O3oWBA6QC8ZJ-r6yaTTfkdAnl_Q

Downloading...
From: https://drive.google.com/uc?id=1uYXI4O3oWBA6QC8ZJ-r6yaTTfkdAnl_Q
To: /content/dataset.zip
  0% 0.00/230k [00:00<?, ?B/s]100% 230k/230k [00:00<00:00, 24.5MB/s]


In [None]:
!unzip './dataset.zip'

Archive:  /content/dataset.zip
   creating: dataset/
  inflating: dataset/all-data.csv    


In [None]:
!pip install Unidecode==0.04.1

Collecting Unidecode==0.04.1
  Downloading Unidecode-0.04.1.tar.gz (167 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/167.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m167.0/167.0 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: Unidecode
  Building wheel for Unidecode (setup.py) ... [?25l[?25hdone
  Created wheel for Unidecode: filename=Unidecode-0.4.1-py3-none-any.whl size=211976 sha256=32fc71a62ceee17c33f9c8fbc5c523f1b5ffcb8940ee8fd7ae906903458a4079
  Stored in directory: /root/.cache/pip/wheels/99/77/8c/1d8cef148e84ca19d365e7d1bad016fb0dc17e8eddddc53fbe
Successfully built Unidecode
Installing collected packages: Unidecode
Successfully installed Unidecode-0.4.1


In [None]:
import torch
import torch.nn as nn

seed = 1
torch.manual_seed(seed)

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
from unidecode import unidecode

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Read dataset

In [None]:
dataset_path = './dataset/all-data.csv'
headers = ['sentiment', 'content']
df = pd.read_csv(
    dataset_path,
    names=headers,
    encoding='ISO-8859-1'
)

In [None]:
classes = {
    class_name: idx for idx, class_name in enumerate(df['sentiment'].unique().tolist())
}
df['sentiment'] = df['sentiment'].apply(lambda x: classes[x])

## Preprocessing dataset

In [None]:
english_stop_words = stopwords.words('english')
stemmer = PorterStemmer()

def text_normalize(text):
  text = text.lower()
  text = unidecode(text)
  text = text.strip()
  text = re.sub (r'[^\w\s]', '', text )
  text = ' '.join([word for word in text.split(' ') if word not in english_stop_words])
  text = ' '.join([stemmer.stem(word) for word in text.split()])

  return text

df['content'] = df['content'].apply(lambda x: text_normalize(x))

## Build vocabulary

In [None]:
vocab = []
for sentence in df['content'].tolist():
  tokens = sentence.split()
  for token in tokens:
    if token not in vocab:
      vocab.append(token)

vocab.append('UNK')
vocab.append('PAD')
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
vocab_size = len(vocab)

In [None]:
def transform(text, word_to_idx, max_seq_len):
  tokens = []
  for w in text.split():
    try:
      tokens.append(word_to_idx[w])
    except:
      tokens.append(word_to_idx['UNK'])

  if len(tokens) < max_seq_len:
    tokens += [word_to_idx['PAD']] * (max_seq_len - len(tokens))
  elif len(tokens) > max_seq_len:
    tokens = tokens[:max_seq_len]

  return tokens

## Split train, val, test

In [None]:
val_size = 0.2
test_size = 0.125
is_shuffle = True
texts = df['content'].tolist()
labels = df['sentiment'].tolist()

X_train, X_val, y_train, y_val = train_test_split(
    texts,
    labels,
    test_size=val_size,
    shuffle=is_shuffle,
    random_state=seed
)

X_train, X_test, y_train, y_test = train_test_split(
    X_train,
    y_train,
    test_size=test_size,
    shuffle=is_shuffle,
    random_state=seed
)

## Build pytorch datasets

In [None]:
class FinancialNews(Dataset):
  def __init__(self, texts, labels, word_to_idx, max_seq_len, transform=None):
    self.texts = texts
    self.labels = labels
    self.word_to_idx = word_to_idx
    self.max_seq_len = max_seq_len
    self.transform = transform

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = self.texts[idx]
    label = self.labels[idx]

    if self.transform:
      text = self.transform(text, self.word_to_idx, self.max_seq_len)

    text = torch.tensor(text)

    return text, label

## Declare dataloader

In [None]:
max_seq_len = 32

train_dataset = FinancialNews(
    X_train,
    y_train,
    word_to_idx=word_to_idx,
    max_seq_len=max_seq_len,
    transform=transform
)

val_dataset = FinancialNews(
    X_val, y_val,
    word_to_idx=word_to_idx,
    max_seq_len=max_seq_len,
    transform=transform
)

test_dataset = FinancialNews(
    X_test, y_test,
    word_to_idx=word_to_idx,
    max_seq_len=max_seq_len,
    transform=transform
)

train_batch_size = 128
test_batch_size = 128

train_dataloader = DataLoader(
    train_dataset,
    batch_size=train_batch_size,
    shuffle=True,
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=test_batch_size,
    shuffle=False,
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=test_batch_size,
    shuffle=False,
)

## Build model

In [None]:
class SentimentClassifier(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_size, n_layers, n_classes, dropout_prob):
    super(SentimentClassifier, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.rnn = nn.RNN(embedding_dim, hidden_size, n_layers, batch_first=True)
    self.norm = nn.LayerNorm(hidden_size)
    self.dropout = nn.Dropout(dropout_prob)
    self.fc1 = nn.Linear(hidden_size, 16)
    self.relu = nn.ReLU()
    self.fc2 = nn.Linear(16, n_classes)

  def forward(self, x):
    x = self.embedding(x)
    x, hn = self.rnn(x)
    x = x[:, -1, :]
    x = self.norm(x)
    x = self.dropout(x)
    x = self.fc1(x)
    x = self.relu(x)
    x = self.fc2(x)

    return x

In [None]:
n_classes = len(list(classes.keys()))
embedding_dim = 64
hidden_size = 64
n_layers = 2
dropout_prob = 0.2
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = SentimentClassifier(
    vocab_size,
    embedding_dim,
    hidden_size,
    n_layers,
    n_classes,
    dropout_prob
).to(device)

## Setting loss and optimizer

In [None]:
lr = 1e-4
epochs = 50

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

## Training model

In [None]:
def fit(model, train_dataloader, val_dataloader, criterion, optimizer, device, epochs):
  train_losses = []
  val_losses = []

  for epoch in range(epochs):
    batch_train_losses = []
    model.train()
    for idx, (inputs, labels) in enumerate(train_dataloader):
      inputs, labels = inputs.to(device), labels.to(device)

      optimizer.zero_grad()
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()

      batch_train_losses.append(loss.item())

    train_loss = sum(batch_train_losses) / len(batch_train_losses)
    train_losses.append(train_loss)

    val_loss, val_acc = evaluate(model, val_dataloader, criterion, device)
    val_losses.append(val_loss)

    print(f'EPOCH {epoch + 1}:\tTrain loss: {train_loss:.4f}\tVal loss: {val_loss:.4f}')

  return train_losses, val_losses

def evaluate(model, val_dataloader, criterion, device):
  model.eval()
  correct = 0
  total = 0
  losses = []

  with torch.no_grad():
    for inputs, labels in val_dataloader:
      inputs, labels = inputs.to(device), labels.to(device)
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      losses.append(loss.item())

      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()

  loss = sum(losses) / len(losses)
  acc = correct / total

  return loss, acc

In [None]:
train_losses, val_losses = fit(
    model,
    train_dataloader,
    val_dataloader,
    criterion,
    optimizer,
    device,
    epochs
)

EPOCH 1:	Train loss: 0.9385	Val loss: 0.9371
EPOCH 2:	Train loss: 0.9324	Val loss: 0.9350
EPOCH 3:	Train loss: 0.9297	Val loss: 0.9351
EPOCH 4:	Train loss: 0.9312	Val loss: 0.9349
EPOCH 5:	Train loss: 0.9284	Val loss: 0.9349
EPOCH 6:	Train loss: 0.9280	Val loss: 0.9349
EPOCH 7:	Train loss: 0.9282	Val loss: 0.9348
EPOCH 8:	Train loss: 0.9290	Val loss: 0.9348
EPOCH 9:	Train loss: 0.9296	Val loss: 0.9349
EPOCH 10:	Train loss: 0.9300	Val loss: 0.9349
EPOCH 11:	Train loss: 0.9305	Val loss: 0.9348
EPOCH 12:	Train loss: 0.9302	Val loss: 0.9349
EPOCH 13:	Train loss: 0.9325	Val loss: 0.9348
EPOCH 14:	Train loss: 0.9286	Val loss: 0.9348
EPOCH 15:	Train loss: 0.9272	Val loss: 0.9349
EPOCH 16:	Train loss: 0.9282	Val loss: 0.9348
EPOCH 17:	Train loss: 0.9251	Val loss: 0.9347
EPOCH 18:	Train loss: 0.9277	Val loss: 0.9348
EPOCH 19:	Train loss: 0.9277	Val loss: 0.9347
EPOCH 20:	Train loss: 0.9251	Val loss: 0.9346
EPOCH 21:	Train loss: 0.9288	Val loss: 0.9347
EPOCH 22:	Train loss: 0.9287	Val loss: 0.93

In [None]:
val_loss, val_acc = evaluate(model, val_dataloader, criterion, device)
test_loss, test_acc = evaluate(model, test_dataloader, criterion, device)

print("Evaluation on val/test dataset")
print(f'Val acc: {val_acc:.4f}\tVal loss: {val_loss:.4f}')
print(f'Test acc: {test_acc:.4f}\tTest loss: {test_loss:.4f}')

Evaluation on val/test dataset
Val acc: 0.5948	Val loss: 0.9091
Test acc: 0.6186	Test loss: 0.8765
