In [1]:
import pandas as pd
import numpy as np

In [2]:
DATA_ROOT = f"./datasets/all-data.csv"

In [3]:
df = pd.read_csv(DATA_ROOT, names=['sentiment', 'content'], encoding ='ISO-8859-1')

df.head()

Unnamed: 0,sentiment,content
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [4]:
df.count()

sentiment    4846
content      4846
dtype: int64

# Sentiment Process

In [5]:
classes = {
    class_name: idx for idx, class_name in enumerate(df['sentiment'].unique().tolist())
}

sentiments = df['sentiment'].apply(lambda x: classes[x])
sentiments[:5]

0    0
1    0
2    1
3    2
4    2
Name: sentiment, dtype: int64

# Content Process

In [6]:
import re
import unidecode
import nltk
from nltk.corpus  import stopwords
from nltk.stem.porter import PorterStemmer

In [7]:
nltk.download ('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nguye_zvo4miw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
english_stop_words = stopwords.words('english')
stemmer = PorterStemmer()

# Lowering, Punctuation removal, stopwords removal, stemming.
def text_normalize(text: str):
    text = text.lower() # Lowring
    text = unidecode.unidecode(text) #  
    text = text.strip() # 
    text = re.sub(r'[^\w\s]', '', text) # 
    text = ' '.join([word for word in text.split() if word not in english_stop_words]) # stopwords removal
    text = ' '.join([stemmer.stem(word) for word in text.split()]) # stemming

    return text

texts_normalize = df['content'].apply(lambda x: text_normalize(x))
texts_normalize[:5]

0    accord gran compani plan move product russia a...
1    technopoli plan develop stage area less 100000...
2    intern electron industri compani elcoteq laid ...
3    new product plant compani would increas capac ...
4    accord compani updat strategi year 20092012 ba...
Name: content, dtype: object

In [9]:
df['content'] = texts_normalize

In [10]:
df.head()

Unnamed: 0,sentiment,content
0,neutral,accord gran compani plan move product russia a...
1,neutral,technopoli plan develop stage area less 100000...
2,negative,intern electron industri compani elcoteq laid ...
3,positive,new product plant compani would increas capac ...
4,positive,accord compani updat strategi year 20092012 ba...


# Build vocab package

In [11]:
vocab = []
for sentence in df['content'].tolist():
    tokens = sentence.split()

    for token in tokens:
        if token not in vocab:
            vocab.append(token)

vocab.append('UNK') # UKN for unknown vocab
vocab.append('PAD') # PAD for space
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
vocab_size = len(vocab)

In [12]:
word_to_idx

{'accord': 0,
 'gran': 1,
 'compani': 2,
 'plan': 3,
 'move': 4,
 'product': 5,
 'russia': 6,
 'although': 7,
 'grow': 8,
 'technopoli': 9,
 'develop': 10,
 'stage': 11,
 'area': 12,
 'less': 13,
 '100000': 14,
 'squar': 15,
 'meter': 16,
 'order': 17,
 'host': 18,
 'work': 19,
 'comput': 20,
 'technolog': 21,
 'telecommun': 22,
 'statement': 23,
 'said': 24,
 'intern': 25,
 'electron': 26,
 'industri': 27,
 'elcoteq': 28,
 'laid': 29,
 'ten': 30,
 'employe': 31,
 'tallinn': 32,
 'facil': 33,
 'contrari': 34,
 'earlier': 35,
 'layoff': 36,
 'contract': 37,
 'rank': 38,
 'offic': 39,
 'worker': 40,
 'daili': 41,
 'postime': 42,
 'report': 43,
 'new': 44,
 'plant': 45,
 'would': 46,
 'increas': 47,
 'capac': 48,
 'meet': 49,
 'expect': 50,
 'demand': 51,
 'improv': 52,
 'use': 53,
 'raw': 54,
 'materi': 55,
 'therefor': 56,
 'profit': 57,
 'updat': 58,
 'strategi': 59,
 'year': 60,
 '20092012': 61,
 'baswar': 62,
 'target': 63,
 'longterm': 64,
 'net': 65,
 'sale': 66,
 'growth': 67,
 'r

Transform words in vocab (str) to numbers array

In [13]:
def transform(text: str, word_to_idx, max_seq_len):
    '''
    text: string,
    word_to_idx: vocabulary dictation,
    max_seq_len: a constant number,
    '''

    
    tokens = []
    for w in text.split():
        try:
            w_idx = word_to_idx[w]
        except:
            w_idx = word_to_idx['UNK']
        tokens.append(w_idx)

    if len(tokens) < max_seq_len:
        tokens += [word_to_idx['PAD']] * (max_seq_len - len(tokens))

    elif len(tokens) > max_seq_len:
        tokens = tokens[:max_seq_len]

    return tokens

In [14]:
text_tokens = df['content'].apply(lambda x: transform(x, word_to_idx=word_to_idx, max_seq_len=32))

text_tokens[:5]

0    [0, 1, 2, 3, 4, 5, 6, 7, 2, 8, 8907, 8907, 890...
1    [9, 3, 10, 11, 12, 13, 14, 15, 16, 17, 18, 2, ...
2    [25, 26, 27, 2, 28, 29, 30, 31, 32, 33, 34, 35...
3    [44, 5, 45, 2, 46, 47, 48, 49, 50, 47, 51, 46,...
4    [0, 2, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,...
Name: content, dtype: object

# Recurrent Neural Network (RNN)
<!-- <img src="imgs/RNN-unrolled.png" width="500"/> -->
![RNN](imgs/RNN-unrolled.png)
RNN usually apply for sequential data:

- Sequential data: time series, natural language, speech or stock prices.
- Unlike traditional feedforward neural networks, which treat inputs independently, RNNs are designed to "remember" previous inputs by maintaining a hidden state that acts as a memory. This makes them ideal for tasks where past information influences future predictions, such as predicting the next word in a sentence or the next value in a time series.
## Challenges:
- Vanishing/Exploding Gradients
- Sequential Processing

In [15]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
val_size = 0.2
test_size = 0.125
is_shuffle = True
texts = text_tokens.tolist()#df['content'].tolist()
labels = sentiments.tolist()#df['sentiment'].tolist()

X_train, X_val, y_train, y_val = train_test_split(
    texts, labels,
    test_size=val_size,
    random_state=42,
    shuffle=is_shuffle
)

X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train,
    test_size=test_size,
    random_state=42,
    shuffle=is_shuffle    
)

In [17]:
len(X_train), len(X_test), len(X_val)

(3391, 485, 970)

In [18]:
class FinancialNews(Dataset):
    def __init__(
            self,
            X, y,
            # word_to_idx,
            # max_seq_len,
            # transform=None
    ):
        self.texts = X
        self.labels = y
        # self.word_to_idx = word_to_idx
        # self.max_seq_len = max_seq_len
        # self.transform = transform

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        text = torch.tensor(text)
        # label = torch.tensor(label)

        return text, label

In [19]:
train_dataset = FinancialNews(
    X_train,
    y_train
)

val_dataset = FinancialNews(
    X_val,
    y_val
)

test_dataset = FinancialNews(
    X_test,
    y_test
)

In [20]:
train_batch_size = 128
test_batch_size = 8

train_loader = DataLoader(
    train_dataset,
    batch_size=train_batch_size,
    shuffle=True
)

val_loader = DataLoader(
    train_dataset,
    batch_size=test_batch_size,
    shuffle=False
)

test_loader = DataLoader(
    train_dataset,
    batch_size=test_batch_size,
    shuffle=False
)

In [21]:
class SentimentClassifer(nn.Module):
    def __init__(
            self,
            vocab_size,
            embedding_dim,
            hidden_size, n_layers, n_classes,
            dropout_prob
    ):
        super(SentimentClassifer, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_size, n_layers, batch_first=True)
        self.norm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc1 = nn.Linear(hidden_size, 16)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(16, n_classes)

    def forward(self, x):
        x = self.embedding(x)
        x, hn = self.rnn(x)
        x = x[:, -1, :]
        x = self.norm(x)
        x = self.dropout(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)

        return x

In [22]:
n_classes = 3
embedding_dim = 64
hidden_size = 64
n_layers = 2
dropout_prob = 0.2

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = SentimentClassifer(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    hidden_size=hidden_size,
    n_classes=n_classes,
    n_layers=n_layers,
    dropout_prob=dropout_prob
).to(device)

In [23]:
lr = 1e-4
epochs = 50

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=lr,
)

In [24]:
def evaluate(model, dataloader, criterion, device):
    model.eval()
    correct = 0
    total = 0
    losses = []

    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            losses.append(loss.item())

            _,predicted = torch.max(outputs.data, 1)
            total += labels.size(0)

            correct += (predicted == labels).sum().item()

    loss = sum(losses) / len(losses)
    acc = correct/total

    return loss, acc

In [25]:
def fit(
        model,
        train_loader,
        val_loader,
        criterion,
        optimizer,
        device,
        epochs
):
    train_losses = []
    val_losses = []

    for epoch in range(epochs):
        batch_train_losses = []
        model.train()

        for idx, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)

            loss = criterion(outputs, labels)
            loss.backward()

            optimizer.step()
            batch_train_losses.append(loss.item())

        train_loss = sum(batch_train_losses) / len(batch_train_losses)
        train_losses.append(train_loss)

        val_loss, val_acc = evaluate(
            model,
            val_loader,
            criterion,
            device
        )

        val_losses.append(val_loss)

        print (f'EPOCH { epoch + 1}:\tTrain loss : { train_loss:.4f}\tVal loss : {val_loss:.4f}')

    return train_losses, val_losses

In [26]:
train_losses, val_losses = fit(
    model,
    train_loader,
    val_loader,
    criterion,
    optimizer,
    device,
    epochs
)

EPOCH 1:	Train loss : 0.9400	Val loss : 0.9271
EPOCH 2:	Train loss : 0.9292	Val loss : 0.9270
EPOCH 3:	Train loss : 0.9295	Val loss : 0.9269
EPOCH 4:	Train loss : 0.9311	Val loss : 0.9269
EPOCH 5:	Train loss : 0.9290	Val loss : 0.9266
EPOCH 6:	Train loss : 0.9280	Val loss : 0.9265
EPOCH 7:	Train loss : 0.9284	Val loss : 0.9265
EPOCH 8:	Train loss : 0.9295	Val loss : 0.9264
EPOCH 9:	Train loss : 0.9285	Val loss : 0.9263
EPOCH 10:	Train loss : 0.9276	Val loss : 0.9261
EPOCH 11:	Train loss : 0.9283	Val loss : 0.9260
EPOCH 12:	Train loss : 0.9278	Val loss : 0.9259
EPOCH 13:	Train loss : 0.9272	Val loss : 0.9259
EPOCH 14:	Train loss : 0.9277	Val loss : 0.9257
EPOCH 15:	Train loss : 0.9249	Val loss : 0.9256
EPOCH 16:	Train loss : 0.9264	Val loss : 0.9256
EPOCH 17:	Train loss : 0.9255	Val loss : 0.9253
EPOCH 18:	Train loss : 0.9272	Val loss : 0.9252
EPOCH 19:	Train loss : 0.9243	Val loss : 0.9251
EPOCH 20:	Train loss : 0.9277	Val loss : 0.9250
EPOCH 21:	Train loss : 0.9274	Val loss : 0.9248
E

In [27]:
val_loss, val_acc = evaluate(
    model,
    val_loader,
    criterion,
    device
)

test_loss, test_acc = evaluate(
    model,
    test_loader,
    criterion,
    device
)


print('Evaluation on val/test dataset')
print('Val accuracy: ', val_acc)
print('Test accuracy: ', test_acc)

Evaluation on val/test dataset
Val accuracy:  0.6054261279858448
Test accuracy:  0.6054261279858448
