# Part 3. Enhancement
The RNN model used in Part 2 is a basic model to perform the task of sentiment classification. In
this section, you will design strategies to improve upon the previous model you have built. You are
required to implement the following adjustments:

1. Instead of keeping the word embeddings fixed, now update the word embeddings (the same
way as model parameters) during the training process.
2. As discussed in Question 1(c), apply your solution in mitigating the influence of OOV words
and train your model again.
3. Keeping the above two adjustments, replace your simple RNN model in Part 2 with a biLSTM model and a biGRU model, incorporating recurrent computations in both directions and
stacking multiple layers if possible.
4. Keeping the above two adjustments, replace your simple RNN model in Part 2 with a Convolutional Neural Network (CNN) to produce sentence representations and perform sentiment
classification.
5. Further improve your model. You are free to use any strategy other than the above mentioned solutions. Changing hyper-parameters or stacking more layers is not counted towards
a meaningful improvement.


In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from datasets import load_dataset
import numpy as np
import json
import nltk
from gensim.models import FastText
from common_utils import EmbeddingMatrix

In [3]:
with open('result/word2idx.json', 'r', encoding='utf-8') as f:
    word2idx = json.load(f)

UNK_TOKEN = "<UNK>"
embedding_matrix = np.load('result/embedding_matrix.npy')

In [4]:
EMBEDDING_DIM = 100  
VOCAB_SIZE = max(word2idx.values()) + 1
BATCH_SIZE = 64

In [5]:
# For FASTTEXT
# embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))

# for word, idx in word2idx.items():
#     if word in fasttext_model.wv:
#         embedding_matrix[idx] = fasttext_model.wv[word]
#     else:
#         # Initialize missing embeddings with random values
#         embedding_matrix[idx] = np.random.normal(scale=0.6, size=(EMBEDDING_DIM,))

In [16]:
# Reference: https://d2l.ai/chapter_natural-language-processing-applications/sentiment-analysis-cnn.html
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, num_filters, filter_sizes, dropout):
        super(CNN, self).__init__()
        # Create input layer, for the pretrained embedding matrix
        self.embedding = nn.Embedding(vocab_size, embedding_dim) 
        self.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))  

        # Enable back propagation -> Part 3 Task 1, allowing embeddings to be updated
        self.embedding.weight.requires_grad = True 

        # To store a list of Convolutions Layers, for 2-gram, 3-gram, 4-gram, 5-gram, 6-gram models.
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=fs) for fs in filter_sizes # fs = N-gram for varying length
        ])

        # Randomly drops some elements in the input tensor as a form of Regulatization, to prevent overfitting
        self.dropout = nn.Dropout(dropout)

        # My attempt to incorporate Attention layer by applying more weights on importat feature? Not really helping actually
        self.attention_layer = nn.Linear(len(filter_sizes) * num_filters, len(filter_sizes) * num_filters) 
        
        # Output the final result => Classes, 2 for our binary classification case
        self.fc = nn.Linear(len(filter_sizes) * num_filters, num_classes)
    
    def forward(self, x):
        # Shape: (batch_size, sequence_length, embedding_dim)
        x = self.embedding(x) 

        # Reshape to (batch_size, embedding_dim, sequence_length), as per Conv1d's expectation
        x = x.permute(0, 2, 1)

        # Applying ReLU activation function for each N-gram Conv1d layer
        conv_results = [F.relu(conv(x)) for conv in self.convs]

        # To perform max-over-time-pooling, taking the max value across entire sequence length for each filter, reducing the dimentionality of the output
        # Maybe can consider avg_pool1d too
        pool_results = [F.max_pool1d(conv_result, conv_result.shape[2]).squeeze(2) for conv_result in conv_results]
        
        # Concatenate the pooled features
        x = torch.cat(pool_results, 1) 

        # Prevent Overfitting
        x = self.dropout(x)

        # Attention is all you need? (Unfortunately not really in our case :')
        attention_weights = torch.softmax(self.attention_layer(x), dim=1)
        x = x * attention_weights
        
        return self.fc(x)


NUM_FILTERS = 100 
FILTER_SIZES = [2, 3, 4, 5, 6] # Essentially treating single CNN 1 Dimensional as N-gram model (Use the filter size to determine computation with kernel)
DROPOUT = 0.5
LEARNING_RATE = 0.001
EPOCHS = 5
NUM_CLASSES = 2 

model = CNN(VOCAB_SIZE, EMBEDDING_DIM, NUM_CLASSES, NUM_FILTERS, FILTER_SIZES, DROPOUT)


In [17]:
# These are to prepare dataset for mini-batch training

dataset = load_dataset("rotten_tomatoes")

train_texts = [example['text'] for example in dataset['train']]
train_labels = [example['label'] for example in dataset['train']]

test_texts = [example['text'] for example in dataset['test']]
test_labels = [example['label'] for example in dataset['test']]

def tokenize(texts, word2idx, max_len=512):
    tokenized = []
    for text in texts:
        tokens = nltk.word_tokenize(text.lower())
        token_ids = [word2idx.get(word, word2idx[UNK_TOKEN]) for word in tokens]
        tokenized.append(torch.tensor(token_ids[:max_len]))  # Truncate to max_len, or should I remove it?
    return tokenized

train_tokenized = tokenize(train_texts, word2idx)
test_tokenized = tokenize(test_texts, word2idx)

train_tokenized = pad_sequence(train_tokenized, batch_first=True)
test_tokenized = pad_sequence(test_tokenized, batch_first=True)

train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

train_data = data.TensorDataset(train_tokenized, train_labels)
test_data = data.TensorDataset(test_tokenized, test_labels)

train_loader = data.DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = data.DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

In [18]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE) #Idea: Can use Adam / MiniAdam / SGD with Momentum
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

def train(model, train_loader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        correct_preds = 0
        total_samples = 0
        for batch in train_loader:
            inputs, labels = batch
            optimizer.zero_grad()
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            _, predicted = torch.max(outputs, 1)
            correct_preds += (predicted == labels).sum().item()
            total_samples += labels.size(0)
        scheduler.step()
        epoch_loss = total_loss / len(train_loader)
        epoch_accuracy = correct_preds / total_samples
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}")


def evaluate(model, test_loader):
    model.eval()
    correct_preds = 0
    total_samples = 0
    
    with torch.no_grad():
        for batch in test_loader:
            inputs, labels = batch
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            correct_preds += (predicted == labels).sum().item()
            total_samples += labels.size(0)
    
    accuracy = correct_preds / total_samples
    print(f"Test Accuracy: {accuracy:.4f}")

train(model, train_loader, criterion, optimizer, EPOCHS)
evaluate(model, test_loader)


Epoch [1/5], Loss: 0.6789, Accuracy: 0.5489
Epoch [2/5], Loss: 0.5729, Accuracy: 0.6937
Epoch [3/5], Loss: 0.4539, Accuracy: 0.8023
Epoch [4/5], Loss: 0.3447, Accuracy: 0.8696
Epoch [5/5], Loss: 0.3255, Accuracy: 0.8698
Test Accuracy: 0.7636


In [15]:
from torchviz import make_dot
import torch

sample_input = torch.randint(0, VOCAB_SIZE, (1, 100)).to('cpu') 
output = model(sample_input)
dot = make_dot(output, params=dict(model.named_parameters()))

dot.format = 'png'  
dot.render('result/cnn_model_graph')


'result\\cnn_model_graph.png'

# Question 3. Enhancement
(a) Report the accuracy score on the test set when the word embeddings are updated (Part 3.1).
   
(b) Report the accuracy score on the test set when applying your method to deal with OOV words
in Part 3.2.
   
(c) Report the accuracy scores of biLSTM and biGRU on the test set (Part 3.3).
   
(d) Report the accuracy scores of CNN on the test set (Part 3.4).
   
(e) Describe your final improvement strategy in Part 3.5. Report the accuracy on the test set
using your improved model.
   
(f) Compare the results across different solutions above and describe your observations with possible discussions.
