In [1]:
import time
from datasets import load_dataset, Dataset

In [2]:
start_time = time.time()

dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

end_time = time.time()
print(f"Elapsed time to load dataset: {end_time - start_time:.4f} seconds")

Elapsed time to load dataset: 8.8539 seconds


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [4]:
train_dataset[:5]

{'text': ['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
  'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .',
  'effective but too-tepid biopic',
  'if you sometimes like to go to the movies to have fun , wasabi is a good place to start .',
  "emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one ."],
 'label': [1, 1, 1, 1, 1]}

# Part 1. Preparing Word Embeddings   
As the first step of building your model, you need to prepare the word embeddings to form the
input layer of your model. You are required to choose only from Word2vec or Glove to initialize
your word embedding matrix. The word embedding matrix stores the pre-trained word vectors
(taken from Word2vec or Glove) where each row corresponds to a vector for a specific word in the
vocabulary formed from your task dataset.


In [5]:
import json
import os
from collections import defaultdict

import numpy as np
import regex as re

UNK_TOKEN = "<UNK>"
EMBEDDING_DIM = 100 # glove embedding are usually 50,100,200,300
SAVE_DIR = './result/'
VOCAB_PATH = os.path.join(SAVE_DIR, 'vocab.json')
EMBEDDING_MATRIX_PATH = os.path.join(SAVE_DIR, 'embedding_matrix.npy')
WORD2IDX_PATH = os.path.join(SAVE_DIR, 'word2idx.json')
IDX2WORD_PATH = os.path.join(SAVE_DIR, 'idx2word.json')

os.makedirs(SAVE_DIR, exist_ok=True)

## Preparing Vocab

In [6]:
train_string = train_dataset[4]["text"]
train_string

"emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one ."

In [7]:
# Pattern is adapted from GPT2: https://github.com/huggingface/transformers/blob/4fb28703adc2b44ed66a44dd04740787010c5e11/src/transformers/models/gpt2/tokenization_gpt2.py#L167
pattern = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d|\p{L}+|\p{N}+|[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
tokens_re = pattern.findall(train_string)
tokens_re = set(tokens_re)
tokens_re

{' ',
 "'s",
 "'t",
 ',',
 '.',
 'an',
 'and',
 'as',
 'doesn',
 'emerges',
 'feel',
 'honest',
 'issue',
 'it',
 'keenly',
 'like',
 'movie',
 'observed',
 'one',
 'rare',
 'so',
 'something',
 'that'}

In [8]:
import nltk
nltk.download('punkt_tab')
tokens_nltk = nltk.word_tokenize(train_string)
tokens_nltk = set(tokens_nltk)
tokens_nltk

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


{"'s",
 ',',
 '.',
 'an',
 'and',
 'as',
 'does',
 'emerges',
 'feel',
 'honest',
 'issue',
 'it',
 'keenly',
 'like',
 'movie',
 "n't",
 'observed',
 'one',
 'rare',
 'so',
 'something',
 'that'}

In [9]:
tokens_nltk - tokens_re

{'does', "n't"}

In [10]:
tokens_re - tokens_nltk

{' ', "'t", 'doesn'}

It does seem like NLTK tokenizer performs a better job at tokenizing the overall dataset.
In light of that, we will use the tokenizer that was introduced in the lectures
to obtain an overall more comprehensive tokenization process.

In [11]:
def tokenize(dataset: Dataset) -> set:
    """Tokenize the text in the dataset using NTLK

    :param dataset: The dataset to tokenize
    :type dataset: Dataset
    :return: The set of tokens in the dataset
    :rtype: set
    """
    vocab = set()
    
    for example in dataset:
        tokens = nltk.word_tokenize(example['text'])
        vocab.update(tokens)
    
    print(f"Vocabulary Size: {len(vocab)}")

    with open(VOCAB_PATH, 'w', encoding='utf-8') as f:
        json.dump(list(vocab), f, ensure_ascii=False, indent=4)

    print(f"Vocabulary saved to {VOCAB_PATH}")
    return vocab

In [12]:
vocab = tokenize(train_dataset)

Vocabulary Size: 18030
Vocabulary saved to ./result/vocab.json


## Initialize Word Embedding Matrix with Glove

In [13]:
def load_glove_vocab():
    """Load Glove vocab

    :return: GloVe vocab
    :rtype: Set
    """
    print("Loading GloVe vocab...")
    glove_vocab:set = set()
    # https://huggingface.co/datasets/SLU-CSCI4750/glove.6B.100d.txt
    dataset = load_dataset("SLU-CSCI4750/glove.6B.100d.txt")
    dataset = dataset['train']
    
    for example in dataset:
        word = example["text"].split()[0]
        glove_vocab.add(word)
    print("GloVe vocab loaded.")
    return glove_vocab

In [14]:
glove_vocab = load_glove_vocab()
print(f"Size of GloVe vocab: {len(glove_vocab)}")

oov_words = vocab - glove_vocab
print(f"Number of OOV Words: {len(oov_words)}")

Loading GloVe vocab...


Repo card metadata block was not found. Setting CardData to empty.


GloVe vocab loaded.
Size of GloVe vocab: 400000
Number of OOV Words: 1867


In [15]:
def load_glove_embeddings() -> dict:
    """Load GloVe embeddings

    :return: GloVe embeddings
    :rtype: Dict
    """
    print("Loading GloVe embeddings...")
    glove_dict = {}
    word_embedding_glove = load_dataset("SLU-CSCI4750/glove.6B.100d.txt")
    word_embedding_glove = word_embedding_glove['train']
    
    for example in word_embedding_glove:
        split_line = example["text"].strip().split()
        word = split_line[0]
        vector = np.array(split_line[1:], dtype='float32')
        glove_dict[word] = vector

    print(f"Total GloVe words loaded: {len(glove_dict)}")
    return glove_dict

In [16]:
# mapping of words to indices and vice versa
word2idx = {word: idx for idx, word in enumerate(sorted(vocab))}
idx2word = {idx: word for word, idx in word2idx.items()}

vocab_size = len(vocab)
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))

print("Building embedding matrix...")

glove_dict = load_glove_embeddings()

for word in vocab:
    idx = word2idx[word]
    # if word is in glove vocab, use glove vector
    if word in glove_dict:
        embedding_matrix[idx] = glove_dict[word]
    else:
    # word is not in glove vocab, we remove it from our vocab
        # embedding_matrix[idx] = np.zeros(EMBEDDING_DIM)
        word2idx.pop(word)
        idx2word.pop(idx)

print("Embedding matrix built successfully.")

np.save(EMBEDDING_MATRIX_PATH, embedding_matrix)
print(f"Embedding matrix saved as '{EMBEDDING_MATRIX_PATH}'.")

with open(WORD2IDX_PATH, 'w', encoding='utf-8') as f:
    json.dump(word2idx, f, ensure_ascii=False, indent=4)
print(f"Mapping 'word2idx' saved as '{WORD2IDX_PATH}'.")

with open(IDX2WORD_PATH, 'w', encoding='utf-8') as f:
    json.dump(idx2word, f, ensure_ascii=False, indent=4)
print(f"Mapping 'idx2word' saved as '{IDX2WORD_PATH}'.")


Building embedding matrix...
Loading GloVe embeddings...


Repo card metadata block was not found. Setting CardData to empty.


Total GloVe words loaded: 400000
Embedding matrix built successfully.
Embedding matrix saved as './result/embedding_matrix.npy'.
Mapping 'word2idx' saved as './result/word2idx.json'.
Mapping 'idx2word' saved as './result/idx2word.json'.


# Question 1. Word Embedding


(a) What is the size of the vocabulary formed from your training data?   

Answer:
`16570`

(b) We use OOV (out-of-vocabulary) to refer to those words appeared in the training data but not in the Word2vec (or Glove) dictionary. How many OOV words exist in your training data?

Answer:
`638`

(c) The existence of the OOV words is one of the well-known limitations of Word2vec (or Glove).
Without using any transformer-based language models (e.g., BERT, GPT, T5), what do you
think is the best strategy to mitigate such limitation? Implement your solution in your source
code. Show the corresponding code snippet. 

Answer:

(1) Using an <UNK> Token, with its Embeddings randomized. Map any OOV words to the <UNK> Token

We explore the code snippet below

```python
for word in extended_vocab:
    idx = word2idx[word]
    # if word is in glove vocab, use glove vector
    if word in glove_dict:
        embedding_matrix[idx] = glove_dict[word]
    else:
        # use random vector for unknown words
        if word == UNK_TOKEN:
            embedding_matrix[idx] = np.random.normal(scale=0.6, size=(EMBEDDING_DIM,))
        else:
            embedding_matrix[idx] = embedding_matrix[word2idx[UNK_TOKEN]]
```

This is a useful strategy as we can use the embeddings of the <UNK> token to
represent any unknown words. Thus, now for any unknown words, we can use the
<UNK> token to represent them and for the vocabulary words that are not in the
pretrained embeddings, we can use the embeddings of the <UNK> token to represent
it.

(2) There are many kinds of static embeddings. An extension of word2vec, fasttext (Bojanowski et al., 2017), addresses a problem with word2vec as we have presented it so far: it has no good way to deal with unknown words—words that appear in a test corpus but were unseen in the training corpus.

A related problem is word sparsity, such as in languages with rich morphology, where some of the many forms for each noun and verb may only occur rarely. Fasttext deals with these problems by using subword models, representing each word as itself plus a bag of constituent n-grams, with special boundary symbols < and > added to each word.

## Part 2. Model Training & Evaluation - RNN   
Now with the pretrained word embeddings acquired from Part 1 and the dataset acquired from
Part 0, you need to train a deep learning model for sentiment classification using the training set,
conforming to these requirements:


• Use the pretrained word embeddings from Part 1 as inputs; do not update them during training
(they are “frozen”).   

• Design a simple recurrent neural network (RNN), taking the input word embeddings, and
predicting a sentiment label for each sentence. To do that, you need to consider how to
aggregate the word representations to represent a sentence.   

• Use the validation set to gauge the performance of the model for each epoch during training.
You are required to use accuracy as the performance metric during validation and evaluation. 
   
• Use the mini-batch strategy during training. You may choose any preferred optimizer (e.g.,
SGD, Adagrad, Adam, RMSprop). Be careful when you choose your initial learning rate and
mini-batch size. (You should use the validation set to determine the optimal configuration.)
Train the model until the accuracy score on the validation set is not increasing for a few
epochs.
   
• Evaluate your trained model on the test dataset, observing the accuracy score.

In [None]:
# convert sentences into proper input for RNN -> sequence modelling 
# decide on RNN model? -> Vanilla Rnn / LSTM 
# dataloaders 
# train + validate -> decide on best hyperparams: optimizer = nn.optimizer.Adams
# test 

In [5]:
# note: 1 is pos, 0 is neg, can just use Sigmoid 
train_dataset[0]

{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
 'label': 1}

In [13]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import numpy as np
# create train, val, test dataloaders 

In [12]:
embedding_matrix = np.load(EMBEDDING_MATRIX_PATH)

In [None]:
# define hyperparams 
optimizer = nn.optimizer.Adams

In [None]:
class vanillaRNN(torch.nn.Module):
  def __init__(self, input_dim, embedding_dim, hidden_dim, num_layers, output_dim):
    super().__init__() 
    # self.embedding = nn.Embedding(input_dim, embedding_dim)
    self.embedding = nn.Embedding.from_pretrained(embeddings=torch.from_numpy(embedding_matrix), freeze=True)
    self.rnn = nn.RNN(input_dim, embedding_dim, hidden_dim, num_layers, batch_first=True) 
    # NOTE: batch_first = True, means need to input with the size of batch as the first dim
    # NOTE: x will take the shape of (batch size, seq size, input size.)
    self.fc = nn.Linear(hidden_dim, output_dim)
    self.sigmoid = nn.Sigmoid()
  def forward(self, x):
    embedded = self.embedding(x)
    output, _ = self.rnn(embedded)

In [None]:
# we can use an LSTM 
class LSTM(nn.Module):
  pass 

In [None]:
# rotten tomatoes dataset (train) 
# parameters --> word embeddings 

In [None]:
# RNN model many (word seq) to one (sentiment)
# vanilla RNN 

# LSTM (Gated RNN)

# how to aggregate word representations to represent a sentence (concatenate / averaging)

# sentiment classification => 

### Question 2. RNN
(a) Report the final configuration of your best model, namely the number of training epochs,
learning rate, optimizer, batch size.   

(b) Report the accuracy score on the test set, as well as the accuracy score on the validation
set for each epoch during training.   

(c) RNNs produce a hidden vector for each word, instead of the entire sentence. Which methods
have you tried in deriving the final sentence representation to perform sentiment classification?
Describe all the strategies you have implemented, together with their accuracy scores on the
test set.

## Part 3. Enhancement
The RNN model used in Part 2 is a basic model to perform the task of sentiment classification. In
this section, you will design strategies to improve upon the previous model you have built. You are
required to implement the following adjustments:

1. Instead of keeping the word embeddings fixed, now update the word embeddings (the same
way as model parameters) during the training process.
2. As discussed in Question 1(c), apply your solution in mitigating the influence of OOV words
and train your model again.
3. Keeping the above two adjustments, replace your simple RNN model in Part 2 with a biLSTM model and a biGRU model, incorporating recurrent computations in both directions and
stacking multiple layers if possible.
4. Keeping the above two adjustments, replace your simple RNN model in Part 2 with a Convolutional Neural Network (CNN) to produce sentence representations and perform sentiment
classification.
5. Further improve your model. You are free to use any strategy other than the above mentioned solutions. Changing hyper-parameters or stacking more layers is not counted towards
a meaningful improvement.
