In [11]:
#!pip install datasets
#!pip install gensim


## Part 0 Dataset Preparation

In [12]:
from datasets import load_dataset

dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']


## Part 1 Preparing Word Embeddings

In [3]:
import gensim.downloader as api

# List all available pre-trained models
available_models = api.info()['models'].keys()
print(available_models)


dict_keys(['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis'])


### Loading 'word2vec-google-news-300' model

In [15]:
# Load Google's pre-trained Word2Vec model (300-dimensional vectors)
word2vec_model = api.load('word2vec-google-news-300')


In [90]:
# Check the length of the dataset
print(len(train_dataset))

# View the first sample in the dataset
first_sample = train_dataset[0]
for i, key in enumerate(first_sample):
    print(f"Element {i}: {key}")

for key, value in first_sample.items():
    print(f"'{key}' : {value}")


8530
Element 0: text
Element 1: label
'text' : the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .
'label' : 1


### (a) What is the size of the vocabulary formed from your training data?

In [91]:
from collections import Counter

vocab_counter = Counter()

for sentence in train_dataset['text']:  
    vocab_counter.update(sentence.split())  # Split sentences into words

# Extract vocabulary
vocab = list(vocab_counter.keys())
#print(vocab)
print("Size of training data vocabulary: " + str(len(vocab)))

Size of training data vocabulary: 18951


In [92]:
from collections import Counter
import re

vocab_counter = Counter()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = text.split()  
    return tokens

for sentence in train_dataset['text']:  
    tokens = preprocess_text(sentence)
    vocab_counter.update(tokens)  

# Extract vocabulary
vocab = list(vocab_counter.keys())
#print(vocab)
print("Size of training data vocabulary: " + str(len(vocab)))

Size of training data vocabulary: 18223


### (b) How many OOV words exist in your training data?

In [93]:
oov_words = []

for word in vocab:
    if word not in word2vec_model:
        oov_words.append(word)  # Use pre-trained Word2Vec vector
        
print("Number of OOV words in training data: " + str(len(oov_words)))
print("Sample of 10 OOV words: " + str(oov_words[:10]))

Number of OOV words in training data: 3334
Sample of 10 OOV words: ['to', '21st', 'centurys', 'and', 'a', 'jeanclaud', 'damme', 'segal', 'of', 'cowriterdirector']


### (c) Strategy to mitigate limitation of OOV words

Each OOV word can be represented as a bag of character N-grams. Embeddings are then generated based on these N-grams. By representing words as combinations of n-grams, FastText can generalize better across similar words. For example, if the model has seen "apple," it can infer meaningful representations for related words like "apples," "applet," and even misspellings or variations.FastText’s n-gram approach can capture semantic similarities between words that share similar character patterns. For instance, "cat" and "cats" will share common n-grams, leading to embeddings that are close to each other in the vector space. For instance:
The word "unhappiness" can be broken down into n-grams like "un," "happi," "ness," etc.
This enables FastText to understand that "unhappy" and "happiness" share a common root, even if those specific words were not seen during training.


remove some and REPHRASE EVERYTHING


In [21]:
from gensim.models import FastText

# Load the pre-trained FastText model (English) from Gensim's API
fasttext_model = api.load('fasttext-wiki-news-subwords-300')

In [94]:
import numpy as np
import re
from collections import Counter

# Load your models and define vocab

vocab = ['<UNK>'] + vocab  # Adding a padding token and a unk token

embedding_dim = 300
embedding_matrix = np.zeros((len(vocab), embedding_dim))  # Initialize embedding matrix

# Create a clean_word-to-index dictionary for your vocabulary
word_to_idx = { word : idx for idx, word in enumerate(vocab)}

oov_random_embeds = []

# Fill the embedding matrix
for word, idx in word_to_idx.items():
    if word in word2vec_model:
        embedding_matrix[idx] = word2vec_model[word]  # Use Word2Vec vector
    elif word in fasttext_model:
        embedding_matrix[idx] = fasttext_model[word]  # Use FastText vector
    else:
        embedding_matrix[idx] = np.random.normal(size=(embedding_dim,))  # Random vector for OOV words
        oov_random_embeds.append(word)

        
# Check for OOV words
print("Number of OOV words in training data: " + str(len(oov_random_embeds)))
print("Some OOV words:", oov_random_embeds[:10])  # Print the first 10 OOV words

Number of OOV words in training data: 2678
Some OOV words: ['<UNK>', 'jeanclaud', 'damme', 'cowriterdirector', 'tolkiens', 'middleearth', 'tootepid', 'wisegirls', 'familyoriented', 'fantasyadventure']


## Part 2 Model Training & Evaluation - RNN

In [14]:
#print("Shape of word2vec vectors : " + str(word2vec_model['great'].shape))

Shape of word2vec vectors : (300,)


#### Helper Functions

In [113]:
def encode(sentence):
    encoded_list = []
    tokens = preprocess_text(sentence)
    for token in tokens: 
        if token in word_to_idx:
            encoded_list.append(word_to_idx[token])
        else:
            encoded_list.append(1)# index of <UNK>

    # padding : .append(0)
    return encoded_list
    
# DataLoader
def get_batch(split):
    # Generate a small batch of data of inputs x and targets y
    data = train_dataset if split == 'train' else validation_dataset
    ix = torch.randint(len(data), (batch_size,))
    
    x_numerical = []
    for i in ix:
        sentence = data[i.item()]['text']
        indices = encode(sentence)  
        x_numerical.append(torch.tensor(indices))  
    
    y = torch.tensor([data[i.item()]['label'] for i in ix], dtype=torch.long) 
    x_numerical = [x.to(device) for x in x_numerical] 
    y = y.to(device)
    
    return x_numerical, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        accuracies = torch.zeros(eval_iters)
        for k in range(eval_iters):
            sequences, labels = get_batch(split)
            padded_sequences = pad_sequence(sequences, batch_first=True)  
            lengths = torch.tensor([seq.size(0) for seq in sequences])
            
            # Get logits and loss from the model
            logits, loss, _ = model(padded_sequences, lengths, labels)
            losses[k] = loss.item()
            
            # Calculate accuracy
            predictions = torch.argmax(logits, dim=1)  # Get predicted class indices
            correct = (predictions == labels).float()  # Compare predictions with labels
            accuracies[k] = correct.sum() / labels.size(0)  # Calculate accuracy for the batch

        out[split] = {
            'loss': losses.mean(),
            'accuracy': accuracies.mean()  # Mean accuracy across iterations
        }
    
    model.train()  # Switch back to training mode
    return out


### Using Custom RNN : SLOW AF AND LOSS DOES NOT CONVERGE!!!!!!!!!!!

In [109]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SimpleRNNCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(SimpleRNNCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        # Weight matrices
        self.Wx = nn.Parameter(torch.Tensor(input_size, hidden_size))  # Input to hidden
        self.Wh = nn.Parameter(torch.Tensor(hidden_size, hidden_size))  # Hidden to hidden
        self.b = nn.Parameter(torch.Tensor(hidden_size))  # Bias

        # LayerNorm for hidden state
        self.layer_norm = nn.LayerNorm(hidden_size)

        # Initialize weights
        self.reset_parameters()

    def reset_parameters(self):
        for param in self.parameters():
            nn.init.kaiming_uniform_(self.Wx)
            nn.init.kaiming_uniform_(self.Wh)
            nn.init.zeros_(self.b)
        # LayerNorm has two parameters: weight (gamma) and bias (beta)
        self.layer_norm.weight.data.fill_(1)  # Initialize layer norm weight to 1
        self.layer_norm.bias.data.fill_(0)    # Initialize layer norm bias to 0

    def forward(self, x, h):
        # x: input at the current time step (batch_size, input_size)
        # h: hidden state from the previous time step (batch_size, hidden_size)
        h_next = torch.tanh(torch.mm(x.view(1, -1), self.Wx) + torch.mm(h.view(1, -1), self.Wh) + self.b)
        h_next = self.layer_norm(h_next)
        return h_next.view(-1)  # Return to shape (hidden_size)


In [112]:
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn_cell = SimpleRNNCell(input_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
        self.layer_norm = nn.LayerNorm(hidden_size)
        
        pre_trained_embeddings = torch.tensor(embedding_matrix)
        self.token_embedding_table = nn.Embedding(vocab_size, feature_size)
        self.token_embedding_table.weight.data.copy_(pre_trained_embeddings)
        self.token_embedding_table.weight.requires_grad = False # Cuz we dont want to update word embeds
        
        

    def forward(self, x, lengths, labels=None):
        batch_size = x.size(0)
        max_length = x.size(1)

        x = self.token_embedding_table(x)

        h = torch.zeros(batch_size, self.hidden_size).to(x.device) # Starting Fresh for Every Batch of Sequences

        for t in range(max_length):
            for i in range(batch_size):
                if t < lengths[i]:
                    h_next = self.rnn_cell(x[i, t, :], h[i])
                    h[i] = h_next.detach() # Detach to prevent in-place modifications
                    
        final_h = h
        final_h = self.layer_norm(final_h)
        logits = self.fc(final_h)

        if labels is None:
            loss = None
        else:
            loss = F.cross_entropy(logits, labels)
            predictions = torch.argmax(logits, dim=1) # Get predicted class by taking argmax over logits
            correct = (predictions == labels).sum().item()  
            accuracy = correct / batch_size  
            
        return logits, loss, accuracy


### Using Torch's RNN

In [121]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TorchRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(TorchRNN, self).__init__()
        self.hidden_size = hidden_size
        
        # Embedding layer
        self.token_embedding_table = nn.Embedding(vocab_size, feature_size)
        pre_trained_embeddings = torch.tensor(embedding_matrix)
        self.token_embedding_table.weight.data.copy_(pre_trained_embeddings)
        self.token_embedding_table.weight.requires_grad = False  # Freeze embeddings
        
        # RNN layer
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.layer_norm = nn.LayerNorm(hidden_size)

    def forward(self, x, lengths, labels=None):
        batch_size = x.size(0)
        max_length = x.size(1)

        # Retrieve embeddings
        x = self.token_embedding_table(x)

        # Initialize hidden state
        h_0 = torch.zeros(1, batch_size, self.hidden_size).to(x.device)  # Initial hidden state

        # Pack the padded sequence
        packed_x = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)

        # Forward pass through RNN
        packed_output, h_n = self.rnn(packed_x, h_0)

        # Unpack the output and get the last hidden state
        output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        final_h = h_n[-1]  # Last hidden state of the last layer

        # Layer normalization
        final_h = self.layer_norm(final_h)

        # Fully connected layer
        logits = self.fc(final_h)

        if labels is None:
            loss = None
            accuracy = None
        else:
            # Calculate loss and accuracy
            loss = F.cross_entropy(logits, labels)
            predictions = torch.argmax(logits, dim=1)  # Get predicted class by taking argmax over logits
            correct = (predictions == labels).sum().item()  
            accuracy = correct / batch_size  

        return logits, loss, accuracy



### Training & Evaluation

In [126]:
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

# Data characteristics
feature_size = 300
num_classes = 2
vocab_size = len(vocab)

# Hyperparameters
input_size = feature_size
hidden_size = feature_size * 2  # Size of the hidden layer
output_size = num_classes  # Number of output classes
batch_size = 32
learning_rate = 0.00001
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
epochs = 25
max_iters = round(len(train_dataset)/batch_size * epochs)
eval_iters = 50 # Number of mini-batches we want to evaluate
eval_interval = 500 # How many iterations before we evaluate

model = TorchRNN(input_size, hidden_size, output_size)
model.to(device)

# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

torch.autograd.set_detect_anomaly(True)
for iter in range(max_iters):
    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']['loss']:.4f}, train accuracy {losses['train']['accuracy']:.4f}, "
              f"val loss {losses['val']['loss']:.4f}, val accuracy {losses['val']['accuracy']:.4f}")
    # sample a batch of data
    sequences, labels = get_batch('train')
    padded_sequences = pad_sequence(sequences, batch_first=True)  # shape: (batch_size, max_length, features)
    lengths = torch.tensor([seq.size(0) for seq in sequences])  # lengths of each original sequence

    # evaluate the loss
    with torch.autocast(device_type=device,dtype=torch.bfloat16):
        logits, loss, accuracy = model(padded_sequences, lengths, labels)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

Using device: cuda
6.009002 M parameters
step 0: train loss 0.7550, train accuracy 0.4875, val loss 0.7692, val accuracy 0.4769
step 500: train loss 0.5745, train accuracy 0.7050, val loss 0.6433, val accuracy 0.6269
step 1000: train loss 0.5135, train accuracy 0.7538, val loss 0.5589, val accuracy 0.7212
step 1500: train loss 0.4660, train accuracy 0.7862, val loss 0.5636, val accuracy 0.7244
step 2000: train loss 0.4639, train accuracy 0.7937, val loss 0.5553, val accuracy 0.7188
step 2500: train loss 0.4629, train accuracy 0.7969, val loss 0.5368, val accuracy 0.7475
step 3000: train loss 0.4433, train accuracy 0.7975, val loss 0.5751, val accuracy 0.7113
step 3500: train loss 0.4258, train accuracy 0.7987, val loss 0.5695, val accuracy 0.7081
step 4000: train loss 0.4067, train accuracy 0.8131, val loss 0.5831, val accuracy 0.7125
step 4500: train loss 0.4218, train accuracy 0.8125, val loss 0.5686, val accuracy 0.7312
step 5000: train loss 0.4156, train accuracy 0.8175, val loss 0

In [127]:
# possible improvements:
# hyperparams lr etc..
# add layer norm
# use gelu
# residual layer?? 
# drop thingy
# how the words are aggregated to sentence as a whole.. but would require using custom from scratch model