In [1]:
import json
# We omit warnings to keep the output clean
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from datasets import load_dataset
import matplotlib.pyplot as plt 
import nltk

from common_utils import load_glove_embeddings, set_seed, EmbeddingMatrix

In [2]:
# set seed 
set_seed()

# initialize parameters
BATCH_SIZE = 32
INPUT_SIZE = 100 # word embedding size 
HIDDEN_SIZE = 128 # just as a starter to see 
NUM_EPOCHS = 100 
EMBEDDING_DIM=100
GRADIENT_CLIP=5

In [3]:
# load dataset from huggingface first 
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

with open('result/word2idx.json', "r") as file:
    word2idx = json.load(file)

In [4]:
# initialize word embeddings
word_embeddings = EmbeddingMatrix.load()
word_embeddings.add_padding()

print("The index of <PAD> is: ", word_embeddings.pad_idx)

print(word_embeddings.to_tensor[word_embeddings.pad_idx])

The index of <PAD> is:  18030
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.], dtype=torch.float64)


In [5]:
# create train, validate and test datasets and dataloaders
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class EmbeddingsDataset(Dataset):
    def __init__(self, X, y, word_embeddings:EmbeddingMatrix =word_embeddings):
        self.word_embeddings = word_embeddings
        self.X = X # train_dataset['text']
        self.y = y # train_dataset['label']
        self.len = len(self.X)

    def __getitem__(self, index):
        # tokenize the sentence
        tokens = self.tokenize_sentence(self.X[index])
        return tokens, self.y[index] 

    def __len__(self):
        return self.len 

    def tokenize_sentence(self, x): 
        '''
    returns a list containing the embeddings of each token 
    '''
        tokens = nltk.word_tokenize(x)
        # word tokens to index, skip if token is not in the word embeddings
        tokens = [self.word_embeddings.get_idx(token) for token in tokens if self.word_embeddings.get_idx(token) is not None]
        return tokens


def pad_collate(batch, pad_value):
    (xx, yy) = zip(*batch)
    # convert xx to a tensor
    xx = [torch.tensor(x, dtype=torch.int64) for x in xx]
    xx_pad = pad_sequence(xx, batch_first=True, padding_value=pad_value)
    return xx_pad, torch.tensor(yy, dtype=torch.long)

In [6]:
train_dataset_ed = EmbeddingsDataset(
    train_dataset["text"], train_dataset["label"]
)
validation_dataset_ed = EmbeddingsDataset(
    validation_dataset["text"], validation_dataset["label"]
)
test_dataset_ed = EmbeddingsDataset(test_dataset["text"], test_dataset["label"])

pad_value = word_embeddings.pad_idx
# implement minibatch training
train_dataloader = DataLoader(
    train_dataset_ed,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=lambda x: pad_collate(x, pad_value),
)
validation_dataloader = DataLoader(
    validation_dataset_ed,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=lambda x: pad_collate(x, pad_value),
)
test_dataloader = DataLoader(
    test_dataset_ed,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=lambda x: pad_collate(x, pad_value),
)

In [7]:
# using nn.Embeddings 
class VanillaRNNWithEmbedding(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes,  embedding_matrix_torch:torch.tensor, padding_idx: int):
        super(VanillaRNNWithEmbedding, self).__init__()
        # Embedding layer
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix_torch, freeze=True, padding_idx=padding_idx)
        self.num_layers = num_layers 
        self.hidden_size = hidden_size 
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True) # this is the num rows of the input matrix 
        self.fc = nn.Linear(hidden_size, num_classes)
        self.sigmoid = nn.Sigmoid()

 
    def forward(self, x):
        # Pass input through embedding layer
        x = self.embedding(x).float()
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size, dtype=torch.float).to(x.device)
        # Pass the embeddings through the RNN layer
        out, _ = self.rnn(x, h0)
        # Max pooling
        res, _ = torch.max(out, dim=1)
        # Only take the last output for each sequence
        #res = hidden[-1]
        # Pass through the fully connected layer
        res = self.fc(res)
        # Apply sigmoid activation (for binary classification)
        res = self.sigmoid(res)
        
        return res


In [8]:
# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

Training on GPU.


In [9]:
def train_loop_bce(train_dataloader, model, loss_fn, optimizer):
    if train_on_gpu:
        model.cuda()
    model.train()
    num_batches = len(train_dataloader)
    size = len(train_dataloader.dataset)
    train_loss, train_correct = 0, 0
    for batch_no, (X_batch, y_batch) in enumerate(train_dataloader):
        if train_on_gpu:
            X_batch = X_batch.cuda()
            y_batch = y_batch.cuda()
        
        # Forward pass
        pred = model(X_batch)
        
        pred = pred.squeeze(1)
        pred_binary = (pred >= 0.5).long()
        loss = loss_fn(pred, y_batch.float())
        train_loss += loss.item() 
        train_correct += (pred_binary==y_batch.long()).sum().item() 
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()

        # TODO add main branch
        # torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP)
        
        optimizer.step()
    
    train_loss /= num_batches 
    train_correct /= size 

    return train_loss, train_correct 
   

def test_loop_bce(validate_dataloader, model, loss_fn):
    if train_on_gpu:
        model.cuda()
    model.eval()
    num_batches = len(validate_dataloader)
    size = len(validate_dataloader.dataset)
    test_loss, test_correct = 0, 0

    with torch.no_grad():
        for X_batch, y_batch in validate_dataloader:
            if train_on_gpu:
                X_batch = X_batch.cuda()
                y_batch = y_batch.cuda()

            pred = model(X_batch)
            pred = pred.squeeze(1)
            pred_binary = (pred >= 0.5).long()
            test_loss += loss_fn(pred, y_batch.float()).item()
            test_correct += (pred_binary == y_batch.long()).sum().item()

    test_loss /= num_batches
    test_correct /= size
    return test_loss, test_correct

In [10]:
def train(model, optim, criterion, training_dataloader, validation_dataloader, epoch = NUM_EPOCHS):
  validation_acc = [] 
  train_acc = []
  train_losses, validate_losses = [], []
  for i in range(epoch):
    train_loss, train_correct = train_loop_bce(training_dataloader, model, criterion, optim) 
    validate_loss, validate_correct = test_loop_bce(validation_dataloader, model, criterion)
    validation_acc.append(validate_correct)
    train_acc.append(train_correct)
    train_losses.append(train_loss)
    validate_losses.append(validate_loss)

    print(f"Epoch {i+1}, Train Loss: {train_loss:.4f}, Validate Loss: {validate_loss:.4f}")
    #if i%10 == 0:
    print(f"Epoch:{i+1} \t Train Acc:{train_correct} \t Validation Acc:{validate_correct}")
  return train_acc, validation_acc, train_losses, validate_losses

In [None]:
vanilla_rnn = VanillaRNNWithEmbedding(input_size=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE, num_layers=2, num_classes=1, embedding_matrix_torch=word_embeddings.to_tensor, padding_idx=word_embeddings.pad_idx)
optim = torch.optim.Adam(vanilla_rnn.parameters(), lr=0.0001)
criterion = nn.BCELoss()

train_acc_vanilla_rnn, validation_acc_vanilla_rnn, train_loss_vanilla_rnn, validation_loss_vanilla_rnn = train(vanilla_rnn, optim, criterion, train_dataloader, validation_dataloader, epoch=50)

Epoch 1, Train Loss: 0.6718, Validate Loss: 0.6189
Epoch:1 	 Train Acc:0.6039859320046893 	 Validation Acc:0.6660412757973734
Epoch 2, Train Loss: 0.5866, Validate Loss: 0.5654
Epoch:2 	 Train Acc:0.6936694021101993 	 Validation Acc:0.7101313320825516
Epoch 3, Train Loss: 0.5514, Validate Loss: 0.5520
Epoch:3 	 Train Acc:0.720398593200469 	 Validation Acc:0.7166979362101313
Epoch 4, Train Loss: 0.5347, Validate Loss: 0.5341
Epoch:4 	 Train Acc:0.7316529894490035 	 Validation Acc:0.7373358348968105
Epoch 5, Train Loss: 0.5155, Validate Loss: 0.5705
Epoch:5 	 Train Acc:0.7468933177022274 	 Validation Acc:0.700750469043152
Epoch 6, Train Loss: 0.5029, Validate Loss: 0.5124
Epoch:6 	 Train Acc:0.7533411488862837 	 Validation Acc:0.7626641651031895
Epoch 7, Train Loss: 0.4928, Validate Loss: 0.5073
Epoch:7 	 Train Acc:0.7610785463071512 	 Validation Acc:0.7560975609756098
Epoch 8, Train Loss: 0.4819, Validate Loss: 0.5129
Epoch:8 	 Train Acc:0.7685814771395076 	 Validation Acc:0.74859287054