In [1]:
import json
# We omit warnings to keep the output clean
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from datasets import load_dataset
import matplotlib.pyplot as plt 
import nltk

from common_utils import load_glove_embeddings, set_seed, EmbeddingMatrix

In [2]:
# set seed 
set_seed()

# initialize parameters
BATCH_SIZE = 32
INPUT_SIZE = 100 # word embedding size 
HIDDEN_SIZE = 128 # just as a starter to see 
NUM_EPOCHS = 100 
EMBEDDING_DIM=100
GRADIENT_CLIP=5

In [3]:
# load dataset from huggingface first 
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

with open('result/word2idx.json', "r") as file:
    word2idx = json.load(file)

In [4]:
# initialize word embeddings
word_embeddings = load_glove_embeddings()

embeddings = [word_embeddings[word] for word in word_embeddings.keys()]

embedding_matrix_np = np.array(embeddings)
embedding_matrix_np = np.vstack((embedding_matrix_np, np.zeros((1, 100)))) # add a row of zeros for padding

print(embedding_matrix_np.shape)


Loading GloVe embeddings...


Repo card metadata block was not found. Setting CardData to empty.


Total GloVe words loaded: 400000
(400001, 100)


In [5]:
from torch.utils.data import Dataset, DataLoader 

# TODO: change the num_tokens 
class EmbeddingsDataset2(Dataset):
  def __init__(self, X, y, num_tokens_per_sentence=8, word_embeddings=word_embeddings):
    self.num_tokens_per_sentence = num_tokens_per_sentence
    self.word_embeddings = word_embeddings
    self.X = X # train_dataset['text']
    self.y = y # train_dataset['label']
    self.len = len(self.X)

  def __getitem__(self, index):
    # tokenize the sentence 
    tokens = self.tokenize_sentence(self.X[index])
    # convert each token to embeddings 
    sentence_tensor = self.convert_sentence_into_indices(tokens)
    label = torch.tensor(self.y[index], dtype=torch.long)
    return sentence_tensor, label 

  def __len__(self):
    return self.len 

  def tokenize_sentence(self, x): 
    '''
    returns a list containing the embeddings of each token 
    '''
    tokens = nltk.word_tokenize(x.lower())
    return tokens 
  
  def convert_sentence_into_indices(self, tokens):
    indices = []
    num_tokens_used = 0 
    for token in tokens:
      if num_tokens_used == self.num_tokens_per_sentence:
        break # we have enough of tokens from the sentence 
      if token in word2idx:
        indices.append(word2idx[token])
        num_tokens_used += 1 
    # # if not enough tokens in the sentence, use index of ?? 
    if len(indices) < self.num_tokens_per_sentence:
      padding = [(embedding_matrix_np.shape[0] - 1 ) for _ in range(self.num_tokens_per_sentence - len(indices))]
      indices.extend(padding)
    #print(indices)
    indices = torch.tensor(indices, dtype=torch.long)
    return indices
  

In [6]:
train_dataset_ed = EmbeddingsDataset2(train_dataset['text'], train_dataset['label'])
validation_dataset_ed = EmbeddingsDataset2(validation_dataset['text'], validation_dataset['label'])
test_dataset_ed = EmbeddingsDataset2(test_dataset['text'], test_dataset['label'])

# implement minibatch training 
train_dataloader = DataLoader(train_dataset_ed, batch_size=BATCH_SIZE, shuffle=True)
validation_dataloader = DataLoader(validation_dataset_ed, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset_ed, batch_size=BATCH_SIZE, shuffle=True)

In [12]:
# using nn.Embeddings 
import torch
import torch.nn as nn



class VanillaRNNWithEmbedding(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, embedding_matrix_torch=torch.tensor(embedding_matrix_np, dtype=torch.float)):
        super(VanillaRNNWithEmbedding, self).__init__()
        # Embedding layer
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix_torch, freeze=True, padding_idx=embedding_matrix_torch.shape[0]-1)
        self.num_layers = num_layers 
        self.hidden_size = hidden_size 
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True) # this is the num rows of the input matrix 
        self.fc = nn.Linear(hidden_size, num_classes)
        self.sigmoid = nn.Sigmoid()

 
    def forward(self, x):
        # Pass input through embedding layer
        x = self.embedding(x)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size, dtype=torch.float).to(x.device)
        # Pass the embeddings through the RNN layer
        out, hidden = self.rnn(x, h0)
        # Max pooling
        #out, _ = torch.max(out, dim=1)
        # Only take the last output for each sequence
        res = hidden[-1]
        # Pass through the fully connected layer
        res = self.fc(res)
        # Apply sigmoid activation (for binary classification)
        res = self.sigmoid(res)
        
        return res


In [8]:
# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

Training on GPU.


In [9]:
def train_loop_bce(train_dataloader, model, loss_fn, optimizer):
    if train_on_gpu:
        model.cuda()
    model.train()
    num_batches = len(train_dataloader)
    size = len(train_dataloader.dataset)
    train_loss, train_correct = 0, 0
    for batch_no, (X_batch, y_batch) in enumerate(train_dataloader):
        if train_on_gpu:
            X_batch = X_batch.cuda()
            y_batch = y_batch.cuda()
        
        # Forward pass
        pred = model(X_batch)
        
        pred = pred.squeeze(1)
        pred_binary = (pred >= 0.5).long()
        loss = loss_fn(pred, y_batch.float())
        train_loss += loss.item() 
        train_correct += (pred_binary==y_batch.long()).sum().item() 
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()

        # TODO add main branch
        # torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP)
        
        optimizer.step()
    
    train_loss /= num_batches 
    train_correct /= size 

    return train_loss, train_correct 
   

def test_loop_bce(validate_dataloader, model, loss_fn):
    if train_on_gpu:
        model.cuda()
    model.eval()
    num_batches = len(validate_dataloader)
    size = len(validate_dataloader.dataset)
    test_loss, test_correct = 0, 0

    with torch.no_grad():
        for X_batch, y_batch in validate_dataloader:
            if train_on_gpu:
                X_batch = X_batch.cuda()
                y_batch = y_batch.cuda()

            pred = model(X_batch)
            pred = pred.squeeze(1)
            pred_binary = (pred >= 0.5).long()
            test_loss += loss_fn(pred, y_batch.float()).item()
            test_correct += (pred_binary == y_batch.long()).sum().item()

    test_loss /= num_batches
    test_correct /= size
    return test_loss, test_correct

In [10]:
def train(model, optim, criterion, training_dataloader, validation_dataloader, epoch = NUM_EPOCHS):
  validation_acc = [] 
  train_acc = []
  train_losses, validate_losses = [], []
  for i in range(epoch):
    train_loss, train_correct = train_loop_bce(training_dataloader, model, criterion, optim) 
    validate_loss, validate_correct = test_loop_bce(validation_dataloader, model, criterion)
    validation_acc.append(validate_correct)
    train_acc.append(train_correct)
    train_losses.append(train_loss)
    validate_losses.append(validate_loss)

    print(f"Epoch {i+1}, Train Loss: {train_loss:.4f}, Validate Loss: {validate_loss:.4f}")
    #if i%10 == 0:
    print(f"Epoch:{i+1} \t Train Acc:{train_correct} \t Validation Acc:{validate_correct}")
  return train_acc, validation_acc, train_losses, validate_losses

In [14]:
vanilla_rnn = VanillaRNNWithEmbedding(input_size=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE, num_layers=2, num_classes=1)
optim = torch.optim.Adam(vanilla_rnn.parameters(), lr=0.0001)
criterion = nn.BCELoss()

train_acc_vanilla_rnn, validation_acc_vanilla_rnn, train_loss_vanilla_rnn, validation_loss_vanilla_rnn = train(vanilla_rnn, optim, criterion, train_dataloader, validation_dataloader, epoch=50)

Epoch 1, Train Loss: 0.6929, Validate Loss: 0.6921
Epoch:1 	 Train Acc:0.5145369284876905 	 Validation Acc:0.5075046904315197
Epoch 2, Train Loss: 0.6862, Validate Loss: 0.6875
Epoch:2 	 Train Acc:0.5487690504103165 	 Validation Acc:0.5412757973733584
Epoch 3, Train Loss: 0.6783, Validate Loss: 0.6827
Epoch:3 	 Train Acc:0.5728018757327081 	 Validation Acc:0.5534709193245778
Epoch 4, Train Loss: 0.6718, Validate Loss: 0.6847
Epoch:4 	 Train Acc:0.5807737397420868 	 Validation Acc:0.5487804878048781
Epoch 5, Train Loss: 0.6668, Validate Loss: 0.6810
Epoch:5 	 Train Acc:0.5917936694021102 	 Validation Acc:0.5684803001876173
Epoch 6, Train Loss: 0.6611, Validate Loss: 0.6864
Epoch:6 	 Train Acc:0.603282532239156 	 Validation Acc:0.5469043151969981
Epoch 7, Train Loss: 0.6584, Validate Loss: 0.6915
Epoch:7 	 Train Acc:0.6067995310668229 	 Validation Acc:0.5600375234521576
Epoch 8, Train Loss: 0.6552, Validate Loss: 0.6879
Epoch:8 	 Train Acc:0.6160609613130129 	 Validation Acc:0.5590994371

KeyboardInterrupt: 