# Preparing the movie review data

First I download the dataset from huggingFace community

In [1]:
from datasets import load_dataset
import torch
import torch.nn as nn
dataset = load_dataset("imdb")

  from .autonotebook import tqdm as notebook_tqdm


Then I turn the trainset into train and validation set

In [2]:
from torch.utils.data.dataset import random_split
torch.manual_seed(1)
train_dataset, valid_dataset = random_split(dataset["train"], [20000, 5000])
test_dataset = dataset["test"]

Now I use a costum tokenizer to choose unique words. In the counter I have a dictionary which connects each word with its frequency.

In [3]:
import re
from collections import Counter, OrderedDict
def tokenizer(text):
    text = re.sub(r'<[^>]*>', '', text)
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub(r'[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized

token_counts = Counter()

for line in train_dataset:
    text = line["text"]
    tokens = tokenizer(text)
    token_counts.update(tokens)

print('Vocab_size:' , len(token_counts))

Vocab_size: 69023


Now I encode each unique token into integers. index 1 is reserved for those tokens which are present in test data and are not present in training data. So we assign all of them to 1. Index 0 is explained later. 

In [4]:
# 1. Sort tokens by frequency
sorted_tokens = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)

# 2. Assign IDs
vocab = {"<pad>": 0, "<unk>": 1}
for idx, (token, _) in enumerate(sorted_tokens, start=2):
    vocab[token] = idx

print("Vocab size:", len(vocab))
print(list(vocab.items())[:20])  # first 20 tokens

Vocab size: 69025
[('<pad>', 0), ('<unk>', 1), ('the', 2), ('and', 3), ('a', 4), ('of', 5), ('to', 6), ('is', 7), ('it', 8), ('in', 9), ('i', 10), ('this', 11), ('that', 12), ('s', 13), ('was', 14), ('as', 15), ('for', 16), ('with', 17), ('movie', 18), ('but', 19)]


In [5]:
text_pipeline = lambda x: [vocab.get(token, vocab["<unk>"]) for token in tokenizer(x)]
label_pipeline = lambda x: int(x)

In [6]:
train_dataset[0]

{'text': 'An extra is called upon to play a general in a movie about the Russian Revolution. However, he is not any ordinary extra. He is Serguis Alexander, former commanding general of the Russia armies who is now being forced to relive the same scene, which he suffered professional and personal tragedy in, to satisfy the director who was once a revolutionist in Russia and was humiliated by Alexander. It can now be the time for this broken man to finally "win" his penultimate battle. This is one powerful movie with meticulous direction by Von Sternberg, providing the greatest irony in Alexander\'s character in every way he can. Jannings deserved his Oscar for the role with a very moving performance playing the general at his peak and at his deepest valley. Powell lends a sinister support as the revenge minded director and Brent is perfect in her role with her face and movements showing so much expression as Jannings\' love. All around brilliance. Rating, 10.',
 'label': 1}

In [7]:
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for example in batch:
        _label = example["label"]
        _text = example["text"]

        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)

        text_list.append(processed_text)
        lengths.append(processed_text.size(0))

    lengths = torch.tensor(lengths)
    label_list = torch.tensor(label_list)
    padded_text_list = nn.utils.rnn.pad_sequence(text_list, batch_first=True)

    return padded_text_list, label_list, lengths


from torch.utils.data import DataLoader

train_dataLoader = DataLoader(train_dataset,
                              batch_size=4,
                              shuffle=False,
                              collate_fn=collate_batch)


In [8]:
text_batch, label_batch, length_batch = next(iter(train_dataLoader))

# Label_batch contains labels of each sentence whether it is positive(1) or negative(0) obviously for 4 sentences
# text_batch contains contains the sentence turned into a tensor of numbers, each number representing a word in the dictionary
# length is the length of the review before padding


In [None]:
BATCH_SIZE = 32

train_dl = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=BATCH_SIZE,shuffle=False,collate_fn=collate_batch)

## Embedding layers for sentence Embedding

Now that we created DataLoaders, we are ready for modelling. however there is still one issue. We should turn the tokens which are integers now into a vector of unique numbers to avoid the curse of dimensionality. For this case we use an embedding matrix to first reduce the size of the each input vector, and then turn these pure integers into a vector of real numbers which are between [-1,1]. For example if we have 100,000 tokens we can represent them with a vector of dimension=100.    

# Building an RNN model for Sentiment analysis

In [10]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_size, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        # num_embeddings is the vocabulary size(Or the number of token + 2)||embedding dimension is the dimension of output
        self.embedding = nn.Embedding(num_embeddings=vocab_size,embedding_dim=embed_size,padding_idx=0)
        self.rnn = nn.LSTM(embed_size,rnn_hidden_size, batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size,fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size,1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(
            out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        out, (hidden , cell) = self.rnn(out)
        out = hidden[-1, : , :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)

        return out

In [13]:
# Device_agnostic code
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [14]:
VOCAB_SIZE = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64
model = RNN(vocab_size=VOCAB_SIZE, embed_size=embed_dim,rnn_hidden_size=rnn_hidden_size, fc_hidden_size=fc_hidden_size).to(device)

model

RNN(
  (embedding): Embedding(69025, 20, padding_idx=0)
  (rnn): LSTM(20, 64, batch_first=True)
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

## Train and Evaluate functions

In [30]:
def train(dataloader: torch.utils.data.DataLoader,
          optimizer: torch.optim,
          loss_fn: nn.Module,
          model: nn.Module,
          device = device):
    
    model.train()
    total_acc, total_loss = 0, 0 
    for text_batch, label_batch, lengths in dataloader:
        text_batch = text_batch.to(device)
        label_batch = label_batch.to(device)
        lengths = lengths.to(device)
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:,0]
        loss = loss_fn(pred, label_batch.float())
        loss.backward()
        optimizer.step()
        total_acc += (
            (pred >= 0.5).float() == label_batch
        ).float().sum().item()
        total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)


In [31]:
def evaluate(dataloader: torch.utils.data.DataLoader,
             loss_fn: nn.Module,
             model: nn.Module,
             device = device):
    model.eval()
    total_acc, total_loss = 0 , 0 
    with torch.inference_mode():
        for text_batch, label_batch, lengths in dataloader:
            text_batch = text_batch.to(device)
            label_batch = label_batch.to(device)
            lengths = lengths.to(device)
            pred = model(text_batch, lengths)[:,0]
            loss = loss_fn(pred, label_batch.float())
            total_acc += (
                (pred >= 0.5).float()  == label_batch
            ).float().sum().item()
            total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [32]:
# optimizer and loss function for the model
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=2
)
loss_fn = nn.BCELoss()

In [33]:
from tqdm import tqdm
num_epochs = 10
for epoch in tqdm(range(num_epochs)):
    acc_train, loss_train = train(train_dl,
                                  optimizer=optimizer,
                                  loss_fn=loss_fn,
                                  model=model, device=device
                                  )
    acc_valid, loss_valid = evaluate(valid_dl,
                                     loss_fn=loss_fn,
                                     model=model,
                                     device=device)
    
    scheduler.step(loss_valid)

    print(f'Epoch: {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')


 10%|█         | 1/10 [01:14<11:07, 74.19s/it]

Epoch: 0 accuracy: 0.7586 val_accuracy: 0.7752


 20%|██        | 2/10 [02:30<10:02, 75.33s/it]

Epoch: 1 accuracy: 0.8199 val_accuracy: 0.8154


 30%|███       | 3/10 [03:45<08:48, 75.45s/it]

Epoch: 2 accuracy: 0.8442 val_accuracy: 0.8004


 40%|████      | 4/10 [05:02<07:34, 75.77s/it]

Epoch: 3 accuracy: 0.8769 val_accuracy: 0.8172


 50%|█████     | 5/10 [06:15<06:13, 74.72s/it]

Epoch: 4 accuracy: 0.9053 val_accuracy: 0.8336


 60%|██████    | 6/10 [07:33<05:03, 75.85s/it]

Epoch: 5 accuracy: 0.9064 val_accuracy: 0.8286


 70%|███████   | 7/10 [08:51<03:49, 76.54s/it]

Epoch: 6 accuracy: 0.9347 val_accuracy: 0.8448


 80%|████████  | 8/10 [10:08<02:33, 76.74s/it]

Epoch: 7 accuracy: 0.9472 val_accuracy: 0.8472


 90%|█████████ | 9/10 [11:25<01:16, 76.89s/it]

Epoch: 8 accuracy: 0.9544 val_accuracy: 0.8444


100%|██████████| 10/10 [12:41<00:00, 76.18s/it]

Epoch: 9 accuracy: 0.9653 val_accuracy: 0.8588





In [35]:
acc_test, _ = evaluate(test_dl,
                       loss_fn=loss_fn,
                       model=model)
print(f'test_accuracy: {acc_test:.4f}')

test_accuracy: 0.8515
