## RNN + LSTM

[Code](https://github.com/priyammaz/PyTorch-Adventures/blob/main/PyTorch%20for%20NLP/Recurrent%20Neural%20Networks/IMDB%20Classification/Sequence%20Classification.ipynb)

In [3]:
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt
from collections import Counter
import re

import nltk

# nltk.download('stopwords') #Download the NLTK Stopwords
from nltk.corpus import stopwords

stopwords = set(stopwords.words("english"))

## Word Embeddings

In [4]:
path_to_data = "./aclImdb/train"

path_to_pos_folder = os.path.join(path_to_data, "pos")
path_to_neg_folder = os.path.join(path_to_data, "neg")

path_to_pos_txt = [os.path.join(path_to_pos_folder, file) for file in os.listdir(path_to_pos_folder)]
path_to_neg_txt = [os.path.join(path_to_neg_folder, file) for file in os.listdir(path_to_neg_folder)]

training_files = path_to_pos_txt + path_to_neg_txt

all_text = []
len_words = []

for file in tqdm(training_files):
    with open(file, "r", encoding="utf-8") as f:
        text = f.readlines()[0].lower()
        text = re.sub(r"[^\w\s]", "", text)  # Remove All Punctuation
        text = text.split(" ")  # Split by Space
        text = [word for word in text if word not in stopwords]  # Remove Stopwords

        len_words.append(len(text))
        all_text += text

unique_counts = dict(Counter(all_text))
words = sorted([key for key, value in unique_counts.items() if value > 500])

words.append("<unk>")
words.append("<pad>")

word2index = {word: i for i, word in enumerate(words)}
index2word = {i: word for i, word in enumerate(words)}


100%|██████████| 25000/25000 [00:04<00:00, 5443.88it/s]


## IMDB Dataset

In [5]:
class IMDBDataset(Dataset):
    def __init__(self, training_files, word2index, max_seq_length=200):
        self.training_files = training_files
        self.tokenizer = word2index
        self.max_len = max_seq_length

    def __len__(self):
        return len(self.training_files)

    def __getitem__(self, idx):
        path_to_text = self.training_files[idx]
        with open(path_to_text, "r", encoding="utf-8") as f:
            text = f.read()
            text = text.lower()
            text = re.sub(r"[^\w\s]", "", text)
            text = text.split(" ")
            text = [word for word in text if word not in stopwords]
            tokenized = [self.tokenizer.get(word, self.tokenizer["<unk>"]) for word in text]
            sample = torch.tensor(tokenized)

            if len(sample) > self.max_len:
                diff = len(sample) - self.max_len
                start_idx = np.random.randint(0, diff)
                sample = sample[start_idx : start_idx + self.max_len]

            if "neg" in path_to_text:
                label = 0
            else:
                label = 1

            return sample, label


dataset = IMDBDataset(training_files, word2index)


def data_collator(batch):
    texts, labels = [], []
    for text, label in batch:
        texts.append(text)
        labels.append(label)
    label = torch.tensor(labels)
    texts = nn.utils.rnn.pad_sequence(texts, batch_first=True, padding_value=word2index["<pad>"])
    return texts, label


dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=data_collator)

for batch in dataloader:
    print(batch)
    break


(tensor([[831, 794, 424,  ..., 991, 991, 991],
        [990, 990, 990,  ..., 520, 990, 990],
        [990, 740, 762,  ..., 991, 991, 991],
        ...,
        [990, 990, 517,  ..., 991, 991, 991],
        [679,   1,  79,  ..., 991, 991, 991],
        [884, 990, 295,  ..., 991, 991, 991]]), tensor([0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        0, 0, 1, 0, 1, 1, 0, 1]))


## Embedding

In [6]:
emb = nn.Embedding(5, 3)

print("Embedding Weights")
print(emb.weight)

print("Embedding for Single Sentence")
sentence = torch.tensor([1, 3])  # Sentence words as a list of numbers
print(emb(sentence))
print(emb(sentence).shape)

print("Embedding for Batch Sentence")
batch_sentences = torch.tensor([[1, 3], [1, 3], [1, 3]])
print(emb(batch_sentences))
print(emb(batch_sentences).shape)

Embedding Weights
Parameter containing:
tensor([[ 1.2869, -0.3814, -0.0954],
        [ 0.5240, -0.6788, -1.7196],
        [ 1.9333,  0.9930, -0.8217],
        [ 1.2559,  0.4559,  0.5881],
        [-2.4945, -0.1725,  0.1925]], requires_grad=True)
Embedding for Single Sentence
tensor([[ 0.5240, -0.6788, -1.7196],
        [ 1.2559,  0.4559,  0.5881]], grad_fn=<EmbeddingBackward0>)
torch.Size([2, 3])
Embedding for Batch Sentence
tensor([[[ 0.5240, -0.6788, -1.7196],
         [ 1.2559,  0.4559,  0.5881]],

        [[ 0.5240, -0.6788, -1.7196],
         [ 1.2559,  0.4559,  0.5881]],

        [[ 0.5240, -0.6788, -1.7196],
         [ 1.2559,  0.4559,  0.5881]]], grad_fn=<EmbeddingBackward0>)
torch.Size([3, 2, 3])


## LSTM

In [9]:
batch_size = 5
sequence_length = 15

input_size = 10
hidden_size = 20
num_layers = 2


lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)

rand = torch.rand(batch_size, sequence_length, input_size)
h0 = torch.zeros(num_layers, batch_size, hidden_size)
c0 = torch.zeros(num_layers, batch_size, hidden_size)

output, (hidden, cell) = lstm(rand, (h0, c0))

print(output.shape)
print(hidden.shape)
print(cell.shape)

hidden[-1][0] == output[0][-1]


torch.Size([5, 15, 20])
torch.Size([2, 5, 20])
torch.Size([2, 5, 20])


tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True])

## LSTM Net

In [None]:
class LSTMNet(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, num_classes):
        super(LSTMNet, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Get the batch size and sequence length from input x
        batch_size, seq_length = x.size()
        # Pass input through the embedding layer
        embeddings = self.embedding(x)
        # Initialize the hidden and cell states for the LSTM
        h0, c0 = self.init_hidden(batch_size)
        # Pass embeddings and initial states through the LSTM
        output, (hn, cn) = self.lstm(embeddings, (h0, c0))
        # Get the last hidden state from the LSTM output
        last_hidden = hn[-1]
        # Apply dropout for regularization
        output = self.dropout(last_hidden)
        # Pass through the final fully connected layer
        output = self.fc(output)
        return output

    def init_hidden(self, batch_size):
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        return (h0, c0)

In [11]:
train_dataset = IMDBDataset("./aclImdb/train", word2index)
test_dataset = IMDBDataset("./aclImdb/test", word2index)

In [12]:
DEVICE = "mps"

model = LSTMNet(vocab_size=len(word2index), embedding_dim=128, hidden_size=256, num_layers=1, num_classes=2).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()
num_epochs = 15

train_loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=data_collator)
val_loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=False, collate_fn=data_collator)

In [13]:
def train(model, epochs, optimizer, loss_fn, train_loader, val_loader):
    log_training = {
        "epoch": [],
        "training_loss": [],
        "validation_loss": [],
        "training_acc": [],
        "validation_acc": [],
    }

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")

        train_losses, train_accuracies = [], []
        val_losses, val_accuracies = [], []

        model.train()
        for text, label in tqdm(train_loader, desc="Training"):
            text, label = text.to(DEVICE), label.to(DEVICE)

            output = model(text)
            loss = loss_fn(output, label)
            train_losses.append(loss.item())

            # Compute accuracy
            predictions = torch.argmax(output, axis=-1)
            accuracy = (predictions == label).float().mean()
            train_accuracies.append(accuracy.item())

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        model.eval()
        for text, label in tqdm(val_loader, desc="Validation"):
            text, label = text.to(DEVICE), label.to(DEVICE)

            with torch.no_grad():
                output = model(text)
                loss = loss_fn(output, label)
                val_losses.append(loss.item())

                # Compute accuracy
                predictions = torch.argmax(output, axis=-1)
                accuracy = (predictions == label).float().mean()
                val_accuracies.append(accuracy.item())

        training_loss_mean, training_acc_mean = np.mean(train_losses), np.mean(train_accuracies)
        valid_loss_mean, valid_acc_mean = np.mean(val_losses), np.mean(val_accuracies)

        log_training["epoch"].append(epoch)
        log_training["training_loss"].append(training_loss_mean)
        log_training["training_acc"].append(training_acc_mean)
        log_training["validation_loss"].append(valid_loss_mean)
        log_training["validation_acc"].append(valid_acc_mean)

        print("Training Loss:", training_loss_mean)
        print("Training Acc:", training_acc_mean)
        print("Validation Loss:", valid_loss_mean)
        print("Validation Acc:", valid_acc_mean)

    return log_training, model