## RNN + LSTM

[Code](https://github.com/priyammaz/PyTorch-Adventures/blob/main/PyTorch%20for%20NLP/Recurrent%20Neural%20Networks/IMDB%20Classification/Sequence%20Classification.ipynb)

In [12]:
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt
from collections import Counter
import re

import nltk

# nltk.download('stopwords') #Download the NLTK Stopwords
from nltk.corpus import stopwords

stopwords = set(stopwords.words("english"))

## Word Embeddings

In [30]:
path_to_data = "./aclImdb/train"

path_to_pos_folder = os.path.join(path_to_data, "pos")
path_to_neg_folder = os.path.join(path_to_data, "neg")

path_to_pos_txt = [os.path.join(path_to_pos_folder, file) for file in os.listdir(path_to_pos_folder)]
path_to_neg_txt = [os.path.join(path_to_neg_folder, file) for file in os.listdir(path_to_neg_folder)]

training_files = path_to_pos_txt + path_to_neg_txt

all_text = []
len_words = []

for file in tqdm(training_files):
    with open(file, "r", encoding="utf-8") as f:
        text = f.readlines()[0].lower()
        text = re.sub(r"[^\w\s]", "", text)  # Remove All Punctuation
        text = text.split(" ")  # Split by Space
        text = [word for word in text if word not in stopwords]  # Remove Stopwords

        len_words.append(len(text))
        all_text += text

unique_counts = dict(Counter(all_text))
words = sorted([key for key, value in unique_counts.items() if value > 500])

words.append("<unk>")
words.append("<pad>")

word2index = {word: i for i, word in enumerate(words)}
index2word = {i: word for i, word in enumerate(words)}


100%|██████████| 25000/25000 [00:02<00:00, 11588.47it/s]


## IMDB Dataset

In [37]:
class IMDBDataset(Dataset):
    def __init__(self, training_files, word2index, max_seq_length=200):
        self.training_files = training_files
        self.tokenizer = word2index
        self.max_len = max_seq_length

    def __len__(self):
        return len(self.training_files)

    def __getitem__(self, idx):
        path_to_text = self.training_files[idx]
        with open(path_to_text, "r", encoding="utf-8") as f:
            text = f.read()
            text = text.lower()
            text = re.sub(r"[^\w\s]", "", text)
            text = text.split(" ")
            text = [word for word in text if word not in stopwords]
            tokenized = [self.tokenizer.get(word, self.tokenizer["<unk>"]) for word in text]
            sample = torch.tensor(tokenized)

            if len(sample) > self.max_len:
                diff = len(sample) - self.max_len
                start_idx = np.random.randint(0, diff)
                sample = sample[start_idx : start_idx + self.max_len]

            if "neg" in path_to_text:
                label = 0
            else:
                label = 1

            return sample, label


dataset = IMDBDataset(training_files, word2index)


def data_collator(batch):
    texts, labels = [], []
    for text, label in batch:
        texts.append(text)
        labels.append(label)
    label = torch.tensor(labels)
    texts = nn.utils.rnn.pad_sequence(texts, batch_first=True, padding_value=word2index["<pad>"])
    return texts, label


dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=data_collator)

for batch in dataloader:
    print(batch)
    break


(tensor([[990, 990, 772,  ..., 991, 991, 991],
        [160, 745, 976,  ..., 990, 990, 583],
        [ 86, 990, 437,  ..., 991, 991, 991],
        ...,
        [723, 511, 990,  ..., 991, 991, 991],
        [990, 737,  35,  ..., 991, 991, 991],
        [ 84, 492, 408,  ..., 991, 991, 991]]), tensor([1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
        0, 1, 1, 1, 1, 1, 1, 0]))


## Embedding

In [43]:
emb = nn.Embedding(5, 3)

print("Embedding Weights")
print(emb.weight)

print("Embedding for Single Sentence")
sentence = torch.tensor([1, 3])  # Sentence words as a list of numbers
print(emb(sentence))
print(emb(sentence).shape)

print("Embedding for Batch Sentence")
batch_sentences = torch.tensor([[1, 3], [1, 3], [1, 3]])
print(emb(batch_sentences))
print(emb(batch_sentences).shape)

Embedding Weights
Parameter containing:
tensor([[-0.5457, -0.4180,  0.4387],
        [-0.9984, -1.0818, -1.3813],
        [ 1.2098,  0.6644, -0.7961],
        [-0.2582, -1.6075,  2.0916],
        [-0.2454,  0.0422,  0.4284]], requires_grad=True)
Embedding for Single Sentence
tensor([[-0.9984, -1.0818, -1.3813],
        [-0.2582, -1.6075,  2.0916]], grad_fn=<EmbeddingBackward0>)
torch.Size([2, 3])
Embedding for Batch Sentence
tensor([[[-0.9984, -1.0818, -1.3813],
         [-0.2582, -1.6075,  2.0916]],

        [[-0.9984, -1.0818, -1.3813],
         [-0.2582, -1.6075,  2.0916]],

        [[-0.9984, -1.0818, -1.3813],
         [-0.2582, -1.6075,  2.0916]]], grad_fn=<EmbeddingBackward0>)
torch.Size([3, 2, 3])
