<a href="https://colab.research.google.com/github/Hamza-Ali0237/PyTorch-Projects/blob/main/Beginner/PyTorch-Text-Classification-RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
!pip install torchmetrics datasets



In [9]:
!pip install --upgrade datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [1]:
# Import Libraries
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from datasets import load_dataset

from torchmetrics import Accuracy, Precision, Recall, F1Score

from collections import Counter
import re

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Set GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Load Dataset
dataset = load_dataset("imdb")

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [5]:
# Tokenization and Vocab Building

## Basic Word-Level Tokenizer
def tokenizer(text):
  text = text.lower()
  tokens = re.findall(r'\b\w+\b', text)
  return tokens

## Build Vocabulary From Training Data
counter = Counter()
for sample in dataset['train']:
  tokens = tokenizer(sample['text'])
  counter.update(tokens)

## Create Vocab Dict With <pad> and <unk> Tokens
vocab = {
    '<pad>': 0,
    '<unk>': 1
}

for i, word in enumerate(counter.keys(), start = 2):
  vocab[word] = i

# Function To Convert Text To List Of Token IDs
def encode(text):
  return [vocab.get(token, vocab['<unk>']) for token in tokenizer(text)]

In [9]:
# Custom Dataset Class
class IMDBDataset(Dataset):
  def __init__(self, dataset, split):
    self.data = dataset[split]

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
    text = self.data[index]['text']
    label = self.data[index]['label']
    encoded = torch.tensor(encode(text), dtype = torch.long)
    return encoded, label

In [8]:
# Collate Function For DataLoader
def collate_fn(batch):
  texts, labels = zip(*batch)
  lengths = [len(x) for x in texts]
  padded = pad_sequence(texts, batch_first = True, padding_value = 0)
  return padded, torch.tensor(labels), torch.tensor(lengths)

In [12]:
# Create DataLoaders
train_dataset = IMDBDataset(dataset, "train")
test_dataset = IMDBDataset(dataset, "test")

train_loader = DataLoader(train_dataset, batch_size = 32, shuffle = True, collate_fn = collate_fn)
test_loader = DataLoader(test_dataset, batch_size = 32, shuffle = False, collate_fn = collate_fn)

In [15]:
# RNN Model Class
class RNNModel(nn.Module):
  def __init__(self, vocab_size, embed_dim, hidden_dim):
    super(RNNModel, self).__init__()

    self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx = 0)
    self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first = True)
    self.fc = nn.Linear(hidden_dim, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, text, lenght):
    embedded = self.embedding(text)
    packed = pack_padded_sequence(embedded, lengths.cpu(), batch_first = True, enforce_sorted = False)
    _, hidden = self.rnn(packed)
    out = self.sigmoid(self.fc(hidden.squeeze(0)))
    return out

In [17]:
# Initialize Model, Loss, Optimizer
VOCAB_SIZE = len(vocab)
EMBED_DIM = 64
HIDDEN_DIM = 128

model = RNNModel(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM).to(device)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)