In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from collections import Counter
import numpy as np

In [2]:
# 1. Load dataset and build vocab
dataset = load_dataset("imdb")

train_texts = dataset['train']['text']
train_labels = dataset['train']['label']

test_texts = dataset['test']['text']
test_labels = dataset['test']['label']

def build_vocab(texts, max_size=10000):
    words = []
    for text in texts:
        words.extend(text.lower().split())
    freq = Counter(words)
    vocab = {word: i+1 for i, (word, _) in enumerate(freq.most_common(max_size))}
    return vocab

vocab = build_vocab(train_texts)

In [3]:
# 2. Encode texts to fixed-length sequences
def encode(text, vocab, max_len=100):
    tokens = text.lower().split()
    idxs = [vocab.get(token, 0) for token in tokens]  # 0 for unknown words
    if len(idxs) < max_len:
        idxs += [0] * (max_len - len(idxs))
    else:
        idxs = idxs[:max_len]
    return idxs

In [4]:
# 3. Custom dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        x = torch.tensor(encode(self.texts[idx], self.vocab), dtype=torch.long)
        y = torch.tensor(self.labels[idx], dtype=torch.long)
        return x, y

train_dataset = TextDataset(train_texts, train_labels, vocab)
test_dataset = TextDataset(test_texts, test_labels, vocab)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

In [5]:
# 4. Define model
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=50, hidden_dim=64, output_dim=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size + 1, embed_dim, padding_idx=0)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = self.dropout(out[:, -1, :])  # apply dropout on last hidden state
        out = self.fc(out)
        return out


model = RNNClassifier(vocab_size=len(vocab))

In [6]:
# 5. Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [7]:
# 6. Training loop
def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for x_batch, y_batch in loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        outputs = model(x_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    return total_loss / len(loader)

In [8]:
# 7. Evaluation function
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for x_batch, y_batch in loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            outputs = model(x_batch)
            loss = criterion(outputs, y_batch)
            total_loss += loss.item()
            
            preds = outputs.argmax(dim=1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)
    return total_loss / len(loader), correct / total

In [9]:
# 8. Run training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 50
for epoch in range(epochs):
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate(model, test_loader, criterion, device)
    print(f"Epoch {epoch+1}: Train loss={train_loss:.4f}, Val loss={val_loss:.4f}, Val acc={val_acc:.4f}")


Epoch 1: Train loss=0.6999, Val loss=0.6939, Val acc=0.5104
Epoch 2: Train loss=0.6918, Val loss=0.6928, Val acc=0.5088
Epoch 3: Train loss=0.6924, Val loss=0.6923, Val acc=0.5171
Epoch 4: Train loss=0.6894, Val loss=0.6934, Val acc=0.5291
Epoch 5: Train loss=0.6795, Val loss=0.6922, Val acc=0.5330
Epoch 6: Train loss=0.6635, Val loss=0.6770, Val acc=0.5831
Epoch 7: Train loss=0.6595, Val loss=0.6910, Val acc=0.5351
Epoch 8: Train loss=0.6658, Val loss=0.7007, Val acc=0.5666
Epoch 9: Train loss=0.6367, Val loss=0.6885, Val acc=0.5633
Epoch 10: Train loss=0.6374, Val loss=0.7011, Val acc=0.5349
Epoch 11: Train loss=0.6027, Val loss=0.7106, Val acc=0.5426
Epoch 12: Train loss=0.6000, Val loss=0.7009, Val acc=0.5878
Epoch 13: Train loss=0.5529, Val loss=0.6899, Val acc=0.6190
Epoch 14: Train loss=0.6025, Val loss=0.7231, Val acc=0.5209
Epoch 15: Train loss=0.5885, Val loss=0.6958, Val acc=0.6076
Epoch 16: Train loss=0.5840, Val loss=0.7303, Val acc=0.5466
Epoch 17: Train loss=0.5966, Val 

In [12]:
test_acc = evaluate(model, test_loader, criterion, device)
print(f"Test Accuracy: {round(test_acc[1] * 100, 2)}%")

Test Accuracy: 63.74%
