In [2]:
import os
import zipfile
import urllib.request

def download_glove(dest_path="glove.6B.100d.txt"):
    if not os.path.exists(dest_path):
        print("⏬ Downloading GloVe embeddings...")
        url = "http://nlp.stanford.edu/data/glove.6B.zip"
        zip_path = "glove.6B.zip"
        urllib.request.urlretrieve(url, zip_path)
        print("✅ Download complete. Extracting...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall()
        os.remove(zip_path)
        print("📁 Extraction complete.")
    else:
        print("🟢 GloVe already downloaded.")

download_glove()


⏬ Downloading GloVe embeddings...
✅ Download complete. Extracting...
📁 Extraction complete.


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
from collections import Counter
import numpy as np
from tqdm import tqdm

# 1. Load IMDb dataset
dataset = load_dataset("imdb")
train_texts = dataset['train']['text']
train_labels = dataset['train']['label']
test_texts = dataset['test']['text']
test_labels = dataset['test']['label']

# 2. Build vocab
def build_vocab(texts, max_size=100000):
    words = []
    for text in texts:
        words.extend(text.lower().split())
    freq = Counter(words)
    vocab = {word: i+1 for i, (word, _) in enumerate(freq.most_common(max_size))}
    return vocab

vocab = build_vocab(train_texts)
vocab_size = len(vocab)

# 3. Encode text
def encode(text, vocab, max_len=300):
    tokens = text.lower().split()
    ids = [vocab.get(token, 0) for token in tokens][:max_len]
    return ids + [0] * (max_len - len(ids))

train_inputs = [encode(t, vocab) for t in train_texts]
test_inputs = [encode(t, vocab) for t in test_texts]

# 4. Custom dataset
class IMDbDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return torch.tensor(self.inputs[idx]), torch.tensor(self.labels[idx])

train_dataset = IMDbDataset(train_inputs, train_labels)
test_dataset = IMDbDataset(test_inputs, test_labels)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

# 5. Load GloVe embeddings
def load_glove(path, vocab, embed_dim=100):
    embeddings = np.random.uniform(-0.05, 0.05, (len(vocab)+1, embed_dim))
    found = 0
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            if word in vocab:
                idx = vocab[word]
                vect = np.array(parts[1:], dtype=np.float32)
                embeddings[idx] = vect
                found += 1
    print(f"Loaded {found} vectors from GloVe.")
    return torch.tensor(embeddings, dtype=torch.float32)

glove_path = "glove.6B.100d.txt"  # Ensure this file is downloaded in your working dir
embedding_matrix = load_glove(glove_path, vocab, embed_dim=100)

# 6. RNN model with GloVe
class RNNClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim=128, output_dim=2, num_layers=2, dropout=0.3, bidirectional=True):
        super().__init__()
        vocab_size, embed_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.rnn = nn.RNN(embed_dim, hidden_dim, num_layers=num_layers,
                          dropout=dropout if num_layers > 1 else 0.0,
                          bidirectional=bidirectional, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * (2 if bidirectional else 1), output_dim)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = out.mean(dim=1)
        out = self.dropout(out)
        return self.fc(out)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RNNClassifier(embedding_matrix).to(device)

# 7. Training setup
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# 8. Training loop
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for inputs, labels in tqdm(loader):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader, criterion):
    model.eval()
    total_loss, correct = 0, 0
    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
    accuracy = correct / len(loader.dataset)
    return total_loss / len(loader), accuracy

# 9. Run training
for epoch in range(10):
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    val_loss, val_acc = evaluate(model, test_loader, criterion)
    print(f"Epoch {epoch+1}: Train loss={train_loss:.4f}, Val loss={val_loss:.4f}, Val acc={val_acc:.4f}")


Loaded 40268 vectors from GloVe.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 391/391 [00:01<00:00, 204.63it/s]


Epoch 1: Train loss=0.4927, Val loss=0.3719, Val acc=0.8466


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 391/391 [00:01<00:00, 250.73it/s]


Epoch 2: Train loss=0.2850, Val loss=0.3365, Val acc=0.8604


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 391/391 [00:01<00:00, 250.53it/s]


Epoch 3: Train loss=0.2489, Val loss=0.4284, Val acc=0.8450


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 391/391 [00:01<00:00, 249.84it/s]


Epoch 4: Train loss=0.1676, Val loss=0.5317, Val acc=0.8182


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 391/391 [00:01<00:00, 251.04it/s]


Epoch 5: Train loss=0.0750, Val loss=0.5588, Val acc=0.8535


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 391/391 [00:01<00:00, 248.50it/s]


Epoch 6: Train loss=0.0320, Val loss=0.7285, Val acc=0.8481


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 391/391 [00:01<00:00, 249.54it/s]


Epoch 7: Train loss=0.0158, Val loss=0.8216, Val acc=0.8386


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 391/391 [00:01<00:00, 249.06it/s]


Epoch 8: Train loss=0.0163, Val loss=0.9176, Val acc=0.8316


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 391/391 [00:01<00:00, 249.86it/s]


Epoch 9: Train loss=0.0201, Val loss=1.0831, Val acc=0.8323


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 391/391 [00:01<00:00, 250.39it/s]


Epoch 10: Train loss=0.0081, Val loss=0.8339, Val acc=0.8462
