# 2️⃣ BiLSTM + GloVe on AG News

### 🧠 Model: BiLSTM + GloVe
**Dataset:** AG News  
**Classes:** 4  
**Technique:** This model uses GloVe pretrained word embeddings to represent input tokens, followed by a Bidirectional LSTM that captures forward and backward context.  
A MaxPooling layer condenses temporal features into a fixed-size vector before classification.


In [None]:
def load_kaggle_dataset(dataset_path, unzip=True):
    """
    Load a Kaggle dataset in Google Colab.

    Args:
        dataset_path (str): The Kaggle dataset path in the form 'username/dataset-name'
        unzip (bool): Whether to unzip the dataset after downloading (default: True)

    Returns:
        str: The path to the dataset folder
    """
    import os
    import zipfile

    # Make directory and move kaggle.json
    os.makedirs("/root/.kaggle", exist_ok=True)
    if not os.path.exists("/root/.kaggle/kaggle.json"):
        from google.colab import files
        print("Please upload your kaggle.json file")
        files.upload()
        os.rename("kaggle.json", "/root/.kaggle/kaggle.json")
    os.chmod("/root/.kaggle/kaggle.json", 600)

    # Install kaggle if needed
    # !pip install -q kaggle

    # Download dataset
    dataset_dir = dataset_path.split("/")[-1]
    !kaggle datasets download -d {dataset_path} -p data/

    # Unzip if needed
    if unzip:
        with zipfile.ZipFile(f"data/{dataset_dir}.zip", "r") as zip_ref:
            zip_ref.extractall("data/")
        print(f"✅ Dataset extracted to: data/{dataset_dir}")
        return f"data/{dataset_dir}"
    else:
        print(f"📦 Dataset zip saved to: data/{dataset_dir}.zip")
        return f"data/{dataset_dir}.zip"


In [None]:
dataset_path = "amananandrai/ag-news-classification-dataset"
dataset_folder = load_kaggle_dataset(dataset_path)

Please upload your kaggle.json file


Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/amananandrai/ag-news-classification-dataset
License(s): unknown
Downloading ag-news-classification-dataset.zip to data
  0% 0.00/11.4M [00:00<?, ?B/s]
100% 11.4M/11.4M [00:00<00:00, 1.03GB/s]
✅ Dataset extracted to: data/ag-news-classification-dataset


# BiLSTM + GloVe 

In [None]:
# !pip install nltk --quiet

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
df = pd.read_csv("/kaggle/input/ag-news-classification-dataset/train.csv", header=None)
df.columns = ['label', 'title', 'description']
df['text'] = df['title'] + " " + df['description']
texts = df['text'].tolist()
labels = df['label'].tolist()

In [None]:
tokenized_texts = [word_tokenize(text.lower()) for text in texts]

In [None]:
# Build vocab
from collections import Counter
word_counts = Counter([word for sent in tokenized_texts for word in sent])
vocab = {word: idx+2 for idx, (word, _) in enumerate(word_counts.items())}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1

In [None]:
def encode_text(text, vocab, max_len=100):
    encoded = [vocab.get(word, vocab["<UNK>"]) for word in text]
    if len(encoded) < max_len:
        encoded += [vocab["<PAD>"]] * (max_len - len(encoded))
    else:
        encoded = encoded[:max_len]
    return encoded

X = [encode_text(tokens, vocab) for tokens in tokenized_texts]
le = LabelEncoder()
y = le.fit_transform(labels)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
class NewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = NewsDataset(X_train, y_train)
val_ds = NewsDataset(X_val, y_val)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64)

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2025-05-29 10:20:05--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-05-29 10:20:06--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-05-29 10:20:06--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’


2

In [None]:
def load_glove(file_path="glove.6B.100d.txt"):
    embeddings = {}
    with open(file_path, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vec = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vec
    return embeddings

glove = load_glove()
embedding_dim = 100

embedding_matrix = np.random.normal(scale=0.6, size=(len(vocab), embedding_dim))
for word, idx in vocab.items():
    if word in glove:
        embedding_matrix[idx] = glove[word]

embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)

In [None]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, embedding_matrix):
        super(BiLSTMClassifier, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim*2, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

model = BiLSTMClassifier(len(vocab), 100, 128, 4, embedding_matrix)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [None]:
# Train loop
for epoch in range(5):
    model.train()
    total_loss = 0
    for X_batch, y_batch in tqdm(train_loader):
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}")

100%|██████████| 1688/1688 [00:15<00:00, 105.51it/s]


Epoch 1 Loss: 0.8333


100%|██████████| 1688/1688 [00:15<00:00, 109.50it/s]


Epoch 2 Loss: 0.2534


100%|██████████| 1688/1688 [00:15<00:00, 108.43it/s]


Epoch 3 Loss: 0.1711


100%|██████████| 1688/1688 [00:15<00:00, 107.40it/s]


Epoch 4 Loss: 0.1197


100%|██████████| 1688/1688 [00:15<00:00, 106.20it/s]

Epoch 5 Loss: 0.0822





In [None]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for X_batch, y_batch in val_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == y_batch).sum().item()
        total += y_batch.size(0)

print(f"Validation Accuracy: {correct / total * 100:.2f}%")


Validation Accuracy: 91.58%
