# BiLSTM for Hierarchical Text Classification

This notebook uses a **Bidirectional LSTM** model with **multi-output heads** to perform hierarchical classification. It's designed to predict all levels (e.g., category → subcategory) in a single pass.


In [3]:
!pip install -q transformers torch datasets accelerate scikit-learn

In [46]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer
from datasets import load_dataset
import numpy as np
import random
from sklearn.preprocessing import LabelEncoder

## 🔹 Dataset: Amazon Product Hierarchy

Amazon product reviews are often categorized hierarchically: e.g.,

- Level 1: Electronics, Clothing, Home, etc.

- Level 2: Within Electronics — Phones, Laptops, Accessories, etc.

In [None]:
# Load Amazon Reviews dataset (subset)
dataset = load_dataset("amazon_polarity", split='train[:5%]')  # smaller subset for demo

# For demo, simulate hierarchy:
# Level 1: 'Electronics' vs 'Clothing' (just random assign)
# Level 2: 3 subcategories per Level 1 (random)
level1_classes = ['Electronics', 'Clothing']
level2_classes = {
    'Electronics': ['Phones', 'Laptops', 'Accessories'],
    'Clothing': ['Men', 'Women', 'Kids']
}

## 🔹 Preprocessing

- Tokenization
- Vocabulary building
- Padding / Truncation
- Label encoding for multiple levels

In [None]:
# Randomly assign hierarchical labels
def assign_hierarchy(example):
    lvl1 = random.choice(level1_classes)
    lvl2 = random.choice(level2_classes[lvl1])
    example['level1'] = lvl1
    example['level2'] = lvl2
    return example

dataset = dataset.map(assign_hierarchy)

In [None]:
# Encode labels
le_lvl1 = LabelEncoder()
le_lvl2 = LabelEncoder()

dataset = dataset.map(lambda x: {
    'level1_enc': le_lvl1.fit_transform([x['level1']])[0],
    'level2_enc': le_lvl2.fit_transform([x['level2']])[0]
})

In [None]:
# For tokenization, use simple tokenizer (or use BERT tokenizer)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

MAX_LEN = 128

class AmazonDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]['content']
        lvl1 = self.data[idx]['level1_enc']
        lvl2 = self.data[idx]['level2_enc']

        enc = self.tokenizer(text, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors='pt')

        input_ids = enc['input_ids'].squeeze(0)
        attention_mask = enc['attention_mask'].squeeze(0)

        return input_ids, attention_mask, torch.tensor(lvl1), torch.tensor(lvl2)


In [57]:
# Prepare dataset and dataloader
train_size = int(0.8 * len(dataset))
train_data = dataset.select(range(train_size))
val_data = dataset.select(range(train_size, len(dataset)))

train_dataset = AmazonDataset(train_data, tokenizer)
val_dataset = AmazonDataset(val_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

## 🔹 BiLSTM Model Architecture

- Embedding Layer (Glove or learned)
- BiLSTM Layer
- Multiple Dense heads with softmax (Level-wise outputs)
- Loss: Combined categorical cross-entropy across heads


In [58]:
# Define BiLSTM multi-output model
class BiLSTM_MultiOutput(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim_lvl1, output_dim_lvl2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.bilstm = nn.LSTM(embed_dim, hidden_dim, bidirectional=True, batch_first=True)

        # Two classification heads
        self.classifier_lvl1 = nn.Linear(hidden_dim * 2, output_dim_lvl1)
        self.classifier_lvl2 = nn.Linear(hidden_dim * 2, output_dim_lvl2)

    def forward(self, input_ids, attention_mask):
        embedded = self.embedding(input_ids)
        packed_output, (hidden, cell) = self.bilstm(embedded)
        # Concatenate final forward and backward hidden states
        hidden_cat = torch.cat((hidden[-2], hidden[-1]), dim=1)

        out_lvl1 = self.classifier_lvl1(hidden_cat)
        out_lvl2 = self.classifier_lvl2(hidden_cat)

        return out_lvl1, out_lvl2


In [59]:
# Hyperparameters
VOCAB_SIZE = tokenizer.vocab_size
EMBED_DIM = 128
HIDDEN_DIM = 64
OUTPUT_DIM_LVL1 = len(le_lvl1.classes_)
OUTPUT_DIM_LVL2 = len(le_lvl2.classes_)

model = BiLSTM_MultiOutput(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM, OUTPUT_DIM_LVL1, OUTPUT_DIM_LVL2)

# Loss & optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Training loop (simplified)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [61]:
from tqdm import tqdm

EPOCHS = 1

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    loop = tqdm(train_loader, leave=True)
    for i, (input_ids, attention_mask, lvl1_labels, lvl2_labels) in enumerate(loop, 1):
        input_ids = input_ids.to(device)
        lvl1_labels = lvl1_labels.to(device)
        lvl2_labels = lvl2_labels.to(device)

        optimizer.zero_grad()

        out_lvl1, out_lvl2 = model(input_ids, attention_mask)

        loss1 = criterion(out_lvl1, lvl1_labels)
        loss2 = criterion(out_lvl2, lvl2_labels)
        loss = loss1 + loss2

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Use i instead of loop.n to avoid zero division
        loop.set_description(f"Epoch {epoch+1}/{EPOCHS}")
        loop.set_postfix(loss=total_loss/i)

    print(f"Epoch {epoch+1} completed. Average Loss: {total_loss/len(train_loader):.4f}")


Epoch 1/1: 100%|██████████| 4500/4500 [05:22<00:00, 13.96it/s, loss=0]


Epoch 1 completed. Average Loss: 0.0000
