In [1]:
# ================================================
# BiLSTM + fastText Bangla in PyTorch
# ================================================
import os, re, numpy as np, pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, f1_score
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence


In [2]:

# -----------------------
# Config
# -----------------------
TRAIN_CSV = "Dataset_60_20_20/train.csv"
VAL_CSV   = "Dataset_60_20_20/validation.csv"
TEST_CSV  = "Dataset_60_20_20/test.csv"

EMBED_FILE = "cc.bn.300.vec"  # fastText Bangla
EMBED_DIM  = 300
MAX_LEN    = 300
BATCH_SIZE = 128
EPOCHS     = 50
LR         = 1e-3
SEED       = 42
DEVICE     = "cuda" if torch.cuda.is_available() else "cpu"

torch.manual_seed(SEED)
np.random.seed(SEED)


In [3]:

# -----------------------
# Load data
# -----------------------
train_df = pd.read_csv(TRAIN_CSV)
val_df   = pd.read_csv(VAL_CSV)
test_df  = pd.read_csv(TEST_CSV)

for d in (train_df, val_df, test_df):
    d["Summary"] = d["Summary"].astype(str)
    d["Genre"]   = d["Genre"].astype(str)

X_train, y_train = train_df["Summary"], train_df["Genre"]
X_val,   y_val   = val_df["Summary"],   val_df["Genre"]
X_test,  y_test  = test_df["Summary"],  test_df["Genre"]


In [4]:

# -----------------------
# Label encoding
# -----------------------
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc   = le.transform(y_val)
y_test_enc  = le.transform(y_test)
num_classes = len(le.classes_)
print("Classes:", list(le.classes_))


Classes: ['Adventure', 'Biography and Autobiography', 'Classic Novel', 'Classic Story', 'Contemporary Novel', 'Contemporary Story', 'Cooking, Food and Nutrition', 'History and Tradition', 'Math', 'Mystery', 'Philosophy', 'Politics', 'Religious', 'Sciene Fiction', 'Shishu Kishor', 'Thriller']


In [5]:

# -----------------------
# Tokenizer
# -----------------------
def tokenize(text):
    return re.findall(r'[\u0980-\u09FFA-Za-z0-9]+', str(text))

# Build vocab from training+val
word2idx = {"<PAD>":0, "<UNK>":1}
for txt in pd.concat([X_train, X_val]):
    for tok in tokenize(txt):
        if tok not in word2idx:
            word2idx[tok] = len(word2idx)
idx2word = {i:w for w,i in word2idx.items()}
vocab_size = len(word2idx)
print("Vocab size:", vocab_size)

def encode(text, max_len=MAX_LEN):
    tokens = tokenize(text)
    ids = [word2idx.get(t, 1) for t in tokens]
    return torch.tensor(ids[:max_len], dtype=torch.long)


Vocab size: 188874


In [6]:

# -----------------------
# Dataset class
# -----------------------
class TextDataset(Dataset):
    def __init__(self, texts, labels=None):
        self.texts = texts
        self.labels = labels
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        x = encode(self.texts.iloc[idx])
        if self.labels is not None:
            y = torch.tensor(self.labels[idx], dtype=torch.long)
            return x, y
        else:
            return x

def collate_batch(batch):
    xs, ys = zip(*batch)
    xs_pad = pad_sequence(xs, batch_first=True, padding_value=0)
    ys = torch.stack(ys)
    return xs_pad, ys

train_ds = TextDataset(X_train, y_train_enc)
val_ds   = TextDataset(X_val, y_val_enc)
test_ds  = TextDataset(X_test, y_test_enc)

train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
val_dl   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)
test_dl  = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)


In [7]:

# -----------------------
# Load fastText embeddings
# -----------------------
def load_embeddings(path, embed_dim):
    embeddings_index = {}
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            parts = line.rstrip().split(' ')
            if len(parts) < embed_dim + 1:  # skip header line
                continue
            word = parts[0]
            try:
                vec = np.asarray(parts[1:1+embed_dim], dtype='float32')
                embeddings_index[word] = vec
            except:
                continue
    return embeddings_index

print("Loading embeddings...")
embeddings_index = load_embeddings(EMBED_FILE, EMBED_DIM)
embedding_matrix = np.random.normal(0, 0.05, (vocab_size, EMBED_DIM)).astype('float32')
hits = 0
for word, idx in word2idx.items():
    vec = embeddings_index.get(word)
    if vec is not None:
        embedding_matrix[idx] = vec
        hits += 1
print(f"Embedding coverage: {hits}/{vocab_size} = {hits/vocab_size:.2%}")


Loading embeddings...
Embedding coverage: 130530/188874 = 69.11%


In [8]:

# -----------------------
# Model
# -----------------------
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, embeddings=None):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        if embeddings is not None:
            self.embedding.weight.data.copy_(torch.tensor(embeddings))
            self.embedding.weight.requires_grad = False  # freeze; set True to fine-tune
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.3)
        self.fc1 = nn.Linear(hidden_dim*2, 256)
        self.bn = nn.BatchNorm1d(256)
        self.fc2 = nn.Linear(256, num_classes)

    def forward(self, x):
        emb = self.embedding(x)
        _, (h, _) = self.lstm(emb)  # h shape: (2, B, H)
        h_cat = torch.cat((h[0], h[1]), dim=1)  # (B, 2H)
        out = self.dropout(h_cat)
        out = self.fc1(out)
        out = self.bn(out)
        out = torch.relu(out)
        out = self.dropout(out)
        return self.fc2(out)

model = BiLSTMClassifier(vocab_size, EMBED_DIM, 128, num_classes, embedding_matrix).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)


In [9]:

# -----------------------
# Training loop
# -----------------------
best_val_acc = 0
patience, wait = 3, 0

for epoch in range(1, EPOCHS+1):
    model.train()
    total_loss = 0
    for xb, yb in tqdm(train_dl, desc=f"Epoch {epoch} [Train]"):
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        optimizer.zero_grad()
        out = model(xb)
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch} Train Loss: {total_loss/len(train_dl):.4f}")

    # Validation
    model.eval()
    all_preds, all_true = [], []
    with torch.no_grad():
        for xb, yb in val_dl:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            out = model(xb)
            preds = out.argmax(1)
            all_preds.extend(preds.cpu().numpy())
            all_true.extend(yb.cpu().numpy())
    val_acc = accuracy_score(all_true, all_preds)
    print(f"Epoch {epoch} Val Acc: {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        wait = 0
        torch.save(model.state_dict(), "best_bilstm_fasttext.pt")
        print("Saved best model.")
    else:
        wait += 1
        if wait >= patience:
            print("Early stopping.")
            break


Epoch 1 [Train]: 100%|██████████| 122/122 [00:04<00:00, 27.90it/s]


Epoch 1 Train Loss: 2.0402
Epoch 1 Val Acc: 0.4363
Saved best model.


Epoch 2 [Train]: 100%|██████████| 122/122 [00:04<00:00, 29.02it/s]


Epoch 2 Train Loss: 1.6875
Epoch 2 Val Acc: 0.4423
Saved best model.


Epoch 3 [Train]: 100%|██████████| 122/122 [00:04<00:00, 29.56it/s]


Epoch 3 Train Loss: 1.5754
Epoch 3 Val Acc: 0.4995
Saved best model.


Epoch 4 [Train]: 100%|██████████| 122/122 [00:04<00:00, 29.74it/s]


Epoch 4 Train Loss: 1.4836
Epoch 4 Val Acc: 0.4882


Epoch 5 [Train]: 100%|██████████| 122/122 [00:04<00:00, 29.16it/s]


Epoch 5 Train Loss: 1.4411
Epoch 5 Val Acc: 0.5001
Saved best model.


Epoch 6 [Train]: 100%|██████████| 122/122 [00:04<00:00, 29.84it/s]


Epoch 6 Train Loss: 1.3900
Epoch 6 Val Acc: 0.5184
Saved best model.


Epoch 7 [Train]: 100%|██████████| 122/122 [00:04<00:00, 29.31it/s]


Epoch 7 Train Loss: 1.3345
Epoch 7 Val Acc: 0.5059


Epoch 8 [Train]: 100%|██████████| 122/122 [00:04<00:00, 29.50it/s]


Epoch 8 Train Loss: 1.3022
Epoch 8 Val Acc: 0.5350
Saved best model.


Epoch 9 [Train]: 100%|██████████| 122/122 [00:04<00:00, 29.98it/s]


Epoch 9 Train Loss: 1.2715
Epoch 9 Val Acc: 0.5323


Epoch 10 [Train]: 100%|██████████| 122/122 [00:04<00:00, 28.50it/s]


Epoch 10 Train Loss: 1.2227
Epoch 10 Val Acc: 0.5486
Saved best model.


Epoch 11 [Train]: 100%|██████████| 122/122 [00:04<00:00, 29.81it/s]


Epoch 11 Train Loss: 1.2236
Epoch 11 Val Acc: 0.5365


Epoch 12 [Train]: 100%|██████████| 122/122 [00:04<00:00, 28.71it/s]


Epoch 12 Train Loss: 1.1648
Epoch 12 Val Acc: 0.5490
Saved best model.


Epoch 13 [Train]: 100%|██████████| 122/122 [00:04<00:00, 28.48it/s]


Epoch 13 Train Loss: 1.1423
Epoch 13 Val Acc: 0.5589
Saved best model.


Epoch 14 [Train]: 100%|██████████| 122/122 [00:04<00:00, 28.06it/s]


Epoch 14 Train Loss: 1.1148
Epoch 14 Val Acc: 0.5602
Saved best model.


Epoch 15 [Train]: 100%|██████████| 122/122 [00:04<00:00, 29.25it/s]


Epoch 15 Train Loss: 1.1015
Epoch 15 Val Acc: 0.5238


Epoch 16 [Train]: 100%|██████████| 122/122 [00:04<00:00, 29.09it/s]


Epoch 16 Train Loss: 1.0706
Epoch 16 Val Acc: 0.5554


Epoch 17 [Train]: 100%|██████████| 122/122 [00:04<00:00, 29.45it/s]


Epoch 17 Train Loss: 1.0494
Epoch 17 Val Acc: 0.5623
Saved best model.


Epoch 18 [Train]: 100%|██████████| 122/122 [00:04<00:00, 27.05it/s]


Epoch 18 Train Loss: 1.0263
Epoch 18 Val Acc: 0.5579


Epoch 19 [Train]: 100%|██████████| 122/122 [00:04<00:00, 28.81it/s]


Epoch 19 Train Loss: 1.0043
Epoch 19 Val Acc: 0.5610


Epoch 20 [Train]: 100%|██████████| 122/122 [00:04<00:00, 29.28it/s]


Epoch 20 Train Loss: 0.9849
Epoch 20 Val Acc: 0.5773
Saved best model.


Epoch 21 [Train]: 100%|██████████| 122/122 [00:04<00:00, 29.25it/s]


Epoch 21 Train Loss: 0.9543
Epoch 21 Val Acc: 0.5772


Epoch 22 [Train]: 100%|██████████| 122/122 [00:04<00:00, 29.57it/s]


Epoch 22 Train Loss: 0.9342
Epoch 22 Val Acc: 0.5646


Epoch 23 [Train]: 100%|██████████| 122/122 [00:04<00:00, 29.11it/s]


Epoch 23 Train Loss: 0.9378
Epoch 23 Val Acc: 0.5762
Early stopping.


In [10]:

# -----------------------
# Test Evaluation
# -----------------------
model.load_state_dict(torch.load("best_bilstm_fasttext.pt"))
model.eval()
all_preds, all_true = [], []
with torch.no_grad():
    for xb, yb in test_dl:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        out = model(xb)
        preds = out.argmax(1)
        all_preds.extend(preds.cpu().numpy())
        all_true.extend(yb.cpu().numpy())

acc = accuracy_score(all_true, all_preds)
f1m = f1_score(all_true, all_preds, average="macro")
f1w = f1_score(all_true, all_preds, average="weighted")
print("\n==== TEST RESULTS ====")
print("Accuracy   :", f"{acc:.4f}")
print("Macro F1   :", f"{f1m:.4f}")
print("Weighted F1:", f"{f1w:.4f}")
print("\nClassification Report:")
print(classification_report(all_true, all_preds, target_names=le.classes_, digits=4))



==== TEST RESULTS ====
Accuracy   : 0.5717
Macro F1   : 0.4923
Weighted F1: 0.5584

Classification Report:
                             precision    recall  f1-score   support

                  Adventure     0.4490    0.3099    0.3667        71
Biography and Autobiography     0.6953    0.6135    0.6518      1123
              Classic Novel     0.1815    0.1960    0.1885       250
              Classic Story     0.1538    0.0164    0.0296       122
         Contemporary Novel     0.4820    0.8045    0.6028       931
         Contemporary Story     0.6434    0.3481    0.4518       451
Cooking, Food and Nutrition     0.9259    0.6410    0.7576        39
      History and Tradition     0.6947    0.6980    0.6963       639
                       Math     0.9630    0.9123    0.9369        57
                    Mystery     0.4900    0.3451    0.4050       142
                 Philosophy     0.6000    0.4000    0.4800       135
                   Politics     0.4444    0.2981    0.3569     