In [11]:
# ======================================================
# Custom Neural Network (TextCNN + BiLSTM Fusion) in PyTorch
# ======================================================
import os, re, numpy as np, pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, f1_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence


In [12]:

# -----------------------
# Config
# -----------------------
TRAIN_CSV = "Dataset_60_20_20/train.csv"
VAL_CSV   = "Dataset_60_20_20/validation.csv"
TEST_CSV  = "Dataset_60_20_20/test.csv"

USE_PRETRAINED = True              # set False to train embeddings from scratch
EMBED_FILE     = "cc.bn.300.vec"   # fastText Bangla .vec (if USE_PRETRAINED=True)
EMBED_DIM      = 300               # 300 for fastText bn
MAX_LEN        = 350               # pad/truncate length
BATCH_SIZE     = 64
EPOCHS         = 50
LR             = 1e-3
PATIENCE       = 5                 # early stopping patience
SEED           = 42
DEVICE         = "cuda" if torch.cuda.is_available() else "cpu"

torch.manual_seed(SEED)
np.random.seed(SEED)


In [13]:

# -----------------------
# Load data
# -----------------------
train_df = pd.read_csv(TRAIN_CSV)
val_df   = pd.read_csv(VAL_CSV)
test_df  = pd.read_csv(TEST_CSV)

for d in (train_df, val_df, test_df):
    d["Summary"] = d["Summary"].astype(str)
    d["Genre"]   = d["Genre"].astype(str)

X_train, y_train = train_df["Summary"], train_df["Genre"]
X_val,   y_val   = val_df["Summary"],   val_df["Genre"]
X_test,  y_test  = test_df["Summary"],  test_df["Genre"]


In [14]:

# -----------------------
# Label encoding
# -----------------------
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc   = le.transform(y_val)
y_test_enc  = le.transform(y_test)
NUM_CLASSES = len(le.classes_)
print("Classes:", list(le.classes_))


Classes: ['Adventure', 'Biography and Autobiography', 'Classic Novel', 'Classic Story', 'Contemporary Novel', 'Contemporary Story', 'Cooking, Food and Nutrition', 'History and Tradition', 'Math', 'Mystery', 'Philosophy', 'Politics', 'Religious', 'Sciene Fiction', 'Shishu Kishor', 'Thriller']


In [15]:

# -----------------------
# Tokenization & Vocab
# -----------------------
TOKEN_RE = r'[\u0980-\u09FFA-Za-z0-9]+'

def tokenize(text):
    return re.findall(TOKEN_RE, str(text))

# build vocab on train+val
word2idx = {"<PAD>":0, "<UNK>":1}
for txt in pd.concat([X_train, X_val], axis=0):
    for tok in tokenize(txt):
        if tok not in word2idx:
            word2idx[tok] = len(word2idx)
idx2word = {i:w for w,i in word2idx.items()}
VOCAB_SIZE = len(word2idx)
print("Vocab size:", VOCAB_SIZE)

def encode(text, max_len=MAX_LEN):
    ids = [word2idx.get(t, 1) for t in tokenize(text)]
    ids = ids[:max_len]
    return torch.tensor(ids, dtype=torch.long)


Vocab size: 188874


In [16]:

# -----------------------
# Dataset / DataLoader
# -----------------------
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts.reset_index(drop=True)
        self.labels = labels
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        x = encode(self.texts.iloc[idx])
        y = torch.tensor(self.labels[idx], dtype=torch.long)
        return x, y

def collate(batch):
    xs, ys = zip(*batch)
    xs_pad = pad_sequence(xs, batch_first=True, padding_value=0)
    # clip/pad to MAX_LEN to keep tensors aligned
    if xs_pad.size(1) < MAX_LEN:
        pad_amt = MAX_LEN - xs_pad.size(1)
        xs_pad = F.pad(xs_pad, (0, pad_amt), value=0)
    elif xs_pad.size(1) > MAX_LEN:
        xs_pad = xs_pad[:, :MAX_LEN]
    return xs_pad, torch.stack(ys)

train_dl = DataLoader(TextDataset(X_train, y_train_enc), batch_size=BATCH_SIZE, shuffle=True,  collate_fn=collate)
val_dl   = DataLoader(TextDataset(X_val,   y_val_enc),   batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate)
test_dl  = DataLoader(TextDataset(X_test,  y_test_enc),  batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate)


In [17]:

# -----------------------
# Load fastText embeddings (.vec)
# -----------------------
embedding_matrix = None
if USE_PRETRAINED:
    def load_embeddings_txt(path, embed_dim):
        emb = {}
        with open(path, 'r', encoding='utf-8', errors='ignore') as f:
            for line in f:
                parts = line.rstrip().split(' ')
                if len(parts) < embed_dim + 1:  # skip header
                    continue
                w = parts[0]
                try:
                    v = np.asarray(parts[1:1+embed_dim], dtype='float32')
                    emb[w] = v
                except:
                    continue
        return emb

    print("Loading embeddings from:", EMBED_FILE)
    emb_index = load_embeddings_txt(EMBED_FILE, EMBED_DIM)
    embedding_matrix = np.random.normal(0, 0.05, (VOCAB_SIZE, EMBED_DIM)).astype('float32')
    # make PAD = 0 vector
    embedding_matrix[0] = 0.0
    hits = 0
    for w, idx in word2idx.items():
        vec = emb_index.get(w)
        if vec is not None and len(vec) == EMBED_DIM:
            embedding_matrix[idx] = vec
            hits += 1
    print(f"Pretrained coverage: {hits}/{VOCAB_SIZE} = {hits/VOCAB_SIZE:.2%}")


Loading embeddings from: cc.bn.300.vec
Pretrained coverage: 130530/188874 = 69.11%


In [18]:

# -----------------------
# Custom Model: TextCNN + BiLSTM fusion
# -----------------------
class CustomCNNBiLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, embeddings=None,
                 cnn_kernel_sizes=(3,4,5), cnn_channels=128, lstm_hidden=128,
                 embed_trainable=False, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        if embeddings is not None:
            self.embedding.weight.data.copy_(torch.tensor(embeddings))
        self.embedding.weight.requires_grad = embed_trainable

        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim, out_channels=cnn_channels, kernel_size=k)
            for k in cnn_kernel_sizes
        ])
        self.lstm = nn.LSTM(embed_dim, lstm_hidden, batch_first=True, bidirectional=True)

        fused_dim = cnn_channels * len(cnn_kernel_sizes) + lstm_hidden*2
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(fused_dim, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.fc2 = nn.Linear(256, num_classes)

    def forward(self, x):
        # x: (B, T)
        emb = self.embedding(x)                   # (B, T, E)
        # --- CNN branch ---
        cnn_in = emb.transpose(1, 2)              # (B, E, T)
        cnn_feats = []
        for conv in self.convs:
            c = torch.relu(conv(cnn_in))          # (B, C, T')
            p = torch.max(c, dim=2).values        # global max pool -> (B, C)
            cnn_feats.append(p)
        cnn_out = torch.cat(cnn_feats, dim=1)     # (B, C*#kernels)

        # --- BiLSTM branch ---
        lstm_out, (h, _) = self.lstm(emb)         # h: (2, B, H)
        lstm_feat = torch.cat((h[0], h[1]), dim=1)  # (B, 2H)

        # --- Fuse ---
        fused = torch.cat([cnn_out, lstm_feat], dim=1)
        fused = self.dropout(fused)
        fused = self.fc1(fused)
        fused = self.bn1(fused)
        fused = torch.relu(fused)
        fused = self.dropout(fused)
        logits = self.fc2(fused)                  # (B, num_classes)
        return logits

model = CustomCNNBiLSTM(
    vocab_size=VOCAB_SIZE,
    embed_dim=EMBED_DIM,
    num_classes=NUM_CLASSES,
    embeddings=embedding_matrix if USE_PRETRAINED else None,
    cnn_kernel_sizes=(3,4,5),
    cnn_channels=128,
    lstm_hidden=128,
    embed_trainable=False,   # set True to fine-tune embeddings
    dropout=0.3
).to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=LR)


In [19]:

# -----------------------
# Training with Early Stopping (val accuracy)
# -----------------------
best_val_acc = 0.0
wait = 0

for epoch in range(1, EPOCHS+1):
    model.train()
    running_loss = 0.0
    for xb, yb in tqdm(train_dl, desc=f"Epoch {epoch} [Train]"):
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    model.eval()
    val_preds, val_true = [], []
    with torch.no_grad():
        for xb, yb in val_dl:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            logits = model(xb)
            preds = logits.argmax(1)
            val_preds.extend(preds.cpu().numpy())
            val_true.extend(yb.cpu().numpy())
    val_acc = accuracy_score(val_true, val_preds)

    print(f"Epoch {epoch} | TrainLoss {running_loss/len(train_dl):.4f} | ValAcc {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        wait = 0
        torch.save(model.state_dict(), "best_custom_cnn_bilstm.pt")
        print("  Saved new best model.")
    else:
        wait += 1
        if wait >= PATIENCE:
            print("Early stopping.")
            break


Epoch 1 [Train]: 100%|██████████| 244/244 [00:07<00:00, 33.30it/s]


Epoch 1 | TrainLoss 1.8419 | ValAcc 0.5149
  Saved new best model.


Epoch 2 [Train]: 100%|██████████| 244/244 [00:07<00:00, 34.32it/s]


Epoch 2 | TrainLoss 1.3275 | ValAcc 0.5710
  Saved new best model.


Epoch 3 [Train]: 100%|██████████| 244/244 [00:07<00:00, 34.38it/s]


Epoch 3 | TrainLoss 1.0710 | ValAcc 0.5868
  Saved new best model.


Epoch 4 [Train]: 100%|██████████| 244/244 [00:07<00:00, 33.43it/s]


Epoch 4 | TrainLoss 0.8706 | ValAcc 0.6034
  Saved new best model.


Epoch 5 [Train]: 100%|██████████| 244/244 [00:07<00:00, 33.39it/s]


Epoch 5 | TrainLoss 0.6829 | ValAcc 0.5798


Epoch 6 [Train]: 100%|██████████| 244/244 [00:07<00:00, 33.83it/s]


Epoch 6 | TrainLoss 0.5352 | ValAcc 0.5802


Epoch 7 [Train]: 100%|██████████| 244/244 [00:07<00:00, 33.80it/s]


Epoch 7 | TrainLoss 0.4235 | ValAcc 0.5876


Epoch 8 [Train]: 100%|██████████| 244/244 [00:07<00:00, 33.84it/s]


Epoch 8 | TrainLoss 0.3367 | ValAcc 0.5806


Epoch 9 [Train]: 100%|██████████| 244/244 [00:07<00:00, 33.81it/s]


Epoch 9 | TrainLoss 0.2857 | ValAcc 0.5797
Early stopping.


In [20]:

# -----------------------
# Test evaluation
# -----------------------
model.load_state_dict(torch.load("best_custom_cnn_bilstm.pt"))
model.eval()
test_preds, test_true = [], []
with torch.no_grad():
    for xb, yb in test_dl:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        logits = model(xb)
        preds = logits.argmax(1)
        test_preds.extend(preds.cpu().numpy())
        test_true.extend(yb.cpu().numpy())

acc  = accuracy_score(test_true, test_preds)
f1m  = f1_score(test_true, test_preds, average="macro")
f1w  = f1_score(test_true, test_preds, average="weighted")
print("\n==== TEST RESULTS (Custom CNN+BiLSTM) ====")
print("Accuracy   :", f"{acc:.4f}")
print("Macro F1   :", f"{f1m:.4f}")
print("Weighted F1:", f"{f1w:.4f}")
print("\nClassification Report:")
print(classification_report(test_true, test_preds, target_names=le.classes_, digits=4))



==== TEST RESULTS (Custom CNN+BiLSTM) ====
Accuracy   : 0.6079
Macro F1   : 0.5179
Weighted F1: 0.5923

Classification Report:
                             precision    recall  f1-score   support

                  Adventure     0.4800    0.3380    0.3967        71
Biography and Autobiography     0.6669    0.7079    0.6868      1123
              Classic Novel     0.3514    0.2600    0.2989       250
              Classic Story     0.1429    0.0082    0.0155       122
         Contemporary Novel     0.5550    0.7691    0.6448       931
         Contemporary Story     0.6225    0.5521    0.5852       451
Cooking, Food and Nutrition     0.8250    0.8462    0.8354        39
      History and Tradition     0.7032    0.6635    0.6828       639
                       Math     0.8226    0.8947    0.8571        57
                    Mystery     0.3885    0.4296    0.4080       142
                 Philosophy     0.5784    0.4370    0.4979       135
                   Politics     0.4507    0