In [24]:
# ================================================
# Fine-tune BanglaBERT (PyTorch + HF Transformers)
# Train on train.csv, validate on validation.csv, test on test.csv
# Fixes device mismatch by moving every batch tensor to DEVICE
# ================================================
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

import torch
from torch.utils.data import Dataset, DataLoader

from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup
)


In [25]:

# -----------------------
# Paths & config
# -----------------------
TRAIN_CSV = "Dataset_60_20_20/train.csv"
VAL_CSV   = "Dataset_60_20_20/validation.csv"
TEST_CSV  = "Dataset_60_20_20/test.csv"

MODEL_NAME = "sagorsarker/bangla-bert-base"  # ✅ valid public model id
MAX_LEN    = 256
BATCH_SIZE = 16
EPOCHS     = 50
LR         = 2e-5
WARMUP_PROP = 0.1
PATIENCE    = 2           # early stop on val weighted-F1
SEED        = 42
OUT_DIR     = "banglabert_results"
os.makedirs(OUT_DIR, exist_ok=True)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)


Device: cuda


In [26]:

# Reproducibility
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)


In [27]:

# -----------------------
# Load data
# -----------------------
train_df = pd.read_csv(TRAIN_CSV)
val_df   = pd.read_csv(VAL_CSV)
test_df  = pd.read_csv(TEST_CSV)

for d in (train_df, val_df, test_df):
    d["Summary"] = d["Summary"].astype(str)
    d["Genre"]   = d["Genre"].astype(str)

X_train, y_train = train_df["Summary"], train_df["Genre"]
X_val,   y_val   = val_df["Summary"],   val_df["Genre"]
X_test,  y_test  = test_df["Summary"],  test_df["Genre"]



In [28]:

# -----------------------
# Labels
# -----------------------
le = LabelEncoder()
y_train_ids = le.fit_transform(y_train)
y_val_ids   = le.transform(y_val)
y_test_ids  = le.transform(y_test)
NUM_CLASSES = len(le.classes_)
print("Classes:", list(le.classes_))


Classes: ['Adventure', 'Biography and Autobiography', 'Classic Novel', 'Classic Story', 'Contemporary Novel', 'Contemporary Story', 'Cooking, Food and Nutrition', 'History and Tradition', 'Math', 'Mystery', 'Philosophy', 'Politics', 'Religious', 'Sciene Fiction', 'Shishu Kishor', 'Thriller']


In [29]:

# -----------------------
# Tokenizer
# -----------------------
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

def tokenize_batch(texts):
    return tokenizer(
        list(texts),
        padding=True,
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )


In [30]:

# -----------------------
# Dataset (keeps tensors on CPU; we move per-batch to DEVICE)
# -----------------------
class TextClsDataset(Dataset):
    def __init__(self, texts, labels):
        enc = tokenize_batch(texts)
        self.input_ids = enc["input_ids"]
        self.attn_mask = enc["attention_mask"]
        self.labels = torch.tensor(labels, dtype=torch.long)
    def __len__(self):
        return self.labels.size(0)
    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attn_mask[idx],
            "labels": self.labels[idx]
        }

train_ds = TextClsDataset(X_train, y_train_ids)
val_ds   = TextClsDataset(X_val,   y_val_ids)
test_ds  = TextClsDataset(X_test,  y_test_ids)

pin = (DEVICE == "cuda")
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  pin_memory=pin)
val_dl   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, pin_memory=pin)
test_dl  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, pin_memory=pin)


In [31]:

# -----------------------
# Model, optimizer, scheduler
# -----------------------
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=NUM_CLASSES
).to(DEVICE)

# weight decay for all but bias/LayerNorm
no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias"]
param_groups = [
    {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": 0.01},
    {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]
optimizer = AdamW(param_groups, lr=LR)

num_training_steps = len(train_dl) * EPOCHS
num_warmup_steps = int(WARMUP_PROP * num_training_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)

scaler = torch.cuda.amp.GradScaler(enabled=(DEVICE=="cuda"))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sagorsarker/bangla-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:

# -----------------------
# Utils
# -----------------------
def to_device(batch, device):
    # move every tensor to DEVICE (fixes CPU/CUDA mismatch)
    return {k: v.to(device, non_blocking=True) for k, v in batch.items()}

@torch.no_grad()
def evaluate(dataloader):
    model.eval()
    all_preds, all_true = [], []
    with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
        for batch in dataloader:
            batch = to_device(batch, DEVICE)
            # no need to pass labels for eval unless you want loss
            outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
            preds = outputs.logits.argmax(dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_true.extend(batch["labels"].cpu().numpy())
    acc = accuracy_score(all_true, all_preds)
    f1_macro = f1_score(all_true, all_preds, average="macro")
    f1_weighted = f1_score(all_true, all_preds, average="weighted")
    return acc, f1_macro, f1_weighted, np.array(all_true), np.array(all_preds)


In [None]:

# -----------------------
# Train (early stop on val Weighted-F1)
# -----------------------
best_val_f1w = -1.0
epochs_no_improve = 0
best_path = os.path.join(OUT_DIR, "banglabert_best.pt")

for epoch in range(1, EPOCHS+1):
    model.train()
    total_loss = 0.0
    pbar = tqdm(train_dl, desc=f"Epoch {epoch}/{EPOCHS}")
    for batch in pbar:
        batch = to_device(batch, DEVICE)
        optimizer.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
            outputs = model(**batch)  # uses input_ids, attention_mask, labels
            loss = outputs.loss

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        total_loss += loss.item()
        pbar.set_postfix(loss=f"{total_loss/len(pbar):.4f}")

    val_acc, val_f1m, val_f1w, _, _ = evaluate(val_dl)
    print(f"Epoch {epoch} | TrainLoss {total_loss/len(train_dl):.4f} | ValAcc {val_acc:.4f} | ValF1_w {val_f1w:.4f}")

    if val_f1w > best_val_f1w:
        best_val_f1w = val_f1w
        epochs_no_improve = 0
        torch.save(model.state_dict(), best_path)
        print("  Saved new best model.")
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= PATIENCE:
            print("Early stopping.")
            break


Epoch 1/50: 100%|██████████| 974/974 [07:41<00:00,  2.11it/s, loss=2.8027]


Epoch 1 | TrainLoss 2.8027 | ValAcc 0.0187 | ValF1_w 0.0096
  Saved new best model.


Epoch 2/50: 100%|██████████| 974/974 [07:28<00:00,  2.17it/s, loss=2.6729]


Epoch 2 | TrainLoss 2.6729 | ValAcc 0.2102 | ValF1_w 0.1119
  Saved new best model.


Epoch 3/50: 100%|██████████| 974/974 [07:28<00:00,  2.17it/s, loss=2.5012]


Epoch 3 | TrainLoss 2.5012 | ValAcc 0.2441 | ValF1_w 0.1306
  Saved new best model.


Epoch 4/50: 100%|██████████| 974/974 [07:28<00:00,  2.17it/s, loss=2.3907]


Epoch 4 | TrainLoss 2.3907 | ValAcc 0.2383 | ValF1_w 0.1237


Epoch 5/50: 100%|██████████| 974/974 [07:28<00:00,  2.17it/s, loss=2.3473]


Epoch 5 | TrainLoss 2.3473 | ValAcc 0.2828 | ValF1_w 0.1625
  Saved new best model.


Epoch 6/50: 100%|██████████| 974/974 [07:28<00:00,  2.17it/s, loss=2.3119]


Epoch 6 | TrainLoss 2.3119 | ValAcc 0.3063 | ValF1_w 0.1744
  Saved new best model.


Epoch 7/50: 100%|██████████| 974/974 [07:28<00:00,  2.17it/s, loss=2.2730]


Epoch 7 | TrainLoss 2.2730 | ValAcc 0.3123 | ValF1_w 0.1792
  Saved new best model.


Epoch 8/50: 100%|██████████| 974/974 [07:34<00:00,  2.15it/s, loss=2.2422]


Epoch 8 | TrainLoss 2.2422 | ValAcc 0.3140 | ValF1_w 0.1825
  Saved new best model.


Epoch 9/50: 100%|██████████| 974/974 [07:47<00:00,  2.08it/s, loss=2.2170]


Epoch 9 | TrainLoss 2.2170 | ValAcc 0.3207 | ValF1_w 0.1968
  Saved new best model.


Epoch 10/50: 100%|██████████| 974/974 [07:42<00:00,  2.11it/s, loss=2.1854]


Epoch 10 | TrainLoss 2.1854 | ValAcc 0.3267 | ValF1_w 0.2049
  Saved new best model.


Epoch 11/50: 100%|██████████| 974/974 [07:38<00:00,  2.12it/s, loss=2.1616]


Epoch 11 | TrainLoss 2.1616 | ValAcc 0.3313 | ValF1_w 0.2131
  Saved new best model.


Epoch 12/50: 100%|██████████| 974/974 [07:29<00:00,  2.17it/s, loss=2.1424]


Epoch 12 | TrainLoss 2.1424 | ValAcc 0.3383 | ValF1_w 0.2268
  Saved new best model.


Epoch 13/50: 100%|██████████| 974/974 [07:24<00:00,  2.19it/s, loss=2.1183]


Epoch 13 | TrainLoss 2.1183 | ValAcc 0.3456 | ValF1_w 0.2397
  Saved new best model.


Epoch 14/50:  59%|█████▊    | 572/974 [04:23<03:07,  2.15it/s, loss=1.2375]

In [None]:

# -----------------------
# Test
# -----------------------
model.load_state_dict(torch.load(best_path, map_location=DEVICE))
test_acc, test_f1m, test_f1w, y_true, y_pred = evaluate(test_dl)
print("\n==== TEST RESULTS (BanglaBERT) ====")
print("Accuracy   :", f"{test_acc:.4f}")
print("Macro F1   :", f"{test_f1m:.4f}")
print("Weighted F1:", f"{test_f1w:.4f}")

rep = classification_report(y_true, y_pred, target_names=le.classes_, digits=4)
print("\nClassification Report:\n", rep)

# Save report
report_path = os.path.join(OUT_DIR, "banglabert_test_report.txt")
with open(report_path, "w", encoding="utf-8") as f:
    f.write("BanglaBERT Test Results\n")
    f.write(f"Accuracy   : {test_acc:.4f}\n")
    f.write(f"Macro F1   : {test_f1m:.4f}\n")
    f.write(f"Weighted F1: {test_f1w:.4f}\n\n")
    f.write(rep)
print(f"Saved report to: {report_path}")

# Optional: Confusion matrix PNG
try:
    import matplotlib.pyplot as plt
    import seaborn as sns
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10,8))
    sns.heatmap(cm, annot=False, cmap="Blues",
                xticklabels=le.classes_, yticklabels=le.classes_)
    plt.title("Confusion Matrix - BanglaBERT")
    plt.xlabel("Predicted"); plt.ylabel("True")
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    cm_path = os.path.join(OUT_DIR, "banglabert_confusion_matrix.png")
    plt.savefig(cm_path, dpi=200)
    plt.show()
    print(f"Saved confusion matrix to: {cm_path}")
except Exception as e:
    print("Skipped confusion matrix plot:", e)