In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, TrainerCallback, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
file_path = 'Bangla Dataset.csv'
df = pd.read_csv(file_path)

In [None]:
# Load the dataset

df = pd.read_csv("Bangla Dataset.csv")

df.head(3)


Unnamed: 0,Sentence,Type
0,পৃথিবীর সকল মানুষ যেন রোগমুক্ত থাকে।,4
1,তোমার প্রতিটি উৎসব যেন আনন্দে ভরে ওঠে।,4
2,তুমি কোথায় স্কুলে পড়ো?,2


In [None]:
# Function to remove Bangla symbols

def clean_bangla_text(text):
    return re.sub(r"[।,.;:'\"!?()\[\]{}<>@#$%^&*_+=~`]", "", text)

In [None]:
# Apply cleaning function to the "Sentence" column

df["Sentence"] = df["Sentence"].apply(clean_bangla_text)

# Ensure labels are zero-indexed

df['Type'] = df['Type'] - 1

df.head(10)

Unnamed: 0,Sentence,Type
0,পৃথিবীর সকল মানুষ যেন রোগমুক্ত থাকে,3
1,তোমার প্রতিটি উৎসব যেন আনন্দে ভরে ওঠে,3
2,তুমি কোথায় স্কুলে পড়ো,1
3,কেউ সমাজের প্রতি অবিচার না করুক সবাই ন্যায়ের প...,3
4,অন্ধকারে পথ খুঁজতে শিখো কারণ আলো চিরকাল থাকবে না,2
5,ম্যাডাম দয়া করে বোর্ডে আরেকবার লিখে দেখান,2
6,আপনার বিবাহিত জীবন সুখময় হোক,3
7,তোমার দেহ ও মনের শক্তি অটুট থাকুক,3
8,কি কষ্ট পেলাম,4
9,দেশ তাই মানুষেরর চেতনার অংশ দেশের সঙ্গে মানুষে...,0


In [None]:
# Stratified train-test split

Train_texts, test_texts, Train_labels, test_labels = train_test_split(
    df['Sentence'], df['Type'], test_size=0.2, shuffle=True, stratify=df['Type'], random_state=42
)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    Train_texts, Train_labels, test_size=0.2, stratify=Train_labels, random_state=42
)

In [None]:
# Print dataset info

print("Train set label distribution:\n", pd.Series(train_labels).value_counts())
print("\nValidation set label distribution:\n", pd.Series(val_labels).value_counts())
print("\nTest set label distribution:\n", pd.Series(test_labels).value_counts())
print(f"Number of training texts: {len(train_texts)}")
print(f"Number of testing texts: {len(test_texts)}")
print(f"Number of validation texts: {len(val_texts)}")

Train set label distribution:
 Type
2    3264
3    3264
4    3264
1    3264
0    3264
Name: count, dtype: int64

Validation set label distribution:
 Type
4    816
3    816
0    816
1    816
2    816
Name: count, dtype: int64

Test set label distribution:
 Type
1    1020
4    1020
0    1020
3    1020
2    1020
Name: count, dtype: int64
Number of training texts: 16320
Number of testing texts: 5100
Number of validation texts: 4080


In [None]:
# Load the "xlm-roberta-base" model and tokenizer

os.environ["TOKENIZERS_PARALLELISM"] = "false"
model = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-base", num_labels=5
)

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenize the training and testing sentences

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128)

In [None]:
# Create Dataset class

class BanglaDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.reset_index(drop=True) if isinstance(labels, pd.Series) else labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# Create the train and test datasets

train_dataset = BanglaDataset(train_encodings, train_labels)
test_dataset = BanglaDataset(test_encodings, test_labels)
val_dataset = BanglaDataset(val_encodings, val_labels)

In [None]:
# Check if MPS is available

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

Using device: mps


XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [None]:
# Training arguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=20,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    gradient_accumulation_steps=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=False,
    save_total_limit=3,
    fp16=False,
    report_to="none",
    optim="adamw_torch"
)

print("training arguments set up")



training arguments set up


In [17]:
def compute_metrics(p):
    preds = p.predictions.argmax(axis=1)
    acc = accuracy_score(p.label_ids, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    return {
        "eval_accuracy": acc,
        "eval_precision": precision,
        "eval_recall": recall,
        "eval_f1": f1
    }

In [None]:
class LogTrainValMetricsCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        train_metrics = trainer.evaluate(eval_dataset=trainer.train_dataset)
        val_metrics = trainer.evaluate(eval_dataset=trainer.eval_dataset)

        train_acc = train_metrics["eval_accuracy"]  
        train_loss = train_metrics.get("eval_loss")
        val_acc = val_metrics["eval_accuracy"]   

        trainer.state.log_history.append({
            "epoch": state.epoch,
            "train_accuracy": train_acc,
            "train_loss": train_loss
        })

        print(f"\nEpoch {int(state.epoch)} - Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}")

In [None]:
# Trainer setup

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=5, early_stopping_threshold=0.005),
        LogTrainValMetricsCallback()
    ]
)

In [22]:
import time
start_train_time = time.time()
trainer.train()
end_train_time = time.time()

training_time = end_train_time - start_train_time
print(f"\nTotal Training Time: {training_time:.2f} seconds")

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2097,0.181027,0.959069,0.959163,0.959069,0.958924
2,0.1447,0.137314,0.972794,0.972896,0.972794,0.97275
3,0.0849,0.142283,0.97598,0.976148,0.97598,0.97596
4,0.1296,0.146866,0.974265,0.974463,0.974265,0.974241
5,0.0598,0.125169,0.977696,0.977729,0.977696,0.977705
6,0.0564,0.154419,0.977206,0.977317,0.977206,0.977164
7,0.0617,0.157244,0.974265,0.974503,0.974265,0.97427
8,0.0294,0.210403,0.971814,0.97197,0.971814,0.97173
9,0.0366,0.16685,0.977206,0.977304,0.977206,0.97719
10,0.0151,0.156443,0.977696,0.977693,0.977696,0.977683



Epoch 1 - Train Acc: 0.9711, Val Acc: 0.9591

Epoch 2 - Train Acc: 0.9833, Val Acc: 0.9728

Epoch 3 - Train Acc: 0.9894, Val Acc: 0.9760

Epoch 4 - Train Acc: 0.9924, Val Acc: 0.9743

Epoch 5 - Train Acc: 0.9960, Val Acc: 0.9777

Epoch 6 - Train Acc: 0.9957, Val Acc: 0.9772

Epoch 7 - Train Acc: 0.9971, Val Acc: 0.9743

Epoch 8 - Train Acc: 0.9959, Val Acc: 0.9718

Epoch 9 - Train Acc: 0.9984, Val Acc: 0.9772

Epoch 10 - Train Acc: 0.9995, Val Acc: 0.9777

Epoch 11 - Train Acc: 0.9992, Val Acc: 0.9777

Epoch 12 - Train Acc: 0.9993, Val Acc: 0.9740

Epoch 13 - Train Acc: 0.9998, Val Acc: 0.9787

Epoch 14 - Train Acc: 0.9999, Val Acc: 0.9779

Epoch 15 - Train Acc: 0.9999, Val Acc: 0.9772

Epoch 16 - Train Acc: 0.9999, Val Acc: 0.9772

Epoch 17 - Train Acc: 0.9999, Val Acc: 0.9779

Epoch 18 - Train Acc: 0.9999, Val Acc: 0.9789

Epoch 19 - Train Acc: 0.9999, Val Acc: 0.9787

Epoch 20 - Train Acc: 0.9999, Val Acc: 0.9789

Total Training Time: 9609.80 seconds


In [23]:
for log in trainer.state.log_history:
    print(log)

{'loss': 1.6278, 'grad_norm': 2.8646044731140137, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.09803921568627451, 'step': 100}
{'loss': 1.5191, 'grad_norm': 13.017172813415527, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.19607843137254902, 'step': 200}
{'loss': 0.8053, 'grad_norm': 22.774900436401367, 'learning_rate': 1.2e-05, 'epoch': 0.29411764705882354, 'step': 300}
{'loss': 0.403, 'grad_norm': 57.40666580200195, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.39215686274509803, 'step': 400}
{'loss': 0.378, 'grad_norm': 160.4955596923828, 'learning_rate': 2e-05, 'epoch': 0.49019607843137253, 'step': 500}
{'loss': 0.2589, 'grad_norm': 25.859466552734375, 'learning_rate': 1.9998753895176576e-05, 'epoch': 0.5882352941176471, 'step': 600}
{'loss': 0.217, 'grad_norm': 30.472280502319336, 'learning_rate': 1.999501589126174e-05, 'epoch': 0.6862745098039216, 'step': 700}
{'loss': 0.2684, 'grad_norm': 10.17104434967041, 'learning_rate': 1.9988786919844437e-05, 'epoch': 0.784

In [None]:
from collections import defaultdict

log_history = trainer.state.log_history

train_metrics = [(log['epoch'], log['train_accuracy'], log['train_loss'])
                 for log in log_history if 'train_accuracy' in log]

train_epochs = [m[0] for m in train_metrics]
train_acc = [m[1] for m in train_metrics]
train_loss = [m[2] for m in train_metrics]

eval_dict = defaultdict(dict)

for log in log_history:
    if 'eval_accuracy' in log:
        epoch = round(float(log['epoch']), 4)
        eval_dict[epoch] = {
            'epoch': epoch,
            'eval_accuracy': log['eval_accuracy'],
            'eval_loss': log['eval_loss']
        }

eval_metrics = [eval_dict[ep] for ep in sorted(eval_dict)]
eval_epochs = [m['epoch'] for m in eval_metrics]
eval_acc = [m['eval_accuracy'] for m in eval_metrics]
eval_loss = [m['eval_loss'] for m in eval_metrics]

In [None]:
# Final evaluation on test set

test_results = trainer.evaluate(eval_dataset=test_dataset)
print("\nTest Results:", test_results)


Test Results: {'eval_accuracy': 0.9756862745098039, 'eval_precision': 0.975664079539561, 'eval_recall': 0.9756862745098039, 'eval_f1': 0.9756610701625499, 'eval_loss': 0.23998576402664185, 'eval_runtime': 23.0337, 'eval_samples_per_second': 221.415, 'eval_steps_per_second': 13.849, 'epoch': 20.0}


In [None]:
start_inference_time = time.time()
test_results = trainer.evaluate(eval_dataset=test_dataset)
end_inference_time = time.time()

inference_time = end_inference_time - start_inference_time

print(f"\nInference Time on Test Set: {inference_time:.2f} seconds")


Inference Time on Test Set: 22.72 seconds


In [None]:
from sklearn.metrics import classification_report

# Compute classification metrics

accuracy = accuracy_score(test_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, predictions, average='weighted')

# Print evaluation results

print("\nTest Set Evaluation Metrics:")
print(f"🔹 Accuracy: {accuracy:.4f}")
print(f"🔹 Precision: {precision:.4f}")
print(f"🔹 Recall: {recall:.4f}")
print(f"🔹 F1-Score: {f1:.4f}")

# Detailed classification report

print("\nClassification Report:\n", classification_report(test_labels, predictions, digits=4))



Test Set Evaluation Metrics:
🔹 Accuracy: 0.9757
🔹 Precision: 0.9757
🔹 Recall: 0.9757
🔹 F1-Score: 0.9757

Classification Report:
               precision    recall  f1-score   support

           0     0.9673    0.9559    0.9615      1020
           1     0.9833    0.9814    0.9823      1020
           2     0.9696    0.9686    0.9691      1020
           3     0.9758    0.9873    0.9815      1020
           4     0.9824    0.9853    0.9838      1020

    accuracy                         0.9757      5100
   macro avg     0.9757    0.9757    0.9757      5100
weighted avg     0.9757    0.9757    0.9757      5100

