In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

khadiza13_less_data_path = kagglehub.dataset_download('khadiza13/less-data')

print('Data source import complete.')


In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("/kaggle/input/less-data/changed_ds/new_ds_small.csv")
df.head()

Unnamed: 0,image,text,label
0,2001.jpg,আচ্ছা ভাই।\n মেয়েদের থেকে দূরে থাকবা মেয়ের পাল...,stereotype & objectification
1,2002.jpg,কিসের foodpanda যেখানে আমার সাদিয়া রে অর্ডার ক...,stereotype & objectification
2,2003.jpg,উপস্থিত sir\n Yes sir\n Present Teacher\n [লাব...,stereotype & objectification
3,2004.png,আমি হিজাব চাইনি\n তারপর আমার স্বামী আমাকে বোঝালেন,violence or abuse
4,2005.jpg,"বাঙ্গু (beta, omega, theta, delta) male\nসুন্দ...",non-misogynistic


In [None]:
df.columns = ['image_name', 'text', 'label']
df.head()

Unnamed: 0,image_name,text,label
0,2001.jpg,আচ্ছা ভাই।\n মেয়েদের থেকে দূরে থাকবা মেয়ের পাল...,stereotype & objectification
1,2002.jpg,কিসের foodpanda যেখানে আমার সাদিয়া রে অর্ডার ক...,stereotype & objectification
2,2003.jpg,উপস্থিত sir\n Yes sir\n Present Teacher\n [লাব...,stereotype & objectification
3,2004.png,আমি হিজাব চাইনি\n তারপর আমার স্বামী আমাকে বোঝালেন,violence or abuse
4,2005.jpg,"বাঙ্গু (beta, omega, theta, delta) male\nসুন্দ...",non-misogynistic


In [None]:
import pandas as pd
import numpy as np
import torch
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers.trainer_callback import EarlyStoppingCallback

# Load dataset
df = pd.read_csv("/kaggle/input/less-data/changed_ds/new_ds_small.csv")
assert not df.empty, "Dataset is empty"
assert df['text'].notnull().all(), "Missing values in 'text' column"
assert df['label'].notnull().all(), "Missing values in 'label' column"
print("✅ Dataset loaded. Sample:")
print(df.head())

# Clean text
def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[\n“”\"]", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation.replace("!", "").replace("?", "")))
    text = re.sub(r"\s+", " ", text).strip()
    return text.lower()

df["text"] = df["text"].astype(str).apply(clean_text)

# Encode labels
label2id = {label: idx for idx, label in enumerate(df["label"].unique())}
id2label = {v: k for k, v in label2id.items()}
df["label_id"] = df["label"].map(label2id)
print(f"✅ Unique labels: {label2id}")
print(f"Label distribution: {df['label'].value_counts().to_dict()}")

# Split data into train (70%), validation (15%), test (15%)
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df["text"].tolist(), df["label_id"].tolist(), test_size=0.3, random_state=42, stratify=df["label_id"]
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42, stratify=temp_labels
)
print(f"✅ Dataset split: Train: {len(train_texts)}, Val: {len(val_texts)}, Test: {len(test_texts)}")


# Model checkpoints
model_names = {
    "mBERT": "bert-base-multilingual-cased"
}

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | {"labels": torch.tensor(self.labels[idx])}
    def __len__(self):
        return len(self.labels)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    accuracy = (predictions == labels).mean()
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": accuracy, "f1": f1}

def run_model(model_name, model_checkpoint):
    print(f"\n🔄 Loading {model_name}...", flush=True)
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    # Optimize max_length
    tokenized_lengths = [len(tokenizer.encode(text, add_special_tokens=True)) for text in train_texts]
    max_length = min(int(np.percentile(tokenized_lengths, 95)), 512)
    print(f"Using max_length: {max_length}")

    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=len(label2id))
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    print(f"Using device: {device}")

    print("🔠 Tokenizing...", flush=True)
    train_enc = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
    val_enc = tokenizer(val_texts, truncation=True, padding=True, max_length=max_length)
    test_enc = tokenizer(test_texts, truncation=True, padding=True, max_length=max_length)

    train_dataset = TextDataset(train_enc, train_labels)
    val_dataset = TextDataset(val_enc, val_labels)
    test_dataset = TextDataset(test_enc, test_labels)

    training_args = TrainingArguments(
        output_dir=f"./results/{model_name}",
        eval_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=5,  # Suitable for 4000 samples
        per_device_train_batch_size=16,  # Increased for faster training
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        logging_dir=f"./logs/{model_name}",
        logging_steps=50,  # Adjusted for 2800 training samples
        load_best_model_at_end=True,
        metric_for_best_model="f1",  # Use F1 for balanced performance
        greater_is_better=True,
        save_total_limit=1,
        report_to="none",
        disable_tqdm=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(
            early_stopping_patience=2,
            early_stopping_threshold=0.01
        )]
    )

    print(f"🚀 Training {model_name}...", flush=True)
    trainer.train()

    print(f"🔍 Evaluating {model_name} on validation set...", flush=True)
    predictions = trainer.predict(val_dataset)
    preds = np.argmax(predictions.predictions, axis=1)
    unique_labels = sorted(set(val_labels))
    target_names = [id2label[i] for i in unique_labels]
    print(f"\n📊 Validation Classification Report for {model_name}")
    print(classification_report(val_labels, preds, labels=unique_labels, target_names=target_names))

    print(f"🔍 Evaluating {model_name} on test set...", flush=True)
    predictions = trainer.predict(test_dataset)
    preds = np.argmax(predictions.predictions, axis=1)
    unique_labels = sorted(set(test_labels))
    target_names = [id2label[i] for i in unique_labels]
    print(f"\n📊 Test Classification Report for {model_name}")
    print(classification_report(test_labels, preds, labels=unique_labels, target_names=target_names))

    # Save the final model
    trainer.save_model(f"./models/{model_name}")

# Run each model
for name, checkpoint in model_names.items():
    try:
        run_model(name, checkpoint)
    except Exception as e:
        print(f"❌ Skipping {name} due to error: {e}")

✅ Dataset loaded. Sample:
      image                                               text  \
0  2001.jpg  আচ্ছা ভাই।\n মেয়েদের থেকে দূরে থাকবা মেয়ের পাল...   
1  2002.jpg  কিসের foodpanda যেখানে আমার সাদিয়া রে অর্ডার ক...   
2  2003.jpg  উপস্থিত sir\n Yes sir\n Present Teacher\n [লাব...   
3  2004.png  আমি হিজাব চাইনি\n তারপর আমার স্বামী আমাকে বোঝালেন   
4  2005.jpg  বাঙ্গু (beta, omega, theta, delta) male\nসুন্দ...   

                          label  
0  stereotype & objectification  
1  stereotype & objectification  
2  stereotype & objectification  
3             violence or abuse  
4              non-misogynistic  
✅ Unique labels: {'stereotype & objectification': 0, 'violence or abuse': 1, 'non-misogynistic': 2}
Label distribution: {'stereotype & objectification': 1591, 'non-misogynistic': 1380, 'violence or abuse': 1030}
✅ Dataset split: Train: 2800, Val: 600, Test: 601

🔄 Loading mBERT...


Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using max_length: 89
Using device: cuda
🔠 Tokenizing...
🚀 Training mBERT...


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.965,0.882353,0.576667,0.565279
2,0.7521,0.750772,0.688333,0.685733
3,0.5801,0.895426,0.655,0.648574
4,0.4094,0.883383,0.681667,0.678177


🔍 Evaluating mBERT on validation set...



📊 Validation Classification Report for mBERT
                              precision    recall  f1-score   support

stereotype & objectification       0.62      0.82      0.70       239
           violence or abuse       0.70      0.53      0.60       154
            non-misogynistic       0.82      0.65      0.73       207

                    accuracy                           0.69       600
                   macro avg       0.71      0.67      0.68       600
                weighted avg       0.71      0.69      0.69       600

🔍 Evaluating mBERT on test set...



📊 Test Classification Report for mBERT
                              precision    recall  f1-score   support

stereotype & objectification       0.56      0.75      0.64       239
           violence or abuse       0.60      0.49      0.54       155
            non-misogynistic       0.78      0.58      0.67       207

                    accuracy                           0.62       601
                   macro avg       0.65      0.61      0.62       601
                weighted avg       0.65      0.62      0.62       601



In [None]:
import pandas as pd
import numpy as np
import torch
import re
import string
import random
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers.trainer_callback import EarlyStoppingCallback

# Set random seed for reproducibility
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    random.seed(seed)
set_seed(42)

# Load dataset
df = pd.read_csv("/kaggle/input/less-data/changed_ds/new_ds_small.csv")
assert not df.empty, "Dataset is empty"
assert len(df) == 4001, f"Expected 4000 samples, got {len(df)}"
assert df['text'].notnull().all(), "Missing values in 'text' column"
assert df['label'].notnull().all(), "Missing values in 'label' column"
print("✅ Dataset loaded. Sample:")
print(df.head())

# Clean text for Banglish
def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[\n“”\"]", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation.replace("!", "").replace("?", "")))
    text = re.sub(r"\s+", " ", text).strip()
    text = text.lower()
    text = re.sub(r"(.)\1{2,}", r"\1\1", text)  # Normalize repeated characters (e.g., "sooo" -> "soo")
    # text = re.sub(r"[^\x00-\x7F]+", "", text)  # Uncomment to remove emojis
    return text

df["text"] = df["text"].astype(str).apply(clean_text)

# Encode labels
label2id = {label: idx for idx, label in enumerate(df["label"].unique())}
id2label = {v: k for k, v in label2id.items()}
df["label_id"] = df["label"].map(label2id)
print(f"✅ Unique labels: {label2id}")
print(f"Label distribution: {df['label'].value_counts().to_dict()}")

# Split data into train (70%), validation (15%), test (15%)
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df["text"].tolist(), df["label_id"].tolist(), test_size=0.3, random_state=42, stratify=df["label_id"]
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42, stratify=temp_labels
)
print(f"✅ Dataset split: Train: {len(train_texts)}, Val: {len(val_texts)}, Test: {len(test_texts)}")

# Model checkpoints
model_names = {
    "BanglishBERT": "csebuetnlp/banglishbert",
    "BanglaBERT": "csebuetnlp/banglabert",
    "MuRILBERT": "google/muril-base-cased",
    "XLM-RoBERTa": "xlm-roberta-base",
    "DistilBERT": "distilbert-base-multilingual-cased"
}

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | {"labels": torch.tensor(self.labels[idx])}
    def __len__(self):
        return len(self.labels)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    accuracy = (predictions == labels).mean()
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": accuracy, "f1": f1}

def run_model(model_name, model_checkpoint):
    print(f"\n🔄 Loading {model_name}...", flush=True)
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    # Optimize max_length
    tokenized_lengths = [len(tokenizer.encode(text, add_special_tokens=True)) for text in train_texts]
    max_length = min(int(np.percentile(tokenized_lengths, 95)), 512)
    print(f"Using max_length: {max_length}")

    # Load model for classification
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, num_labels=len(label2id), id2label=id2label, label2id=label2id
    )
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    print(f"Using device: {device}")

    print("🔠 Tokenizing...", flush=True)
    train_enc = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
    val_enc = tokenizer(val_texts, truncation=True, padding=True, max_length=max_length)
    test_enc = tokenizer(test_texts, truncation=True, padding=True, max_length=max_length)

    train_dataset = TextDataset(train_enc, train_labels)
    val_dataset = TextDataset(val_enc, val_labels)
    test_dataset = TextDataset(test_enc, test_labels)

    training_args = TrainingArguments(
        output_dir=f"./results/{model_name}",
        eval_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=10,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=2,
        learning_rate=2e-5,
        logging_dir=f"./logs/{model_name}",
        logging_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        save_total_limit=1,
        report_to="none",
        disable_tqdm=False,
        fp16=True  # Mixed precision for faster training
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(
            early_stopping_patience=2,
            early_stopping_threshold=0.01
        )]
    )

    print(f"🚀 Training {model_name}...", flush=True)
    trainer.train()

    print(f"🔍 Evaluating {model_name} on validation set...", flush=True)
    val_predictions = trainer.predict(val_dataset)
    val_preds = np.argmax(val_predictions.predictions, axis=1)
    unique_labels = sorted(set(val_labels))
    target_names = [id2label[i] for i in unique_labels]
    print(f"\n📊 Validation Classification Report for {model_name}")
    val_report = classification_report(val_labels, val_preds, labels=unique_labels, target_names=target_names)
    print(val_report)

    print(f"🔍 Evaluating {model_name} on test set...", flush=True)
    test_predictions = trainer.predict(test_dataset)
    test_preds = np.argmax(test_predictions.predictions, axis=1)
    print(f"\n📊 Test Classification Report for {model_name}")
    test_report = classification_report(test_labels, test_preds, labels=unique_labels, target_names=target_names)
    print(test_report)

    # Save outputs
    os.makedirs(f"./predictions/{model_name}", exist_ok=True)
    np.save(f"./predictions/{model_name}/val_preds.npy", val_predictions.predictions)
    np.save(f"./predictions/{model_name}/test_preds.npy", test_predictions.predictions)
    trainer.save_model(f"./results/{model_name}")

    # Store results
    results[model_name] = {
        "val_report": classification_report(val_labels, val_preds, labels=unique_labels, target_names=target_names, output_dict=True),
        "test_report": classification_report(test_labels, test_preds, labels=unique_labels, target_names=target_names, output_dict=True)
    }

    # Clear memory
    del model
    del trainer
    torch.cuda.empty_cache()
    print(f"🧹 Cleared memory after {model_name}")

# Dictionary to store results
results = {}

# Run models
for name, checkpoint in model_names.items():
    try:
        run_model(name, checkpoint)
    except Exception as e:
        print(f"❌ Skipping {name} due to error: {e}")

# Compare results
print("\n📊 Model Comparison (Test F1)")
for model_name, result in results.items():
    test_f1 = result["test_report"]["weighted avg"]["f1-score"]
    print(f"{model_name}: {test_f1:.4f}")

✅ Dataset loaded. Sample:
      image                                               text  \
0  2001.jpg  আচ্ছা ভাই।\n মেয়েদের থেকে দূরে থাকবা মেয়ের পাল...   
1  2002.jpg  কিসের foodpanda যেখানে আমার সাদিয়া রে অর্ডার ক...   
2  2003.jpg  উপস্থিত sir\n Yes sir\n Present Teacher\n [লাব...   
3  2004.png  আমি হিজাব চাইনি\n তারপর আমার স্বামী আমাকে বোঝালেন   
4  2005.jpg  বাঙ্গু (beta, omega, theta, delta) male\nসুন্দ...   

                          label  
0  stereotype & objectification  
1  stereotype & objectification  
2  stereotype & objectification  
3             violence or abuse  
4              non-misogynistic  
✅ Unique labels: {'stereotype & objectification': 0, 'violence or abuse': 1, 'non-misogynistic': 2}
Label distribution: {'stereotype & objectification': 1591, 'non-misogynistic': 1380, 'violence or abuse': 1030}
✅ Dataset split: Train: 2800, Val: 600, Test: 601

🔄 Loading BanglishBERT...
Using max_length: 48


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at csebuetnlp/banglishbert and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
🔠 Tokenizing...
🚀 Training BanglishBERT...


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.9188,0.830626,0.606667,0.556218
2,0.663,0.726623,0.688333,0.683901
3,0.4009,0.783682,0.698333,0.695961
4,0.2399,1.014558,0.693333,0.688286
5,0.1112,1.437472,0.705,0.69469


🔍 Evaluating BanglishBERT on validation set...



📊 Validation Classification Report for BanglishBERT
                              precision    recall  f1-score   support

stereotype & objectification       0.68      0.70      0.69       239
           violence or abuse       0.65      0.56      0.61       154
            non-misogynistic       0.74      0.79      0.77       207

                    accuracy                           0.70       600
                   macro avg       0.69      0.69      0.69       600
                weighted avg       0.70      0.70      0.70       600

🔍 Evaluating BanglishBERT on test set...



📊 Test Classification Report for BanglishBERT
                              precision    recall  f1-score   support

stereotype & objectification       0.70      0.74      0.72       239
           violence or abuse       0.59      0.56      0.57       155
            non-misogynistic       0.77      0.74      0.75       207

                    accuracy                           0.69       601
                   macro avg       0.68      0.68      0.68       601
                weighted avg       0.69      0.69      0.69       601

🧹 Cleared memory after BanglishBERT

🔄 Loading BanglaBERT...


tokenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/586 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/528k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Using max_length: 48


pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
🔠 Tokenizing...


model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

🚀 Training BanglaBERT...


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.9383,0.850361,0.616667,0.550693
2,0.7002,0.703674,0.718333,0.718784
3,0.5172,0.712947,0.716667,0.71847
4,0.3218,0.831664,0.71,0.70505


🔍 Evaluating BanglaBERT on validation set...



📊 Validation Classification Report for BanglaBERT
                              precision    recall  f1-score   support

stereotype & objectification       0.71      0.70      0.71       239
           violence or abuse       0.61      0.63      0.62       154
            non-misogynistic       0.81      0.80      0.80       207

                    accuracy                           0.72       600
                   macro avg       0.71      0.71      0.71       600
                weighted avg       0.72      0.72      0.72       600

🔍 Evaluating BanglaBERT on test set...



📊 Test Classification Report for BanglaBERT
                              precision    recall  f1-score   support

stereotype & objectification       0.73      0.77      0.75       239
           violence or abuse       0.60      0.65      0.63       155
            non-misogynistic       0.85      0.74      0.79       207

                    accuracy                           0.73       601
                   macro avg       0.73      0.72      0.72       601
                weighted avg       0.74      0.73      0.73       601

🧹 Cleared memory after BanglaBERT

🔄 Loading MuRILBERT...


tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

Using max_length: 52


pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/953M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
🔠 Tokenizing...
🚀 Training MuRILBERT...


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.031,0.971366,0.623333,0.572578
2,0.8674,0.865668,0.673333,0.664238
3,0.7457,0.795872,0.678333,0.668232
4,0.6059,0.765444,0.698333,0.695734
5,0.5257,0.819008,0.676667,0.672267
6,0.4032,0.867744,0.673333,0.670274


🔍 Evaluating MuRILBERT on validation set...



📊 Validation Classification Report for MuRILBERT
                              precision    recall  f1-score   support

stereotype & objectification       0.64      0.77      0.69       239
           violence or abuse       0.72      0.53      0.61       154
            non-misogynistic       0.78      0.75      0.76       207

                    accuracy                           0.70       600
                   macro avg       0.71      0.68      0.69       600
                weighted avg       0.71      0.70      0.70       600

🔍 Evaluating MuRILBERT on test set...



📊 Test Classification Report for MuRILBERT
                              precision    recall  f1-score   support

stereotype & objectification       0.67      0.85      0.75       239
           violence or abuse       0.71      0.54      0.61       155
            non-misogynistic       0.85      0.74      0.79       207

                    accuracy                           0.73       601
                   macro avg       0.74      0.71      0.72       601
                weighted avg       0.74      0.73      0.73       601

🧹 Cleared memory after MuRILBERT

🔄 Loading XLM-RoBERTa...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Using max_length: 68


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
🔠 Tokenizing...
🚀 Training XLM-RoBERTa...


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.9616,0.795764,0.665,0.645352
2,0.7479,0.771774,0.681667,0.679551
3,0.6146,0.712961,0.708333,0.706629
4,0.4931,0.765,0.743333,0.740715
5,0.4586,0.917212,0.7,0.701871
6,0.304,0.945769,0.723333,0.720818


🔍 Evaluating XLM-RoBERTa on validation set...



📊 Validation Classification Report for XLM-RoBERTa
                              precision    recall  f1-score   support

stereotype & objectification       0.73      0.75      0.74       239
           violence or abuse       0.72      0.60      0.66       154
            non-misogynistic       0.77      0.84      0.80       207

                    accuracy                           0.74       600
                   macro avg       0.74      0.73      0.73       600
                weighted avg       0.74      0.74      0.74       600

🔍 Evaluating XLM-RoBERTa on test set...



📊 Test Classification Report for XLM-RoBERTa
                              precision    recall  f1-score   support

stereotype & objectification       0.75      0.76      0.75       239
           violence or abuse       0.66      0.61      0.63       155
            non-misogynistic       0.78      0.82      0.80       207

                    accuracy                           0.74       601
                   macro avg       0.73      0.73      0.73       601
                weighted avg       0.74      0.74      0.74       601

🧹 Cleared memory after XLM-RoBERTa

🔄 Loading DistilBERT...


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors


Using max_length: 89


model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
🔠 Tokenizing...
🚀 Training DistilBERT...


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.989,0.91225,0.571667,0.561748
2,0.7978,0.782699,0.675,0.672421
3,0.6609,0.827154,0.628333,0.61691
4,0.4726,0.854483,0.665,0.660121


🔍 Evaluating DistilBERT on validation set...



📊 Validation Classification Report for DistilBERT
                              precision    recall  f1-score   support

stereotype & objectification       0.64      0.69      0.66       239
           violence or abuse       0.65      0.53      0.59       154
            non-misogynistic       0.73      0.77      0.75       207

                    accuracy                           0.68       600
                   macro avg       0.67      0.66      0.67       600
                weighted avg       0.67      0.68      0.67       600

🔍 Evaluating DistilBERT on test set...



📊 Test Classification Report for DistilBERT
                              precision    recall  f1-score   support

stereotype & objectification       0.58      0.65      0.62       239
           violence or abuse       0.58      0.50      0.53       155
            non-misogynistic       0.69      0.68      0.69       207

                    accuracy                           0.62       601
                   macro avg       0.62      0.61      0.61       601
                weighted avg       0.62      0.62      0.62       601

🧹 Cleared memory after DistilBERT

📊 Model Comparison (Test F1)
BanglishBERT: 0.6916
BanglaBERT: 0.7310
MuRILBERT: 0.7271
XLM-RoBERTa: 0.7388
DistilBERT: 0.6194
