<a href="https://colab.research.google.com/github/Khadiza13/DravidianLangTech-NAACL-Misogyny-/blob/main/Textual_MBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

khadiza13_train_dataset_path = kagglehub.dataset_download('khadiza13/train-dataset')
khadiza13_eval_dataset_path = kagglehub.dataset_download('khadiza13/eval-dataset')
khadiza13_test_dataset_path = kagglehub.dataset_download('khadiza13/test-dataset')
khadiza13_test_with_labels_path = kagglehub.dataset_download('khadiza13/test-with-labels')

print('Data source import complete.')


In [None]:
import numpy as np
import pandas as pd

In [None]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report
from tqdm import tqdm

def load_and_preprocess_data(csv_path):
    data = pd.read_csv(csv_path)
    texts = data['transcriptions'].tolist()
    labels = data['labels'].tolist()
    return texts, labels

# Paths for your dataset
TRAIN_CSV_PATH = '/kaggle/input/train-dataset/train/train.csv'
EVAL_CSV_PATH = '/kaggle/input/eval-dataset/dev/dev.csv'
TEST_CSV_PATH = '/kaggle/input/test-with-labels/test_with_labels/test_with_labels.csv'

# Load data
train_texts, train_labels = load_and_preprocess_data(TRAIN_CSV_PATH)
eval_texts, eval_labels = load_and_preprocess_data(EVAL_CSV_PATH)
test_texts, test_labels = load_and_preprocess_data(TEST_CSV_PATH)

# Merge training and evaluation data
merged_texts = train_texts + eval_texts
merged_labels = train_labels + eval_labels

# Initialize tokenizer
text_tokenizer = AutoTokenizer.from_pretrained(
    "bert-base-multilingual-cased",
    model_max_length=128,
    use_fast=True
)

# Custom classification model using mBERT
class MBertClassifier(nn.Module):
    def __init__(self, num_labels=2):
        super().__init__()
        self.bert = AutoModel.from_pretrained("bert-base-multilingual-cased")
        self.bert.resize_token_embeddings(len(text_tokenizer))
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # Take [CLS] token output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)

        return {'loss': loss, 'logits': logits} if loss is not None else {'logits': logits}

# Initialize the custom model
text_model = MBertClassifier(num_labels=2)

def tokenize_texts(texts, tokenizer, max_length=128):
    return tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

# Tokenize the data
merged_encodings = tokenize_texts(merged_texts, text_tokenizer)
test_encodings = tokenize_texts(test_texts, text_tokenizer)

# Create datasets
train_labels_tensor = torch.tensor(merged_labels)
test_labels_tensor = torch.tensor(test_labels)

train_dataset = TensorDataset(
    merged_encodings['input_ids'],
    merged_encodings['attention_mask'],
    train_labels_tensor
)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Optimizer
optimizer = torch.optim.AdamW(text_model.parameters(), lr=2e-5)

# Training loop with progress bar
def train_model(model, dataloader, optimizer, epochs):
    model.train()

    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs}")
        progress_bar = tqdm(dataloader, desc="Training")
        total_loss = 0

        for batch in progress_bar:
            input_ids, attention_mask, labels = batch

            optimizer.zero_grad()
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs['loss']
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

            # Update progress bar
            progress_bar.set_postfix({'loss': f"{loss.item():.4f}"})

        avg_loss = total_loss / len(dataloader)
        print(f"Average loss for epoch {epoch+1}: {avg_loss:.4f}")

# Evaluation function
def evaluate_model(model, encodings, labels_tensor):
    model.eval()
    with torch.no_grad():
        outputs = model(
            input_ids=encodings['input_ids'],
            attention_mask=encodings['attention_mask']
        )
        logits = outputs['logits']
        predicted_labels = torch.argmax(logits, axis=1)
        return predicted_labels

# Train the model for 5 epochs
print("Starting training...")
train_model(text_model, train_dataloader, optimizer, epochs=5)

# Evaluate on test data
print("\nEvaluating on test data...")
predicted_labels = evaluate_model(text_model, test_encodings, test_labels_tensor)

# Print classification report
print("\nClassification Report:\n")
print(classification_report(
    test_labels_tensor,
    predicted_labels.numpy(),
    target_names=["Non-Misogyny", "Misogyny"]
))


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Starting training...

Epoch 1/5


Training: 100%|██████████| 25/25 [04:43<00:00, 11.33s/it, loss=0.7180]


Average loss for epoch 1: 0.6494

Epoch 2/5


Training: 100%|██████████| 25/25 [04:33<00:00, 10.95s/it, loss=0.6197]


Average loss for epoch 2: 0.5181

Epoch 3/5


Training: 100%|██████████| 25/25 [04:34<00:00, 11.00s/it, loss=0.3743]


Average loss for epoch 3: 0.4465

Epoch 4/5


Training: 100%|██████████| 25/25 [04:35<00:00, 11.01s/it, loss=0.2579]


Average loss for epoch 4: 0.3226

Epoch 5/5


Training: 100%|██████████| 25/25 [04:33<00:00, 10.95s/it, loss=0.1313]


Average loss for epoch 5: 0.2071

Evaluating on test data...

Classification Report:

              precision    recall  f1-score   support

Non-Misogyny       0.76      0.84      0.79       122
    Misogyny       0.69      0.58      0.63        78

    accuracy                           0.73       200
   macro avg       0.72      0.71      0.71       200
weighted avg       0.73      0.73      0.73       200

