In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import precision_recall_fscore_support, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, 
                        TrainingArguments, Trainer)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tabulate import tabulate
import warnings
warnings.filterwarnings('ignore')

In [2]:
import wandb
import os
wandb.login(key='02f62ea18d807de380f948a102283d73ce32d0ef')

os.environ["WANDB_MODE"] = "online"

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmohiuddinprantiq[0m ([33mmohiuddinprantiq-chittagong-university-of-engineering-an[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [3]:
def load_data(train_path, val_path, test_path):
    """Load train, validation and test data"""
    train_df = pd.read_csv(train_path)
    val_df = pd.read_csv(val_path)
    test_df = pd.read_csv(test_path)
    
    return (train_df['transcriptions'].values, train_df['labels'].values,
            val_df['transcriptions'].values, val_df['labels'].values,
            test_df['transcriptions'].values, test_df['labels'].values)

In [4]:
def evaluate_metrics(y_true, y_pred):
    """Calculate precision, recall, and f1-score"""
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [5]:
# dataset for transformer
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [6]:
# dataset for CNN, BiLSTM
class CustomTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=100):
        self.texts = texts
        self.labels = labels
        sequences = tokenizer.texts_to_sequences(texts)
        self.data = torch.tensor(pad_sequences(sequences, maxlen=max_len, padding='post'))
        self.labels = torch.tensor(labels, dtype=torch.long)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [7]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size=10000, embedding_dim=100, hidden_dim=256, output_dim=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.lstm(embedded)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        return self.fc(self.dropout(hidden))

class TextCNN(nn.Module):
    def __init__(self, vocab_size=10000, embedding_dim=100, n_filters=100, filter_sizes=[3,4,5], output_dim=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=n_filters, 
                     kernel_size=(fs, embedding_dim)) 
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, text):
        embedded = self.embedding(text)
        embedded = embedded.unsqueeze(1)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

In [8]:
def train_model(model, train_loader, val_loader, criterion, optimizer, device, epochs=5):
    best_val_loss = float('inf')
    best_model = None
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        total_loss = 0
        for texts, labels in train_loader:
            texts, labels = texts.to(device), labels.to(device)
            
            optimizer.zero_grad()
            predictions = model(texts)
            loss = criterion(predictions, labels)
            
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        # Validation phase
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for texts, labels in val_loader:
                texts, labels = texts.to(device), labels.to(device)
                
                predictions = model(texts)
                loss = criterion(predictions, labels)
                val_loss += loss.item()
                
                _, predicted = torch.max(predictions, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
        
        val_accuracy = val_correct / val_total
        val_loss = val_loss / len(val_loader)
        
        print(f'Epoch: {epoch+1}')
        print(f'\tTrain Loss: {total_loss/len(train_loader):.4f}')
        print(f'\tVal Loss: {val_loss:.4f}')
        print(f'\tVal Accuracy: {val_accuracy:.4f}')
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model.state_dict().copy()
    
    # Load best model
    model.load_state_dict(best_model)
    return model

In [9]:
def evaluate_model(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for texts, labels in test_loader:
            texts, labels = texts.to(device), labels.to(device)
            
            predictions = model(texts)
            _, predicted = torch.max(predictions, 1)
            
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    return evaluate_metrics(np.array(all_labels), np.array(all_preds))

def run_cnn_bilstm(X_train, X_val, X_test, y_train, y_val, y_test, model_type='cnn'):
    # Initialize tokenizer
    tokenizer = Tokenizer(num_words=10000)
    tokenizer.fit_on_texts(X_train)
    vocab_size = len(tokenizer.word_index) + 1
    
    # Create datasets
    batch_size = 32
    train_dataset = CustomTextDataset(X_train, y_train, tokenizer)
    val_dataset = CustomTextDataset(X_val, y_val, tokenizer)
    test_dataset = CustomTextDataset(X_test, y_test, tokenizer)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    # Initialize model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if model_type == 'cnn':
        model = TextCNN(vocab_size=vocab_size).to(device)
        print("Training CNN...")
    else:
        model = BiLSTM(vocab_size=vocab_size).to(device)
        print("Training BiLSTM...")
    
    # Training setup
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters())
    
    # Train model
    model = train_model(model, train_loader, val_loader, criterion, optimizer, device)
    
    # Evaluate model
    return evaluate_model(model, test_loader, device)

In [10]:
def run_svm(X_train, X_val, X_test, y_train, y_val, y_test):
    """Train and evaluate SVM model"""
    print("Training SVM...")
    vectorizer = TfidfVectorizer(max_features=10000)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    
    svm = LinearSVC(random_state=42)
    svm.fit(X_train_tfidf, y_train)
    y_pred = svm.predict(X_test_tfidf)
    return evaluate_metrics(y_test, y_pred)

In [11]:
def train_torch_model(model, train_loader, val_loader, device, epochs=3):
    """Generic training function for PyTorch models"""
    optimizer = torch.optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss()
    model = model.to(device)
    
    for epoch in range(epochs):
        model.train()
        for batch in train_loader:
            texts = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            predictions = model(texts)
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()
    
    return model

def evaluate_torch_model(model, test_loader, device):
    """Evaluate PyTorch model"""
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in test_loader:
            texts = batch['input_ids'].to(device)
            labels = batch['labels']
            
            predictions = model(texts)
            predictions = torch.argmax(predictions, dim=1)
            
            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(labels.numpy())
    
    return evaluate_metrics(all_labels, all_preds)

def run_transformer_model(model_name, X_train, X_val, X_test, y_train, y_val, y_test):
    """Train and evaluate transformer models"""
    print(f"Training {model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    
    train_dataset = TextDataset(X_train, y_train, tokenizer)
    val_dataset = TextDataset(X_val, y_val, tokenizer)
    test_dataset = TextDataset(X_test, y_test, tokenizer)
    
    
    training_args = TrainingArguments(
        output_dir=f'./results_{model_name}',
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        evaluation_strategy="epoch",
        logging_strategy="no",  # Disable logging
        learning_rate=2e-5,
        weight_decay=0.01,
        save_strategy="epoch",
        save_total_limit=1,  # Keep only the last checkpoint
        load_best_model_at_end=True,
    )


    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )
    
    trainer.train()
    predictions = trainer.predict(test_dataset)
    y_pred = np.argmax(predictions.predictions, axis=1)
    return evaluate_metrics(y_test, y_pred)

In [12]:
def run_all_models(train_path, val_path, test_path):
    """Run all models and compile results"""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Load all data
    X_train, y_train, X_val, y_val, X_test, y_test = load_data(train_path, val_path, test_path)
    
    results = {}
    
    # Run SVM
    results['SVM'] = run_svm(X_train, X_val, X_test, y_train, y_val, y_test)
    
    # Run CNN
    results['CNN'] = run_cnn_bilstm(X_train, X_val, X_test, y_train, y_val, y_test, model_type='cnn')
    
    # Run Bi-LSTM
    results['Bi-LSTM'] = run_cnn_bilstm(X_train, X_val, X_test, y_train, y_val, y_test, model_type='bilstm')
    
    # Run transformer models
    transformer_models = {
        'XLM-R': 'FacebookAI/xlm-roberta-base',
        'mBERT': 'google-bert/bert-base-multilingual-cased'
    }
    
    for model_name, model_path in transformer_models.items():
        results[model_name] = run_transformer_model(
            model_path, X_train, X_val, X_test, y_train, y_val, y_test
        )
    
    return results

def display_results(results,lan):
    """Display results in a formatted table"""
    rows = []
    for model_name, metrics in results.items():
        rows.append([
            model_name,
            f"{metrics['precision']:.4f}",
            f"{metrics['recall']:.4f}",
            f"{metrics['f1']:.4f}"
        ])
    
    headers = ['Model', 'Precision', 'Recall', 'F1-Score']
    print(f"\nModel Comparison Results for {lan}:")
    print(tabulate(rows, headers=headers, tablefmt='grid'))
    
    # Save results to CSV
    results_df = pd.DataFrame(rows, columns=headers)
    results_df.to_csv(f'model_comparison_results_{lan}.csv', index=False)
    print(f"\nResults saved to 'model_comparison_results_for_{lan}.csv'")


In [13]:
# tamil
train_path = '/kaggle/input/misogyny/misogyny/misogyny/tamil/train/train.csv'
val_path = '/kaggle/input/misogyny/misogyny/misogyny/tamil/dev/dev.csv'
test_path = '/kaggle/input/misogyny/test_with_labels_tamil/test_with_labels.csv'
results_tam = run_all_models(train_path, val_path, test_path)
lan='tamil'
display_results(results_tam,lan)

Using device: cuda
Training SVM...
Training CNN...
Epoch: 1
	Train Loss: 0.6078
	Val Loss: 0.5361
	Val Accuracy: 0.7394
Epoch: 2
	Train Loss: 0.4080
	Val Loss: 0.4892
	Val Accuracy: 0.7359
Epoch: 3
	Train Loss: 0.2825
	Val Loss: 0.4783
	Val Accuracy: 0.7535
Epoch: 4
	Train Loss: 0.2103
	Val Loss: 0.4910
	Val Accuracy: 0.7641
Epoch: 5
	Train Loss: 0.1304
	Val Loss: 0.5158
	Val Accuracy: 0.7606
Training BiLSTM...
Epoch: 1
	Train Loss: 0.5535
	Val Loss: 0.5264
	Val Accuracy: 0.7394
Epoch: 2
	Train Loss: 0.4614
	Val Loss: 0.5067
	Val Accuracy: 0.7007
Epoch: 3
	Train Loss: 0.3525
	Val Loss: 0.5019
	Val Accuracy: 0.7289
Epoch: 4
	Train Loss: 0.2659
	Val Loss: 0.5877
	Val Accuracy: 0.6655
Epoch: 5
	Train Loss: 0.1554
	Val Loss: 0.5894
	Val Accuracy: 0.7394
Training FacebookAI/xlm-roberta-base...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.56631
2,No log,0.499194
3,No log,0.475989


Training google-bert/bert-base-multilingual-cased...


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.466199
2,No log,0.458126
3,No log,0.540794



Model Comparison Results for tamil:
+---------+-------------+----------+------------+
| Model   |   Precision |   Recall |   F1-Score |
| SVM     |      0.6716 |   0.5056 |     0.5769 |
+---------+-------------+----------+------------+
| CNN     |      0.5263 |   0.2247 |     0.315  |
+---------+-------------+----------+------------+
| Bi-LSTM |      0.3731 |   0.5618 |     0.4484 |
+---------+-------------+----------+------------+
| XLM-R   |      0.9231 |   0.1348 |     0.2353 |
+---------+-------------+----------+------------+
| mBERT   |      0.593  |   0.573  |     0.5829 |
+---------+-------------+----------+------------+

Results saved to 'model_comparison_results_for_tamil.csv'


In [14]:
# malayalam
train_path = '/kaggle/input/misogyny/misogyny/misogyny/malayalam/train/train.csv'
val_path = '/kaggle/input/misogyny/misogyny/misogyny/malayalam/dev/dev.csv'
test_path = '/kaggle/input/misogyny/test_with_labels_malayalam/test_with_labels.csv'
results_mal = run_all_models(train_path, val_path, test_path)
lan='malayalam'
display_results(results_mal,lan)

Using device: cuda
Training SVM...
Training CNN...
Epoch: 1
	Train Loss: 0.7451
	Val Loss: 0.7381
	Val Accuracy: 0.5062
Epoch: 2
	Train Loss: 0.5286
	Val Loss: 0.6844
	Val Accuracy: 0.5563
Epoch: 3
	Train Loss: 0.3866
	Val Loss: 0.7478
	Val Accuracy: 0.5563
Epoch: 4
	Train Loss: 0.3129
	Val Loss: 0.7080
	Val Accuracy: 0.5625
Epoch: 5
	Train Loss: 0.2230
	Val Loss: 0.7056
	Val Accuracy: 0.5750
Training BiLSTM...
Epoch: 1
	Train Loss: 0.6837
	Val Loss: 0.6693
	Val Accuracy: 0.6188
Epoch: 2
	Train Loss: 0.6078
	Val Loss: 0.6786
	Val Accuracy: 0.5938
Epoch: 3
	Train Loss: 0.4588
	Val Loss: 0.6886
	Val Accuracy: 0.6062
Epoch: 4
	Train Loss: 0.3011
	Val Loss: 0.7898
	Val Accuracy: 0.6562
Epoch: 5
	Train Loss: 0.1746
	Val Loss: 0.9380
	Val Accuracy: 0.6188
Training FacebookAI/xlm-roberta-base...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.579178
2,No log,0.566238
3,No log,0.577757


Training google-bert/bert-base-multilingual-cased...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.579189
2,No log,0.56461
3,No log,0.614726



Model Comparison Results for malayalam:
+---------+-------------+----------+------------+
| Model   |   Precision |   Recall |   F1-Score |
| SVM     |      0.6538 |   0.6538 |     0.6538 |
+---------+-------------+----------+------------+
| CNN     |      0.5132 |   0.5    |     0.5065 |
+---------+-------------+----------+------------+
| Bi-LSTM |      0.5046 |   0.7051 |     0.5882 |
+---------+-------------+----------+------------+
| XLM-R   |      0.6633 |   0.8333 |     0.7386 |
+---------+-------------+----------+------------+
| mBERT   |      0.6981 |   0.4744 |     0.5649 |
+---------+-------------+----------+------------+

Results saved to 'model_comparison_results_for_malayalam.csv'
