In [4]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import torchvision
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_fscore_support, f1_score
from torchvision import transforms
from torchvision.transforms import Compose, Resize, ToTensor, Normalize, RandomHorizontalFlip, RandomRotation
from torchvision.models import resnet50, vgg16
from PIL import Image
from tabulate import tabulate
import warnings
from transformers import (
    AutoFeatureExtractor, 
    SwinModel, 
    BertTokenizer, 
    BertModel, 
    XLMRobertaTokenizer, 
    XLMRobertaModel
)
warnings.filterwarnings('ignore')
print('imported')

imported


In [5]:
# Malayalam
train_csv_mal = '/kaggle/input/misogyny/misogyny/misogyny/malayalam/train/train.csv'
val_csv_mal = '/kaggle/input/misogyny/misogyny/misogyny/malayalam/dev/dev.csv'
df_train_mal = pd.read_csv(train_csv_mal)
df_val_mal = pd.read_csv(val_csv_mal)
    
train_img_dir_mal = '/kaggle/input/misogyny/misogyny/misogyny/malayalam/train/memes'
val_img_dir_mal = '/kaggle/input/misogyny/misogyny/misogyny/malayalam/dev/memes'

test_csv_mal = '/kaggle/input/misogyny/test_with_labels_malayalam/test_with_labels.csv'
test_img_dir_mal = '/kaggle/input/misogyny/test_with_labels_malayalam/memes'
df_test_mal = pd.read_csv(test_csv_mal)

# Tamil
train_csv_tam = '/kaggle/input/misogyny/misogyny/misogyny/tamil/train/train.csv'
val_csv_tam = '/kaggle/input/misogyny/misogyny/misogyny/tamil/dev/dev.csv'
df_train_tam = pd.read_csv(train_csv_tam)
df_val_tam = pd.read_csv(val_csv_tam)
    
train_img_dir_tam = '/kaggle/input/misogyny/misogyny/misogyny/tamil/train/memes'
val_img_dir_tam = '/kaggle/input/misogyny/misogyny/misogyny/tamil/dev/memes'

test_csv_tam = '/kaggle/input/misogyny/test_with_labels_tamil/test_with_labels.csv'
test_img_dir_tam = '/kaggle/input/misogyny/test_with_labels_tamil/memes'
df_test_tam = pd.read_csv(test_csv_tam)

row_tamil = []
row_malayalam = []
headers = ['Text Model', 'Image Model', 'Language', 'Precision', 'Recall', 'F1-Score']
print('data found')

data found


In [6]:
# svm + resnet50 -> tamil
class MemeDataset(Dataset):
    def __init__(self, data, image_dir, transform=None):
        self.data = data
        self.image_dir = image_dir
        self.transform = transform or transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        img_name = f"{self.image_dir}/{self.data.iloc[idx]['image_id']}.jpg"
        image = Image.open(img_name).convert('RGB')
        image = self.transform(image)
        
        text = self.data.iloc[idx]['transcriptions']
        label = self.data.iloc[idx]['labels']
        
        return image, text, label

def extract_text_features(train_data, test_data):
    vectorizer = TfidfVectorizer(max_features=10000)
    X_train_text = vectorizer.fit_transform(train_data['transcriptions'])
    X_test_text = vectorizer.transform(test_data['transcriptions'])
    
    svm = LinearSVC(random_state=42)
    svm.fit(X_train_text, train_data['labels'])
    
    train_text_features = svm.decision_function(X_train_text)
    test_text_features = svm.decision_function(X_test_text)
    
    # Convert sparse matrices to dense NumPy arrays
    return train_text_features.reshape(-1, 1), test_text_features.reshape(-1, 1)

def extract_image_features(loader):
    resnet = torchvision.models.resnet50(pretrained=True)
    resnet = nn.Sequential(*list(resnet.children())[:-1])
    resnet.eval()
    
    all_features, all_labels = [], []
    
    with torch.no_grad():
        for images, _, labels in loader:
            features = resnet(images).view(images.size(0), -1).cpu().numpy()
            all_features.append(features)
            all_labels.extend(labels.tolist())
    
    return np.vstack(all_features), np.array(all_labels)

class MultimodalClassifier(nn.Module):
    def __init__(self, text_feature_dim, image_feature_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(text_feature_dim + image_feature_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(64, 2)
        )
    
    def forward(self, text_features, image_features):
        combined = torch.cat((text_features, image_features), dim=1)
        return self.fc(combined)

def train_multimodal_model(model, criterion, optimizer, text_features, image_features, labels):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    # Convert data to tensors and move to device
    text_features = torch.FloatTensor(text_features).to(device)
    image_features = torch.FloatTensor(image_features).to(device)
    labels = torch.LongTensor(labels).to(device)
    
    model.train()
    for epoch in range(5):
        optimizer.zero_grad()
        outputs = model(text_features, image_features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        # Calculate training F1 score
        _, predictions = torch.max(outputs, 1)
        f1 = f1_score(labels.cpu().numpy(), predictions.cpu().numpy(), average='binary')
        print(f"Epoch [{epoch+1}/5], Loss: {loss.item():.4f}, F1 Score: {f1:.4f}")

def evaluate_model(model, text_features, image_features, labels):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    # Convert data to tensors and move to device
    text_features = torch.FloatTensor(text_features).to(device)
    image_features = torch.FloatTensor(image_features).to(device)
    labels = torch.LongTensor(labels).to(device)
    
    model.eval()
    with torch.no_grad():
        outputs = model(text_features, image_features)
        _, predictions = torch.max(outputs, 1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels.cpu().numpy(), predictions.cpu().numpy(), average='binary'
    )
    
    return precision, recall, f1

def SVM_ResNet50():
    # Load data
    train_data = pd.read_csv(train_csv_tam)
    test_data = pd.read_csv(test_csv_tam)
    
    # Prepare datasets and loaders
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    train_dataset = MemeDataset(train_data, train_img_dir_tam, transform)
    test_dataset = MemeDataset(test_data, test_img_dir_tam, transform)
    
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16)
    
    # Extract features
    train_text_features, test_text_features = extract_text_features(train_data, test_data)
    train_image_features, train_labels = extract_image_features(train_loader)
    test_image_features, test_labels = extract_image_features(test_loader)

    # Normalize features
    scaler = StandardScaler()
    train_text_features = scaler.fit_transform(train_text_features)
    test_text_features = scaler.transform(test_text_features)
    train_image_features = scaler.fit_transform(train_image_features)
    test_image_features = scaler.transform(test_image_features)
    
    # Train multimodal model
    model = MultimodalClassifier(
        text_feature_dim=train_text_features.shape[1],
        image_feature_dim=train_image_features.shape[1]
    )
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    train_multimodal_model(model, criterion, optimizer, train_text_features, train_image_features, train_labels)
    
    # Evaluate model
    precision, recall, f1 = evaluate_model(model, test_text_features, test_image_features, test_labels)
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
    row_tamil.append(['SVM','ResNet50','tamil',f'{precision:.4f}',f'{recall:.4f}',f'{f1:.4f}'])

SVM_ResNet50()

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 192MB/s]


Epoch [1/5], Loss: 0.8307, F1 Score: 0.3808
Epoch [2/5], Loss: 0.8045, F1 Score: 0.4081
Epoch [3/5], Loss: 0.7754, F1 Score: 0.4260
Epoch [4/5], Loss: 0.7581, F1 Score: 0.4295
Epoch [5/5], Loss: 0.7392, F1 Score: 0.4456
Precision: 0.3000, Recall: 0.9438, F1 Score: 0.4553


In [7]:
# svm + swin -> tamil
class MemeDataset(Dataset):
    def __init__(self, data, image_dir, transform=None, is_training=False):
        self.data = data
        self.image_dir = image_dir
        self.is_training = is_training
        
        # Define separate transforms for training and evaluation
        if transform is None:
            if is_training:
                self.transform = Compose([
                    Resize((384, 384)),
                    RandomHorizontalFlip(p=0.3),
                    RandomRotation(15),
                    ToTensor(),
                    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                ])
            else:
                self.transform = Compose([
                    Resize((384, 384)),
                    ToTensor(),
                    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                ])
        else:
            self.transform = transform
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        img_name = f"{self.image_dir}/{self.data.iloc[idx]['image_id']}.jpg"
        image = Image.open(img_name).convert('RGB')
        image = self.transform(image)
        
        text = str(self.data.iloc[idx]['transcriptions'])
        label = self.data.iloc[idx]['labels']
        
        return {
            'image': image,
            'text': text,
            'label': torch.tensor(label, dtype=torch.long)
        }

class MultimodalDataset:
    def __init__(self, text_features, image_features, labels):
        self.text_features = torch.FloatTensor(text_features)
        self.image_features = torch.FloatTensor(image_features)
        self.labels = torch.LongTensor(labels)
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'text': self.text_features[idx],
            'image': self.image_features[idx],
            'label': self.labels[idx]
        }

def extract_text_features(train_data, val_data, test_data):
    # Create TF-IDF vectorizer with better parameters
    vectorizer = TfidfVectorizer(
        max_features=10000,  # Increased from 5000
        ngram_range=(1, 2),  # Added bigrams
        min_df=2,  # Remove very rare terms
        max_df=0.95,  # Remove very common terms
        strip_accents='unicode',
        lowercase=True
    )
    
    # Fit on training data only
    X_train_text = vectorizer.fit_transform(train_data['transcriptions'])
    X_val_text = vectorizer.transform(val_data['transcriptions'])
    X_test_text = vectorizer.transform(test_data['transcriptions'])
    
    # Train SVM with better parameters
    svm = LinearSVC(
        C=1.0,
        class_weight='balanced',
        dual=False,
        max_iter=2000,
        random_state=42
    )
    svm.fit(X_train_text, train_data['labels'])
    
    # Get decision function scores
    train_text_features = svm.decision_function(X_train_text)
    val_text_features = svm.decision_function(X_val_text)
    test_text_features = svm.decision_function(X_test_text)
    
    return (train_text_features.reshape(-1, 1), 
            val_text_features.reshape(-1, 1), 
            test_text_features.reshape(-1, 1))

def extract_image_features(loader, swin_model, feature_extractor, device):
    features_list, labels_list = [], []
    
    swin_model.eval()
    with torch.no_grad():
        for batch in loader:
            images = batch['image'].to(device)
            labels = batch['label']
            
            # Get Swin features
            outputs = swin_model(images)
            
            # Use pooled output instead of mean of last hidden state
            features = outputs.pooler_output.cpu().numpy()
            
            features_list.append(features)
            labels_list.extend(labels.numpy())
    
    return np.vstack(features_list), np.array(labels_list)

class MultimodalClassifier(nn.Module):
    def __init__(self, text_feature_dim, image_feature_dim):
        super().__init__()
        
        # Separate feature processing
        self.text_processor = nn.Sequential(
            nn.Linear(text_feature_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        
        self.image_processor = nn.Sequential(
            nn.Linear(image_feature_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        
        # Fusion layers
        self.fusion = nn.Sequential(
            nn.Linear(256 + 512, 384),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(384, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 2)
        )
        
        # Batch Normalization layers
        self.bn_text = nn.BatchNorm1d(256)
        self.bn_image = nn.BatchNorm1d(512)
        self.bn_fusion = nn.BatchNorm1d(384)
    
    def forward(self, text_features, image_features):
        # Process text features
        text_features = self.text_processor(text_features)
        text_features = self.bn_text(text_features)
        
        # Process image features
        image_features = self.image_processor(image_features)
        image_features = self.bn_image(image_features)
        
        # Combine features
        combined = torch.cat((text_features, image_features), dim=1)
        
        # Final classification
        output = self.fusion(combined)
        return output

def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    for batch in dataloader:
        text_features = batch['text'].to(device)
        image_features = batch['image'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        outputs = model(text_features, image_features)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    
    metrics = precision_recall_fscore_support(
        all_labels, all_preds, average='binary', zero_division=0
    )
    return total_loss / len(dataloader), metrics[2]  # Return loss and F1

def evaluate(model, dataloader, criterion, device):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    
    with torch.no_grad():
        for batch in dataloader:
            text_features = batch['text'].to(device)
            image_features = batch['image'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(text_features, image_features)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='binary', zero_division=0
    )
    return total_loss / len(dataloader), precision, recall, f1

def SVM_SwinTransformer(train_data, val_data, test_data, train_img_dir, val_img_dir, test_img_dir):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Create datasets with augmentation for training
    train_dataset = MemeDataset(train_data, train_img_dir, is_training=True)
    val_dataset = MemeDataset(val_data, val_img_dir, is_training=False)
    test_dataset = MemeDataset(test_data, test_img_dir, is_training=False)
    
    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)
    test_loader = DataLoader(test_dataset, batch_size=16)
    
    # Initialize Swin Transformer
    swin_model = SwinModel.from_pretrained("microsoft/swin-base-patch4-window12-384").to(device)
    feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/swin-base-patch4-window12-384")
    
    # Extract text features
    train_text_features, val_text_features, test_text_features = extract_text_features(
        train_data, val_data, test_data
    )
    
    # Extract image features
    train_image_features, train_labels = extract_image_features(train_loader, swin_model, feature_extractor, device)
    val_image_features, val_labels = extract_image_features(val_loader, swin_model, feature_extractor, device)
    test_image_features, test_labels = extract_image_features(test_loader, swin_model, feature_extractor, device)
    
    # Normalize features
    scaler = StandardScaler()
    train_text_features = scaler.fit_transform(train_text_features)
    val_text_features = scaler.transform(val_text_features)
    test_text_features = scaler.transform(test_text_features)
    
    scaler_img = StandardScaler()
    train_image_features = scaler_img.fit_transform(train_image_features)
    val_image_features = scaler_img.transform(val_image_features)
    test_image_features = scaler_img.transform(test_image_features)

    # Create combined datasets with processed features
    train_combined_dataset = MultimodalDataset(train_text_features, train_image_features, train_labels)
    val_combined_dataset = MultimodalDataset(val_text_features, val_image_features, val_labels)
    test_combined_dataset = MultimodalDataset(test_text_features, test_image_features, test_labels)

    # Create dataloaders for training
    train_loader = DataLoader(train_combined_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_combined_dataset, batch_size=16)
    test_loader = DataLoader(test_combined_dataset, batch_size=16)
    
    # Initialize model
    model = MultimodalClassifier(
        text_feature_dim=train_text_features.shape[1],
        image_feature_dim=train_image_features.shape[1]
    ).to(device)
    
    # Training settings
    criterion = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0]).to(device))  # Weight minority class more
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2)
    
    # Training loop
    best_val_f1 = 0
    epochs = 5
    for epoch in range(epochs):
        # Train
        train_loss, train_f1 = train_epoch(model, train_loader, criterion, optimizer, device)
        
        # Validate
        val_loss, val_prec, val_recall, val_f1 = evaluate(model, val_loader, criterion, device)
        
        # Learning rate scheduling
        scheduler.step(val_f1)
        
        print(f"Epoch {epoch+1}/{epochs}")
        print(f"Train - Loss: {train_loss:.4f}, F1: {train_f1:.4f}")
        print(f"Val - Loss: {val_loss:.4f}, F1: {val_f1:.4f}")
        
        # Save best model
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            torch.save(model.state_dict(), 'best_model.pt')
    
    # Load best model and evaluate
    model.load_state_dict(torch.load('best_model.pt'))
    test_loss, test_prec, test_recall, test_f1 = evaluate(model, test_loader, criterion, device)
    print(f"\nTest Results:")
    print(f"Precision: {test_prec:.4f}")
    print(f"Recall: {test_recall:.4f}")
    print(f"F1 Score: {test_f1:.4f}")
    row_tamil.append(['SVM','Swin','tamil',f'{test_prec:.4f}',f'{test_recall:.4f}',f'{test_f1:.4f}'])

# Run the model
SVM_SwinTransformer(df_train_tam, df_val_tam, df_test_tam, train_img_dir_tam, val_img_dir_tam, test_img_dir_tam)

config.json:   0%|          | 0.00/71.8k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/356M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

Epoch 1/5
Train - Loss: 0.7089, F1: 0.3799
Val - Loss: 0.6794, F1: 0.4831
Epoch 2/5
Train - Loss: 0.6704, F1: 0.4197
Val - Loss: 0.6583, F1: 0.4675
Epoch 3/5
Train - Loss: 0.6373, F1: 0.4770
Val - Loss: 0.6398, F1: 0.4615
Epoch 4/5
Train - Loss: 0.6120, F1: 0.5250
Val - Loss: 0.6215, F1: 0.4559
Epoch 5/5
Train - Loss: 0.5918, F1: 0.5634
Val - Loss: 0.6169, F1: 0.4706

Test Results:
Precision: 0.3020
Recall: 0.6854
F1 Score: 0.4192


In [8]:
# mBERT + resnet50 -> tamil
class MemeDataset(Dataset):
    def __init__(self, dataframe, image_dir, tokenizer, max_length, transform=None):
        self.data = dataframe
        self.image_dir = image_dir
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.transform = transform if transform else Compose([
            Resize((224, 224)),
            ToTensor(),
            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ImageNet normalization
        ])
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = f"{self.image_dir}/{row['image_id']}.jpg"
        image = Image.open(img_path).convert('RGB')
        image = self.transform(image)
        
        encoding = self.tokenizer(
            str(row['transcriptions']),  # Convert to string to handle potential non-string inputs
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        
        return {
            'image': image,
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(row['labels'], dtype=torch.long)
        }

class MultimodalClassifier(nn.Module):
    def __init__(self, text_model, image_model, num_classes=2):
        super().__init__()
        self.text_model = text_model
        self.image_model = image_model
        
        # Get feature dimensions
        self.text_feature_dim = text_model.config.hidden_size  # Usually 768 for BERT
        self.image_feature_dim = 2048  # ResNet50 feature dimension
        
        # Fusion layers
        self.fusion = nn.Sequential(
            nn.Linear(self.text_feature_dim + self.image_feature_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, input_ids, attention_mask, images):
        # Text features
        text_outputs = self.text_model(input_ids, attention_mask=attention_mask)
        text_features = text_outputs.last_hidden_state[:, 0, :]  # Use [CLS] token
        
        # Image features
        image_features = self.image_model(images)
        
        # Concatenate and classify
        combined = torch.cat((text_features, image_features), dim=1)
        return self.fusion(combined)

def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        images = batch['image'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, images)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='binary', zero_division=0
    )
    
    return total_loss / len(dataloader), precision, recall, f1

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            images = batch['image'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask, images)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='binary', zero_division=0
    )
    
    return total_loss / len(dataloader), precision, recall, f1

def mBERT_ResNet50(df_train, df_val, df_test, train_img_dir, val_img_dir, test_img_dir):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Initialize models
    tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
    text_model = BertModel.from_pretrained("bert-base-multilingual-cased")
    
    image_model = resnet50(pretrained=True)
    image_model.fc = nn.Identity()  # Remove classification head
    
    # Create datasets and dataloaders
    train_dataset = MemeDataset(df_train, train_img_dir, tokenizer, max_length=128)
    val_dataset = MemeDataset(df_val, val_img_dir, tokenizer, max_length=128)
    test_dataset = MemeDataset(df_test, test_img_dir, tokenizer, max_length=128)
    
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)
    test_loader = DataLoader(test_dataset, batch_size=16)
    
    # Initialize multimodal model
    model = MultimodalClassifier(text_model, image_model)
    model = model.to(device)
    
    # Training settings
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    num_epochs = 5
    
    # Training loop
    best_val_f1 = 0
    for epoch in range(num_epochs):
        # Train
        train_loss, train_prec, train_recall, train_f1 = train_epoch(
            model, train_loader, criterion, optimizer, device
        )
        
        # Validate
        val_loss, val_prec, val_recall, val_f1 = evaluate(
            model, val_loader, criterion, device
        )
        
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"Train - Loss: {train_loss:.4f}, F1: {train_f1:.4f}")
        print(f"Val - Loss: {val_loss:.4f}, F1: {val_f1:.4f}")
        
        # Save best model
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            torch.save(model.state_dict(), 'best_model.pt')
    
    # Load best model and evaluate on test set
    model.load_state_dict(torch.load('best_model.pt'))
    test_loss, test_prec, test_recall, test_f1 = evaluate(
        model, test_loader, criterion, device
    )
    print(f"\nTest Results:")
    print(f"Precision: {test_prec:.4f}")
    print(f"Recall: {test_recall:.4f}")
    print(f"F1 Score: {test_f1:.4f}")
    row_tamil.append(['mBERT','ResNet50','tamil',f'{test_prec:.4f}',f'{test_recall:.4f}',f'{test_f1:.4f}'])

# Run the model
mBERT_ResNet50(df_train_tam, df_val_tam, df_test_tam, train_img_dir_tam, val_img_dir_tam, test_img_dir_tam)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Epoch 1/5
Train - Loss: 0.5348, F1: 0.0138
Val - Loss: 0.4775, F1: 0.0000
Epoch 2/5
Train - Loss: 0.3697, F1: 0.5169
Val - Loss: 0.4105, F1: 0.5000
Epoch 3/5
Train - Loss: 0.1747, F1: 0.9049
Val - Loss: 0.5002, F1: 0.5484
Epoch 4/5
Train - Loss: 0.0678, F1: 0.9843
Val - Loss: 0.5311, F1: 0.5512
Epoch 5/5
Train - Loss: 0.0346, F1: 0.9877
Val - Loss: 0.5590, F1: 0.6184

Test Results:
Precision: 0.6596
Recall: 0.6966
F1 Score: 0.6776


In [9]:
# mBERT + swin -> tamil
class MemeDataset(Dataset):
    def __init__(self, dataframe, image_dir, tokenizer, max_length, transform=None):
        self.data = dataframe
        self.image_dir = image_dir
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.transform = transform if transform else Compose([
            Resize((224, 224)),
            ToTensor(),
            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = f"{self.image_dir}/{row['image_id']}.jpg"
        image = Image.open(img_path).convert('RGB')
        image = self.transform(image)
        
        encoding = self.tokenizer(
            str(row['transcriptions']),
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        
        return {
            'image': image,
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(row['labels'], dtype=torch.long)
        }

class MultimodalClassifier(nn.Module):
    def __init__(self, text_model, image_model, num_classes=2):
        super().__init__()
        self.text_model = text_model
        self.image_model = image_model
        
        # Get feature dimensions
        self.text_feature_dim = text_model.config.hidden_size  # 768 for BERT
        self.image_feature_dim = image_model.config.hidden_size  # 768 for Swin-Base
        
        # Fusion layers
        self.fusion = nn.Sequential(
            nn.Linear(self.text_feature_dim + self.image_feature_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, input_ids, attention_mask, images):
        # Text features
        text_outputs = self.text_model(input_ids, attention_mask=attention_mask)
        text_features = text_outputs.last_hidden_state[:, 0, :]  # Use [CLS] token
        
        # Image features
        image_outputs = self.image_model(images)
        image_features = image_outputs.pooler_output  # Use pooled output from Swin
        
        # Concatenate and classify
        combined = torch.cat((text_features, image_features), dim=1)
        return self.fusion(combined)

def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        images = batch['image'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, images)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='binary', zero_division=0
    )
    
    return total_loss / len(dataloader), precision, recall, f1

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            images = batch['image'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask, images)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='binary', zero_division=0
    )
    
    return total_loss / len(dataloader), precision, recall, f1

def mBERT_SwinTransformer(df_train, df_val, df_test, train_img_dir, val_img_dir, test_img_dir):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Initialize models
    tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
    text_model = BertModel.from_pretrained("bert-base-multilingual-cased")
    
    # Initialize Swin Transformer
    image_model = SwinModel.from_pretrained("microsoft/swin-base-patch4-window7-224-in22k")
    
    # Create datasets and dataloaders
    train_dataset = MemeDataset(df_train, train_img_dir, tokenizer, max_length=128)
    val_dataset = MemeDataset(df_val, val_img_dir, tokenizer, max_length=128)
    test_dataset = MemeDataset(df_test, test_img_dir, tokenizer, max_length=128)
    
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)
    test_loader = DataLoader(test_dataset, batch_size=16)
    
    # Initialize multimodal model
    model = MultimodalClassifier(text_model, image_model)
    model = model.to(device)
    
    # Training settings
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    num_epochs = 5
    
    # Training loop
    best_val_f1 = 0
    for epoch in range(num_epochs):
        # Train
        train_loss, train_prec, train_recall, train_f1 = train_epoch(
            model, train_loader, criterion, optimizer, device
        )
        
        # Validate
        val_loss, val_prec, val_recall, val_f1 = evaluate(
            model, val_loader, criterion, device
        )
        
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"Train - Loss: {train_loss:.4f}, F1: {train_f1:.4f}")
        print(f"Val - Loss: {val_loss:.4f}, F1: {val_f1:.4f}")
        
        # Save best model
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            torch.save(model.state_dict(), 'best_model.pt')
    
    # Load best model and evaluate on test set
    model.load_state_dict(torch.load('best_model.pt'))
    test_loss, test_prec, test_recall, test_f1 = evaluate(
        model, test_loader, criterion, device
    )
    print(f"\nTest Results:")
    print(f"Precision: {test_prec:.4f}")
    print(f"Recall: {test_recall:.4f}")
    print(f"F1 Score: {test_f1:.4f}")
    row_tamil.append(['mBERT','Swin','tamil',f'{test_prec:.4f}',f'{test_recall:.4f}',f'{test_f1:.4f}'])

# Run the model
mBERT_SwinTransformer(df_train_tam, df_val_tam, df_test_tam, train_img_dir_tam, val_img_dir_tam, test_img_dir_tam)

config.json:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/437M [00:00<?, ?B/s]

Epoch 1/5
Train - Loss: 0.5049, F1: 0.1039
Val - Loss: 0.4569, F1: 0.5426
Epoch 2/5
Train - Loss: 0.3562, F1: 0.6160
Val - Loss: 0.4611, F1: 0.4952
Epoch 3/5
Train - Loss: 0.2282, F1: 0.8216
Val - Loss: 0.4142, F1: 0.6259
Epoch 4/5
Train - Loss: 0.1326, F1: 0.9184
Val - Loss: 0.4825, F1: 0.6522
Epoch 5/5
Train - Loss: 0.0644, F1: 0.9665
Val - Loss: 0.5691, F1: 0.6560

Test Results:
Precision: 0.7463
Recall: 0.5618
F1 Score: 0.6410


In [10]:
print(f"\nComparison Results for Tamil:")
print(tabulate(row_tamil, headers=headers, tablefmt='grid'))

results_tamil = pd.DataFrame(row_tamil, columns=headers)
results_tamil.to_csv(f'Comparison_Results_Tamil.csv', index=False)
print(f"\nResults saved to 'Comparison_Results_for_Tamil.csv'")


Comparison Results for Tamil:
+--------------+---------------+------------+-------------+----------+------------+
| Text Model   | Image Model   | Language   |   Precision |   Recall |   F1-Score |
| SVM          | ResNet50      | tamil      |      0.3    |   0.9438 |     0.4553 |
+--------------+---------------+------------+-------------+----------+------------+
| SVM          | Swin          | tamil      |      0.302  |   0.6854 |     0.4192 |
+--------------+---------------+------------+-------------+----------+------------+
| mBERT        | ResNet50      | tamil      |      0.6596 |   0.6966 |     0.6776 |
+--------------+---------------+------------+-------------+----------+------------+
| mBERT        | Swin          | tamil      |      0.7463 |   0.5618 |     0.641  |
+--------------+---------------+------------+-------------+----------+------------+

Results saved to 'Comparison_Results_for_Tamil.csv'


In [11]:
# xlmr + vgg16 -> mal
class MemeDataset(Dataset):
    def __init__(self, dataframe, image_dir, tokenizer, max_length, transform=None):
        self.data = dataframe
        self.image_dir = image_dir
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.transform = transform if transform else Compose([
            Resize((224, 224)),
            ToTensor(),
            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = f"{self.image_dir}/{row['image_id']}.jpg"
        image = Image.open(img_path).convert('RGB')
        image = self.transform(image)
        
        encoding = self.tokenizer(
            str(row['transcriptions']),
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        
        return {
            'image': image,
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(row['labels'], dtype=torch.long)
        }

class MultimodalClassifier(nn.Module):
    def __init__(self, text_model, image_model, num_classes=2):
        super().__init__()
        self.text_model = text_model
        self.image_model = image_model
        
        # Get feature dimensions
        self.text_feature_dim = text_model.config.hidden_size  # 768 for XLM-R base, 1024 for large
        self.image_feature_dim = 4096  # VGG16 feature dimension after last FC layer
        
        # Fusion layers
        self.fusion = nn.Sequential(
            nn.Linear(self.text_feature_dim + self.image_feature_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, input_ids, attention_mask, images):
        # Text features
        text_outputs = self.text_model(input_ids, attention_mask=attention_mask)
        text_features = text_outputs.last_hidden_state[:, 0, :]  # Use [CLS] token
        
        # Image features
        image_features = self.image_model(images)
        
        # Concatenate and classify
        combined = torch.cat((text_features, image_features), dim=1)
        return self.fusion(combined)

def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        images = batch['image'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, images)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='binary', zero_division=0
    )
    
    return total_loss / len(dataloader), precision, recall, f1

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            images = batch['image'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask, images)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='binary', zero_division=0
    )
    
    return total_loss / len(dataloader), precision, recall, f1

def XLMR_VGG16(df_train, df_val, df_test, train_img_dir, val_img_dir, test_img_dir):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Initialize models
    tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
    text_model = XLMRobertaModel.from_pretrained("xlm-roberta-base")
    
    # Load VGG16 and modify it
    image_model = vgg16(pretrained=True)
    image_model.classifier = nn.Sequential(*list(image_model.classifier.children())[:-1])  # Remove last FC layer
    
    # Create datasets and dataloaders
    train_dataset = MemeDataset(df_train, train_img_dir, tokenizer, max_length=128)
    val_dataset = MemeDataset(df_val, val_img_dir, tokenizer, max_length=128)
    test_dataset = MemeDataset(df_test, test_img_dir, tokenizer, max_length=128)
    
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)
    test_loader = DataLoader(test_dataset, batch_size=16)
    
    # Initialize multimodal model
    model = MultimodalClassifier(text_model, image_model)
    model = model.to(device)
    
    # Training settings
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    num_epochs = 5
    
    # Training loop
    best_val_f1 = 0
    for epoch in range(num_epochs):
        # Train
        train_loss, train_prec, train_recall, train_f1 = train_epoch(
            model, train_loader, criterion, optimizer, device
        )
        
        # Validate
        val_loss, val_prec, val_recall, val_f1 = evaluate(
            model, val_loader, criterion, device
        )
        
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"Train - Loss: {train_loss:.4f}, F1: {train_f1:.4f}")
        print(f"Val - Loss: {val_loss:.4f}, F1: {val_f1:.4f}")
        
        # Save best model
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            torch.save(model.state_dict(), 'best_model.pt')
    
    # Load best model and evaluate on test set
    model.load_state_dict(torch.load('best_model.pt'))
    test_loss, test_prec, test_recall, test_f1 = evaluate(
        model, test_loader, criterion, device
    )
    print(f"\nTest Results:")
    print(f"Precision: {test_prec:.4f}")
    print(f"Recall: {test_recall:.4f}")
    print(f"F1 Score: {test_f1:.4f}")
    row_malayalam.append(['XLM-R','VGG16','malayalam',f'{test_prec:.4f}',f'{test_recall:.4f}',f'{test_f1:.4f}'])

# Run the model
XLMR_VGG16(df_train_mal, df_val_mal, df_test_mal, train_img_dir_mal, val_img_dir_mal, test_img_dir_mal)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth
100%|██████████| 528M/528M [00:02<00:00, 223MB/s] 


Epoch 1/5
Train - Loss: 0.6717, F1: 0.2443
Val - Loss: 0.6189, F1: 0.6526
Epoch 2/5
Train - Loss: 0.5635, F1: 0.5564
Val - Loss: 0.4541, F1: 0.7520
Epoch 3/5
Train - Loss: 0.3755, F1: 0.7683
Val - Loss: 0.5807, F1: 0.6879
Epoch 4/5
Train - Loss: 0.2545, F1: 0.8630
Val - Loss: 0.5191, F1: 0.7727
Epoch 5/5
Train - Loss: 0.1322, F1: 0.9486
Val - Loss: 0.3736, F1: 0.8333

Test Results:
Precision: 0.9016
Recall: 0.7051
F1 Score: 0.7914


In [12]:
# xlmr + swin -> malayalam
class MemeDataset(Dataset):
    def __init__(self, dataframe, image_dir, tokenizer, max_length, transform=None):
        self.data = dataframe
        self.image_dir = image_dir
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.transform = transform if transform else Compose([
            Resize((224, 224)),
            ToTensor(),
            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = f"{self.image_dir}/{row['image_id']}.jpg"
        image = Image.open(img_path).convert('RGB')
        image = self.transform(image)
        
        encoding = self.tokenizer(
            str(row['transcriptions']),
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        
        return {
            'image': image,
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(row['labels'], dtype=torch.long)
        }

class MultimodalClassifier(nn.Module):
    def __init__(self, text_model, image_model, num_classes=2):
        super().__init__()
        self.text_model = text_model
        self.image_model = image_model
        
        # Get feature dimensions
        self.text_feature_dim = text_model.config.hidden_size  # 768 for XLM-R base, 1024 for large
        self.image_feature_dim = image_model.config.hidden_size  # 768 for Swin-Base
        
        # Fusion layers
        self.fusion = nn.Sequential(
            nn.Linear(self.text_feature_dim + self.image_feature_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, input_ids, attention_mask, images):
        # Text features
        text_outputs = self.text_model(input_ids, attention_mask=attention_mask)
        text_features = text_outputs.last_hidden_state[:, 0, :]  # Use [CLS] token
        
        # Image features
        image_outputs = self.image_model(images)
        image_features = image_outputs.pooler_output  # Use pooled output from Swin
        
        # Concatenate and classify
        combined = torch.cat((text_features, image_features), dim=1)
        return self.fusion(combined)

def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        images = batch['image'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, images)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='binary', zero_division=0
    )
    
    return total_loss / len(dataloader), precision, recall, f1

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            images = batch['image'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask, images)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='binary', zero_division=0
    )
    
    return total_loss / len(dataloader), precision, recall, f1

def XLMR_SwinTransformer(df_train, df_val, df_test, train_img_dir, val_img_dir, test_img_dir):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Initialize models
    tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
    text_model = XLMRobertaModel.from_pretrained("xlm-roberta-base")
    
    # Initialize Swin Transformer
    image_model = SwinModel.from_pretrained("microsoft/swin-base-patch4-window7-224-in22k")
    
    # Create datasets and dataloaders
    train_dataset = MemeDataset(df_train, train_img_dir, tokenizer, max_length=128)
    val_dataset = MemeDataset(df_val, val_img_dir, tokenizer, max_length=128)
    test_dataset = MemeDataset(df_test, test_img_dir, tokenizer, max_length=128)
    
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)
    test_loader = DataLoader(test_dataset, batch_size=16)
    
    # Initialize multimodal model
    model = MultimodalClassifier(text_model, image_model)
    model = model.to(device)
    
    # Training settings
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    num_epochs = 5
    
    # Training loop
    best_val_f1 = 0
    for epoch in range(num_epochs):
        # Train
        train_loss, train_prec, train_recall, train_f1 = train_epoch(
            model, train_loader, criterion, optimizer, device
        )
        
        # Validate
        val_loss, val_prec, val_recall, val_f1 = evaluate(
            model, val_loader, criterion, device
        )
        
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"Train - Loss: {train_loss:.4f}, F1: {train_f1:.4f}")
        print(f"Val - Loss: {val_loss:.4f}, F1: {val_f1:.4f}")
        
        # Save best model
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            torch.save(model.state_dict(), 'best_model.pt')
    
    # Load best model and evaluate on test set
    model.load_state_dict(torch.load('best_model.pt'))
    test_loss, test_prec, test_recall, test_f1 = evaluate(
        model, test_loader, criterion, device
    )
    print(f"\nTest Results:")
    print(f"Precision: {test_prec:.4f}")
    print(f"Recall: {test_recall:.4f}")
    print(f"F1 Score: {test_f1:.4f}")
    row_malayalam.append(['XLM-R','Swin','malayalam',f'{test_prec:.4f}',f'{test_recall:.4f}',f'{test_f1:.4f}'])

# Run the model
XLMR_SwinTransformer(df_train_mal, df_val_mal, df_test_mal, train_img_dir_mal, val_img_dir_mal, test_img_dir_mal)

Epoch 1/5
Train - Loss: 0.6150, F1: 0.6048
Val - Loss: 0.4542, F1: 0.8000
Epoch 2/5
Train - Loss: 0.3687, F1: 0.8255
Val - Loss: 0.3186, F1: 0.8033
Epoch 3/5
Train - Loss: 0.2037, F1: 0.9228
Val - Loss: 0.3367, F1: 0.8136
Epoch 4/5
Train - Loss: 0.1724, F1: 0.9325
Val - Loss: 0.4361, F1: 0.8358
Epoch 5/5
Train - Loss: 0.0453, F1: 0.9884
Val - Loss: 0.4610, F1: 0.8254

Test Results:
Precision: 0.8312
Recall: 0.8205
F1 Score: 0.8258


In [13]:
# svm + swin -> malayalam
class MemeDataset(Dataset):
    def __init__(self, data, image_dir, transform=None, is_training=False):
        self.data = data
        self.image_dir = image_dir
        self.is_training = is_training
        
        # Define separate transforms for training and evaluation
        if transform is None:
            if is_training:
                self.transform = Compose([
                    Resize((384, 384)),
                    RandomHorizontalFlip(p=0.3),
                    RandomRotation(15),
                    ToTensor(),
                    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                ])
            else:
                self.transform = Compose([
                    Resize((384, 384)),
                    ToTensor(),
                    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                ])
        else:
            self.transform = transform
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        img_name = f"{self.image_dir}/{self.data.iloc[idx]['image_id']}.jpg"
        image = Image.open(img_name).convert('RGB')
        image = self.transform(image)
        
        text = str(self.data.iloc[idx]['transcriptions'])
        label = self.data.iloc[idx]['labels']
        
        return {
            'image': image,
            'text': text,
            'label': torch.tensor(label, dtype=torch.long)
        }

class MultimodalDataset:
    def __init__(self, text_features, image_features, labels):
        self.text_features = torch.FloatTensor(text_features)
        self.image_features = torch.FloatTensor(image_features)
        self.labels = torch.LongTensor(labels)
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'text': self.text_features[idx],
            'image': self.image_features[idx],
            'label': self.labels[idx]
        }

def extract_text_features(train_data, val_data, test_data):
    # Create TF-IDF vectorizer with better parameters
    vectorizer = TfidfVectorizer(
        max_features=10000,  # Increased from 5000
        ngram_range=(1, 2),  # Added bigrams
        min_df=2,  # Remove very rare terms
        max_df=0.95,  # Remove very common terms
        strip_accents='unicode',
        lowercase=True
    )
    
    # Fit on training data only
    X_train_text = vectorizer.fit_transform(train_data['transcriptions'])
    X_val_text = vectorizer.transform(val_data['transcriptions'])
    X_test_text = vectorizer.transform(test_data['transcriptions'])
    
    # Train SVM with better parameters
    svm = LinearSVC(
        C=1.0,
        class_weight='balanced',
        dual=False,
        max_iter=2000,
        random_state=42
    )
    svm.fit(X_train_text, train_data['labels'])
    
    # Get decision function scores
    train_text_features = svm.decision_function(X_train_text)
    val_text_features = svm.decision_function(X_val_text)
    test_text_features = svm.decision_function(X_test_text)
    
    return (train_text_features.reshape(-1, 1), 
            val_text_features.reshape(-1, 1), 
            test_text_features.reshape(-1, 1))

def extract_image_features(loader, swin_model, feature_extractor, device):
    features_list, labels_list = [], []
    
    swin_model.eval()
    with torch.no_grad():
        for batch in loader:
            images = batch['image'].to(device)
            labels = batch['label']
            
            # Get Swin features
            outputs = swin_model(images)
            
            # Use pooled output instead of mean of last hidden state
            features = outputs.pooler_output.cpu().numpy()
            
            features_list.append(features)
            labels_list.extend(labels.numpy())
    
    return np.vstack(features_list), np.array(labels_list)

class MultimodalClassifier(nn.Module):
    def __init__(self, text_feature_dim, image_feature_dim):
        super().__init__()
        
        # Separate feature processing
        self.text_processor = nn.Sequential(
            nn.Linear(text_feature_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        
        self.image_processor = nn.Sequential(
            nn.Linear(image_feature_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        
        # Fusion layers
        self.fusion = nn.Sequential(
            nn.Linear(256 + 512, 384),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(384, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 2)
        )
        
        # Batch Normalization layers
        self.bn_text = nn.BatchNorm1d(256)
        self.bn_image = nn.BatchNorm1d(512)
        self.bn_fusion = nn.BatchNorm1d(384)
    
    def forward(self, text_features, image_features):
        # Process text features
        text_features = self.text_processor(text_features)
        text_features = self.bn_text(text_features)
        
        # Process image features
        image_features = self.image_processor(image_features)
        image_features = self.bn_image(image_features)
        
        # Combine features
        combined = torch.cat((text_features, image_features), dim=1)
        
        # Final classification
        output = self.fusion(combined)
        return output

def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    for batch in dataloader:
        text_features = batch['text'].to(device)
        image_features = batch['image'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        outputs = model(text_features, image_features)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    
    metrics = precision_recall_fscore_support(
        all_labels, all_preds, average='binary', zero_division=0
    )
    return total_loss / len(dataloader), metrics[2]  # Return loss and F1

def evaluate(model, dataloader, criterion, device):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    
    with torch.no_grad():
        for batch in dataloader:
            text_features = batch['text'].to(device)
            image_features = batch['image'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(text_features, image_features)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='binary', zero_division=0
    )
    return total_loss / len(dataloader), precision, recall, f1

def SVM_SwinTransformer(train_data, val_data, test_data, train_img_dir, val_img_dir, test_img_dir):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Create datasets with augmentation for training
    train_dataset = MemeDataset(train_data, train_img_dir, is_training=True)
    val_dataset = MemeDataset(val_data, val_img_dir, is_training=False)
    test_dataset = MemeDataset(test_data, test_img_dir, is_training=False)
    
    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)
    test_loader = DataLoader(test_dataset, batch_size=16)
    
    # Initialize Swin Transformer
    swin_model = SwinModel.from_pretrained("microsoft/swin-base-patch4-window12-384").to(device)
    feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/swin-base-patch4-window12-384")
    
    # Extract text features
    train_text_features, val_text_features, test_text_features = extract_text_features(
        train_data, val_data, test_data
    )
    
    # Extract image features
    train_image_features, train_labels = extract_image_features(train_loader, swin_model, feature_extractor, device)
    val_image_features, val_labels = extract_image_features(val_loader, swin_model, feature_extractor, device)
    test_image_features, test_labels = extract_image_features(test_loader, swin_model, feature_extractor, device)
    
    # Normalize features
    scaler = StandardScaler()
    train_text_features = scaler.fit_transform(train_text_features)
    val_text_features = scaler.transform(val_text_features)
    test_text_features = scaler.transform(test_text_features)
    
    scaler_img = StandardScaler()
    train_image_features = scaler_img.fit_transform(train_image_features)
    val_image_features = scaler_img.transform(val_image_features)
    test_image_features = scaler_img.transform(test_image_features)

    # Create combined datasets with processed features
    train_combined_dataset = MultimodalDataset(train_text_features, train_image_features, train_labels)
    val_combined_dataset = MultimodalDataset(val_text_features, val_image_features, val_labels)
    test_combined_dataset = MultimodalDataset(test_text_features, test_image_features, test_labels)

    # Create dataloaders for training
    train_loader = DataLoader(train_combined_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_combined_dataset, batch_size=16)
    test_loader = DataLoader(test_combined_dataset, batch_size=16)
    
    # Initialize model
    model = MultimodalClassifier(
        text_feature_dim=train_text_features.shape[1],
        image_feature_dim=train_image_features.shape[1]
    ).to(device)
    
    # Training settings
    criterion = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0]).to(device))  # Weight minority class more
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2)
    
    # Training loop
    best_val_f1 = 0
    epochs = 5
    for epoch in range(epochs):
        # Train
        train_loss, train_f1 = train_epoch(model, train_loader, criterion, optimizer, device)
        
        # Validate
        val_loss, val_prec, val_recall, val_f1 = evaluate(model, val_loader, criterion, device)
        
        # Learning rate scheduling
        scheduler.step(val_f1)
        
        print(f"Epoch {epoch+1}/{epochs}")
        print(f"Train - Loss: {train_loss:.4f}, F1: {train_f1:.4f}")
        print(f"Val - Loss: {val_loss:.4f}, F1: {val_f1:.4f}")
        
        # Save best model
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            torch.save(model.state_dict(), 'best_model.pt')
    
    # Load best model and evaluate
    model.load_state_dict(torch.load('best_model.pt'))
    test_loss, test_prec, test_recall, test_f1 = evaluate(model, test_loader, criterion, device)
    print(f"\nTest Results:")
    print(f"Precision: {test_prec:.4f}")
    print(f"Recall: {test_recall:.4f}")
    print(f"F1 Score: {test_f1:.4f}")
    row_malayalam.append(['SVM','Swin','malayalam',f'{test_prec:.4f}',f'{test_recall:.4f}',f'{test_f1:.4f}'])

# Run the model
SVM_SwinTransformer(df_train_mal, df_val_mal, df_test_mal, train_img_dir_mal, val_img_dir_mal, test_img_dir_mal)

Epoch 1/5
Train - Loss: 0.6804, F1: 0.5179
Val - Loss: 0.6561, F1: 0.6292
Epoch 2/5
Train - Loss: 0.6520, F1: 0.6046
Val - Loss: 0.6276, F1: 0.6630
Epoch 3/5
Train - Loss: 0.6289, F1: 0.6677
Val - Loss: 0.5942, F1: 0.6667
Epoch 4/5
Train - Loss: 0.5992, F1: 0.6897
Val - Loss: 0.5601, F1: 0.7219
Epoch 5/5
Train - Loss: 0.5766, F1: 0.6959
Val - Loss: 0.5275, F1: 0.7595

Test Results:
Precision: 0.6239
Recall: 0.8718
F1 Score: 0.7273


In [14]:
# svm + vgg16 -> malayalam
class MemeDataset(Dataset):
    def __init__(self, data, image_dir, transform=None):
        self.data = data
        self.image_dir = image_dir
        self.transform = transform or transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        img_name = f"{self.image_dir}/{self.data.iloc[idx]['image_id']}.jpg"
        image = Image.open(img_name).convert('RGB')
        image = self.transform(image)
        
        text = self.data.iloc[idx]['transcriptions']
        label = self.data.iloc[idx]['labels']
        
        return image, text, label

def extract_text_features(train_data, test_data):
    vectorizer = TfidfVectorizer(max_features=10000)
    X_train_text = vectorizer.fit_transform(train_data['transcriptions'])
    X_test_text = vectorizer.transform(test_data['transcriptions'])
    
    svm = LinearSVC(random_state=42)
    svm.fit(X_train_text, train_data['labels'])
    
    train_text_features = svm.decision_function(X_train_text)
    test_text_features = svm.decision_function(X_test_text)
    
    # Convert sparse matrices to dense NumPy arrays
    return train_text_features.reshape(-1, 1), test_text_features.reshape(-1, 1)

def extract_image_features(loader):
    # Load VGG16 model and remove the last layer
    vgg16 = torchvision.models.vgg16(pretrained=True)
    vgg16.classifier = nn.Sequential(*list(vgg16.classifier.children())[:-1])  # Remove last layer
    vgg16.eval()
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    vgg16 = vgg16.to(device)
    
    all_features, all_labels = [], []
    
    with torch.no_grad():
        for images, _, labels in loader:
            images = images.to(device)
            features = vgg16(images).cpu().numpy()
            all_features.append(features)
            all_labels.extend(labels.tolist())
    
    return np.vstack(all_features), np.array(all_labels)

class MultimodalClassifier(nn.Module):
    def __init__(self, text_feature_dim, image_feature_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(text_feature_dim + image_feature_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(64, 2)
        )
    
    def forward(self, text_features, image_features):
        combined = torch.cat((text_features, image_features), dim=1)
        return self.fc(combined)

def train_multimodal_model(model, criterion, optimizer, text_features, image_features, labels):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    # Convert data to tensors and move to device
    text_features = torch.FloatTensor(text_features).to(device)
    image_features = torch.FloatTensor(image_features).to(device)
    labels = torch.LongTensor(labels).to(device)
    
    model.train()
    for epoch in range(5):
        optimizer.zero_grad()
        outputs = model(text_features, image_features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        # Calculate training F1 score
        _, predictions = torch.max(outputs, 1)
        f1 = f1_score(labels.cpu().numpy(), predictions.cpu().numpy(), average='binary')
        print(f"Epoch [{epoch+1}/5], Loss: {loss.item():.4f}, F1 Score: {f1:.4f}")

def evaluate_model(model, text_features, image_features, labels):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    # Convert data to tensors and move to device
    text_features = torch.FloatTensor(text_features).to(device)
    image_features = torch.FloatTensor(image_features).to(device)
    labels = torch.LongTensor(labels).to(device)
    
    model.eval()
    with torch.no_grad():
        outputs = model(text_features, image_features)
        _, predictions = torch.max(outputs, 1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels.cpu().numpy(), predictions.cpu().numpy(), average='binary'
    )
    
    return precision, recall, f1

def SVM_VGG16():
    # Load data
    train_data = pd.read_csv(train_csv_mal)
    test_data = pd.read_csv(test_csv_mal)
    
    # Prepare datasets and loaders
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    train_dataset = MemeDataset(train_data, train_img_dir_mal, transform)
    test_dataset = MemeDataset(test_data, test_img_dir_mal, transform)
    
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16)
    
    # Extract features
    train_text_features, test_text_features = extract_text_features(train_data, test_data)
    train_image_features, train_labels = extract_image_features(train_loader)
    test_image_features, test_labels = extract_image_features(test_loader)

    # Normalize features
    scaler = StandardScaler()
    train_text_features = scaler.fit_transform(train_text_features)
    test_text_features = scaler.transform(test_text_features)
    train_image_features = scaler.fit_transform(train_image_features)
    test_image_features = scaler.transform(test_image_features)
    
    # Train multimodal model
    model = MultimodalClassifier(
        text_feature_dim=train_text_features.shape[1],
        image_feature_dim=train_image_features.shape[1]
    )
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    train_multimodal_model(model, criterion, optimizer, train_text_features, train_image_features, train_labels)
    
    # Evaluate model
    precision, recall, f1 = evaluate_model(model, test_text_features, test_image_features, test_labels)
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
    row_malayalam.append(['SVM','VGG16','malayalam',f'{precision:.4f}',f'{recall:.4f}',f'{f1:.4f}'])

SVM_VGG16()

Epoch [1/5], Loss: 0.7094, F1 Score: 0.4641
Epoch [2/5], Loss: 0.6697, F1 Score: 0.5593
Epoch [3/5], Loss: 0.6460, F1 Score: 0.5997
Epoch [4/5], Loss: 0.5934, F1 Score: 0.6607
Epoch [5/5], Loss: 0.5720, F1 Score: 0.7085
Precision: 0.6477, Recall: 0.7308, F1 Score: 0.6867


In [17]:
print(f"\nComparison Results for Malayalam:")
print(tabulate(row_malayalam, headers=headers, tablefmt='grid'))

results_malayalam = pd.DataFrame(row_malayalam, columns=headers)
results_malayalam.to_csv(f'Comparison_Results_Malayalam.csv', index=False)
print(f"\nResults saved to 'Comparison_Results_for_Malayalam.csv'")


Comparison Results for Malayalam:
+--------------+---------------+------------+-------------+----------+------------+
| Text Model   | Image Model   | Language   |   Precision |   Recall |   F1-Score |
| XLM-R        | VGG16         | malayalam  |      0.9016 |   0.7051 |     0.7914 |
+--------------+---------------+------------+-------------+----------+------------+
| XLM-R        | Swin          | malayalam  |      0.8312 |   0.8205 |     0.8258 |
+--------------+---------------+------------+-------------+----------+------------+
| SVM          | Swin          | malayalam  |      0.6239 |   0.8718 |     0.7273 |
+--------------+---------------+------------+-------------+----------+------------+
| SVM          | VGG16         | malayalam  |      0.6477 |   0.7308 |     0.6867 |
+--------------+---------------+------------+-------------+----------+------------+

Results saved to 'Comparison_Results_for_Malayalam.csv'


In [18]:
combined_rows = row_tamil + row_malayalam
print(f"\nComparison Results for both Tamil & Malayalam:")
print(tabulate(combined_rows, headers=headers, tablefmt='grid'))

combined_results = pd.DataFrame(combined_rows, columns=headers)
combined_results.to_csv(f'Comparison_Results_Tamil_&_Malayalam.csv', index=False)
print(f"\nResults saved to 'Comparison_Results_for_Tamil_&_Malayalam.csv'")


Comparison Results for both Tamil & Malayalam:
+--------------+---------------+------------+-------------+----------+------------+
| Text Model   | Image Model   | Language   |   Precision |   Recall |   F1-Score |
| SVM          | ResNet50      | tamil      |      0.3    |   0.9438 |     0.4553 |
+--------------+---------------+------------+-------------+----------+------------+
| SVM          | Swin          | tamil      |      0.302  |   0.6854 |     0.4192 |
+--------------+---------------+------------+-------------+----------+------------+
| mBERT        | ResNet50      | tamil      |      0.6596 |   0.6966 |     0.6776 |
+--------------+---------------+------------+-------------+----------+------------+
| mBERT        | Swin          | tamil      |      0.7463 |   0.5618 |     0.641  |
+--------------+---------------+------------+-------------+----------+------------+
| XLM-R        | VGG16         | malayalam  |      0.9016 |   0.7051 |     0.7914 |
+--------------+------------