<a href="https://colab.research.google.com/github/MubasshirNaib/Misogyny-Meme-Detection/blob/main/Misogyny_Meme_Detection_ChallengeFinalCode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Cell 1: Install Libraries and Mount Drive
# Install required libraries
!pip install torch torchvision transformers pandas numpy scikit-learn pillow -q

# Import libraries
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
import zipfile

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Mounted at /content/drive
Using device: cuda


In [None]:
#Cell 2: Dataset and Model Definition
# Custom Dataset Class
class MemeDataset(Dataset):
    def __init__(self, csv_file, image_dir, tokenizer, transform=None):
        self.data = pd.read_csv(csv_file)
        self.image_dir = image_dir
        self.tokenizer = tokenizer
        self.transform = transform
        self.label_map = {"Misogyny": 1, "Not-Misogyny": 0}  # Updated labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = os.path.join(self.image_dir, self.data.iloc[idx]["image_name"])
        image = Image.open(img_name).convert("RGB")
        if self.transform:
            image = self.transform(image)

        text = str(self.data.iloc[idx]["transcriptions"])
        encoding = self.tokenizer(text, return_tensors="pt", padding="max_length", max_length=128, truncation=True)
        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()

        label = self.label_map[self.data.iloc[idx]["labels"]]
        return {"image": image, "input_ids": input_ids, "attention_mask": attention_mask, "label": label}

# Multimodal Model
class MultimodalClassifier(nn.Module):
    def __init__(self, freeze_bert=True, freeze_resnet=True):
        super(MultimodalClassifier, self).__init__()
        # Image branch (ResNet)
        self.resnet = models.resnet50(pretrained=True)
        if freeze_resnet:
            for param in self.resnet.parameters():
                param.requires_grad = False
        self.resnet.fc = nn.Identity()
        self.image_fc = nn.Linear(2048, 512)

        # Text branch (BERT for Chinese)
        self.bert = BertModel.from_pretrained("bert-base-chinese")
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        self.text_fc = nn.Linear(768, 512)

        # Fusion and classification
        self.fusion_fc = nn.Linear(1024, 256)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(256, 2)

    def forward(self, image, input_ids, attention_mask):
        img_features = self.resnet(image)
        img_features = self.image_fc(img_features)

        text_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_features = text_outputs.pooler_output
        text_features = self.text_fc(text_features)

        combined = torch.cat((img_features, text_features), dim=1)
        combined = self.fusion_fc(combined)
        combined = self.dropout(combined)
        logits = self.classifier(combined)
        return logits

# Data Preprocessing
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

# Load datasets
train_csv = "/content/drive/MyDrive/Misogyny_STD/Misogyny_TrainData/Misogyny_train.csv"
train_image_dir = "/content/drive/MyDrive/Misogyny_STD/Misogyny_TrainData/Misogyny_train_imgfolder"
dev_csv = "/content/drive/MyDrive/Misogyny_STD/Misogyny_DevData/Misogyny_dev.csv"
dev_image_dir = "/content/drive/MyDrive/Misogyny_STD/Misogyny_DevData/Misogyny_dev_imgfolder"

train_dataset = MemeDataset(csv_file=train_csv, image_dir=train_image_dir, tokenizer=tokenizer, transform=transform)
dev_dataset = MemeDataset(csv_file=dev_csv, image_dir=dev_image_dir, tokenizer=tokenizer, transform=transform)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

In [None]:
#Cell 3: Training Function
def train_model(model, train_loader, criterion, optimizer, num_epochs=7):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for batch in train_loader:
            images = batch["image"].to(device)
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            optimizer.zero_grad()
            outputs = model(images, input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader):.4f}")
    return model

In [None]:
#Cell 4: Evaluation Function with Multiple Metrics
def evaluate_model(model, dev_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dev_loader:
            images = batch["image"].to(device)
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(images, input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    macro_f1 = f1_score(all_labels, all_preds, average="macro")
    precision = precision_score(all_labels, all_preds, average="macro")
    recall = recall_score(all_labels, all_preds, average="macro")
    accuracy = accuracy_score(all_labels, all_preds)
    g1_score = np.sqrt(precision * recall)  # Geometric mean of precision and recall

    print(f"Macro F1-Score: {macro_f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"G1-Score: {g1_score:.4f}")
    return all_preds, all_labels

In [None]:
#Cell 5: Hyperparameter Tuning and Execution \\just need to run this part
# Hyperparameter tuning
learning_rates = [4e-5,5e-5]
batch_sizes = [8, 16]
best_f1 = 0
best_params = {}
best_model = None

for lr in learning_rates:
    for bs in batch_sizes:
        print(f"\nTuning with lr={lr}, batch_size={bs}")
        train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True)
        dev_loader = DataLoader(dev_dataset, batch_size=bs, shuffle=False)

        model = MultimodalClassifier(freeze_bert=True, freeze_resnet=True).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)

        # Train
        model = train_model(model, train_loader, criterion, optimizer, num_epochs=5)

        # Evaluate
        preds, labels = evaluate_model(model, dev_loader)
        macro_f1 = f1_score(labels, preds, average="macro")

        if macro_f1 > best_f1:
            best_f1 = macro_f1
            best_params = {"lr": lr, "batch_size": bs}
            best_model = model
            print(f"New best F1: {best_f1:.4f} with {best_params}")

print(f"\nBest Hyperparameters: {best_params}")
print("Training final model with best parameters...")
final_train_loader = DataLoader(train_dataset, batch_size=best_params["batch_size"], shuffle=True)
final_dev_loader = DataLoader(dev_dataset, batch_size=best_params["batch_size"], shuffle=False)
best_model = train_model(best_model, final_train_loader, criterion, torch.optim.Adam(best_model.parameters(), lr=best_params["lr"]))
evaluate_model(best_model, final_dev_loader)


Tuning with lr=4e-05, batch_size=8




Epoch 1/5, Loss: 0.5565
Epoch 2/5, Loss: 0.4396
Epoch 3/5, Loss: 0.3849
Epoch 4/5, Loss: 0.3502
Epoch 5/5, Loss: 0.3365
Macro F1-Score: 0.8467
Precision: 0.8642
Recall: 0.8332
Accuracy: 0.8824
G1-Score: 0.8486
New best F1: 0.8467 with {'lr': 4e-05, 'batch_size': 8}

Tuning with lr=4e-05, batch_size=16




Epoch 1/5, Loss: 0.5794
Epoch 2/5, Loss: 0.4877
Epoch 3/5, Loss: 0.4089
Epoch 4/5, Loss: 0.3682
Epoch 5/5, Loss: 0.3394
Macro F1-Score: 0.7368
Precision: 0.9073
Recall: 0.7021
Accuracy: 0.8353
G1-Score: 0.7981

Tuning with lr=5e-05, batch_size=8




Epoch 1/5, Loss: 0.5445
Epoch 2/5, Loss: 0.4260
Epoch 3/5, Loss: 0.3638
Epoch 4/5, Loss: 0.3251
Epoch 5/5, Loss: 0.3338
Macro F1-Score: 0.8489
Precision: 0.8597
Recall: 0.8398
Accuracy: 0.8824
G1-Score: 0.8497
New best F1: 0.8489 with {'lr': 5e-05, 'batch_size': 8}

Tuning with lr=5e-05, batch_size=16




Epoch 1/5, Loss: 0.5527
Epoch 2/5, Loss: 0.4414
Epoch 3/5, Loss: 0.3880
Epoch 4/5, Loss: 0.3303
Epoch 5/5, Loss: 0.2987
Macro F1-Score: 0.7481
Precision: 0.8681
Recall: 0.7153
Accuracy: 0.8353
G1-Score: 0.7880

Best Hyperparameters: {'lr': 5e-05, 'batch_size': 8}
Training final model with best parameters...
Epoch 1/3, Loss: 0.3409
Epoch 2/3, Loss: 0.2940
Epoch 3/3, Loss: 0.2584
Macro F1-Score: 0.8298
Precision: 0.8320
Recall: 0.8276
Accuracy: 0.8647
G1-Score: 0.8298


([np.int64(0),
  np.int64(1),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(1),
  np.int64(0),
  np.int64(1),
  np.int64(1),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(1),
  np.int64(1),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64

In [None]:
print(f"\nBest Hyperparameters: {best_params}")
print("Training final model with best parameters...")
final_train_loader = DataLoader(train_dataset, batch_size=best_params["batch_size"], shuffle=True)
final_dev_loader = DataLoader(dev_dataset, batch_size=best_params["batch_size"], shuffle=False)
best_model = train_model(best_model, final_train_loader, criterion, torch.optim.Adam(best_model.parameters(), lr=best_params["lr"]))
evaluate_model(best_model, final_dev_loader)


Best Hyperparameters: {'lr': 5e-05, 'batch_size': 8}
Training final model with best parameters...
Epoch 1/5, Loss: 0.3075
Epoch 2/5, Loss: 0.2751
Epoch 3/5, Loss: 0.2712
Epoch 4/5, Loss: 0.2279
Epoch 5/5, Loss: 0.2500
Macro F1-Score: 0.8510
Precision: 0.8812
Recall: 0.8307
Accuracy: 0.8882
G1-Score: 0.8556


([np.int64(0),
  np.int64(1),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(1),
  np.int64(0),
  np.int64(1),
  np.int64(1),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(1),
  np.int64(1),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64

In [None]:
print(f"\nBest Hyperparameters: {best_params}")
print("Training final model with best parameters...")
final_train_loader = DataLoader(train_dataset, batch_size=best_params["batch_size"], shuffle=True)
final_dev_loader = DataLoader(dev_dataset, batch_size=best_params["batch_size"], shuffle=False)
best_model = train_model(best_model, final_train_loader, criterion, torch.optim.Adam(best_model.parameters(), lr=best_params["lr"]))
evaluate_model(best_model, final_dev_loader)


Best Hyperparameters: {'lr': 5e-05, 'batch_size': 8}
Training final model with best parameters...
Epoch 1/7, Loss: 0.2358
Epoch 2/7, Loss: 0.2470
Epoch 3/7, Loss: 0.2325
Epoch 4/7, Loss: 0.2162
Epoch 5/7, Loss: 0.2377
Epoch 6/7, Loss: 0.2690
Epoch 7/7, Loss: 0.2162
Macro F1-Score: 0.8378
Precision: 0.8587
Recall: 0.8226
Accuracy: 0.8765
G1-Score: 0.8404


([np.int64(0),
  np.int64(1),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(1),
  np.int64(0),
  np.int64(1),
  np.int64(1),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(1),
  np.int64(1),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(1),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(0),
  np.int64(1),
  np.int64(0),
  np.int64

In [None]:
# Replace the previous TestMemeDataset and prediction code with this:

class TestMemeDataset(Dataset):
    def __init__(self, csv_file, image_dir, tokenizer, transform=None):
        self.data = pd.read_csv(csv_file)
        self.image_dir = image_dir
        self.tokenizer = tokenizer
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = os.path.join(self.image_dir, self.data.iloc[idx]["image_name"])
        image = Image.open(img_name).convert("RGB")
        if self.transform:
            image = self.transform(image)

        text = str(self.data.iloc[idx]["transcriptions"])
        encoding = self.tokenizer(text, return_tensors="pt", padding="max_length", max_length=128, truncation=True)
        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()

        # Use the base filename (without extension) as the ID
        image_id = os.path.splitext(self.data.iloc[idx]["image_name"])[0]

        return {"image": image, "input_ids": input_ids, "attention_mask": attention_mask, "id": image_id}

def generate_predictions(model, test_loader, output_file="submission.csv"):
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            images = batch["image"].to(device)
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            image_ids = batch["id"]

            outputs = model(images, input_ids, attention_mask)
            pred_labels = torch.argmax(outputs, dim=1).cpu().numpy()

            # Pair each prediction with its corresponding filename-based ID
            for img_id, pred in zip(image_ids, pred_labels):
                predictions.append((img_id, pred))

    # Save predictions to CSV
    submission_df = pd.DataFrame(predictions, columns=["id", "predictions"])
    submission_df.to_csv(output_file, index=False, header=False)
    print(f"✅ Predictions saved to {output_file}")

# Test set paths (unchanged)
test_image_dir = "/content/drive/MyDrive/Misogyny_STD/Misogyny_TestData/"
test_csv = "/content/drive/MyDrive/Misogyny_STD/Misogyny_TestData/test.csv"

# Create test dataset and loader
test_dataset = TestMemeDataset(
    csv_file=test_csv,
    image_dir=test_image_dir,
    tokenizer=tokenizer,
    transform=transform
)
test_loader = DataLoader(test_dataset, batch_size=best_params["batch_size"], shuffle=False)

# Generate predictions
print("\nGenerating test set predictions...")
generate_predictions(best_model, test_loader)


Generating test set predictions...
✅ Predictions saved to submission.csv
