<a href="https://colab.research.google.com/github/Khadiza13/DravidianLangTech-NAACL-Misogyny-/blob/main/Misogyny_malayalam_run2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

In [None]:
TRAIN_DATA_PATH = '/kaggle/input/train-dataset/train/train.csv'
EVAL_DATA_PATH = '/kaggle/input/eval-dataset/dev/dev.csv'
TEST_DATA_PATH = '/kaggle/input/test-dataset/test/test.csv'
# Load the training data
train_data = pd.read_csv(TRAIN_DATA_PATH)
eval_data = pd.read_csv(EVAL_DATA_PATH)
test_data = pd.read_csv(TEST_DATA_PATH)

In [None]:
train_label_counts = train_data['labels'].value_counts()
eval_label_counts = eval_data['labels'].value_counts()


# Display the counts
print("Training Data Label Distribution:")
print(train_label_counts)

print("\nValidation Data Label Distribution:")
print(eval_label_counts)

Training Data Label Distribution:
labels
0    381
1    259
Name: count, dtype: int64

Validation Data Label Distribution:
labels
0    97
1    63
Name: count, dtype: int64


In [None]:
train_data.shape

(640, 3)

In [None]:
eval_data.shape


(160, 3)

In [None]:
test_data.shape

(200, 2)

In [None]:
train_data.head()

Unnamed: 0,image_id,labels,transcriptions
0,888,0,\nഈ ചാടി ഓടി നടക്കണ മനുഷ്യനാണോടാ നിങ്ങളിത്രേം ...
1,554,1,മലയാള സിനിമയുടെ ഭാവി വടറാണി ഇവൾ തന്നെ നല്ല കുഴ...
2,556,1,ഒന്ന് പെറ്റത് ആണെങ്കിലും .. മുലയും വയറും ചാടിയ...
3,484,1,ഓൺലൈൻ പരിചയപ്പെട്ടവനെ കളി തരാമെന്ന് പറഞ്ഞു അപ്...
4,370,0,കാമുകിയും അൺലിമിറ്റഡ് നെറ്റ് ഓഫറുംഉള്ള പയ്യന്റ...


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel, AdamW, get_scheduler
from transformers import ViTModel, ViTFeatureExtractor
from torchvision import transforms
from PIL import Image
from tqdm import tqdm

# Initialize tokenizer and model for Malayalam BERT
text_tokenizer = AutoTokenizer.from_pretrained(
    "l3cube-pune/malayalam-bert",
    model_max_length=128,
    use_fast=True
)

# Malayalam BERT model
text_model = AutoModel.from_pretrained("l3cube-pune/malayalam-bert")

# Resize token embeddings to ensure alignment with tokenizer
text_model.resize_token_embeddings(len(text_tokenizer))

# Vision Transformer model and feature extractor
vision_feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
vision_model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")

# Custom Dataset class
class MemeDataset(Dataset):
    def __init__(self, data, tokenizer, image_path, max_len=128, is_test=False):
        self.texts = data['transcriptions'].fillna("").values
        self.image_ids = data['image_id'].values
        self.tokenizer = tokenizer
        self.image_path = image_path
        self.max_len = max_len
        self.is_test = is_test
        if not is_test:
            self.labels = data['labels'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        image_id = self.image_ids[idx]

        # Handle unknown tokens explicitly
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        image = Image.open(f"{self.image_path}/{image_id}.jpg").convert("RGB")
        image = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])(image)

        item = {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'image': image,
        }

        if not self.is_test:
            item['label'] = torch.tensor(self.labels[idx], dtype=torch.long)

        return item

# Define multimodal model
class MultimodalModel(torch.nn.Module):
    def __init__(self):
        super(MultimodalModel, self).__init__()

        # Text encoder using Malayalam BERT
        self.text_encoder = text_model

        # Image encoder using Vision Transformer
        self.image_encoder = vision_model

        # Combined classifier
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(768 + 768, 512),  # Malayalam BERT hidden size + ViT hidden size
            torch.nn.ReLU(),
            torch.nn.Dropout(0.1),
            torch.nn.Linear(512, 2)
        )

    def forward(self, input_ids, attention_mask, image):
        # Encode text
        text_outputs = self.text_encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )
        text_cls = text_outputs.last_hidden_state[:, 0, :]  # CLS token

        # Encode image
        image_outputs = self.image_encoder(pixel_values=image)
        image_cls = image_outputs.last_hidden_state[:, 0, :]  # CLS token

        # Concatenate features
        combined_features = torch.cat((text_cls, image_cls), dim=1)

        # Classify
        outputs = self.classifier(combined_features)
        return outputs


# Define image directories
train_image_dir = "/kaggle/input/train-dataset/train"
eval_image_dir = "/kaggle/input/eval-dataset/dev"
test_image_dir = "/kaggle/input/test-dataset/test"

# Prepare datasets and data loaders
train_dataset = MemeDataset(train_data, text_tokenizer, train_image_dir)
eval_dataset = MemeDataset(eval_data, text_tokenizer, eval_image_dir)
test_dataset = MemeDataset(test_data, text_tokenizer, test_image_dir, is_test=True)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Initialize model, optimizer, and scheduler
model = MultimodalModel()
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 5
num_training_steps = epochs * len(train_loader)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Training and evaluation functions
def train(model, data_loader):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch in tqdm(data_loader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        images = batch['image']
        labels = batch['label']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, image=images)
        loss_fn = torch.nn.CrossEntropyLoss()
        loss = loss_fn(outputs, labels)

        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()

        preds = torch.argmax(outputs, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    return total_loss / len(data_loader), correct / total

def evaluate(model, data_loader):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluation"):
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            images = batch['image']
            labels = batch['label']

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, image=images)
            loss_fn = torch.nn.CrossEntropyLoss()
            loss = loss_fn(outputs, labels)

            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            correct += (preds == labels).sum().item()
            total += labels.size(0)

    return total_loss / len(data_loader), correct / total, all_labels, all_preds

def predict_test(model, test_loader):
    model.eval()
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing"):
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            images = batch['image']

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, image=images)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())

    return all_preds

# Training loop
print("Starting training...")
for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    train_loss, train_acc = train(model, train_loader)
    eval_loss, eval_acc, _, _ = evaluate(model, eval_loader)

    print(f"Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.4f}")
    print(f"Validation Loss: {eval_loss:.4f}, Validation Accuracy: {eval_acc:.4f}")

# Final evaluation
print("\nFinal Evaluation:")
_, _, all_labels, all_preds = evaluate(model, eval_loader)
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=['Non-Misogyny', 'Misogyny']))

tokenizer_config.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/6.41M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/951M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at l3cube-pune/malayalam-bert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]



Starting training...

Epoch 1/5


Training: 100%|██████████| 40/40 [10:56<00:00, 16.42s/it]
Evaluation: 100%|██████████| 10/10 [00:51<00:00,  5.14s/it]


Training Loss: 0.5717, Training Accuracy: 0.7641
Validation Loss: 0.5393, Validation Accuracy: 0.7312

Epoch 2/5


Training: 100%|██████████| 40/40 [10:43<00:00, 16.08s/it]
Evaluation: 100%|██████████| 10/10 [00:48<00:00,  4.89s/it]


Training Loss: 0.2921, Training Accuracy: 0.9219
Validation Loss: 0.3654, Validation Accuracy: 0.8500

Epoch 3/5


Training: 100%|██████████| 40/40 [10:47<00:00, 16.19s/it]
Evaluation: 100%|██████████| 10/10 [00:50<00:00,  5.02s/it]


Training Loss: 0.1590, Training Accuracy: 0.9594
Validation Loss: 0.4354, Validation Accuracy: 0.8250

Epoch 4/5


Training: 100%|██████████| 40/40 [10:48<00:00, 16.20s/it]
Evaluation: 100%|██████████| 10/10 [00:49<00:00,  4.93s/it]


Training Loss: 0.0839, Training Accuracy: 0.9844
Validation Loss: 0.5587, Validation Accuracy: 0.8000

Epoch 5/5


Training: 100%|██████████| 40/40 [10:45<00:00, 16.14s/it]
Evaluation: 100%|██████████| 10/10 [00:50<00:00,  5.01s/it]


Training Loss: 0.0641, Training Accuracy: 0.9906
Validation Loss: 0.4321, Validation Accuracy: 0.8375

Final Evaluation:


Evaluation: 100%|██████████| 10/10 [00:49<00:00,  4.92s/it]


Classification Report:
              precision    recall  f1-score   support

Non-Misogyny       0.87      0.86      0.86        97
    Misogyny       0.78      0.81      0.80        63

    accuracy                           0.84       160
   macro avg       0.83      0.83      0.83       160
weighted avg       0.84      0.84      0.84       160






In [None]:
# Modified MemeDataset class to handle test data
class MemeDataset(Dataset):
    def __init__(self, data, tokenizer, image_path, max_len=128, is_test=False):
        self.texts = data['transcriptions'].fillna("").values
        self.image_ids = data['image_id'].values
        self.tokenizer = tokenizer
        self.image_path = image_path
        self.max_len = max_len
        self.is_test = is_test
        if not is_test:
            self.labels = data['labels'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        image_id = self.image_ids[idx]

        # Handle unknown tokens explicitly
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt",
            return_overflowing_tokens=False
        )

        # Ensure token IDs are within vocab size
        input_ids = encoding['input_ids'].squeeze(0)
        vocab_size = self.tokenizer.vocab_size
        input_ids = torch.clamp(input_ids, 0, vocab_size - 1)

        image = Image.open(f"{self.image_path}/{image_id}.jpg").convert("RGB")
        image = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])(image)

        item = {
            'input_ids': input_ids,
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'image': image,
        }

        if not self.is_test:
            item['label'] = torch.tensor(self.labels[idx], dtype=torch.long)

        return item

def predict_test(model, test_loader):
    model.eval()
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing"):
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            images = batch['image']

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, image=images)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())

    return all_preds

# Create test dataset and dataloader with is_test=True
test_image_dir = "/kaggle/input/test-dataset/test"
test_dataset = MemeDataset(test_data, text_tokenizer, test_image_dir, is_test=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Get predictions
print("Generating predictions for test data...")
test_predictions = predict_test(model, test_loader)

# Create submission DataFrame
submission_df = pd.DataFrame({
    'id': test_data['image_id'],
    'predictions': test_predictions
})

# Save predictions without header
submission_df.to_csv('CUET_Novice_Malayalam_run2.csv', index=False, header=False)

# Display first few predictions and verification info
print("\nFirst 10 predictions:")
print(submission_df.head(10))
print("\nSubmission shape:", submission_df.shape)
print("Test data shape:", test_data.shape)


Generating predictions for test data...


Testing: 100%|██████████| 13/13 [01:01<00:00,  4.71s/it]


First 10 predictions:
    id  predictions
0  954            0
1  239            0
2   61            1
3  984            0
4  774            0
5  427            1
6  960            0
7  387            0
8  520            0
9  563            1

Submission shape: (200, 2)
Test data shape: (200, 2)





In [None]:
import zipfile
with zipfile.ZipFile('CUET_Novice.zip', 'w') as zipf:
    zipf.write('CUET_Novice_Malayalam_run2.csv')

print("Submission file created: CUET_Novice.zip")

Submission file created: CUET_Novice.zip
