In [1]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from tqdm import tqdm

# Device Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Define Dataset Class
class MemeDataset(Dataset):
    def __init__(self, dataframe, tokenizer, image_dir, max_len, transform, has_labels):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.image_dir = image_dir
        self.max_len = max_len
        self.transform = transform
        self.has_labels = has_labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = str(self.data.loc[idx, 'image_id'])
        if not img_name.endswith(".jpg"):
            img_name += ".jpg"
        img_path = os.path.join(self.image_dir, img_name)

        image = Image.open(img_path).convert("RGB")
        caption = self.data.loc[idx, 'transcriptions']

        if self.transform:
            image = self.transform(image)

        inputs = self.tokenizer(
            caption, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_len
        )

        item = {
            'image': image,
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
        }

        if self.has_labels:
            item['label'] = int(self.data.loc[idx, 'labels'])

        return item

# Load Dataset Function
def load_data(csv_path, image_dir, tokenizer, max_len, batch_size, has_labels=True):
    data = pd.read_csv(csv_path)

    if has_labels:
        data['labels'] = data['labels'].astype(int)

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    dataset = MemeDataset(data, tokenizer, image_dir, max_len, transform, has_labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=has_labels)

# Define the Model
class MultiModalModel(nn.Module):
    def __init__(self, visual_model, text_model, num_classes):
        super(MultiModalModel, self).__init__()
        self.visual_model = visual_model
        self.visual_fc = nn.Linear(512, 768)
        self.text_model = text_model
        self.fc = nn.Sequential(
            nn.Linear(768 + 768, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, num_classes)
        )

    def forward(self, images, input_ids, attention_mask):
        visual_features = self.visual_model(images).squeeze(-1).squeeze(-1)
        visual_features = self.visual_fc(visual_features)

        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_features = text_outputs.last_hidden_state.mean(dim=1)

        combined_features = torch.cat((visual_features, text_features), dim=1)
        return self.fc(combined_features)

# Validation Function for Training
def validate_model(model, val_loader):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(images, input_ids, attention_mask)
            preds = outputs.argmax(dim=1).cpu().numpy()

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds)

    # Calculate Macro F1 Score
    macro_f1 = f1_score(all_labels, all_preds, average="macro")
    return macro_f1

# Validation Function for Test Set
def validate_test_set(model, test_loader):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Validating Test Set"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(images, input_ids, attention_mask)
            preds = outputs.argmax(dim=1).cpu().numpy()

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds)

    precision = precision_score(all_labels, all_preds, average="macro")
    recall = recall_score(all_labels, all_preds, average="macro")
    f1 = f1_score(all_labels, all_preds, average="macro")
    accuracy = accuracy_score(all_labels, all_preds)

    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1 Score: {f1:.4f}")
    print(f"Test Accuracy: {accuracy:.4f}")

    return precision, recall, f1, accuracy

# Training Function
def train_model(model, train_loader, val_loader, epochs, optimizer, criterion):
    model.to(device)
    best_macro_f1 = 0
    results = []

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(images, input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        macro_f1 = validate_model(model, val_loader)
        results.append({"Epoch": epoch + 1, "Loss": total_loss / len(train_loader), "Macro F1": macro_f1})

        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            torch.save(model.state_dict(), "best_model.pth")

        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader):.4f}, Macro F1: {macro_f1:.4f}")

    print("Training Complete!")
    return pd.DataFrame(results)

# Main Function
def main():
    # Paths
    train_csv = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/train/train.csv"
    train_images = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/train/images"
    dev_csv = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/dev/dev.csv"
    dev_images = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/dev/images"
    test_csv = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/test/test_with_labels.csv"
    test_images = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/test/images"

    # Tokenizer and Hyperparameters
    tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
    max_len = 128
    batch_size = 16
    epochs = 5
    learning_rate = 3e-5

    # Load Data
    train_loader = load_data(train_csv, train_images, tokenizer, max_len, batch_size, has_labels=True)
    val_loader = load_data(dev_csv, dev_images, tokenizer, max_len, batch_size, has_labels=True)
    test_loader = load_data(test_csv, test_images, tokenizer, max_len, batch_size, has_labels=True)

    # Initialize Model
    visual_model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)
    visual_model = nn.Sequential(*list(visual_model.children())[:-1])
    text_model = AutoModel.from_pretrained("ai4bharat/indic-bert")
    model = MultiModalModel(visual_model, text_model, num_classes=2)

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    # Train the Model
    train_model(model, train_loader, val_loader, epochs, optimizer, criterion)

    # Load Best Model
    model.load_state_dict(torch.load("best_model.pth"))

    # Validate on Test Set
    validate_test_set(model, test_loader)

if __name__ == "__main__":
    main()


Using device: cuda


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

Downloading: "https://github.com/pytorch/vision/zipball/v0.10.0" to /root/.cache/torch/hub/v0.10.0.zip
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 73.8MB/s]


pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

Epoch 1/5: 100%|██████████| 40/40 [00:28<00:00,  1.40it/s]
Validating: 100%|██████████| 10/10 [00:05<00:00,  1.85it/s]


Epoch 1, Loss: 0.6261, Macro F1: 0.8166


Epoch 2/5: 100%|██████████| 40/40 [00:23<00:00,  1.68it/s]
Validating: 100%|██████████| 10/10 [00:04<00:00,  2.08it/s]


Epoch 2, Loss: 0.3811, Macro F1: 0.8267


Epoch 3/5: 100%|██████████| 40/40 [00:23<00:00,  1.69it/s]
Validating: 100%|██████████| 10/10 [00:04<00:00,  2.10it/s]


Epoch 3, Loss: 0.1852, Macro F1: 0.8427


Epoch 4/5: 100%|██████████| 40/40 [00:23<00:00,  1.70it/s]
Validating: 100%|██████████| 10/10 [00:04<00:00,  2.14it/s]


Epoch 4, Loss: 0.0571, Macro F1: 0.8238


Epoch 5/5: 100%|██████████| 40/40 [00:23<00:00,  1.69it/s]
Validating: 100%|██████████| 10/10 [00:04<00:00,  2.14it/s]
  model.load_state_dict(torch.load("best_model.pth"))


Epoch 5, Loss: 0.0278, Macro F1: 0.8523
Training Complete!


Validating Test Set: 100%|██████████| 13/13 [00:06<00:00,  2.13it/s]


Test Precision: 0.8329
Test Recall: 0.8162
Test F1 Score: 0.8225
Test Accuracy: 0.8350


In [2]:
pip install efficientnet_pytorch transformers

Collecting efficientnet_pytorch
  Downloading efficientnet_pytorch-0.7.1.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: efficientnet_pytorch
  Building wheel for efficientnet_pytorch (setup.py) ... [?25l[?25hdone
  Created wheel for efficientnet_pytorch: filename=efficientnet_pytorch-0.7.1-py3-none-any.whl size=16424 sha256=5d88a13d2fa6052529dccf2ac3b545cd28420c291b2b5405554b10c63ca3479c
  Stored in directory: /root/.cache/pip/wheels/03/3f/e9/911b1bc46869644912bda90a56bcf7b960f20b5187feea3baf
Successfully built efficientnet_pytorch
Installing collected packages: efficientnet_pytorch
Successfully installed efficientnet_pytorch-0.7.1
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from tqdm import tqdm
from efficientnet_pytorch import EfficientNet

# Device Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Define Dataset Class
class MemeDataset(Dataset):
    def __init__(self, dataframe, tokenizer, image_dir, max_len, transform, has_labels):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.image_dir = image_dir
        self.max_len = max_len
        self.transform = transform
        self.has_labels = has_labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = str(self.data.loc[idx, 'image_id'])
        if not img_name.endswith(".jpg"):
            img_name += ".jpg"
        img_path = os.path.join(self.image_dir, img_name)

        image = Image.open(img_path).convert("RGB")
        caption = self.data.loc[idx, 'transcriptions']

        if self.transform:
            image = self.transform(image)

        inputs = self.tokenizer(
            caption, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_len
        )

        item = {
            'image': image,
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
        }

        if self.has_labels:
            item['label'] = int(self.data.loc[idx, 'labels'])

        return item

# Load Dataset Function
def load_data(csv_path, image_dir, tokenizer, max_len, batch_size, has_labels=True):
    data = pd.read_csv(csv_path)

    if has_labels:
        data['labels'] = data['labels'].astype(int)

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    dataset = MemeDataset(data, tokenizer, image_dir, max_len, transform, has_labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=has_labels)

# Define the Model for LaBSE + EfficientNet-B4
class MultiModalModel(nn.Module):
    def __init__(self, visual_model, text_model, num_classes):
        super(MultiModalModel, self).__init__()
        # Text Encoder (LaBSE)
        self.text_model = text_model
        self.text_fc = nn.Linear(768, 256)  # Reduce text features to 256 dimensions
        
        # Image Encoder (EfficientNet-B4)
        self.visual_model = visual_model
        self.visual_fc = nn.Linear(1792, 256)  # EfficientNet-B4 output is 1792
        
        # Classifier
        self.fc = nn.Sequential(
            nn.Linear(512, 128),  # Concatenated features (256 + 256)
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, num_classes)
        )

    def forward(self, images, input_ids, attention_mask):
        # Text Features
        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_features = text_outputs.last_hidden_state.mean(dim=1)  # Average pooling
        text_features = self.text_fc(text_features)
        
        # Image Features
        image_features = self.visual_model.extract_features(images)
        image_features = nn.functional.adaptive_avg_pool2d(image_features, (1, 1)).squeeze(-1).squeeze(-1)
        image_features = self.visual_fc(image_features)
        
        # Concatenate and Classify
        combined_features = torch.cat((text_features, image_features), dim=1)
        return self.fc(combined_features)

# Validation Function for Training
def validate_model(model, val_loader):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(images, input_ids, attention_mask)
            preds = outputs.argmax(dim=1).cpu().numpy()

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds)

    # Calculate Metrics
    precision = precision_score(all_labels, all_preds, average="macro")
    recall = recall_score(all_labels, all_preds, average="macro")
    f1 = f1_score(all_labels, all_preds, average="macro")
    accuracy = accuracy_score(all_labels, all_preds)

    print(f"Validation Precision: {precision:.4f}")
    print(f"Validation Recall: {recall:.4f}")
    print(f"Validation F1 Score: {f1:.4f}")
    print(f"Validation Accuracy: {accuracy:.4f}")

    return precision, recall, f1, accuracy

# Validation Function for Test Set
def validate_test_set(model, test_loader):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Validating Test Set"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(images, input_ids, attention_mask)
            preds = outputs.argmax(dim=1).cpu().numpy()

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds)

    # Calculate Metrics
    precision = precision_score(all_labels, all_preds, average="macro")
    recall = recall_score(all_labels, all_preds, average="macro")
    f1 = f1_score(all_labels, all_preds, average="macro")
    accuracy = accuracy_score(all_labels, all_preds)

    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1 Score: {f1:.4f}")
    print(f"Test Accuracy: {accuracy:.4f}")

    return precision, recall, f1, accuracy

# Training Function
def train_model(model, train_loader, val_loader, epochs, optimizer, criterion):
    model.to(device)
    best_macro_f1 = 0
    results = []

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(images, input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Validate and Log Metrics
        precision, recall, f1, accuracy = validate_model(model, val_loader)
        results.append({
            "Epoch": epoch + 1,
            "Loss": total_loss / len(train_loader),
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "Accuracy": accuracy
        })

        if f1 > best_macro_f1:
            best_macro_f1 = f1
            torch.save(model.state_dict(), "best_model.pth")

        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader):.4f}, F1: {f1:.4f}")

    print("Training Complete!")
    return pd.DataFrame(results)

# Main Function
def main():
    train_csv = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/train/train.csv"
    train_images = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/train/images"
    dev_csv = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/dev/dev.csv"
    dev_images = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/dev/images"
    test_csv = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/test/test_with_labels.csv"
    test_images = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/test/images"
    
    # Tokenizer and Hyperparameters
    tokenizer = AutoTokenizer.from_pretrained("setu4993/LaBSE")  # LaBSE Tokenizer
    max_len = 128
    batch_size = 16
    epochs = 5
    learning_rate = 3e-5

    # Load Data
    train_loader = load_data(train_csv, train_images, tokenizer, max_len, batch_size, has_labels=True)
    val_loader = load_data(dev_csv, dev_images, tokenizer, max_len, batch_size, has_labels=True)
    test_loader = load_data(test_csv, test_images, tokenizer, max_len, batch_size, has_labels=True)

    # Initialize Model
    visual_model = EfficientNet.from_pretrained('efficientnet-b4')  # EfficientNet-B4
    text_model = AutoModel.from_pretrained("setu4993/LaBSE")  # LaBSE Model
    model = MultiModalModel(visual_model, text_model, num_classes=2)

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    # Train the Model
    train_model(model, train_loader, val_loader, epochs, optimizer, criterion)

    # Load Best Model
    model.load_state_dict(torch.load("best_model.pth"))

    # Validate on Test Set
    validate_test_set(model, test_loader)

if __name__ == "__main__":
    main()

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/13.6M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading: "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b4-6ed6700e.pth" to /root/.cache/torch/hub/checkpoints/efficientnet-b4-6ed6700e.pth
100%|██████████| 74.4M/74.4M [00:00<00:00, 125MB/s] 


Loaded pretrained weights for efficientnet-b4


config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Epoch 1/5: 100%|██████████| 40/40 [00:32<00:00,  1.23it/s]
Validating: 100%|██████████| 10/10 [00:04<00:00,  2.12it/s]


Validation Precision: 0.7387
Validation Recall: 0.7359
Validation F1 Score: 0.7000
Validation Accuracy: 0.7000
Epoch 1, Loss: 0.5574, F1: 0.7000


Epoch 2/5: 100%|██████████| 40/40 [00:32<00:00,  1.25it/s]
Validating: 100%|██████████| 10/10 [00:04<00:00,  2.12it/s]


Validation Precision: 0.7791
Validation Recall: 0.7886
Validation F1 Score: 0.7817
Validation Accuracy: 0.7875
Epoch 2, Loss: 0.3416, F1: 0.7817


Epoch 3/5: 100%|██████████| 40/40 [00:32<00:00,  1.24it/s]
Validating: 100%|██████████| 10/10 [00:04<00:00,  2.11it/s]


Validation Precision: 0.7842
Validation Recall: 0.7910
Validation F1 Score: 0.7867
Validation Accuracy: 0.7937
Epoch 3, Loss: 0.2065, F1: 0.7867


Epoch 4/5: 100%|██████████| 40/40 [00:32<00:00,  1.22it/s]
Validating: 100%|██████████| 10/10 [00:05<00:00,  1.94it/s]


Validation Precision: 0.7779
Validation Recall: 0.7747
Validation F1 Score: 0.7762
Validation Accuracy: 0.7875
Epoch 4, Loss: 0.1095, F1: 0.7762


Epoch 5/5: 100%|██████████| 40/40 [00:32<00:00,  1.23it/s]
Validating: 100%|██████████| 10/10 [00:04<00:00,  2.14it/s]
  model.load_state_dict(torch.load("best_model.pth"))


Validation Precision: 0.7612
Validation Recall: 0.7735
Validation F1 Score: 0.7595
Validation Accuracy: 0.7625
Epoch 5, Loss: 0.0402, F1: 0.7595
Training Complete!


Validating Test Set: 100%|██████████| 13/13 [00:05<00:00,  2.39it/s]


Test Precision: 0.7844
Test Recall: 0.7857
Test F1 Score: 0.7851
Test Accuracy: 0.7950


In [4]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
from transformers import XLMRobertaModel, XLMRobertaTokenizer, ViTModel, ViTFeatureExtractor
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from tqdm import tqdm

# Device Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Define Dataset Class
class MemeDataset(Dataset):
    def __init__(self, dataframe, tokenizer, image_dir, max_len, transform, has_labels):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.image_dir = image_dir
        self.max_len = max_len
        self.transform = transform
        self.has_labels = has_labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = str(self.data.loc[idx, 'image_id'])
        if not img_name.endswith(".jpg"):
            img_name += ".jpg"
        img_path = os.path.join(self.image_dir, img_name)

        image = Image.open(img_path).convert("RGB")
        caption = self.data.loc[idx, 'transcriptions']

        if self.transform:
            image = self.transform(image)

        inputs = self.tokenizer(
            caption, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_len
        )

        item = {
            'image': image,
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
        }

        if self.has_labels:
            item['label'] = int(self.data.loc[idx, 'labels'])

        return item

# Load Dataset Function
def load_data(csv_path, image_dir, tokenizer, max_len, batch_size, has_labels=True):
    data = pd.read_csv(csv_path)

    if has_labels:
        data['labels'] = data['labels'].astype(int)

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    dataset = MemeDataset(data, tokenizer, image_dir, max_len, transform, has_labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=has_labels)

# Define the Model for XLM-RoBERTa + ViT
class MultiModalModel(nn.Module):
    def __init__(self, visual_model, text_model, num_classes):
        super(MultiModalModel, self).__init__()
        # Text Encoder (XLM-RoBERTa)
        self.text_model = text_model
        self.text_fc = nn.Linear(768, 256)  # Reduce text features to 256 dimensions
        
        # Image Encoder (ViT)
        self.visual_model = visual_model
        self.visual_fc = nn.Linear(768, 256)  # ViT output is 768
        
        # Classifier
        self.fc = nn.Sequential(
            nn.Linear(512, 128),  # Concatenated features (256 + 256)
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, num_classes)
        )

    def forward(self, images, input_ids, attention_mask):
        # Text Features
        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_features = text_outputs.last_hidden_state.mean(dim=1)  # Average pooling
        text_features = self.text_fc(text_features)
        
        # Image Features
        image_outputs = self.visual_model(pixel_values=images)
        image_features = image_outputs.last_hidden_state.mean(dim=1)  # Average pooling
        image_features = self.visual_fc(image_features)
        
        # Concatenate and Classify
        combined_features = torch.cat((text_features, image_features), dim=1)
        return self.fc(combined_features)

# Validation Function for Training
def validate_model(model, val_loader):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(images, input_ids, attention_mask)
            preds = outputs.argmax(dim=1).cpu().numpy()

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds)

    # Calculate Metrics
    precision = precision_score(all_labels, all_preds, average="macro")
    recall = recall_score(all_labels, all_preds, average="macro")
    f1 = f1_score(all_labels, all_preds, average="macro")
    accuracy = accuracy_score(all_labels, all_preds)

    print(f"Validation Precision: {precision:.4f}")
    print(f"Validation Recall: {recall:.4f}")
    print(f"Validation F1 Score: {f1:.4f}")
    print(f"Validation Accuracy: {accuracy:.4f}")

    return precision, recall, f1, accuracy

# Validation Function for Test Set
def validate_test_set(model, test_loader):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Validating Test Set"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(images, input_ids, attention_mask)
            preds = outputs.argmax(dim=1).cpu().numpy()

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds)

    # Calculate Metrics
    precision = precision_score(all_labels, all_preds, average="macro")
    recall = recall_score(all_labels, all_preds, average="macro")
    f1 = f1_score(all_labels, all_preds, average="macro")
    accuracy = accuracy_score(all_labels, all_preds)

    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1 Score: {f1:.4f}")
    print(f"Test Accuracy: {accuracy:.4f}")

    return precision, recall, f1, accuracy

# Training Function
def train_model(model, train_loader, val_loader, epochs, optimizer, criterion):
    model.to(device)
    best_macro_f1 = 0
    results = []

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(images, input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Validate and Log Metrics
        precision, recall, f1, accuracy = validate_model(model, val_loader)
        results.append({
            "Epoch": epoch + 1,
            "Loss": total_loss / len(train_loader),
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "Accuracy": accuracy
        })

        if f1 > best_macro_f1:
            best_macro_f1 = f1
            torch.save(model.state_dict(), "best_model.pth")

        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader):.4f}, F1: {f1:.4f}")

    print("Training Complete!")
    return pd.DataFrame(results)

# Main Function
def main():
    # Paths
    train_csv = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/train/train.csv"
    train_images = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/train/images"
    dev_csv = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/dev/dev.csv"
    dev_images = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/dev/images"
    test_csv = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/test/test_with_labels.csv"
    test_images = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/test/images"

    # Tokenizer and Hyperparameters
    tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")  # XLM-RoBERTa Tokenizer
    max_len = 128
    batch_size = 16
    epochs = 5
    learning_rate = 3e-5

    # Load Data
    train_loader = load_data(train_csv, train_images, tokenizer, max_len, batch_size, has_labels=True)
    val_loader = load_data(dev_csv, dev_images, tokenizer, max_len, batch_size, has_labels=True)
    test_loader = load_data(test_csv, test_images, tokenizer, max_len, batch_size, has_labels=True)

    # Initialize Model
    visual_model = ViTModel.from_pretrained("google/vit-base-patch16-224")  # ViT Model
    text_model = XLMRobertaModel.from_pretrained("xlm-roberta-base")  # XLM-RoBERTa Model
    model = MultiModalModel(visual_model, text_model, num_classes=2)

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    # Train the Model
    train_model(model, train_loader, val_loader, epochs, optimizer, criterion)

    # Load Best Model
    model.load_state_dict(torch.load("best_model.pth"))

    # Validate on Test Set
    validate_test_set(model, test_loader)

if __name__ == "__main__":
    main()

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Epoch 1/5: 100%|██████████| 40/40 [00:35<00:00,  1.14it/s]
Validating: 100%|██████████| 10/10 [00:05<00:00,  1.91it/s]


Validation Precision: 0.8229
Validation Recall: 0.8247
Validation F1 Score: 0.8238
Validation Accuracy: 0.8313
Epoch 1, Loss: 0.5118, F1: 0.8238


Epoch 2/5: 100%|██████████| 40/40 [00:35<00:00,  1.14it/s]
Validating: 100%|██████████| 10/10 [00:05<00:00,  1.94it/s]


Validation Precision: 0.8000
Validation Recall: 0.8124
Validation F1 Score: 0.8022
Validation Accuracy: 0.8063
Epoch 2, Loss: 0.2282, F1: 0.8022


Epoch 3/5: 100%|██████████| 40/40 [00:34<00:00,  1.15it/s]
Validating: 100%|██████████| 10/10 [00:05<00:00,  1.95it/s]


Validation Precision: 0.7889
Validation Recall: 0.8025
Validation F1 Score: 0.7854
Validation Accuracy: 0.7875
Epoch 3, Loss: 0.0590, F1: 0.7854


Epoch 4/5: 100%|██████████| 40/40 [00:35<00:00,  1.12it/s]
Validating: 100%|██████████| 10/10 [00:05<00:00,  1.88it/s]


Validation Precision: 0.8289
Validation Recall: 0.8354
Validation F1 Score: 0.8316
Validation Accuracy: 0.8375
Epoch 4, Loss: 0.0255, F1: 0.8316


Epoch 5/5: 100%|██████████| 40/40 [00:35<00:00,  1.12it/s]
Validating: 100%|██████████| 10/10 [00:05<00:00,  1.94it/s]
  model.load_state_dict(torch.load("best_model.pth"))


Validation Precision: 0.8091
Validation Recall: 0.8235
Validation F1 Score: 0.8045
Validation Accuracy: 0.8063
Epoch 5, Loss: 0.0169, F1: 0.8045
Training Complete!


Validating Test Set: 100%|██████████| 13/13 [00:05<00:00,  2.18it/s]

Test Precision: 0.8115
Test Recall: 0.8085
Test F1 Score: 0.8099
Test Accuracy: 0.8200





In [5]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms, models
from PIL import Image
from transformers import BertModel, BertTokenizer
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from tqdm import tqdm

# Device Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class MemeDataset(Dataset):
    def __init__(self, dataframe, tokenizer, image_dir, max_len, transform, has_labels):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.image_dir = image_dir
        self.max_len = max_len
        self.transform = transform
        self.has_labels = has_labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = str(self.data.loc[idx, 'image_id'])
        if not img_name.endswith(".jpg"):
            img_name += ".jpg"
        img_path = os.path.join(self.image_dir, img_name)

        image = Image.open(img_path).convert("RGB")
        caption = self.data.loc[idx, 'transcriptions']

        if self.transform:
            image = self.transform(image)

        inputs = self.tokenizer(
            caption,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )

        item = {
            'image': image,
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'token_type_ids': inputs['token_type_ids'].squeeze()
        }

        if self.has_labels:
            item['label'] = int(self.data.loc[idx, 'labels'])

        return item

def load_data(csv_path, image_dir, tokenizer, max_len, batch_size, has_labels=True):
    data = pd.read_csv(csv_path)

    if has_labels:
        data['labels'] = data['labels'].astype(int)

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    dataset = MemeDataset(data, tokenizer, image_dir, max_len, transform, has_labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=has_labels)

class MultiModalModel(nn.Module):
    def __init__(self, num_classes):
        super(MultiModalModel, self).__init__()
        # Text Encoder (mBERT)
        self.text_model = BertModel.from_pretrained('bert-base-multilingual-cased')
        self.text_fc = nn.Linear(768, 256)
        
        # Image Encoder (ResNet-50)
        self.visual_model = models.resnet50(pretrained=True)
        self.visual_model = nn.Sequential(*list(self.visual_model.children())[:-1])
        self.visual_fc = nn.Linear(2048, 256)
        
        # Classifier
        self.fc = nn.Sequential(
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, num_classes)
        )

    def forward(self, images, input_ids, attention_mask, token_type_ids):
        # Text Features
        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        text_features = text_outputs.last_hidden_state.mean(dim=1)
        text_features = self.text_fc(text_features)
        
        # Image Features
        image_features = self.visual_model(images)
        image_features = image_features.squeeze(-1).squeeze(-1)
        image_features = self.visual_fc(image_features)
        
        # Concatenate and Classify
        combined_features = torch.cat((text_features, image_features), dim=1)
        return self.fc(combined_features)

def validate_model(model, val_loader):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['label'].to(device)

            outputs = model(images, input_ids, attention_mask, token_type_ids)
            preds = outputs.argmax(dim=1).cpu().numpy()

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds)

    precision = precision_score(all_labels, all_preds, average="macro")
    recall = recall_score(all_labels, all_preds, average="macro")
    f1 = f1_score(all_labels, all_preds, average="macro")
    accuracy = accuracy_score(all_labels, all_preds)

    print(f"Validation Precision: {precision:.4f}")
    print(f"Validation Recall: {recall:.4f}")
    print(f"Validation F1 Score: {f1:.4f}")
    print(f"Validation Accuracy: {accuracy:.4f}")

    return precision, recall, f1, accuracy

def evaluate_test_set(model, test_loader):
    model.eval()
    all_labels = []
    all_preds = []
    test_loss = 0
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['label'].to(device)

            outputs = model(images, input_ids, attention_mask, token_type_ids)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            
            preds = outputs.argmax(dim=1).cpu().numpy()
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds)

    # Calculate metrics
    precision = precision_score(all_labels, all_preds, average="macro")
    recall = recall_score(all_labels, all_preds, average="macro")
    f1 = f1_score(all_labels, all_preds, average="macro")
    accuracy = accuracy_score(all_labels, all_preds)
    
    # Calculate per-class metrics
    class_precision = precision_score(all_labels, all_preds, average=None)
    class_recall = recall_score(all_labels, all_preds, average=None)
    class_f1 = f1_score(all_labels, all_preds, average=None)

    # Print results
    print("\n=== Test Set Evaluation Results ===")
    print(f"Test Loss: {test_loss/len(test_loader):.4f}")
    print(f"\nOverall Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Macro Precision: {precision:.4f}")
    print(f"Macro Recall: {recall:.4f}")
    print(f"Macro F1 Score: {f1:.4f}")
    
    print("\nPer-Class Metrics:")
    for i in range(len(class_precision)):
        print(f"\nClass {i}:")
        print(f"Precision: {class_precision[i]:.4f}")
        print(f"Recall: {class_recall[i]:.4f}")
        print(f"F1 Score: {class_f1[i]:.4f}")

    return {
        'test_loss': test_loss/len(test_loader),
        'accuracy': accuracy,
        'macro_precision': precision,
        'macro_recall': recall,
        'macro_f1': f1,
        'class_precision': class_precision.tolist(),
        'class_recall': class_recall.tolist(),
        'class_f1': class_f1.tolist()
    }

def train_model(model, train_loader, val_loader, epochs, optimizer, criterion):
    model.to(device)
    best_macro_f1 = 0
    results = []

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(images, input_ids, attention_mask, token_type_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Validate and Log Metrics
        precision, recall, f1, accuracy = validate_model(model, val_loader)
        results.append({
            "Epoch": epoch + 1,
            "Loss": total_loss / len(train_loader),
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "Accuracy": accuracy
        })

        if f1 > best_macro_f1:
            best_macro_f1 = f1
            torch.save(model.state_dict(), "best_model.pth")

        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader):.4f}, F1: {f1:.4f}")

    return pd.DataFrame(results)

def main():
    # Paths
    train_csv = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/train/train.csv"
    train_images = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/train/images"
    dev_csv = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/dev/dev.csv"
    dev_images = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/dev/images"
    test_csv = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/test/test_with_labels.csv"
    test_images = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/test/images"

    # Hyperparameters
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    max_len = 128
    batch_size = 16
    epochs = 5
    learning_rate = 3e-5

    # Load Data
    print("Loading datasets...")
    train_loader = load_data(train_csv, train_images, tokenizer, max_len, batch_size)
    val_loader = load_data(dev_csv, dev_images, tokenizer, max_len, batch_size)
    test_loader = load_data(test_csv, test_images, tokenizer, max_len, batch_size)

    # Initialize Model
    print("Initializing model...")
    model = MultiModalModel(num_classes=2)

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    # Train the Model
    print("Starting training...")
    results_df = train_model(model, train_loader, val_loader, epochs, optimizer, criterion)
    
    # Save training results
    results_df.to_csv('training_results.csv', index=False)
    
    # Load best model and evaluate on test set
    print("\nLoading best model for testing...")
    model.load_state_dict(torch.load('best_model.pth'))
    
    # Final evaluation on test set
    print("Evaluating on test set...")
    test_metrics = evaluate_test_set(model, test_loader)
    
    # Save test metrics
    test_results_df = pd.DataFrame([test_metrics])
    test_results_df.to_csv('test_results.csv', index=False)
    
    return results_df, test_metrics

if __name__ == "__main__":
    train_results, test_metrics = main()

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Loading datasets...
Initializing model...


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 177MB/s]


Starting training...


Epoch 1/5: 100%|██████████| 40/40 [00:26<00:00,  1.50it/s]
Validating: 100%|██████████| 10/10 [00:04<00:00,  2.09it/s]


Validation Precision: 0.8789
Validation Recall: 0.7540
Validation F1 Score: 0.7680
Validation Accuracy: 0.8063
Epoch 1, Loss: 0.5862, F1: 0.7680


Epoch 2/5: 100%|██████████| 40/40 [00:26<00:00,  1.51it/s]
Validating: 100%|██████████| 10/10 [00:04<00:00,  2.17it/s]


Validation Precision: 0.8702
Validation Recall: 0.8528
Validation F1 Score: 0.8595
Validation Accuracy: 0.8688
Epoch 2, Loss: 0.3251, F1: 0.8595


Epoch 3/5: 100%|██████████| 40/40 [00:26<00:00,  1.52it/s]
Validating: 100%|██████████| 10/10 [00:04<00:00,  2.12it/s]


Validation Precision: 0.8429
Validation Recall: 0.8429
Validation F1 Score: 0.8429
Validation Accuracy: 0.8500
Epoch 3, Loss: 0.0964, F1: 0.8429


Epoch 4/5: 100%|██████████| 40/40 [00:26<00:00,  1.51it/s]
Validating: 100%|██████████| 10/10 [00:04<00:00,  2.13it/s]


Validation Precision: 0.8565
Validation Recall: 0.8397
Validation F1 Score: 0.8461
Validation Accuracy: 0.8562
Epoch 4, Loss: 0.0408, F1: 0.8461


Epoch 5/5: 100%|██████████| 40/40 [00:27<00:00,  1.48it/s]
Validating: 100%|██████████| 10/10 [00:04<00:00,  2.14it/s]
  model.load_state_dict(torch.load('best_model.pth'))


Validation Precision: 0.8594
Validation Recall: 0.8504
Validation F1 Score: 0.8543
Validation Accuracy: 0.8625
Epoch 5, Loss: 0.0393, F1: 0.8543

Loading best model for testing...
Evaluating on test set...


Testing: 100%|██████████| 13/13 [00:05<00:00,  2.46it/s]


=== Test Set Evaluation Results ===
Test Loss: 0.3787

Overall Metrics:
Accuracy: 0.8400
Macro Precision: 0.8485
Macro Recall: 0.8134
Macro F1 Score: 0.8242

Per-Class Metrics:

Class 0:
Precision: 0.8261
Recall: 0.9344
F1 Score: 0.8769

Class 1:
Precision: 0.8710
Recall: 0.6923
F1 Score: 0.7714





In [6]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms, models
from PIL import Image
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from tqdm import tqdm

# Device Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class MemeDataset(Dataset):
    def __init__(self, dataframe, tokenizer, image_dir, max_len, transform, has_labels):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.image_dir = image_dir
        self.max_len = max_len
        self.transform = transform
        self.has_labels = has_labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = str(self.data.loc[idx, 'image_id'])
        if not img_name.endswith(".jpg"):
            img_name += ".jpg"
        img_path = os.path.join(self.image_dir, img_name)

        image = Image.open(img_path).convert("RGB")
        caption = self.data.loc[idx, 'transcriptions']

        if self.transform:
            image = self.transform(image)

        inputs = self.tokenizer(
            caption,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )

        item = {
            'image': image,
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'token_type_ids': inputs['token_type_ids'].squeeze()
        }

        if self.has_labels:
            item['label'] = int(self.data.loc[idx, 'labels'])

        return item

def load_data(csv_path, image_dir, tokenizer, max_len, batch_size, has_labels=True):
    data = pd.read_csv(csv_path)

    if has_labels:
        data['labels'] = data['labels'].astype(int)

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    dataset = MemeDataset(data, tokenizer, image_dir, max_len, transform, has_labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=has_labels)

class MultiModalModel(nn.Module):
    def __init__(self, num_classes):
        super(MultiModalModel, self).__init__()
        self.text_model = AutoModel.from_pretrained("google/muril-base-cased")
        self.text_fc = nn.Linear(768, 256)
        
        self.visual_model = models.resnet50(pretrained=True)
        self.visual_model = nn.Sequential(*list(self.visual_model.children())[:-1])
        self.visual_fc = nn.Linear(2048, 256)
        
        self.fusion = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, num_classes)
        )

    def forward(self, images, input_ids, attention_mask, token_type_ids):
        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        text_features = text_outputs.last_hidden_state.mean(dim=1)
        text_features = self.text_fc(text_features)
        
        image_features = self.visual_model(images)
        image_features = image_features.squeeze(-1).squeeze(-1)
        image_features = self.visual_fc(image_features)
        
        combined_features = torch.cat((text_features, image_features), dim=1)
        fused_features = self.fusion(combined_features)
        output = self.classifier(fused_features)
        return output

def validate_model(model, val_loader):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['label'].to(device)

            outputs = model(images, input_ids, attention_mask, token_type_ids)
            preds = outputs.argmax(dim=1).cpu().numpy()

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds)

    precision = precision_score(all_labels, all_preds, average="macro")
    recall = recall_score(all_labels, all_preds, average="macro")
    f1 = f1_score(all_labels, all_preds, average="macro")
    accuracy = accuracy_score(all_labels, all_preds)

    print(f"Validation Precision: {precision:.4f}")
    print(f"Validation Recall: {recall:.4f}")
    print(f"Validation F1 Score: {f1:.4f}")
    print(f"Validation Accuracy: {accuracy:.4f}")

    return precision, recall, f1, accuracy

def evaluate_model(model, test_loader):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating Test Data"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['label'].to(device)

            outputs = model(images, input_ids, attention_mask, token_type_ids)
            preds = outputs.argmax(dim=1).cpu().numpy()

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds)

    precision = precision_score(all_labels, all_preds, average="macro")
    recall = recall_score(all_labels, all_preds, average="macro")
    f1 = f1_score(all_labels, all_preds, average="macro")
    accuracy = accuracy_score(all_labels, all_preds)

    print("\nTest Evaluation Metrics:")
    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1 Score: {f1:.4f}")
    print(f"Test Accuracy: {accuracy:.4f}")

    return precision, recall, f1, accuracy

def train_model(model, train_loader, val_loader, epochs, optimizer, criterion, scheduler=None):
    model.to(device)
    best_macro_f1 = 0
    results = []

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}")

        for batch in progress_bar:
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(images, input_ids, attention_mask, token_type_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            if scheduler is not None:
                scheduler.step()

            total_loss += loss.item()
            progress_bar.set_postfix({'training_loss': f'{loss.item():.3f}'})

        avg_loss = total_loss / len(train_loader)
        precision, recall, f1, accuracy = validate_model(model, val_loader)
        
        results.append({
            "Epoch": epoch + 1,
            "Loss": avg_loss,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "Accuracy": accuracy
        })

        if f1 > best_macro_f1:
            best_macro_f1 = f1
            torch.save(model.state_dict(), "best_model.pth")
            print(f"New best model saved with F1: {f1:.4f}")

        print(f"Epoch {epoch + 1}, Loss: {avg_loss:.4f}, F1: {f1:.4f}")

    return pd.DataFrame(results)

def main():
    # Paths
    train_csv = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/train/train.csv"
    train_images = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/train/images"
    dev_csv = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/dev/dev.csv"
    dev_images = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/dev/images"
    test_csv = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/test/test_with_labels.csv"
    test_images = "/kaggle/input/malayalam-labelled-dataset/Dataset with label/test/images"

    # Hyperparameters
    tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
    max_len = 128
    batch_size = 16
    epochs = 5
    learning_rate = 2e-5

    # Load Data
    print("Loading datasets...")
    train_loader = load_data(train_csv, train_images, tokenizer, max_len, batch_size)
    val_loader = load_data(dev_csv, dev_images, tokenizer, max_len, batch_size)
    test_loader = load_data(test_csv, test_images, tokenizer, max_len, batch_size)

    # Initialize Model
    print("Initializing model...")
    model = MultiModalModel(num_classes=2)

    # Optimizer and Scheduler
    optimizer = torch.optim.AdamW([
        {'params': model.text_model.parameters(), 'lr': learning_rate},
        {'params': model.visual_model.parameters(), 'lr': learning_rate * 10},
        {'params': list(model.text_fc.parameters()) + 
                  list(model.visual_fc.parameters()) + 
                  list(model.fusion.parameters()) + 
                  list(model.classifier.parameters()), 
         'lr': learning_rate * 100}
    ])

    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=[learning_rate, learning_rate * 10, learning_rate * 100],
        steps_per_epoch=len(train_loader),
        epochs=epochs
    )

    criterion = nn.CrossEntropyLoss()

    # Train Model
    print("Starting training...")
    results_df = train_model(model, train_loader, val_loader, epochs, optimizer, criterion, scheduler)
    results_df.to_csv('training_results.csv', index=False)

    # Load Best Model and Evaluate on Test Data
    print("Evaluating on test data...")
    model.load_state_dict(torch.load("best_model.pth"))
    evaluate_model(model, test_loader)

    return results_df

if __name__ == "__main__":
    results = main()


Using device: cuda


tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

Loading datasets...
Initializing model...


pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]



Starting training...


Epoch 1/5: 100%|██████████| 40/40 [00:26<00:00,  1.49it/s, training_loss=0.307]
Validating: 100%|██████████| 10/10 [00:04<00:00,  2.13it/s]


Validation Precision: 0.8566
Validation Recall: 0.6905
Validation F1 Score: 0.6922
Validation Accuracy: 0.7562
New best model saved with F1: 0.6922
Epoch 1, Loss: 0.6272, F1: 0.6922


Epoch 2/5: 100%|██████████| 40/40 [00:26<00:00,  1.50it/s, training_loss=0.157]
Validating: 100%|██████████| 10/10 [00:04<00:00,  2.15it/s]


Validation Precision: 0.8196
Validation Recall: 0.8334
Validation F1 Score: 0.8216
Validation Accuracy: 0.8250
New best model saved with F1: 0.8216
Epoch 2, Loss: 0.4609, F1: 0.8216


Epoch 3/5: 100%|██████████| 40/40 [00:26<00:00,  1.50it/s, training_loss=0.367]
Validating: 100%|██████████| 10/10 [00:04<00:00,  2.10it/s]


Validation Precision: 0.8673
Validation Recall: 0.8556
Validation F1 Score: 0.8605
Validation Accuracy: 0.8688
New best model saved with F1: 0.8605
Epoch 3, Loss: 0.2170, F1: 0.8605


Epoch 4/5: 100%|██████████| 40/40 [00:26<00:00,  1.51it/s, training_loss=0.041]
Validating: 100%|██████████| 10/10 [00:04<00:00,  2.25it/s]


Validation Precision: 0.8803
Validation Recall: 0.8877
Validation F1 Score: 0.8834
Validation Accuracy: 0.8875
New best model saved with F1: 0.8834
Epoch 4, Loss: 0.0500, F1: 0.8834


Epoch 5/5: 100%|██████████| 40/40 [00:26<00:00,  1.50it/s, training_loss=0.011]
Validating: 100%|██████████| 10/10 [00:04<00:00,  2.21it/s]
  model.load_state_dict(torch.load("best_model.pth"))


Validation Precision: 0.8750
Validation Recall: 0.8770
Validation F1 Score: 0.8760
Validation Accuracy: 0.8812
Epoch 5, Loss: 0.0144, F1: 0.8760
Evaluating on test data...


Evaluating Test Data: 100%|██████████| 13/13 [00:05<00:00,  2.53it/s]


Test Evaluation Metrics:
Test Precision: 0.8565
Test Recall: 0.8662
Test F1 Score: 0.8602
Test Accuracy: 0.8650



