In [6]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from tqdm import tqdm

# Device Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Define Dataset Class
class MemeDataset(Dataset):
    def __init__(self, dataframe, tokenizer, image_dir, max_len, transform, has_labels):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.image_dir = image_dir
        self.max_len = max_len
        self.transform = transform
        self.has_labels = has_labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = str(self.data.loc[idx, 'image_id'])
        if not img_name.endswith(".jpg"):
            img_name += ".jpg"
        img_path = os.path.join(self.image_dir, img_name)

        image = Image.open(img_path).convert("RGB")
        caption = self.data.loc[idx, 'transcriptions']

        if self.transform:
            image = self.transform(image)

        inputs = self.tokenizer(
            caption, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_len
        )

        item = {
            'image': image,
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
        }

        if self.has_labels:
            item['label'] = int(self.data.loc[idx, 'labels'])

        return item

# Load Dataset Function
def load_data(csv_path, image_dir, tokenizer, max_len, batch_size, has_labels=True):
    data = pd.read_csv(csv_path)

    if has_labels:
        data['labels'] = data['labels'].astype(int)

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    dataset = MemeDataset(data, tokenizer, image_dir, max_len, transform, has_labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=has_labels)

# Define the Model
class MultiModalModel(nn.Module):
    def __init__(self, visual_model, text_model, num_classes):
        super(MultiModalModel, self).__init__()
        self.visual_model = visual_model
        self.visual_fc = nn.Linear(512, 768)
        self.text_model = text_model
        self.fc = nn.Sequential(
            nn.Linear(768 + 768, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, num_classes)
        )

    def forward(self, images, input_ids, attention_mask):
        visual_features = self.visual_model(images).squeeze(-1).squeeze(-1)
        visual_features = self.visual_fc(visual_features)

        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_features = text_outputs.last_hidden_state.mean(dim=1)

        combined_features = torch.cat((visual_features, text_features), dim=1)
        return self.fc(combined_features)

# Validation Function for Training
def validate_model(model, val_loader):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(images, input_ids, attention_mask)
            preds = outputs.argmax(dim=1).cpu().numpy()

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds)

    # Calculate Macro F1 Score
    macro_f1 = f1_score(all_labels, all_preds, average="macro")
    return macro_f1

# Validation Function for Test Set
def validate_test_set(model, test_loader):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Validating Test Set"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(images, input_ids, attention_mask)
            preds = outputs.argmax(dim=1).cpu().numpy()

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds)

    precision = precision_score(all_labels, all_preds, average="macro")
    recall = recall_score(all_labels, all_preds, average="macro")
    f1 = f1_score(all_labels, all_preds, average="macro")
    accuracy = accuracy_score(all_labels, all_preds)

    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1 Score: {f1:.4f}")
    print(f"Test Accuracy: {accuracy:.4f}")

    return precision, recall, f1, accuracy

# Training Function
def train_model(model, train_loader, val_loader, epochs, optimizer, criterion):
    model.to(device)
    best_macro_f1 = 0
    results = []

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(images, input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        macro_f1 = validate_model(model, val_loader)
        results.append({"Epoch": epoch + 1, "Loss": total_loss / len(train_loader), "Macro F1": macro_f1})

        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            torch.save(model.state_dict(), "best_model.pth")

        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader):.4f}, Macro F1: {macro_f1:.4f}")

    print("Training Complete!")
    return pd.DataFrame(results)

# Main Function
def main():
    # Paths
    train_csv = "/kaggle/input/tamil-labelled-dataset/Dataset with label/train/train.csv"
    train_images = "/kaggle/input/tamil-labelled-dataset/Dataset with label/train/images"
    dev_csv = "/kaggle/input/tamil-labelled-dataset/Dataset with label/dev/dev.csv"
    dev_images = "/kaggle/input/tamil-labelled-dataset/Dataset with label/dev/images"
    test_csv = "/kaggle/input/tamil-labelled-dataset/Dataset with label/test/test.csv"
    test_images = "/kaggle/input/tamil-labelled-dataset/Dataset with label/test/images"

    # Tokenizer and Hyperparameters
    tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
    max_len = 128
    batch_size = 16
    epochs = 5
    learning_rate = 3e-5

    # Load Data
    train_loader = load_data(train_csv, train_images, tokenizer, max_len, batch_size, has_labels=True)
    val_loader = load_data(dev_csv, dev_images, tokenizer, max_len, batch_size, has_labels=True)
    test_loader = load_data(test_csv, test_images, tokenizer, max_len, batch_size, has_labels=True)

    # Initialize Model
    visual_model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)
    visual_model = nn.Sequential(*list(visual_model.children())[:-1])
    text_model = AutoModel.from_pretrained("ai4bharat/indic-bert")
    model = MultiModalModel(visual_model, text_model, num_classes=2)

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    # Train the Model
    train_model(model, train_loader, val_loader, epochs, optimizer, criterion)

    # Load Best Model
    model.load_state_dict(torch.load("best_model.pth"))

    # Validate on Test Set
    validate_test_set(model, test_loader)

if __name__ == "__main__":
    main()


Using device: cuda


Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0
Epoch 1/5: 100%|██████████| 71/71 [00:31<00:00,  2.28it/s]
Validating: 100%|██████████| 18/18 [00:06<00:00,  2.75it/s]


Epoch 1, Loss: 0.4988, Macro F1: 0.4799


Epoch 2/5: 100%|██████████| 71/71 [00:30<00:00,  2.31it/s]
Validating: 100%|██████████| 18/18 [00:05<00:00,  3.28it/s]


Epoch 2, Loss: 0.3378, Macro F1: 0.7033


Epoch 3/5: 100%|██████████| 71/71 [00:30<00:00,  2.31it/s]
Validating: 100%|██████████| 18/18 [00:05<00:00,  3.37it/s]


Epoch 3, Loss: 0.1425, Macro F1: 0.7183


Epoch 4/5: 100%|██████████| 71/71 [00:30<00:00,  2.32it/s]
Validating: 100%|██████████| 18/18 [00:05<00:00,  3.33it/s]


Epoch 4, Loss: 0.0523, Macro F1: 0.7335


Epoch 5/5: 100%|██████████| 71/71 [00:30<00:00,  2.31it/s]
Validating: 100%|██████████| 18/18 [00:05<00:00,  3.28it/s]
  model.load_state_dict(torch.load("best_model.pth"))


Epoch 5, Loss: 0.0348, Macro F1: 0.7496
Training Complete!


Validating Test Set: 100%|██████████| 23/23 [00:08<00:00,  2.76it/s]


Test Precision: 0.8095
Test Recall: 0.7772
Test F1 Score: 0.7909
Test Accuracy: 0.8511


In [7]:
pip install efficientnet_pytorch transformers

Note: you may need to restart the kernel to use updated packages.


In [11]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from tqdm import tqdm
from efficientnet_pytorch import EfficientNet

# Device Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Define Dataset Class
class MemeDataset(Dataset):
    def __init__(self, dataframe, tokenizer, image_dir, max_len, transform, has_labels):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.image_dir = image_dir
        self.max_len = max_len
        self.transform = transform
        self.has_labels = has_labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = str(self.data.loc[idx, 'image_id'])
        if not img_name.endswith(".jpg"):
            img_name += ".jpg"
        img_path = os.path.join(self.image_dir, img_name)

        image = Image.open(img_path).convert("RGB")
        caption = self.data.loc[idx, 'transcriptions']

        if self.transform:
            image = self.transform(image)

        inputs = self.tokenizer(
            caption, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_len
        )

        item = {
            'image': image,
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
        }

        if self.has_labels:
            item['label'] = int(self.data.loc[idx, 'labels'])

        return item

# Load Dataset Function
def load_data(csv_path, image_dir, tokenizer, max_len, batch_size, has_labels=True):
    data = pd.read_csv(csv_path)

    if has_labels:
        data['labels'] = data['labels'].astype(int)

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    dataset = MemeDataset(data, tokenizer, image_dir, max_len, transform, has_labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=has_labels)

# Define the Model for LaBSE + EfficientNet-B4
class MultiModalModel(nn.Module):
    def __init__(self, visual_model, text_model, num_classes):
        super(MultiModalModel, self).__init__()
        # Text Encoder (LaBSE)
        self.text_model = text_model
        self.text_fc = nn.Linear(768, 256)  # Reduce text features to 256 dimensions
        
        # Image Encoder (EfficientNet-B4)
        self.visual_model = visual_model
        self.visual_fc = nn.Linear(1792, 256)  # EfficientNet-B4 output is 1792
        
        # Classifier
        self.fc = nn.Sequential(
            nn.Linear(512, 128),  # Concatenated features (256 + 256)
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, num_classes)
        )

    def forward(self, images, input_ids, attention_mask):
        # Text Features
        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_features = text_outputs.last_hidden_state.mean(dim=1)  # Average pooling
        text_features = self.text_fc(text_features)
        
        # Image Features
        image_features = self.visual_model.extract_features(images)
        image_features = nn.functional.adaptive_avg_pool2d(image_features, (1, 1)).squeeze(-1).squeeze(-1)
        image_features = self.visual_fc(image_features)
        
        # Concatenate and Classify
        combined_features = torch.cat((text_features, image_features), dim=1)
        return self.fc(combined_features)

# Validation Function for Training
def validate_model(model, val_loader):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(images, input_ids, attention_mask)
            preds = outputs.argmax(dim=1).cpu().numpy()

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds)

    # Calculate Metrics
    precision = precision_score(all_labels, all_preds, average="macro")
    recall = recall_score(all_labels, all_preds, average="macro")
    f1 = f1_score(all_labels, all_preds, average="macro")
    accuracy = accuracy_score(all_labels, all_preds)

    print(f"Validation Precision: {precision:.4f}")
    print(f"Validation Recall: {recall:.4f}")
    print(f"Validation F1 Score: {f1:.4f}")
    print(f"Validation Accuracy: {accuracy:.4f}")

    return precision, recall, f1, accuracy

# Validation Function for Test Set
def validate_test_set(model, test_loader):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Validating Test Set"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(images, input_ids, attention_mask)
            preds = outputs.argmax(dim=1).cpu().numpy()

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds)

    # Calculate Metrics
    precision = precision_score(all_labels, all_preds, average="macro")
    recall = recall_score(all_labels, all_preds, average="macro")
    f1 = f1_score(all_labels, all_preds, average="macro")
    accuracy = accuracy_score(all_labels, all_preds)

    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1 Score: {f1:.4f}")
    print(f"Test Accuracy: {accuracy:.4f}")

    return precision, recall, f1, accuracy

# Training Function
def train_model(model, train_loader, val_loader, epochs, optimizer, criterion):
    model.to(device)
    best_macro_f1 = 0
    results = []

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(images, input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Validate and Log Metrics
        precision, recall, f1, accuracy = validate_model(model, val_loader)
        results.append({
            "Epoch": epoch + 1,
            "Loss": total_loss / len(train_loader),
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "Accuracy": accuracy
        })

        if f1 > best_macro_f1:
            best_macro_f1 = f1
            torch.save(model.state_dict(), "best_model.pth")

        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader):.4f}, F1: {f1:.4f}")

    print("Training Complete!")
    return pd.DataFrame(results)

# Main Function
def main():
    # Paths
    train_csv = "/kaggle/input/tamil-labelled-dataset/Dataset with label/train/train.csv"
    train_images = "/kaggle/input/tamil-labelled-dataset/Dataset with label/train/images"
    dev_csv = "/kaggle/input/tamil-labelled-dataset/Dataset with label/dev/dev.csv"
    dev_images = "/kaggle/input/tamil-labelled-dataset/Dataset with label/dev/images"
    test_csv = "/kaggle/input/tamil-labelled-dataset/Dataset with label/test/test.csv"
    test_images = "/kaggle/input/tamil-labelled-dataset/Dataset with label/test/images"

    # Tokenizer and Hyperparameters
    tokenizer = AutoTokenizer.from_pretrained("setu4993/LaBSE")  # LaBSE Tokenizer
    max_len = 128
    batch_size = 16
    epochs = 5
    learning_rate = 3e-5

    # Load Data
    train_loader = load_data(train_csv, train_images, tokenizer, max_len, batch_size, has_labels=True)
    val_loader = load_data(dev_csv, dev_images, tokenizer, max_len, batch_size, has_labels=True)
    test_loader = load_data(test_csv, test_images, tokenizer, max_len, batch_size, has_labels=True)

    # Initialize Model
    visual_model = EfficientNet.from_pretrained('efficientnet-b4')  # EfficientNet-B4
    text_model = AutoModel.from_pretrained("setu4993/LaBSE")  # LaBSE Model
    model = MultiModalModel(visual_model, text_model, num_classes=2)

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    # Train the Model
    train_model(model, train_loader, val_loader, epochs, optimizer, criterion)

    # Load Best Model
    model.load_state_dict(torch.load("best_model.pth"))

    # Validate on Test Set
    validate_test_set(model, test_loader)

if __name__ == "__main__":
    main()

Using device: cuda
Loaded pretrained weights for efficientnet-b4


Epoch 1/5: 100%|██████████| 71/71 [00:48<00:00,  1.46it/s]
Validating: 100%|██████████| 18/18 [00:06<00:00,  2.95it/s]


Validation Precision: 0.7283
Validation Recall: 0.5847
Validation F1 Score: 0.5863
Validation Accuracy: 0.7676
Epoch 1, Loss: 0.4990, F1: 0.5863


Epoch 2/5: 100%|██████████| 71/71 [00:48<00:00,  1.46it/s]
Validating: 100%|██████████| 18/18 [00:06<00:00,  2.86it/s]


Validation Precision: 0.6682
Validation Recall: 0.7085
Validation F1 Score: 0.6722
Validation Accuracy: 0.7113
Epoch 2, Loss: 0.3666, F1: 0.6722


Epoch 3/5: 100%|██████████| 71/71 [00:48<00:00,  1.46it/s]
Validating: 100%|██████████| 18/18 [00:06<00:00,  2.97it/s]


Validation Precision: 0.6817
Validation Recall: 0.7017
Validation F1 Score: 0.6891
Validation Accuracy: 0.7465
Epoch 3, Loss: 0.2439, F1: 0.6891


Epoch 4/5: 100%|██████████| 71/71 [00:48<00:00,  1.46it/s]
Validating: 100%|██████████| 18/18 [00:06<00:00,  2.98it/s]


Validation Precision: 0.7050
Validation Recall: 0.6638
Validation F1 Score: 0.6773
Validation Accuracy: 0.7746
Epoch 4, Loss: 0.1095, F1: 0.6773


Epoch 5/5: 100%|██████████| 71/71 [00:48<00:00,  1.47it/s]
Validating: 100%|██████████| 18/18 [00:05<00:00,  3.06it/s]


Validation Precision: 0.6944
Validation Recall: 0.6961
Validation F1 Score: 0.6952
Validation Accuracy: 0.7641
Epoch 5, Loss: 0.0501, F1: 0.6952
Training Complete!


  model.load_state_dict(torch.load("best_model.pth"))
Validating Test Set: 100%|██████████| 23/23 [00:07<00:00,  3.05it/s]


Test Precision: 0.7687
Test Recall: 0.7360
Test F1 Score: 0.7494
Test Accuracy: 0.8230


In [12]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
from transformers import XLMRobertaModel, XLMRobertaTokenizer, ViTModel, ViTFeatureExtractor
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from tqdm import tqdm

# Device Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Define Dataset Class
class MemeDataset(Dataset):
    def __init__(self, dataframe, tokenizer, image_dir, max_len, transform, has_labels):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.image_dir = image_dir
        self.max_len = max_len
        self.transform = transform
        self.has_labels = has_labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = str(self.data.loc[idx, 'image_id'])
        if not img_name.endswith(".jpg"):
            img_name += ".jpg"
        img_path = os.path.join(self.image_dir, img_name)

        image = Image.open(img_path).convert("RGB")
        caption = self.data.loc[idx, 'transcriptions']

        if self.transform:
            image = self.transform(image)

        inputs = self.tokenizer(
            caption, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_len
        )

        item = {
            'image': image,
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
        }

        if self.has_labels:
            item['label'] = int(self.data.loc[idx, 'labels'])

        return item

# Load Dataset Function
def load_data(csv_path, image_dir, tokenizer, max_len, batch_size, has_labels=True):
    data = pd.read_csv(csv_path)

    if has_labels:
        data['labels'] = data['labels'].astype(int)

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    dataset = MemeDataset(data, tokenizer, image_dir, max_len, transform, has_labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=has_labels)

# Define the Model for XLM-RoBERTa + ViT
class MultiModalModel(nn.Module):
    def __init__(self, visual_model, text_model, num_classes):
        super(MultiModalModel, self).__init__()
        # Text Encoder (XLM-RoBERTa)
        self.text_model = text_model
        self.text_fc = nn.Linear(768, 256)  # Reduce text features to 256 dimensions
        
        # Image Encoder (ViT)
        self.visual_model = visual_model
        self.visual_fc = nn.Linear(768, 256)  # ViT output is 768
        
        # Classifier
        self.fc = nn.Sequential(
            nn.Linear(512, 128),  # Concatenated features (256 + 256)
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, num_classes)
        )

    def forward(self, images, input_ids, attention_mask):
        # Text Features
        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_features = text_outputs.last_hidden_state.mean(dim=1)  # Average pooling
        text_features = self.text_fc(text_features)
        
        # Image Features
        image_outputs = self.visual_model(pixel_values=images)
        image_features = image_outputs.last_hidden_state.mean(dim=1)  # Average pooling
        image_features = self.visual_fc(image_features)
        
        # Concatenate and Classify
        combined_features = torch.cat((text_features, image_features), dim=1)
        return self.fc(combined_features)

# Validation Function for Training
def validate_model(model, val_loader):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(images, input_ids, attention_mask)
            preds = outputs.argmax(dim=1).cpu().numpy()

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds)

    # Calculate Metrics
    precision = precision_score(all_labels, all_preds, average="macro")
    recall = recall_score(all_labels, all_preds, average="macro")
    f1 = f1_score(all_labels, all_preds, average="macro")
    accuracy = accuracy_score(all_labels, all_preds)

    print(f"Validation Precision: {precision:.4f}")
    print(f"Validation Recall: {recall:.4f}")
    print(f"Validation F1 Score: {f1:.4f}")
    print(f"Validation Accuracy: {accuracy:.4f}")

    return precision, recall, f1, accuracy

# Validation Function for Test Set
def validate_test_set(model, test_loader):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Validating Test Set"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(images, input_ids, attention_mask)
            preds = outputs.argmax(dim=1).cpu().numpy()

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds)

    # Calculate Metrics
    precision = precision_score(all_labels, all_preds, average="macro")
    recall = recall_score(all_labels, all_preds, average="macro")
    f1 = f1_score(all_labels, all_preds, average="macro")
    accuracy = accuracy_score(all_labels, all_preds)

    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1 Score: {f1:.4f}")
    print(f"Test Accuracy: {accuracy:.4f}")

    return precision, recall, f1, accuracy

# Training Function
def train_model(model, train_loader, val_loader, epochs, optimizer, criterion):
    model.to(device)
    best_macro_f1 = 0
    results = []

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(images, input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Validate and Log Metrics
        precision, recall, f1, accuracy = validate_model(model, val_loader)
        results.append({
            "Epoch": epoch + 1,
            "Loss": total_loss / len(train_loader),
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "Accuracy": accuracy
        })

        if f1 > best_macro_f1:
            best_macro_f1 = f1
            torch.save(model.state_dict(), "best_model.pth")

        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader):.4f}, F1: {f1:.4f}")

    print("Training Complete!")
    return pd.DataFrame(results)

# Main Function
def main():
    # Paths
    train_csv = "/kaggle/input/tamil-labelled-dataset/Dataset with label/train/train.csv"
    train_images = "/kaggle/input/tamil-labelled-dataset/Dataset with label/train/images"
    dev_csv = "/kaggle/input/tamil-labelled-dataset/Dataset with label/dev/dev.csv"
    dev_images = "/kaggle/input/tamil-labelled-dataset/Dataset with label/dev/images"
    test_csv = "/kaggle/input/tamil-labelled-dataset/Dataset with label/test/test.csv"
    test_images = "/kaggle/input/tamil-labelled-dataset/Dataset with label/test/images"

    # Tokenizer and Hyperparameters
    tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")  # XLM-RoBERTa Tokenizer
    max_len = 128
    batch_size = 16
    epochs = 5
    learning_rate = 3e-5

    # Load Data
    train_loader = load_data(train_csv, train_images, tokenizer, max_len, batch_size, has_labels=True)
    val_loader = load_data(dev_csv, dev_images, tokenizer, max_len, batch_size, has_labels=True)
    test_loader = load_data(test_csv, test_images, tokenizer, max_len, batch_size, has_labels=True)

    # Initialize Model
    visual_model = ViTModel.from_pretrained("google/vit-base-patch16-224")  # ViT Model
    text_model = XLMRobertaModel.from_pretrained("xlm-roberta-base")  # XLM-RoBERTa Model
    model = MultiModalModel(visual_model, text_model, num_classes=2)

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    # Train the Model
    train_model(model, train_loader, val_loader, epochs, optimizer, criterion)

    # Load Best Model
    model.load_state_dict(torch.load("best_model.pth"))

    # Validate on Test Set
    validate_test_set(model, test_loader)

if __name__ == "__main__":
    main()

Using device: cuda


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/5: 100%|██████████| 71/71 [00:54<00:00,  1.29it/s]
Validating: 100%|██████████| 18/18 [00:07<00:00,  2.51it/s]


Validation Precision: 0.7104
Validation Recall: 0.6376
Validation F1 Score: 0.6533
Validation Accuracy: 0.7746
Epoch 1, Loss: 0.4886, F1: 0.6533


Epoch 2/5: 100%|██████████| 71/71 [00:53<00:00,  1.33it/s]
Validating: 100%|██████████| 18/18 [00:06<00:00,  2.62it/s]


Validation Precision: 0.7521
Validation Recall: 0.6272
Validation F1 Score: 0.6436
Validation Accuracy: 0.7852
Epoch 2, Loss: 0.3064, F1: 0.6436


Epoch 3/5: 100%|██████████| 71/71 [00:53<00:00,  1.33it/s]
Validating: 100%|██████████| 18/18 [00:06<00:00,  2.65it/s]


Validation Precision: 0.7248
Validation Recall: 0.6666
Validation F1 Score: 0.6835
Validation Accuracy: 0.7852
Epoch 3, Loss: 0.1503, F1: 0.6835


Epoch 4/5: 100%|██████████| 71/71 [00:53<00:00,  1.32it/s]
Validating: 100%|██████████| 18/18 [00:07<00:00,  2.52it/s]


Validation Precision: 0.8245
Validation Recall: 0.6638
Validation F1 Score: 0.6904
Validation Accuracy: 0.8134
Epoch 4, Loss: 0.0748, F1: 0.6904


Epoch 5/5: 100%|██████████| 71/71 [00:54<00:00,  1.31it/s]
Validating: 100%|██████████| 18/18 [00:07<00:00,  2.57it/s]


Validation Precision: 0.7848
Validation Recall: 0.6789
Validation F1 Score: 0.7037
Validation Accuracy: 0.8099
Epoch 5, Loss: 0.0230, F1: 0.7037
Training Complete!


  model.load_state_dict(torch.load("best_model.pth"))
Validating Test Set: 100%|██████████| 23/23 [00:08<00:00,  2.63it/s]


Test Precision: 0.8240
Test Recall: 0.7453
Test F1 Score: 0.7720
Test Accuracy: 0.8483


In [13]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms, models
from PIL import Image
from transformers import BertModel, BertTokenizer
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from tqdm import tqdm

# Device Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class MemeDataset(Dataset):
    def __init__(self, dataframe, tokenizer, image_dir, max_len, transform, has_labels):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.image_dir = image_dir
        self.max_len = max_len
        self.transform = transform
        self.has_labels = has_labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = str(self.data.loc[idx, 'image_id'])
        if not img_name.endswith(".jpg"):
            img_name += ".jpg"
        img_path = os.path.join(self.image_dir, img_name)

        image = Image.open(img_path).convert("RGB")
        caption = self.data.loc[idx, 'transcriptions']

        if self.transform:
            image = self.transform(image)

        inputs = self.tokenizer(
            caption,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )

        item = {
            'image': image,
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'token_type_ids': inputs['token_type_ids'].squeeze()
        }

        if self.has_labels:
            item['label'] = int(self.data.loc[idx, 'labels'])

        return item

def load_data(csv_path, image_dir, tokenizer, max_len, batch_size, has_labels=True):
    data = pd.read_csv(csv_path)

    if has_labels:
        data['labels'] = data['labels'].astype(int)

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    dataset = MemeDataset(data, tokenizer, image_dir, max_len, transform, has_labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=has_labels)

class MultiModalModel(nn.Module):
    def __init__(self, num_classes):
        super(MultiModalModel, self).__init__()
        # Text Encoder (mBERT)
        self.text_model = BertModel.from_pretrained('bert-base-multilingual-cased')
        self.text_fc = nn.Linear(768, 256)
        
        # Image Encoder (ResNet-50)
        self.visual_model = models.resnet50(pretrained=True)
        self.visual_model = nn.Sequential(*list(self.visual_model.children())[:-1])
        self.visual_fc = nn.Linear(2048, 256)
        
        # Classifier
        self.fc = nn.Sequential(
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, num_classes)
        )

    def forward(self, images, input_ids, attention_mask, token_type_ids):
        # Text Features
        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        text_features = text_outputs.last_hidden_state.mean(dim=1)
        text_features = self.text_fc(text_features)
        
        # Image Features
        image_features = self.visual_model(images)
        image_features = image_features.squeeze(-1).squeeze(-1)
        image_features = self.visual_fc(image_features)
        
        # Concatenate and Classify
        combined_features = torch.cat((text_features, image_features), dim=1)
        return self.fc(combined_features)

def validate_model(model, val_loader):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['label'].to(device)

            outputs = model(images, input_ids, attention_mask, token_type_ids)
            preds = outputs.argmax(dim=1).cpu().numpy()

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds)

    precision = precision_score(all_labels, all_preds, average="macro")
    recall = recall_score(all_labels, all_preds, average="macro")
    f1 = f1_score(all_labels, all_preds, average="macro")
    accuracy = accuracy_score(all_labels, all_preds)

    print(f"Validation Precision: {precision:.4f}")
    print(f"Validation Recall: {recall:.4f}")
    print(f"Validation F1 Score: {f1:.4f}")
    print(f"Validation Accuracy: {accuracy:.4f}")

    return precision, recall, f1, accuracy

def evaluate_test_set(model, test_loader):
    model.eval()
    all_labels = []
    all_preds = []
    test_loss = 0
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['label'].to(device)

            outputs = model(images, input_ids, attention_mask, token_type_ids)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            
            preds = outputs.argmax(dim=1).cpu().numpy()
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds)

    # Calculate metrics
    precision = precision_score(all_labels, all_preds, average="macro")
    recall = recall_score(all_labels, all_preds, average="macro")
    f1 = f1_score(all_labels, all_preds, average="macro")
    accuracy = accuracy_score(all_labels, all_preds)
    
    # Calculate per-class metrics
    class_precision = precision_score(all_labels, all_preds, average=None)
    class_recall = recall_score(all_labels, all_preds, average=None)
    class_f1 = f1_score(all_labels, all_preds, average=None)

    # Print results
    print("\n=== Test Set Evaluation Results ===")
    print(f"Test Loss: {test_loss/len(test_loader):.4f}")
    print(f"\nOverall Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Macro Precision: {precision:.4f}")
    print(f"Macro Recall: {recall:.4f}")
    print(f"Macro F1 Score: {f1:.4f}")
    
    print("\nPer-Class Metrics:")
    for i in range(len(class_precision)):
        print(f"\nClass {i}:")
        print(f"Precision: {class_precision[i]:.4f}")
        print(f"Recall: {class_recall[i]:.4f}")
        print(f"F1 Score: {class_f1[i]:.4f}")

    return {
        'test_loss': test_loss/len(test_loader),
        'accuracy': accuracy,
        'macro_precision': precision,
        'macro_recall': recall,
        'macro_f1': f1,
        'class_precision': class_precision.tolist(),
        'class_recall': class_recall.tolist(),
        'class_f1': class_f1.tolist()
    }

def train_model(model, train_loader, val_loader, epochs, optimizer, criterion):
    model.to(device)
    best_macro_f1 = 0
    results = []

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(images, input_ids, attention_mask, token_type_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Validate and Log Metrics
        precision, recall, f1, accuracy = validate_model(model, val_loader)
        results.append({
            "Epoch": epoch + 1,
            "Loss": total_loss / len(train_loader),
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "Accuracy": accuracy
        })

        if f1 > best_macro_f1:
            best_macro_f1 = f1
            torch.save(model.state_dict(), "best_model.pth")

        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader):.4f}, F1: {f1:.4f}")

    return pd.DataFrame(results)

def main():
    # Paths
    train_csv = "/kaggle/input/tamil-labelled-dataset/Dataset with label/train/train.csv"
    train_images = "/kaggle/input/tamil-labelled-dataset/Dataset with label/train/images"
    dev_csv = "/kaggle/input/tamil-labelled-dataset/Dataset with label/dev/dev.csv"
    dev_images = "/kaggle/input/tamil-labelled-dataset/Dataset with label/dev/images"
    test_csv = "/kaggle/input/tamil-labelled-dataset/Dataset with label/test/test.csv"
    test_images = "/kaggle/input/tamil-labelled-dataset/Dataset with label/test/images"

    # Hyperparameters
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    max_len = 128
    batch_size = 16
    epochs = 5
    learning_rate = 3e-5

    # Load Data
    print("Loading datasets...")
    train_loader = load_data(train_csv, train_images, tokenizer, max_len, batch_size)
    val_loader = load_data(dev_csv, dev_images, tokenizer, max_len, batch_size)
    test_loader = load_data(test_csv, test_images, tokenizer, max_len, batch_size)

    # Initialize Model
    print("Initializing model...")
    model = MultiModalModel(num_classes=2)

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    # Train the Model
    print("Starting training...")
    results_df = train_model(model, train_loader, val_loader, epochs, optimizer, criterion)
    
    # Save training results
    results_df.to_csv('training_results.csv', index=False)
    
    # Load best model and evaluate on test set
    print("\nLoading best model for testing...")
    model.load_state_dict(torch.load('best_model.pth'))
    
    # Final evaluation on test set
    print("Evaluating on test set...")
    test_metrics = evaluate_test_set(model, test_loader)
    
    # Save test metrics
    test_results_df = pd.DataFrame([test_metrics])
    test_results_df.to_csv('test_results.csv', index=False)
    
    return results_df, test_metrics

if __name__ == "__main__":
    train_results, test_metrics = main()

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Loading datasets...
Initializing model...


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 157MB/s]


Starting training...


Epoch 1/5: 100%|██████████| 71/71 [00:38<00:00,  1.86it/s]
Validating: 100%|██████████| 18/18 [00:05<00:00,  3.13it/s]


Validation Precision: 0.7665
Validation Recall: 0.5604
Validation F1 Score: 0.5453
Validation Accuracy: 0.7641
Epoch 1, Loss: 0.4747, F1: 0.5453


Epoch 2/5: 100%|██████████| 71/71 [00:37<00:00,  1.89it/s]
Validating: 100%|██████████| 18/18 [00:05<00:00,  3.19it/s]


Validation Precision: 0.7966
Validation Recall: 0.6411
Validation F1 Score: 0.6620
Validation Accuracy: 0.7993
Epoch 2, Loss: 0.2679, F1: 0.6620


Epoch 3/5: 100%|██████████| 71/71 [00:37<00:00,  1.88it/s]
Validating: 100%|██████████| 18/18 [00:05<00:00,  3.12it/s]


Validation Precision: 0.7865
Validation Recall: 0.7123
Validation F1 Score: 0.7354
Validation Accuracy: 0.8204
Epoch 3, Loss: 0.0742, F1: 0.7354


Epoch 4/5: 100%|██████████| 71/71 [00:37<00:00,  1.88it/s]
Validating: 100%|██████████| 18/18 [00:05<00:00,  3.20it/s]


Validation Precision: 0.7469
Validation Recall: 0.7135
Validation F1 Score: 0.7265
Validation Accuracy: 0.8028
Epoch 4, Loss: 0.0265, F1: 0.7265


Epoch 5/5: 100%|██████████| 71/71 [00:37<00:00,  1.87it/s]
Validating: 100%|██████████| 18/18 [00:05<00:00,  3.14it/s]


Validation Precision: 0.7680
Validation Recall: 0.7318
Validation F1 Score: 0.7461
Validation Accuracy: 0.8169
Epoch 5, Loss: 0.0398, F1: 0.7461

Loading best model for testing...


  model.load_state_dict(torch.load('best_model.pth'))


Evaluating on test set...


Testing: 100%|██████████| 23/23 [00:07<00:00,  3.24it/s]


=== Test Set Evaluation Results ===
Test Loss: 0.4050

Overall Metrics:
Accuracy: 0.8624
Macro Precision: 0.8209
Macro Recall: 0.8034
Macro F1 Score: 0.8114

Per-Class Metrics:

Class 0:
Precision: 0.8978
Recall: 0.9213
F1 Score: 0.9094

Class 1:
Precision: 0.7439
Recall: 0.6854
F1 Score: 0.7135





In [16]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms, models
from PIL import Image
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from tqdm import tqdm

# Device Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class MemeDataset(Dataset):
    def __init__(self, dataframe, tokenizer, image_dir, max_len, transform, has_labels):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.image_dir = image_dir
        self.max_len = max_len
        self.transform = transform
        self.has_labels = has_labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = str(self.data.loc[idx, 'image_id'])
        if not img_name.endswith(".jpg"):
            img_name += ".jpg"
        img_path = os.path.join(self.image_dir, img_name)

        image = Image.open(img_path).convert("RGB")
        caption = self.data.loc[idx, 'transcriptions']

        if self.transform:
            image = self.transform(image)

        inputs = self.tokenizer(
            caption,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )

        item = {
            'image': image,
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'token_type_ids': inputs['token_type_ids'].squeeze()
        }

        if self.has_labels:
            item['label'] = int(self.data.loc[idx, 'labels'])

        return item

def load_data(csv_path, image_dir, tokenizer, max_len, batch_size, has_labels=True):
    data = pd.read_csv(csv_path)

    if has_labels:
        data['labels'] = data['labels'].astype(int)

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    dataset = MemeDataset(data, tokenizer, image_dir, max_len, transform, has_labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=has_labels)

class MultiModalModel(nn.Module):
    def __init__(self, num_classes):
        super(MultiModalModel, self).__init__()
        self.text_model = AutoModel.from_pretrained("google/muril-base-cased")
        self.text_fc = nn.Linear(768, 256)
        
        self.visual_model = models.resnet50(pretrained=True)
        self.visual_model = nn.Sequential(*list(self.visual_model.children())[:-1])
        self.visual_fc = nn.Linear(2048, 256)
        
        self.fusion = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, num_classes)
        )

    def forward(self, images, input_ids, attention_mask, token_type_ids):
        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        text_features = text_outputs.last_hidden_state.mean(dim=1)
        text_features = self.text_fc(text_features)
        
        image_features = self.visual_model(images)
        image_features = image_features.squeeze(-1).squeeze(-1)
        image_features = self.visual_fc(image_features)
        
        combined_features = torch.cat((text_features, image_features), dim=1)
        fused_features = self.fusion(combined_features)
        output = self.classifier(fused_features)
        return output

def validate_model(model, val_loader):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['label'].to(device)

            outputs = model(images, input_ids, attention_mask, token_type_ids)
            preds = outputs.argmax(dim=1).cpu().numpy()

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds)

    precision = precision_score(all_labels, all_preds, average="macro")
    recall = recall_score(all_labels, all_preds, average="macro")
    f1 = f1_score(all_labels, all_preds, average="macro")
    accuracy = accuracy_score(all_labels, all_preds)

    print(f"Validation Precision: {precision:.4f}")
    print(f"Validation Recall: {recall:.4f}")
    print(f"Validation F1 Score: {f1:.4f}")
    print(f"Validation Accuracy: {accuracy:.4f}")

    return precision, recall, f1, accuracy

def evaluate_model(model, test_loader):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating Test Data"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['label'].to(device)

            outputs = model(images, input_ids, attention_mask, token_type_ids)
            preds = outputs.argmax(dim=1).cpu().numpy()

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds)

    precision = precision_score(all_labels, all_preds, average="macro")
    recall = recall_score(all_labels, all_preds, average="macro")
    f1 = f1_score(all_labels, all_preds, average="macro")
    accuracy = accuracy_score(all_labels, all_preds)

    print("\nTest Evaluation Metrics:")
    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1 Score: {f1:.4f}")
    print(f"Test Accuracy: {accuracy:.4f}")

    return precision, recall, f1, accuracy

def train_model(model, train_loader, val_loader, epochs, optimizer, criterion, scheduler=None):
    model.to(device)
    best_macro_f1 = 0
    results = []

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}")

        for batch in progress_bar:
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(images, input_ids, attention_mask, token_type_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            if scheduler is not None:
                scheduler.step()

            total_loss += loss.item()
            progress_bar.set_postfix({'training_loss': f'{loss.item():.3f}'})

        avg_loss = total_loss / len(train_loader)
        precision, recall, f1, accuracy = validate_model(model, val_loader)
        
        results.append({
            "Epoch": epoch + 1,
            "Loss": avg_loss,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "Accuracy": accuracy
        })

        if f1 > best_macro_f1:
            best_macro_f1 = f1
            torch.save(model.state_dict(), "best_model.pth")
            print(f"New best model saved with F1: {f1:.4f}")

        print(f"Epoch {epoch + 1}, Loss: {avg_loss:.4f}, F1: {f1:.4f}")

    return pd.DataFrame(results)

def main():
    # Paths
    train_csv = "/kaggle/input/tamil-labelled-dataset/Dataset with label/train/train.csv"
    train_images = "/kaggle/input/tamil-labelled-dataset/Dataset with label/train/images"
    dev_csv = "/kaggle/input/tamil-labelled-dataset/Dataset with label/dev/dev.csv"
    dev_images = "/kaggle/input/tamil-labelled-dataset/Dataset with label/dev/images"
    test_csv = "/kaggle/input/tamil-labelled-dataset/Dataset with label/test/test.csv"
    test_images = "/kaggle/input/tamil-labelled-dataset/Dataset with label/test/images"

    # Hyperparameters
    tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
    max_len = 128
    batch_size = 16
    epochs = 5
    learning_rate = 2e-5

    # Load Data
    print("Loading datasets...")
    train_loader = load_data(train_csv, train_images, tokenizer, max_len, batch_size)
    val_loader = load_data(dev_csv, dev_images, tokenizer, max_len, batch_size)
    test_loader = load_data(test_csv, test_images, tokenizer, max_len, batch_size)

    # Initialize Model
    print("Initializing model...")
    model = MultiModalModel(num_classes=2)

    # Optimizer and Scheduler
    optimizer = torch.optim.AdamW([
        {'params': model.text_model.parameters(), 'lr': learning_rate},
        {'params': model.visual_model.parameters(), 'lr': learning_rate * 10},
        {'params': list(model.text_fc.parameters()) + 
                  list(model.visual_fc.parameters()) + 
                  list(model.fusion.parameters()) + 
                  list(model.classifier.parameters()), 
         'lr': learning_rate * 100}
    ])

    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=[learning_rate, learning_rate * 10, learning_rate * 100],
        steps_per_epoch=len(train_loader),
        epochs=epochs
    )

    criterion = nn.CrossEntropyLoss()

    # Train Model
    print("Starting training...")
    results_df = train_model(model, train_loader, val_loader, epochs, optimizer, criterion, scheduler)
    results_df.to_csv('training_results.csv', index=False)

    # Load Best Model and Evaluate on Test Data
    print("Evaluating on test data...")
    model.load_state_dict(torch.load("best_model.pth"))
    evaluate_model(model, test_loader)

    return results_df

if __name__ == "__main__":
    results = main()


Using device: cuda
Loading datasets...
Initializing model...




Starting training...


Epoch 1/5: 100%|██████████| 71/71 [00:39<00:00,  1.79it/s, training_loss=0.218]
Validating: 100%|██████████| 18/18 [00:05<00:00,  3.01it/s]


Validation Precision: 0.8551
Validation Recall: 0.5922
Validation F1 Score: 0.5936
Validation Accuracy: 0.7852
New best model saved with F1: 0.5936
Epoch 1, Loss: 0.5104, F1: 0.5936


Epoch 2/5: 100%|██████████| 71/71 [00:38<00:00,  1.86it/s, training_loss=0.498]
Validating: 100%|██████████| 18/18 [00:05<00:00,  3.19it/s]


Validation Precision: 0.7707
Validation Recall: 0.5962
Validation F1 Score: 0.6018
Validation Accuracy: 0.7782
New best model saved with F1: 0.6018
Epoch 2, Loss: 0.4363, F1: 0.6018


Epoch 3/5: 100%|██████████| 71/71 [00:37<00:00,  1.88it/s, training_loss=0.357]
Validating: 100%|██████████| 18/18 [00:05<00:00,  3.23it/s]


Validation Precision: 0.7016
Validation Recall: 0.7223
Validation F1 Score: 0.7096
Validation Accuracy: 0.7641
New best model saved with F1: 0.7096
Epoch 3, Loss: 0.3534, F1: 0.7096


Epoch 4/5: 100%|██████████| 71/71 [00:38<00:00,  1.85it/s, training_loss=0.025]
Validating: 100%|██████████| 18/18 [00:05<00:00,  3.09it/s]


Validation Precision: 0.8152
Validation Recall: 0.7373
Validation F1 Score: 0.7627
Validation Accuracy: 0.8380
New best model saved with F1: 0.7627
Epoch 4, Loss: 0.1854, F1: 0.7627


Epoch 5/5: 100%|██████████| 71/71 [00:38<00:00,  1.86it/s, training_loss=0.132]
Validating: 100%|██████████| 18/18 [00:05<00:00,  3.24it/s]
  model.load_state_dict(torch.load("best_model.pth"))


Validation Precision: 0.8112
Validation Recall: 0.7306
Validation F1 Score: 0.7561
Validation Accuracy: 0.8345
Epoch 5, Loss: 0.0585, F1: 0.7561
Evaluating on test data...


Evaluating Test Data: 100%|██████████| 23/23 [00:07<00:00,  3.21it/s]



Test Evaluation Metrics:
Test Precision: 0.7712
Test Recall: 0.7285
Test F1 Score: 0.7449
Test Accuracy: 0.8230
