In [1]:
# import libraries

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import BertTokenizer, BertModel
from transformers import RobertaTokenizer, RobertaModel
from transformers import AutoTokenizer, AutoModel
from torchvision import transforms, models
import pandas as pd
import os
import numpy as np
import logging

In [3]:
class MultimodalDataset(Dataset):
    def __init__(self, dataframe, image_folder, tokenizer, transform):
        self.dataframe = dataframe
        self.image_folder = image_folder
        self.tokenizer = tokenizer
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # get image and text data
        image_name = self.dataframe.iloc[idx]['image_name']
        text = self.dataframe.iloc[idx]['description']
        label = self.dataframe.iloc[idx]['target']

        # load and transform the image
        img_path = f"{self.image_folder}/{image_name}"
        image = Image.open(img_path).convert('RGB')
        image = self.transform(image)

        # tokenize the text using RoBERTa tokenizer
        tokens = self.tokenizer(text, padding='max_length', truncation=True, return_tensors="pt", max_length=128)
        input_ids = tokens['input_ids'].squeeze(0)  # remove the batch dimension
        attention_mask = tokens['attention_mask'].squeeze(0)  # remove the batch dimension

        category = self.dataframe.iloc[idx]['object']

        return image, input_ids, attention_mask, category, torch.tensor(label)


In [4]:
# image transformation (input size of 224x224)
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [5]:
# Load the RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]



In [6]:
# Load the training dataframe
train_df = pd.read_csv('/kaggle/input/multimodal-classification-tc-sep-2024/dataset/train.csv')

# create the custom dataset
train_dataset = MultimodalDataset(train_df,
                                  image_folder='/kaggle/input/multimodal-classification-tc-sep-2024/dataset/images/train',
                                  tokenizer=tokenizer,
                                  transform=image_transform)

# define the DataLoader for training
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [7]:
# image processing network

class ImageEncoder(nn.Module):
    def __init__(self):
        super(ImageEncoder, self).__init__()
        # load pre-trained model and remove the last fully connected layer
        self.model = models.convnext_large(pretrained=True)
        self.model = nn.Sequential(*list(self.model.children())[:-1])  # remove the final classification layer

    def forward(self, images):
        img_features = self.model(images)  # extract features
        img_features = img_features.view(img_features.size(0), -1)  # flatten the features
        return img_features

In [8]:
# text encoder network

class TextEncoder(nn.Module):
    def __init__(self):
        super(TextEncoder, self).__init__()
        self.model = RobertaModel.from_pretrained('roberta-large')

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        text_features = outputs.pooler_output
        return text_features

In [10]:
object_classes = {"cat": 0,
                  "car": 1,
                  "person": 2,
                  "house": 3,
                  "bus": 4}

def create_one_hot_tensor(categories):
    output = torch.Tensor([])
    for elem in categories:
        current = torch.from_numpy(np.array([1 if i == object_classes[elem] else 0 for i in range(0, 5)]))
        output = torch.cat((output, current), dim=0)
    return torch.reshape(output, (len(categories), 5))

In [11]:
class MultimodalClassifier(nn.Module):
    def __init__(self, num_classes):
        super(MultimodalClassifier, self).__init__()
        # image encoder
        self.image_encoder = ImageEncoder()
        self.image_encoder.to("cuda:1")
        # text encoder
        self.text_encoder = TextEncoder()
        self.text_encoder.to("cuda:0")
        # fully connected layers to combine image and text features
        self.fc = nn.Sequential(
            nn.Linear(2560, 512)
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )
        self.fc.to("cuda:0")

    def forward(self, images, input_ids, attention_mask, category):
        # extract features from images and text
        img_features = self.image_encoder(images.to("cuda:1"))
        text_features = self.text_encoder(input_ids.to("cuda:0"), attention_mask.to("cuda:0"))
        # concatenate the features
        combined_features = torch.cat((img_features.to("cuda:0"), text_features.to("cuda:0")), dim=1)
        # pass through the fully connected layers
        output = self.fc(combined_features.to("cuda:0"))
        return output


In [12]:
# remove logging errors from transformers

loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
for logger in loggers:
    if "transformers" in logger.name.lower():
        logger.setLevel(logging.ERROR)

In [None]:
# instantiate the model, loss function, and optimizer
model = MultimodalClassifier(num_classes=4)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-2)

# training loop
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for images, input_ids, attention_mask, category, labels in train_loader:
        optimizer.zero_grad()

        # forward pass
        outputs = model(images.to("cuda:1"), input_ids.to("cuda:0"), attention_mask.to("cuda:0"), category)
        loss = criterion(outputs, labels.cuda())

        # backward pass and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    # print the average loss for this epoch
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader)}")

Downloading: "https://download.pytorch.org/models/convnext_large-ea097f82.pth" to /root/.cache/torch/hub/checkpoints/convnext_large-ea097f82.pth
100%|██████████| 755M/755M [00:04<00:00, 186MB/s]  


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

In [None]:
class TestDataset(Dataset):
    def __init__(self, dataframe, image_folder, tokenizer, transform):
        self.dataframe = dataframe
        self.image_folder = image_folder
        self.tokenizer = tokenizer
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        image_name = self.dataframe.iloc[idx]['image_name']
        text = self.dataframe.iloc[idx]['description']

        # load and transform the image
        img_path = f"{self.image_folder}/{image_name}"
        image = Image.open(img_path).convert('RGB')
        image = self.transform(image)

        # tokenize the text
        tokens = self.tokenizer(text, padding='max_length', truncation=True, return_tensors="pt", max_length=128)
        input_ids = tokens['input_ids'].squeeze(0)
        attention_mask = tokens['attention_mask'].squeeze(0)

        category = self.dataframe.iloc[idx]['object']

        return image, input_ids, attention_mask, category, image_name  # Return image_name for the submission file


In [None]:
# load the test dataframe
test_df = pd.read_csv('/kaggle/input/multimodal-classification-tc-sep-2024/dataset/test.csv')

# create the test dataset
test_dataset = TestDataset(test_df, image_folder='/kaggle/input/multimodal-classification-tc-sep-2024/dataset/images/test',
                           tokenizer=tokenizer, transform=image_transform)

# create the DataLoader for the test dataset
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
model.eval()

predictions = []

with torch.no_grad():
    for images, input_ids, attention_mask, category, image_names in test_loader:
        # forward pass to get predictions
        outputs = model(images.cuda(), input_ids.cuda(), attention_mask.cuda(), category)

        # get the predicted class (use .argmax to get the class with the highest probability)
        _, predicted_labels = torch.max(outputs, 1)

        # store the image_name and the corresponding predicted label
        for image_name, predicted_label in zip(image_names, predicted_labels):
            predictions.append({'image_name': image_name, 'target': predicted_label.item()})

In [None]:
# create the submission file
submission_df = pd.DataFrame(predictions)
submission_df.to_csv('submission.csv', index=False)