# Multimodal Memotion Analysis
**Final Assignment**  
Tasks:  
A. Sentiment Classification  
B. Humor/Sarcasm/Offense Detection  
C. Motivational Classification


## Configuration & Imports
Import required libraries and set up hardware acceleration

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from PIL import Image, ImageFile
from transformers import DistilBertTokenizer, DistilBertModel, ViTImageProcessor, ViTModel
import torch.nn as nn
import os
from tqdm import tqdm
import time

# Enable truncated image handling
ImageFile.LOAD_TRUNCATED_IMAGES = True

# Hardware configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.backends.cudnn.benchmark = True

## Loading DistilBert and ViT transformer


In [8]:
class MemeDataset(Dataset):
    def __init__(self, labels_path, image_dir, text_max_length=128):
        self.labels_df = pd.read_csv(labels_path)
        self.image_dir = image_dir
        
        # Data validation
        self.labels_df['text_corrected'] = self.labels_df['text_corrected'].astype(str)
        
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.text_max_length = text_max_length
        
        self.image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
        
        self.label_maps = {
            'sentiment': ['very_negative', 'negative', 'neutral', 'positive', 'very_positive'],
            'humor': ['not_funny', 'funny', 'very_funny', 'hilarious'],
            'sarcasm': ['not_sarcastic', 'general', 'twisted_meaning', 'very_twisted'],
            'offensive': ['not_offensive', 'slight', 'very_offensive', 'hateful_offensive'],
            'motivational': ['not_motivational', 'motivational']
        }

    def __len__(self):
        return len(self.labels_df)

    def __getitem__(self, idx):
        row = self.labels_df.iloc[idx]
        
        # Text processing
        text = str(row['text_corrected'])
        inputs = self.tokenizer(
            text,
            max_length=self.text_max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        
        # Image processing
        img_path = os.path.join(self.image_dir, row['image_name'])
        image = Image.open(img_path).convert('RGB')
        pixel_values = self.image_processor(images=image, return_tensors="pt").pixel_values
        
        # Label encoding
        labels = {
            'sentiment': torch.tensor(self.label_maps['sentiment'].index(row['overall_sentiment']), dtype=torch.long),
            'humor': torch.tensor(self.label_maps['humor'].index(row['humour']), dtype=torch.long),
            'sarcasm': torch.tensor(self.label_maps['sarcasm'].index(row['sarcasm']), dtype=torch.long),
            'offensive': torch.tensor(self.label_maps['offensive'].index(row['offensive']), dtype=torch.long),
            'motivational': torch.tensor(1 if row['motivational'] == 'motivational' else 0, dtype=torch.long)
        }
        
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'pixel_values': pixel_values.squeeze(),
            'labels': labels
        }

## Fusion Layer with Neural Networks

In [9]:
class MultimodalModel(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Text encoder
        self.text_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        
        # Image encoder
        self.image_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
        
        # Multimodal fusion
        self.fusion = nn.Sequential(
            nn.Linear(768*2, 512),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        # Classification heads
        self.classifier = nn.ModuleDict({
            'sentiment': nn.Linear(512, 5),
            'humor': nn.Linear(512, 4),
            'sarcasm': nn.Linear(512, 4),
            'offensive': nn.Linear(512, 4),
            'motivational': nn.Linear(512, 2)
        })

    def forward(self, input_ids, attention_mask, pixel_values):
        text_out = self.text_model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:,0,:]
        image_out = self.image_model(pixel_values=pixel_values).last_hidden_state[:,0,:]
        
        fused = torch.cat([text_out, image_out], dim=1)
        fused = self.fusion(fused)
        
        return {task: self.classifier[task](fused) for task in self.classifier}

## Loading the model to train

In [10]:
def train_model(model, dataloader, optimizer, criterion, epochs=2):
    model.train()
    
    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")
        
        for batch in progress_bar:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = {k: v.to(device) for k, v in batch['labels'].items()}
            
            optimizer.zero_grad()
            outputs = model(**inputs)
            
            loss = sum(criterion[task](outputs[task], labels[task]) for task in outputs)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            progress_bar.set_postfix({'loss': f"{loss.item():.4f}"})
        
        print(f"Epoch {epoch+1} Avg Loss: {total_loss/len(dataloader):.4f}")
    
    return model

## Loading the datasets to train the model

In [11]:
# Initialize components
dataset = MemeDataset(
    labels_path='labels.csv',
    image_dir='images'
)

dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

model = MultimodalModel().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Loss functions
criterion = {
    task: nn.CrossEntropyLoss()
    for task in ['sentiment', 'humor', 'sarcasm', 'offensive', 'motivational']
}

# Start training
trained_model = train_model(model, dataloader, optimizer, criterion)

# Save model
torch.save(trained_model.state_dict(), 'memotion_model.pth')
print("Model saved successfully!")

Epoch 1/2: 100%|██████████| 437/437 [25:00<00:00,  3.43s/it, loss=5.6098]


Epoch 1 Avg Loss: 5.6426


Epoch 2/2: 100%|██████████| 437/437 [24:25<00:00,  3.35s/it, loss=5.6576]


Epoch 2 Avg Loss: 5.5518
Model saved successfully!
