# Necessary Imports

In [None]:
import json
import torch
import time
import os
import numpy as np
from PIL import Image
from collections import Counter
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from torchvision.models import resnet50, ResNet50_Weights
from torchvision import transforms
from sklearn.model_selection import train_test_split

from tqdm.auto import tqdm
import sys

# 1. Setup Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

Device: cuda


# Image-preprocessing + Data augmentation

In [None]:
# 2. Define Transforms
# Image pre-processing : to properly utilize ResNET50 (ImageNet)
val_transform = transforms.Compose([
    transforms.Resize((224, 224)),   # Resize directly to 224x224 for simplicity
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Training transform - Data Augmentation
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.3, saturation=0.3),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Dataset class of pytorch

In [None]:
class VQADataset(Dataset):
    def __init__(self, image_dir, questions_list, ann_map, tokenizer, answer_to_idx, transform):
        self.image_dir = image_dir
        self.questions = questions_list
        self.ann_map = ann_map
        self.tokenizer = tokenizer
        self.answer_to_idx = answer_to_idx
        self.transform = transform

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        q = self.questions[idx]
        qid = q["question_id"]
        
        # Image Loading
        img_name = f"COCO_train2014_{q['image_id']:012d}.jpg"
        img_path = os.path.join(self.image_dir, img_name)
        
        # Safety check if image exists (optional but good for debugging)
        try:
            image = Image.open(img_path).convert("RGB")
        except:
            # Create a dummy black image if file missing (prevents crash)
            image = Image.new('RGB', (224, 224))
            
        image = self.transform(image)

        # Tokenization
        tokens = self.tokenizer(
            q["question"],
            padding="max_length",
            truncation=True,
            max_length=32,
            return_tensors="pt"
        )

        # Label
        ann = self.ann_map.get(qid)
        label = -1
        if ann:
            # We take the 'multiple_choice_answer'
            ans_str = ann['multiple_choice_answer'].lower().strip()
            label = self.answer_to_idx.get(ans_str, -1)

        # Fixed typo: squueze -> squeeze
        return {
            "image": image,
            "input_ids": tokens['input_ids'].squeeze(0),
            "attention_mask": tokens['attention_mask'].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)
        }

# VQA Model (ResNet + BERT)

In [None]:
# 5. VQA Model (ResNet + BERT)
class VQAModel(nn.Module):
    def __init__(self, num_answers):
        super().__init__()
        
        # 1. Visual Encoder (ResNet50)
        self.cnn = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
        self.cnn.fc = nn.Identity() 
        
        # UNFREEZE TRICK: Train the last block (layer4) to adapt to VQA
        for name, param in self.cnn.named_parameters():
            if "layer4" in name:
                param.requires_grad = True 
            else:
                param.requires_grad = False
            
        # 2. Text Encoder (BERT)
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        # Freeze first 8 layers, train last 4
        for layer in self.bert.encoder.layer[:8]: 
            for p in layer.parameters():
                p.requires_grad = False
                
        # 3. FUSION LAYERS (The Fix)
        # Project both to 1024 dim
        self.img_proj = nn.Linear(2048, 1024)
        self.txt_proj = nn.Linear(768, 1024)
        
        self.dropout = nn.Dropout(0.5)
        
        # 4. Classifier
        self.classifier = nn.Sequential(
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_answers)
        )

    def forward(self, image, input_ids, attention_mask):
        # Image Features
        img_feat = self.cnn(image)         # [B, 2048]
        img_feat = self.img_proj(img_feat) # [B, 1024]
        
        # Text Features
        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        txt_feat = bert_out.last_hidden_state[:, 0, :] # [B, 768]
        txt_feat = self.txt_proj(txt_feat)             # [B, 1024]
        
        # KEY CHANGE: Element-wise Multiplication
        fused = img_feat * txt_feat  
        
        fused = self.dropout(fused)
        logits = self.classifier(fused)
        return logits

# Training fn

In [None]:
def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0

    loop = tqdm(loader, desc="Training", leave=False)
    
    for batch in loop:
        images = batch["image"].to(device)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        mask = labels != -1
        if mask.sum() == 0: continue

        optimizer.zero_grad()
        outputs = model(images, input_ids, attention_mask)
        
        loss = criterion(outputs[mask], labels[mask])
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        
        # Calculate running stats
        preds = outputs.argmax(dim=1)
        correct += (preds[mask] == labels[mask]).sum().item()
        total += mask.sum().item()
        
        # --- LIVE ACCURACY UPDATE ---
        # Calculate current average accuracy so far
        current_acc = correct / total if total > 0 else 0
        
        # Update the progress bar
        loop.set_postfix(loss=loss.item(), acc=f"{current_acc:.4f}")
        
    train_acc = correct / total if total > 0 else 0
    train_loss = total_loss / len(loader)
    
    return train_loss, train_acc


# Validation fn
def validate(model, loader, device):
    model.eval()
    correct = 0
    total = 0
    
    # Add TQDM for validation as well
    loop = tqdm(loader, desc="Validating", leave=False)
    
    with torch.no_grad():
        for batch in loop:
            images = batch["image"].to(device)
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            mask = labels != -1
            if mask.sum() == 0:
                continue

            outputs = model(images, input_ids, attention_mask)
            preds = outputs.argmax(dim=1)
            
            correct += (preds[mask] == labels[mask]).sum().item()
            total += mask.sum().item()
            
    return correct / total if total > 0 else 0

In [None]:
#  Helper: Building Answer Vocabulary
def build_answer_vocab(annotations, top_k=1000):
    all_answers = []
    for ann in annotations:
        for ans in ann["answers"]:
            all_answers.append(ans['answer'].lower().strip())
    
    counter = Counter(all_answers)
    most_common = counter.most_common(top_k)
    
    answer_to_idx = {ans: i for i, (ans, _) in enumerate(most_common)}
    return answer_to_idx

# Final Execution / Training

In [None]:
# --- CONFIGURATION ---
IMG_DIR = "/kaggle/input/mini-coco2014-dataset-for-image-captioning/Images"
QUEST_PATH = "/kaggle/input/vqatext/v2_Questions_Train_mscoco/v2_OpenEnded_mscoco_train2014_questions.json"
ANNOT_PATH = "/kaggle/input/vqatext/v2_Annotations_Train_mscoco/v2_mscoco_train2014_annotations.json"
TOP_K = 1000
BATCH_SIZE = 32

# 1. Load Data
print("Loading JSON data...")
with open(QUEST_PATH, 'r') as f:
    questions_json = json.load(f)
    all_questions = questions_json["questions"]

with open(ANNOT_PATH, 'r') as f:
    annotations_json = json.load(f)
    all_annotations = annotations_json["annotations"]

# Map question_id to annotation
ann_map = {ann["question_id"]: ann for ann in all_annotations}

# 2. Build Vocabulary
print("Building Vocabulary...")
answer_to_idx = build_answer_vocab(all_annotations, TOP_K)
num_classes = len(answer_to_idx)
print(f"Vocab size: {num_classes}")

# 3. Train/Test Split
# We split the list of questions, not the file paths
print("Splitting data...")
train_questions, val_questions = train_test_split(
    all_questions, 
    test_size=0.2, 
    random_state=42
)
print(f"Train size: {len(train_questions)}, Val size: {len(val_questions)}")

# 4. Create Datasets & Loaders
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_dataset = VQADataset(IMG_DIR, train_questions, ann_map, tokenizer, answer_to_idx, train_transform)
val_dataset = VQADataset(IMG_DIR, val_questions, ann_map, tokenizer, answer_to_idx, val_transform)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

# 5. Initialize Model
# model = VQAModel(num_answers=num_classes).to(device)

model = VQAModel(num_answers=num_classes) # Don't send to .to(device) yet

# Check if multiple GPUs are available
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    # This wraps the model to split batches across both GPUs
    model = nn.DataParallel(model)

# Now send the model to the primary device (cuda:0)
model = model.to(device)


criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-5)

# 6. Run Training
num_epochs = 6 # Reduced for testing, increase later
best_val_acc = 0.0


print("Starting training...")

for epoch in range(num_epochs):
    
    # Train
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, device)
    
    # Validate
    val_acc = validate(model, val_loader, device)
    
    # Clean print after the progress bars are done
    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

    # torch.save(model.state_dict(), "vqa_last.pth")
    torch.save(model.module.state_dict(), "vqa_last.pth")
    print("saved last model")
    
    # Save checkpoints
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        # torch.save(model.state_dict(), "vqa_best.pth")
        torch.save(model.module.state_dict(), "vqa_best.pth")
        print(" -> Saved best model.")
    
    !zip model.zip vqa_best.pth vqa_last.pth

Loading JSON data...
Building Vocabulary...
Vocab size: 1000
Splitting data...
Train size: 355005, Val size: 88752


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth


100%|██████████| 97.8M/97.8M [00:00<00:00, 185MB/s] 


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Using 2 GPUs!
Starting training...
Starting training...


Training:   0%|          | 0/11094 [00:00<?, ?it/s]

Validating:   0%|          | 0/2774 [00:00<?, ?it/s]

Epoch 1/6 | Train Loss: 2.5254 | Train Acc: 0.3180 | Val Acc: 0.3629
saved last model
 -> Saved best model.
  adding: vqa_best.pth (deflated 7%)
  adding: vqa_last.pth (deflated 7%)


Training:   0%|          | 0/11094 [00:00<?, ?it/s]

Validating:   0%|          | 0/2774 [00:00<?, ?it/s]

Epoch 2/6 | Train Loss: 2.0148 | Train Acc: 0.3642 | Val Acc: 0.3790
saved last model
 -> Saved best model.
updating: vqa_best.pth (deflated 7%)
updating: vqa_last.pth (deflated 7%)


Training:   0%|          | 0/11094 [00:00<?, ?it/s]

In [7]:
import json
from IPython.display import FileLink

# 1. Save the file locally in the kernel
with open('vocab.json', 'w') as f:
    json.dump(answer_to_idx, f)

# 2. Generate a download link
FileLink('vocab.json')

NameError: name 'answer_to_idx' is not defined