## Step 1: Import Libraries & Define Paths

In [None]:
import torch
import os
import json
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import BlipProcessor, BlipForQuestionAnswering
import matplotlib.pyplot as plt

## Check Gpu Available And Moving models to GPU

In [None]:
# Check if CUDA is available
print("CUDA Available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Define Dataset Paths

In [None]:
# Define dataset paths
TRAIN_Q_PATH = "/kaggle/input/vqa-dataset/v2_Questions_Train_mscoco/v2_OpenEnded_mscoco_train2014_questions.json"
TRAIN_A_PATH = "/kaggle/input/vqa-dataset/v2_Annotations_Train_mscoco/v2_mscoco_train2014_annotations.json"
VAL_Q_PATH = "/kaggle/input/vqa-dataset/v2_Questions_Val_mscoco/v2_OpenEnded_mscoco_val2014_questions.json"
VAL_A_PATH = "/kaggle/input/vqa-dataset/v2_Annotations_Val_mscoco/v2_mscoco_val2014_annotations.json"

TRAIN_IMAGE_FOLDER = "/kaggle/input/vqa-dataset/train2014_2/train2014/"
VAL_IMAGE_FOLDER = "/kaggle/input/vqa-dataset/val2014/val2014/"

### Load JSON Files

In [None]:
# Function to load JSON files
def load_json(filepath):
    with open(filepath, 'r') as f:
        return json.load(f)

In [None]:
# Function to convert JSON data to DataFrame
def json_to_dataframe(questions, annotations):
    question_df = pd.DataFrame(questions['questions'])
    annotation_df = pd.DataFrame(annotations['annotations'])
    return pd.merge(question_df, annotation_df, on='question_id', how='inner')

# Function to get image path
def get_image_path(image_id, dataset_type="train"):
    image_filename = f"COCO_{dataset_type}2014_{image_id:012d}.jpg"
    folder = TRAIN_IMAGE_FOLDER if dataset_type == "train" else VAL_IMAGE_FOLDER
    return os.path.join(folder, image_filename)

## Load and Preprocess Data

In [None]:
# Load train and validation data
train_questions = load_json(TRAIN_Q_PATH)
train_annotations = load_json(TRAIN_A_PATH)
val_questions = load_json(VAL_Q_PATH)
val_annotations = load_json(VAL_A_PATH)


In [None]:
# Convert JSON to DataFrame
train_df = json_to_dataframe(train_questions, train_annotations)
val_df = json_to_dataframe(val_questions, val_annotations)

# Rename columns and select required fields
train_df = train_df.rename(columns={'image_id_x': 'image_id'})
val_df = val_df.rename(columns={'image_id_x': 'image_id'})

train_df = train_df[['image_id', 'question', 'multiple_choice_answer']].dropna()
val_df = val_df[['image_id', 'question', 'multiple_choice_answer']].dropna()

# Convert questions to lowercase
train_df['question'] = train_df['question'].str.lower()
val_df['question'] = val_df['question'].str.lower()

In [None]:
# Test image paths
print("Sample Train Image Path:", get_image_path(train_df.iloc[0]['image_id'], dataset_type="train"))
print("Sample Validation Image Path:", get_image_path(val_df.iloc[0]['image_id'], dataset_type="val"))

## Define Dataset and DataLoader

In [None]:
class VQADataset(Dataset):
    def __init__(self, dataframe, dataset_type="train"):
        self.dataframe = dataframe
        self.dataset_type = dataset_type

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_id = row['image_id']
        question = row['question']
        answer = row['multiple_choice_answer']

        image_path = get_image_path(image_id, dataset_type=self.dataset_type)
        image = Image.open(image_path).convert("RGB")

        # Normalize image to [0, 1] range
        transform = transforms.Compose([
            transforms.ToTensor(),  # Converts PIL image to tensor and scales to [0, 1]
        ])
        image = transform(image)

        # Process image and question using the processor
        inputs = processor(images=image, text=question, return_tensors="pt", padding=True)
        return {
            "inputs": inputs,
            "labels": answer
        }

In [None]:
def custom_collate_fn(batch):
    # Extract inputs and labels from the batch
    inputs = [item["inputs"] for item in batch]
    answers = [item["labels"] for item in batch]

    # Stack images
    images = torch.stack([input["pixel_values"].squeeze(0) for input in inputs])  # Stack images

    # Extract tokenized questions and pad them
    questions = [input["input_ids"].squeeze(0) for input in inputs]  # Extract tokenized questions
    padded_questions = processor.tokenizer.pad(
        {"input_ids": questions},
        padding=True,
        return_tensors="pt"
    )

    # Tokenize answers
    tokenized_answers = processor.tokenizer(
        answers,
        return_tensors="pt",
        padding=True
    )

    return {
        "pixel_values": images,
        "input_ids": padded_questions["input_ids"],
        "attention_mask": padded_questions["attention_mask"],  # Include attention_mask
        "labels": tokenized_answers["input_ids"]
    }

In [None]:
# Create Dataset and DataLoader
train_dataset = VQADataset(train_df, dataset_type="train")
val_dataset = VQADataset(val_df, dataset_type="val")

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=custom_collate_fn)

## Load BLIP Model

In [None]:
# Load BLIP Model
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base", do_rescale=False)
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)

## Training and Validation Function

In [None]:
def train_and_validate(train_loader, val_loader, model, device, num_epochs=10, patience=5):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    best_val_loss = float("inf")
    patience_counter = 0

    # Store loss and accuracy for visualization
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        correct_train = 0
        total_train = 0

        for i, batch in enumerate(train_loader):
            # Move inputs to device
            pixel_values = batch["pixel_values"].to(device)
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)  
            labels = batch["labels"].to(device)
            
            outputs = model(
                pixel_values=pixel_values,
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss

            # Generate answer instead of accessing logits
            generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids)
            preds = processor.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)  # Convert to text

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

            # Calculate training accuracy
            true_answers = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)
            correct_train += sum([1 for pred, label in zip(preds, true_answers) if pred.strip().lower() == label.strip().lower()])
            total_train += len(true_answers)

            # Keep session alive - print every 10 batches
            if i % 10 == 0:
                print(f"Training Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}, Acc: {correct_train / total_train:.4f}")
                time.sleep(1)  # Small delay to avoid flooding logs

        avg_train_loss = total_train_loss / len(train_loader)
        train_acc = correct_train / total_train  # Training accuracy
        train_losses.append(avg_train_loss)
        train_accuracies.append(train_acc)
        print(f"Epoch [{epoch+1}/{num_epochs}] | Train Loss: {avg_train_loss:.4f} | Train Acc: {train_acc:.4f}")

        # Validation
        model.eval()
        total_val_loss = 0
        correct_val = 0
        total_val = 0

        with torch.no_grad():
            for batch in val_loader:
                pixel_values = batch["pixel_values"].to(device)
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)  
                labels = batch["labels"].to(device)

                outputs = model(
                    pixel_values=pixel_values,
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                loss = outputs.loss

                # Generate answer instead of accessing logits
                generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids)
                preds = processor.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)  # Convert to text

                total_val_loss += loss.item()

                # Calculate validation accuracy
                true_answers = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)
                correct_val += sum([1 for pred, label in zip(preds, true_answers) if pred.strip().lower() == label.strip().lower()])
                total_val += len(true_answers)

                # Keep session alive - print every 10 batches
                if i % 10 == 0:
                    print(f"Validation Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}, Acc: {correct_train / total_train:.4f}")
                    time.sleep(1)  # Small delay to avoid flooding logs


        avg_val_loss = total_val_loss / len(val_loader)
        val_acc = correct_val / total_val  # Validation accuracy
        val_losses.append(avg_val_loss)
        val_accuracies.append(val_acc)
        print(f"Epoch [{epoch+1}/{num_epochs}] | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.4f}")

        # Early Stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), "best_model.pth")
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break

    print("Training complete!")

    # Plot Training & Validation Loss and Accuracy
    plt.figure(figsize=(12, 6))

    # Plot Loss
    plt.subplot(1, 2, 1)
    plt.plot(range(1, len(train_losses) + 1), train_losses, label="Train Loss", marker="o")
    plt.plot(range(1, len(val_losses) + 1), val_losses, label="Validation Loss", marker="o")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.title("Training and Validation Loss")
    plt.legend()
    plt.grid(True)

    # Plot Accuracy
    plt.subplot(1, 2, 2)
    plt.plot(range(1, len(train_accuracies) + 1), train_accuracies, label="Train Accuracy", marker="o")
    plt.plot(range(1, len(val_accuracies) + 1), val_accuracies, label="Validation Accuracy", marker="o")
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.title("Training and Validation Accuracy")
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()

# Run training
train_and_validate(train_loader, val_loader, model, device, num_epochs=100, patience=10)


## Run Training