<a href="https://colab.research.google.com/github/Mr1-Robot/machine-learning-alternative-assessment/blob/main/alternative_assessment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Alternative Assessment - WOA7015
#####MUAAMAR MOHAMMED ABDULLAH AL-GHRAIRI - 24084470

In [1]:
# Mount google drive to access the dataset
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Core libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Data processing
import pandas as pd
import numpy as np
from PIL import Image

# Utils
from tqdm import tqdm
import json
import pickle
import os
from datetime import datetime

print("-"*30)
print("SETUP")
print("-"*30)
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
  print(f"GPU: {torch.cuda.get_device_name(0)}")
print("-"*30)


------------------------------
SETUP
------------------------------
PyTorch Version: 2.9.0+cu126
CUDA available: True
GPU: Tesla T4
------------------------------


Load & Explore **Dataset**

In [3]:
# Define paths
DATA_DIR = '/content/drive/MyDrive/alternative-assessment-materials/'
JSON_PATH = f"{DATA_DIR}VQA_RAD_Dataset.json"
IMAGE_DIR = f"{DATA_DIR}VQA_RAD_Images"

print("-"*30)
print("Loading VQA-RAD Dataser")
print("-"*30)
print(f"JSON file: {JSON_PATH}")
print(f"Images folder: {IMAGE_DIR}")
print("-"*30)

# Load the JSON file into a pandas Dataframe
vqa_data = pd.read_json(JSON_PATH)

print(f"\nDataset loaded successfully!")
print(f"Total Q&A pairs: {len(vqa_data)}")
print("\nDataFrame columns:")
print(vqa_data.columns.tolist())
print("\nFirst 3 rows:")
print(vqa_data.head(3))

print("-"*30)
print("DATASET STATISTICS")
print("-"*30)

# Answer types distribution
answer_types = vqa_data['answer_type'].value_counts()
print("\nAnswer Type Distribution:")
for answer_type, count in answer_types.items():
    percentage = (count / len(vqa_data)) * 100
    print(f"  {answer_type}: {count} ({percentage:.1f}%)")

# For closed-ended questions (yes/no)
closed_data = vqa_data[vqa_data['answer_type'] == 'CLOSED']
print(f"\nClosed-ended questions: {len(closed_data)}")

# Yes/No distribution
answer_dist = closed_data['answer'].value_counts()
print("\nYes/No Distribution:")
for answer, count in answer_dist.items():
    percentage = (count / len(closed_data)) * 100
    print(f"  {answer}: {count} ({percentage:.1f}%)")

# Organ distribution
if 'phrase_type' in vqa_data.columns:
    organ_dist = vqa_data['phrase_type'].value_counts()
    print("\nOrgan Distribution:")
    for organ, count in organ_dist.items():
        percentage = (count / len(vqa_data)) * 100
        print(f"  {organ}: {count} ({percentage:.1f}%)")

print("-"*30)

------------------------------
Loading VQA-RAD Dataser
------------------------------
JSON file: /content/drive/MyDrive/alternative-assessment-materials/VQA_RAD_Dataset.json
Images folder: /content/drive/MyDrive/alternative-assessment-materials/VQA_RAD_Images
------------------------------

Dataset loaded successfully!
Total Q&A pairs: 2248

DataFrame columns:
['qid', 'phrase_type', 'qid_linked_id', 'image_case_url', 'image_name', 'image_organ', 'evaluation', 'question', 'question_rephrase', 'question_relation', 'question_frame', 'question_type', 'answer', 'answer_type']

First 3 rows:
   qid phrase_type                         qid_linked_id  \
0    0    freeform  03f451ca-de62-4617-9679-e836026a7642   
1    1    freeform  06e26b2c-04b9-42bc-8e98-1de30a0f7682   
2    2    freeform  0d0e8b6b-7753-4788-9b6d-dc7f25250c3f   

                                      image_case_url       image_name  \
0  https://medpix.nlm.nih.gov/case?id=48e1dd0e-85...  synpic54610.jpg   
1  https://medpix.nl

###Create TRAIN/VAL/TEST splits

In [4]:
from sklearn.model_selection import train_test_split

# Filter closed-ended questions only
closed_data = vqa_data[vqa_data['answer_type'] == 'CLOSED'].copy()

# Convery yes/no to binary labels (0=no, 1=yes)
closed_data['label'] = closed_data['answer'].apply(lambda x: 1 if x.lower() == 'yes' else 0)

print("-"*30)
print("Creating TEST/VAL/TEST splits")
print("-"*30)
print(f"Total closed-ended samples: {len(closed_data)}")
print(f"Yes: {closed_data['label'].sum()}, No: {len(closed_data) - closed_data['label'].sum()}")

# Set random seed for reproducibility
np.random.seed(42)

# Split: 70% train, 15% val, 15% test (stratified by label)
train_val, test = train_test_split(
    closed_data,
    test_size=0.15,          # 15% for test
    random_state=42,
    stratify=closed_data['label']  # Maintain yes/no ratio
)

train, val = train_test_split(
    train_val,
    test_size=0.176,         # 15% of total (0.15/0.85 ≈ 0.176)
    random_state=42,
    stratify=train_val['label']
)

# Reset indices
train = train.reset_index(drop=True)
val = val.reset_index(drop=True)
test = test.reset_index(drop=True)

print(f"\nSplit sizes:")
print(f"  Train: {len(train)} samples ({len(train)/len(closed_data)*100:.1f}%)")
print(f"  Val:   {len(val)} samples ({len(val)/len(closed_data)*100:.1f}%)")
print(f"  Test:  {len(test)} samples ({len(test)/len(closed_data)*100:.1f}%)")

print(f"\nClass distribution:")
print(f"  Train - Yes: {train['label'].sum()}, No: {len(train)-train['label'].sum()}")
print(f"  Val   - Yes: {val['label'].sum()}, No: {len(val)-val['label'].sum()}")
print(f"  Test  - Yes: {test['label'].sum()}, No: {len(test)-test['label'].sum()}")
print("-"*30)

# Save split indices for reproducibility
SAVE_DIR = '/content/drive/MyDrive/alternative-assessment-materials/VQA_RAD_LOCKED_RESULTS/'
os.makedirs(SAVE_DIR, exist_ok=True)

split_indices = {
    'train_indices': train.index.tolist(),
    'val_indices': val.index.tolist(),
    'test_indices': test.index.tolist()
}

with open(f'{SAVE_DIR}/split_indices.json', 'w') as f:
    json.dump(split_indices, f)

print(f"Split indices saved to: {SAVE_DIR}split_indices.json")

------------------------------
Creating TEST/VAL/TEST splits
------------------------------
Total closed-ended samples: 1297
Yes: 586, No: 711

Split sizes:
  Train: 908 samples (70.0%)
  Val:   194 samples (15.0%)
  Test:  195 samples (15.0%)

Class distribution:
  Train - Yes: 410, No: 498
  Val   - Yes: 88, No: 106
  Test  - Yes: 88, No: 107
------------------------------
Split indices saved to: /content/drive/MyDrive/alternative-assessment-materials/VQA_RAD_LOCKED_RESULTS/split_indices.json


Build **Vocabulary** from questions

In [5]:
from collections import Counter

print("-"*30)
print("Building Vocabulary")
print("-"*30)

# Tokenize all training questions
all_words = []
for question in train['question']:
    # Simple tokenization: lowercase and split by spaces
    words = question.lower().split()
    all_words.extend(words)

# Count word frequencies
word_counts = Counter(all_words)
print(f"Total words in training set: {len(all_words)}")
print(f"Unique words: {len(word_counts)}")

# Create vocabulary: word -> index mapping
# Reserve indices: 0=PAD, 1=UNK (unknown words)
vocab = {'<PAD>': 0, '<UNK>': 1}
for word, count in word_counts.most_common():
    if count >= 2:  # Only include words that appear at least twice
        vocab[word] = len(vocab)

print(f"Vocabulary size (including PAD and UNK): {len(vocab)}")
print(f"\nMost common words:")
for word, count in word_counts.most_common(20):
    print(f"  '{word}': {count} times")

# Create reverse mapping: index -> word
idx_to_word = {idx: word for word, idx in vocab.items()}

print("-"*30)

# Test encoding a sample question
sample_question = train.iloc[0]['question']
sample_encoded = [vocab.get(word.lower(), vocab['<UNK>']) for word in sample_question.split()]

print(f"\nSample encoding:")
print(f"  Question: {sample_question}")
print(f"  Encoded: {sample_encoded}")
print(f"  Decoded: {' '.join([idx_to_word[idx] for idx in sample_encoded])}")
print("-"*30)

------------------------------
Building Vocabulary
------------------------------
Total words in training set: 5842
Unique words: 923
Vocabulary size (including PAD and UNK): 465

Most common words:
  'is': 627 times
  'the': 620 times
  'there': 294 times
  'this': 236 times
  'in': 179 times
  'a': 168 times
  'of': 146 times
  'are': 137 times
  'image?': 78 times
  'or': 66 times
  'image': 63 times
  'does': 55 times
  'left': 47 times
  'mass': 47 times
  'present?': 47 times
  'an': 43 times
  'air': 42 times
  'on': 41 times
  'evidence': 40 times
  'normal?': 38 times
------------------------------

Sample encoding:
  Question: Is the heart size abnormal?
  Encoded: [2, 3, 23, 121, 42]
  Decoded: is the heart size abnormal?
------------------------------


###**Create PyTorch Dataset**

In [6]:
from torchvision import transforms

class VQADataset(Dataset):
    """PyTorch Dataset for VQA-RAD"""

    def __init__(self, dataframe, vocab, image_dir, max_seq_length=20, transform=None):
        """
        Args:
            dataframe: DataFrame with columns [image_name, question, label]
            vocab: Dictionary mapping words to indices
            image_dir: Path to image folder
            max_seq_length: Maximum question length (pad/truncate)
            transform: Image transformations
        """
        self.data = dataframe
        self.vocab = vocab
        self.image_dir = image_dir
        self.max_seq_length = max_seq_length
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get data for this index
        row = self.data.iloc[idx]
        image_name = row['image_name']
        question = row['question']
        label = row['label']

        # Load and transform image
        image_path = os.path.join(self.image_dir, image_name)
        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        # Encode question to indices
        words = question.lower().split()
        question_encoded = [self.vocab.get(word, self.vocab['<UNK>']) for word in words]

        # Pad or truncate to max_seq_length
        if len(question_encoded) < self.max_seq_length:
            question_encoded += [self.vocab['<PAD>']] * (self.max_seq_length - len(question_encoded))
        else:
            question_encoded = question_encoded[:self.max_seq_length]

        # Convert to tensors
        question_tensor = torch.tensor(question_encoded, dtype=torch.long)
        label_tensor = torch.tensor(label, dtype=torch.long)

        return image, question_tensor, label_tensor

# Define image transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),           # Resize to 224x224 (ResNet input size)
    transforms.ToTensor(),                   # Convert to tensor [0, 1]
    transforms.Normalize(                    # Normalize with ImageNet stats
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

# Create datasets
train_dataset = VQADataset(train, vocab, IMAGE_DIR, transform=transform)
val_dataset = VQADataset(val, vocab, IMAGE_DIR, transform=transform)
test_dataset = VQADataset(test, vocab, IMAGE_DIR, transform=transform)

print("-"*30)
print("Datasets Created")
print("-"*30)
print(f"Train: {len(train_dataset)} samples")
print(f"Val:   {len(val_dataset)} samples")
print(f"Test:  {len(test_dataset)} samples")

# Test loading one sample
sample_image, sample_question, sample_label = train_dataset[0]
print(f"\nSample data shapes:")
print(f"  Image: {sample_image.shape}")        # Should be [3, 224, 224]
print(f"  Question: {sample_question.shape}")  # Should be [20]
print(f"  Label: {sample_label.item()}")       # Should be 0 or 1
print("-"*30)

------------------------------
Datasets Created
------------------------------
Train: 908 samples
Val:   194 samples
Test:  195 samples

Sample data shapes:
  Image: torch.Size([3, 224, 224])
  Question: torch.Size([20])
  Label: 0
------------------------------


**Create Dataloaders**

In [7]:
# Batch size for training
BATCH_SIZE = 32

# Create DataLoaders
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,        # Shuffle training data each epoch
    num_workers=2,       # Parallel data loading
    pin_memory=True      # Faster data transfer to GPU
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,       # Don't shuffle validation
    num_workers=2,
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,       # Don't shuffle test
    num_workers=2,
    pin_memory=True
)

print("-"*30)
print("Dataloaders Created")
print("-"*30)
print(f"Train batches: {len(train_loader)} (batch size: {BATCH_SIZE})")
print(f"Val batches:   {len(val_loader)}")
print(f"Test batches:  {len(test_loader)}")

# Test loading one batch
sample_batch = next(iter(train_loader))
batch_images, batch_questions, batch_labels = sample_batch

print(f"\nSample batch shapes:")
print(f"  Images: {batch_images.shape}")        # [32, 3, 224, 224]
print(f"  Questions: {batch_questions.shape}")  # [32, 20]
print(f"  Labels: {batch_labels.shape}")        # [32]
print("-"*30)

------------------------------
Dataloaders Created
------------------------------
Train batches: 29 (batch size: 32)
Val batches:   7
Test batches:  7

Sample batch shapes:
  Images: torch.Size([32, 3, 224, 224])
  Questions: torch.Size([32, 20])
  Labels: torch.Size([32])
------------------------------


###**CNN Model Architecture (ResNet18 + BiLSTM)**

In [9]:
from torchvision import models

class CNNBaselineVQA(nn.Module):
    """CNN Baseline: ResNet18 image encoder + BiLSTM question encoder"""

    def __init__(self, vocab_size, embed_dim=256, lstm_hidden=512, img_feature_dim=512):
        super(CNNBaselineVQA, self).__init__()

        # Image encoder: Pre-trained ResNet18 (remove final FC layer)
        resnet = models.resnet18(pretrained=True)
        self.image_encoder = nn.Sequential(*list(resnet.children())[:-1])  # Output: 512-dim

        # Question encoder: Embedding + BiLSTM
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embed_dim,
            lstm_hidden,
            batch_first=True,
            bidirectional=True  # BiLSTM: 2 * lstm_hidden output
        )

        # Fusion and classification
        fusion_dim = img_feature_dim + (lstm_hidden * 2)  # 512 + 1024 = 1536
        self.classifier = nn.Sequential(
            nn.Linear(fusion_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, 2)  # Binary classification: yes/no
        )

    def forward(self, images, questions):
        # Extract image features
        img_features = self.image_encoder(images)  # [batch, 512, 1, 1]
        img_features = img_features.view(img_features.size(0), -1)  # [batch, 512]

        # Extract question features
        embedded = self.embedding(questions)  # [batch, seq_len, embed_dim]
        lstm_out, (hidden, cell) = self.lstm(embedded)
        # Concatenate forward and backward hidden states
        question_features = torch.cat([hidden[0], hidden[1]], dim=1)  # [batch, 1024]

        # Fuse image and question features
        combined = torch.cat([img_features, question_features], dim=1)  # [batch, 1536]

        # Classify
        output = self.classifier(combined)  # [batch, 2]
        return output

# Create model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CNNBaselineVQA(vocab_size=len(vocab)).to(device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print("-"*30)
print("Modal Created")
print("-"*30)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Device: {device}")
print("-"*30)

# Test forward pass
model.eval()
with torch.no_grad():
    test_images = batch_images.to(device)
    test_questions = batch_questions.to(device)
    test_output = model(test_images, test_questions)
    print(f"\nTest forward pass:")
    print(f"  Input images: {test_images.shape}")
    print(f"  Input questions: {test_questions.shape}")
    print(f"  Output logits: {test_output.shape}")  # [32, 2]
print("-"*30)



Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:00<00:00, 191MB/s]


------------------------------
Modal Created
------------------------------
Total parameters: 15,237,442
Trainable parameters: 15,237,442
Device: cuda
------------------------------

Test forward pass:
  Input images: torch.Size([32, 3, 224, 224])
  Input questions: torch.Size([32, 20])
  Output logits: torch.Size([32, 2])
------------------------------


**Loading the Best Model**

In [11]:
checkpoint = torch.load(f'{SAVE_DIR}/cnn_best_model.pth')
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()
print(f"Loaded trained model: {checkpoint['val_acc']:.2f}% val accuracy")

Loaded trained model: 67.53% val accuracy


####**Training & Evaulation functions**

In [16]:
def train_one_epoch(model, dataloader, criterion, optimizer, device):
    """Train for one epoch"""
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for images, questions, labels in tqdm(dataloader, desc="Training"):
        images = images.to(device)
        questions = questions.to(device)
        labels = labels.to(device)

        # Forward pass
        optimizer.zero_grad()
        outputs = model(images, questions)
        loss = criterion(outputs, labels)

        # Backward pass
        loss.backward()
        optimizer.step()

        # Statistics
        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(dataloader)
    epoch_acc = 100 * correct / total
    return epoch_loss, epoch_acc

def evaluate(model, dataloader, criterion, device):
    """Evaluate on validation/test set"""
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for images, questions, labels in tqdm(dataloader, desc="Evaluating"):
            images = images.to(device)
            questions = questions.to(device)
            labels = labels.to(device)

            outputs = model(images, questions)
            loss = criterion(outputs, labels)

            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(dataloader)
    epoch_acc = 100 * correct / total
    return epoch_loss, epoch_acc

print("-"*30)
print("TRAINING FUNCTIONS DEFINED")
print("-"*30)

------------------------------
TRAINING FUNCTIONS DEFINED
------------------------------


**Train CNN Baseline Model**

In [19]:
# Training configuration
NUM_EPOCHS = 5
LEARNING_RATE = 0.0001

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Track training history
history = {
    'train_loss': [],
    'train_acc': [],
    'val_loss': [],
    'val_acc': []
}

# Track best model
best_val_acc = 0.0
best_epoch = 0

print("-"*30)
print("Training CNN Baseline Model")
print("-"*30)
print(f"Epochs: {NUM_EPOCHS}")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Optimizer: Adam")
print("-"*30)

for epoch in range(1, NUM_EPOCHS + 1):
    print(f"\nEpoch {epoch}/{NUM_EPOCHS}")
    print("-" * 30)

    # Train
    train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, device)
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)

    # Validate
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)

    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")

    # Save best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_epoch = epoch
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_acc': val_acc,
        }, f'{SAVE_DIR}/cnn_best_model.pth')
        print(f"*** Best model saved! (Val Acc: {val_acc:.2f}%) ***")

print("-"*30)
print("TRAINING COMPLETE")
print("-"*30)
print(f"Best validation accuracy: {best_val_acc:.2f}% (Epoch {best_epoch})")
print("-"*30)

------------------------------
Training CNN Baseline Model
------------------------------
Epochs: 5
Learning rate: 0.0001
Optimizer: Adam
------------------------------

Epoch 1/5
------------------------------


Training: 100%|██████████| 29/29 [00:15<00:00,  1.85it/s]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  2.27it/s]


Train Loss: 0.6185 | Train Acc: 64.32%
Val Loss: 0.6150 | Val Acc: 63.40%
*** Best model saved! (Val Acc: 63.40%) ***

Epoch 2/5
------------------------------


Training: 100%|██████████| 29/29 [00:11<00:00,  2.60it/s]
Evaluating: 100%|██████████| 7/7 [00:02<00:00,  3.11it/s]


Train Loss: 0.4993 | Train Acc: 74.56%
Val Loss: 0.6454 | Val Acc: 64.43%
*** Best model saved! (Val Acc: 64.43%) ***

Epoch 3/5
------------------------------


Training: 100%|██████████| 29/29 [00:16<00:00,  1.78it/s]
Evaluating: 100%|██████████| 7/7 [00:02<00:00,  3.23it/s]


Train Loss: 0.4580 | Train Acc: 77.86%
Val Loss: 0.6311 | Val Acc: 65.98%
*** Best model saved! (Val Acc: 65.98%) ***

Epoch 4/5
------------------------------


Training: 100%|██████████| 29/29 [00:15<00:00,  1.82it/s]
Evaluating: 100%|██████████| 7/7 [00:03<00:00,  2.30it/s]


Train Loss: 0.4271 | Train Acc: 78.85%
Val Loss: 0.7068 | Val Acc: 67.01%
*** Best model saved! (Val Acc: 67.01%) ***

Epoch 5/5
------------------------------


Training: 100%|██████████| 29/29 [00:15<00:00,  1.87it/s]
Evaluating: 100%|██████████| 7/7 [00:02<00:00,  3.15it/s]


Train Loss: 0.3980 | Train Acc: 80.84%
Val Loss: 0.6796 | Val Acc: 67.53%
*** Best model saved! (Val Acc: 67.53%) ***
------------------------------
TRAINING COMPLETE
------------------------------
Best validation accuracy: 67.53% (Epoch 5)
------------------------------


**Evaulate on TEST set**

In [26]:
from sklearn.metrics import confusion_matrix, classification_report

# Load best model
checkpoint = torch.load(f'{SAVE_DIR}/cnn_best_model.pth')
model.load_state_dict(checkpoint['model_state_dict'])
print(f"Loaded best model from epoch {checkpoint['epoch']} (Val Acc: {checkpoint['val_acc']:.2f}%)")

# Evaluate on test set
test_loss, test_acc = evaluate(model, test_loader, criterion, device)

print("\n")
print("-"*30)
print("Test Set Evaluation")
print("-"*30)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.2f}%")
print("-"*30)

# Get predictions for confusion matrix
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for images, questions, labels in test_loader:
        images = images.to(device)
        questions = questions.to(device)

        outputs = model(images, questions)
        _, predicted = torch.max(outputs, 1)

        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.numpy())

# Confusion matrix
cm = confusion_matrix(all_labels, all_preds)
print("\nConfusion Matrix")
print("-"*30)
print("              Predicted")
print("              No    Yes")
print(f"Actual No     {cm[0][0]:<5} {cm[0][1]:<5}")
print(f"Actual Yes    {cm[1][0]:<5} {cm[1][1]:<5}")
print("-"*30)

# Classification report
report = classification_report(all_labels, all_preds, target_names=['No', 'Yes'], output_dict=True)
print("\nCalssification Report")
print("-"*30)
print(f"Class 'No':  Precision={report['No']['precision']:.2f}, "
      f"Recall={report['No']['recall']:.2f}, F1={report['No']['f1-score']:.2f}")
print(f"Class 'Yes': Precision={report['Yes']['precision']:.2f}, "
      f"Recall={report['Yes']['recall']:.2f}, F1={report['Yes']['f1-score']:.2f}")
print(f"\nAccuracy: {report['accuracy']:.2f}")
print(f"Support No: {int(report['No']['support'])}, Support Yes: {int(report['Yes']['support'])}")
print("-"*30)

Loaded best model from epoch 5 (Val Acc: 67.53%)


Evaluating: 100%|██████████| 7/7 [00:03<00:00,  2.13it/s]



------------------------------
Test Set Evaluation
------------------------------
Test Loss: 0.8236
Test Accuracy: 72.82%
------------------------------






Confusion Matrix
------------------------------
              Predicted
              No    Yes
Actual No     80    27   
Actual Yes    26    62   
------------------------------

Calssification Report
------------------------------
Class 'No':  Precision=0.75, Recall=0.75, F1=0.75
Class 'Yes': Precision=0.70, Recall=0.70, F1=0.70

Accuracy: 0.73
Support No: 107, Support Yes: 88
------------------------------


####**Saving CNN Baseline Results**

In [29]:
# Save complete results
CNN_RESULTS = {
    'model_name': 'CNN Baseline (ResNet18 + BiLSTM)',
    'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),

    'architecture': {
        'image_encoder': 'ResNet18 (pre-trained on ImageNet)',
        'question_encoder': 'Embedding + BiLSTM',
        'vocab_size': len(vocab),
        'embedding_dim': 256,
        'lstm_hidden': 512,
        'total_params': total_params,
        'trainable_params': trainable_params
    },

    'training': {
        'epochs': NUM_EPOCHS,
        'batch_size': BATCH_SIZE,
        'learning_rate': LEARNING_RATE,
        'optimizer': 'Adam',
        'best_epoch': best_epoch,
        'best_val_acc': round(best_val_acc, 2)
    },

    'dataset': {
        'train_samples': len(train),
        'val_samples': len(val),
        'test_samples': len(test)
    },

    'test_results': {
        'accuracy': round(test_acc, 2),
        'loss': round(test_loss, 4),
        'confusion_matrix': {
            'true_no_pred_no': int(cm[0][0]),
            'true_no_pred_yes': int(cm[0][1]),
            'true_yes_pred_no': int(cm[1][0]),
            'true_yes_pred_yes': int(cm[1][1])
        },
        'classification_report': {
            'no_precision': round(report['No']['precision'], 2),
            'no_recall': round(report['No']['recall'], 2),
            'no_f1': round(report['No']['f1-score'], 2),
            'yes_precision': round(report['Yes']['precision'], 2),
            'yes_recall': round(report['Yes']['recall'], 2),
            'yes_f1': round(report['Yes']['f1-score'], 2),
            'support_no': int(report['No']['support']),
            'support_yes': int(report['Yes']['support'])
        }
    }
}

# Save results JSON
with open(f'{SAVE_DIR}/CNN_BASELINE_RESULTS.json', 'w') as f:
    json.dump(CNN_RESULTS, f, indent=2)

# Save vocabulary
with open(f'{SAVE_DIR}/vocabulary.pkl', 'wb') as f:
    pickle.dump(vocab, f)

# Save predictions
with open(f'{SAVE_DIR}/cnn_test_predictions.pkl', 'wb') as f:
    pickle.dump({
        'predictions': all_preds,
        'ground_truth': all_labels
    }, f)

print("-"*30)
print("CNN Baseline Results Saved")
print("-"*30)
print(f"Location: {SAVE_DIR}")
print("Files saved:")
print("  - CNN_BASELINE_RESULTS.json")
print("  - cnn_best_model.pth")
print("  - vocabulary.pkl")
print("  - cnn_test_predictions.pkl")
print("-"*30)
print("\nCNN Baseline Locked:")
print(f"  Test Accuracy: {test_acc:.2f}%")
print(f"  Validation Accuracy: {best_val_acc:.2f}%")
print(f"  Test Samples: {len(test)}")
print("-"*30)

------------------------------
CNN Baseline Results Saved
------------------------------
Location: /content/drive/MyDrive/alternative-assessment-materials/VQA_RAD_LOCKED_RESULTS/
Files saved:
  - CNN_BASELINE_RESULTS.json
  - cnn_best_model.pth
  - vocabulary.pkl
  - cnn_test_predictions.pkl
------------------------------

CNN Baseline Locked:
  Test Accuracy: 72.82%
  Validation Accuracy: 67.53%
  Test Samples: 195
------------------------------


**Install BLIP-2 Dependencies**



In [12]:
!pip install transformers==4.36.2 --break-system-packages --quiet
!pip install accelerate --break-system-packages --quiet
!pip install bitsandbytes --break-system-packages --quiet

print("-"*30)
print("BLIP-2 Dependencies Installed")
print("-"*30)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m90.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m85.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 5.1.2 requires transformers<5.0.0,>=4.41.0, but you have transformers 4.36.2 which is incompatible.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25h------------------------------
BLIP-2 Dependencies Installed
------------------------------


**Loading BLIP-2 Model**

In [13]:
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from PIL import Image

print("-"*30)
print("Loading BLIP-2 Model")
print("-"*30)
print("Model: Salesforce/blip2-opt-2.7b")
print("-"*30)

# Load processor (handles image + text preprocessing)
processor = Blip2Processor.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    use_fast=False  # Avoid tokenizer serialization bug
)

# Load model
blip2_model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    torch_dtype=torch.float16,  # FP16 for memory efficiency
    device_map="auto"            # Auto device placement
)

print("-"*30)
print("BLIP-2 Model Loaded")
print("-"*30)
print(f"Model: BLIP-2 OPT-2.7B")
print(f"Parameters: 2.7B (frozen)")
print(f"Device: {blip2_model.device}")
print("-"*30)

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


------------------------------
Loading BLIP-2 Model
------------------------------
Model: Salesforce/blip2-opt-2.7b
------------------------------


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/882 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

------------------------------
BLIP-2 Model Loaded
------------------------------
Model: BLIP-2 OPT-2.7B
Parameters: 2.7B (frozen)
Device: cuda:0
------------------------------


**BLIP-2 VQA Prediction Function**

In [14]:
def blip2_vqa_predict(image, question, model, processor):
    """
    Zero-shot VQA using BLIP-2

    Args:
        image: PIL Image
        question: str
        model: BLIP-2 model
        processor: BLIP-2 processor

    Returns:
        str: Predicted answer
    """
    # Format prompt for VQA
    prompt = f"Question: {question} Answer:"

    # Preprocess inputs
    inputs = processor(
        images=image,
        text=prompt,
        return_tensors="pt"
    ).to(model.device, torch.float16)

    # Generate answer
    generated_ids = model.generate(
        **inputs,
        max_new_tokens=10,   # Short answers for yes/no
        num_beams=1,         # Greedy decoding
        do_sample=False      # Deterministic
    )

    # Decode answer
    generated_text = processor.batch_decode(
        generated_ids,
        skip_special_tokens=True
    )[0].strip()

    return generated_text

print("-"*30)
print("BLIP-2 VQA Function Defined")
print("-"*30)

# Test on one sample
sample_row = test.iloc[0]
sample_image_path = os.path.join(IMAGE_DIR, sample_row['image_name'])
sample_image = Image.open(sample_image_path).convert('RGB')
sample_question = sample_row['question']
sample_answer = sample_row['answer']

test_prediction = blip2_vqa_predict(sample_image, sample_question, blip2_model, processor)

print(f"\nTest prediction:")
print(f"  Question: {sample_question}")
print(f"  Ground Truth: {sample_answer}")
print(f"  BLIP-2 Prediction: {test_prediction}")
print("-"*30)

------------------------------
BLIP-2 VQA Function Defined
------------------------------

Test prediction:
  Question: Are adrenal glands present in this image?
  Ground Truth: no
  BLIP-2 Prediction: Yes, adrenal glands are present in this image
------------------------------


**Evaluate BLIP-2 Zero-Shot on Test Set**

In [16]:
print("-"*30)
print("Evaluating BLIP-2 ZERO-SHOT on Test Set")
print("-"*30)
print(f"Test samples: {len(test)}")
print("-"*30)

blip2_predictions = []
blip2_ground_truth = []
blip2_raw_outputs = []

correct = 0
total = 0

for idx, row in tqdm(test.iterrows(), total=len(test), desc="Evaluating BLIP-2"):
    # Load image
    image_path = os.path.join(IMAGE_DIR, row['image_name'])
    try:
        image = Image.open(image_path).convert('RGB')
    except:
        print(f"Warning: Could not load {row['image_name']}")
        continue

    # Get question and ground truth
    question = row['question']
    ground_truth = row['answer'].lower().strip()

    # Get BLIP-2 prediction
    prediction = blip2_vqa_predict(image, question, blip2_model, processor)
    prediction_lower = prediction.lower().strip()

    # Store results
    blip2_predictions.append(prediction_lower)
    blip2_ground_truth.append(ground_truth)
    blip2_raw_outputs.append(prediction)

    # Check if correct (flexible matching for yes/no)
    is_correct = False
    if ground_truth == 'yes':
        is_correct = 'yes' in prediction_lower
    elif ground_truth == 'no':
        is_correct = 'no' in prediction_lower and 'yes' not in prediction_lower

    if is_correct:
        correct += 1

    total += 1

# Calculate accuracy
blip2_accuracy = (correct / total) * 100

print("-"*30)
print("BLIP-2 ZERO-SHOT Evaluation Complete")
print("-"*30)
print(f"Total samples: {total}")
print(f"Correct predictions: {correct}")
print(f"Accuracy: {blip2_accuracy:.2f}%")
print("-"*30)

------------------------------
Evaluating BLIP-2 ZERO-SHOT on Test Set
------------------------------
Test samples: 195
------------------------------


Evaluating BLIP-2: 100%|██████████| 195/195 [02:00<00:00,  1.62it/s]

------------------------------
BLIP-2 ZERO-SHOT Evaluation Complete
------------------------------
Total samples: 195
Correct predictions: 93
Accuracy: 47.69%
------------------------------





**Caclculate BLIP-2 Detailed Metrics**

In [17]:
from sklearn.metrics import confusion_matrix, classification_report

print("-"*30)
print("Calculating BLIP-2 Metrics")
print("-"*30)

# Convert to binary labels (0=no, 1=yes)
blip2_pred_binary = []
blip2_true_binary = []

for pred, true in zip(blip2_predictions, blip2_ground_truth):
    # Ground truth
    true_label = 1 if true == 'yes' else 0
    blip2_true_binary.append(true_label)

    # Prediction (flexible matching)
    if 'yes' in pred and 'no' not in pred:
        pred_label = 1
    elif 'no' in pred:
        pred_label = 0
    else:
        pred_label = 0  # Default to 'no' if unclear

    blip2_pred_binary.append(pred_label)

# Confusion matrix
cm_blip2 = confusion_matrix(blip2_true_binary, blip2_pred_binary)

print("\nConfusion Matrix - BLIP-2 ZERO-SHOT")
print("-"*30)
print("              Predicted")
print("              No    Yes")
print(f"Actual No     {cm_blip2[0][0]:<5} {cm_blip2[0][1]:<5}")
print(f"Actual Yes    {cm_blip2[1][0]:<5} {cm_blip2[1][1]:<5}")
print("-"*30)

# Classification report
report_blip2 = classification_report(
    blip2_true_binary,
    blip2_pred_binary,
    target_names=['No', 'Yes'],
    output_dict=True
)

print("\nClassification Report")
print("-"*30)
print(f"Class 'No':  Precision={report_blip2['No']['precision']:.2f}, "
      f"Recall={report_blip2['No']['recall']:.2f}, F1={report_blip2['No']['f1-score']:.2f}")
print(f"Class 'Yes': Precision={report_blip2['Yes']['precision']:.2f}, "
      f"Recall={report_blip2['Yes']['recall']:.2f}, F1={report_blip2['Yes']['f1-score']:.2f}")
print(f"\nAccuracy: {report_blip2['accuracy']:.2f}")
print(f"Support No: {int(report_blip2['No']['support'])}, Support Yes: {int(report_blip2['Yes']['support'])}")
print("-"*30)

------------------------------
Calculating BLIP-2 Metrics
------------------------------

Confusion Matrix - BLIP-2 ZERO-SHOT
------------------------------
              Predicted
              No    Yes
Actual No     45    62   
Actual Yes    17    71   
------------------------------

Classification Report
------------------------------
Class 'No':  Precision=0.73, Recall=0.42, F1=0.53
Class 'Yes': Precision=0.53, Recall=0.81, F1=0.64

Accuracy: 0.59
Support No: 107, Support Yes: 88
------------------------------


**Save BLIP-2 Results & Comparison**

In [20]:
blip2_accuracy_corrected = report_blip2['accuracy'] * 100

# Save BLIP-2 results
BLIP2_RESULTS = {
    'model_name': 'BLIP-2 Zero-Shot (OPT-2.7B)',
    'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),

    'architecture': {
        'vision_encoder': 'EVA-CLIP (ViT-g/14)',
        'language_model': 'OPT-2.7B',
        'total_params': '2.7B',
        'approach': 'zero-shot (no training)'
    },

    'dataset': {
        'test_samples': total
    },

    'test_results': {
        'accuracy': round(blip2_accuracy_corrected, 2),
        'confusion_matrix': {
            'true_no_pred_no': int(cm_blip2[0][0]),
            'true_no_pred_yes': int(cm_blip2[0][1]),
            'true_yes_pred_no': int(cm_blip2[1][0]),
            'true_yes_pred_yes': int(cm_blip2[1][1])
        },
        'classification_report': {
            'no_precision': round(report_blip2['No']['precision'], 2),
            'no_recall': round(report_blip2['No']['recall'], 2),
            'no_f1': round(report_blip2['No']['f1-score'], 2),
            'yes_precision': round(report_blip2['Yes']['precision'], 2),
            'yes_recall': round(report_blip2['Yes']['recall'], 2),
            'yes_f1': round(report_blip2['Yes']['f1-score'], 2),
            'support_no': int(report_blip2['No']['support']),
            'support_yes': int(report_blip2['Yes']['support'])
        }
    }
}

# Save BLIP-2 results
with open(f'{SAVE_DIR}/BLIP2_BASELINE_RESULTS.json', 'w') as f:
    json.dump(BLIP2_RESULTS, f, indent=2)

with open(f'{SAVE_DIR}/blip2_test_predictions.pkl', 'wb') as f:
    pickle.dump({
        'predictions': blip2_predictions,
        'predictions_binary': blip2_pred_binary,
        'ground_truth': blip2_ground_truth,
        'ground_truth_binary': blip2_true_binary,
        'raw_outputs': blip2_raw_outputs
    }, f)

# Load CNN results for comparison
with open(f'{SAVE_DIR}/CNN_BASELINE_RESULTS.json', 'r') as f:
    cnn_results = json.load(f)

print("-"*70)
print("FINAL COMPARISON: CNN vs BLIP-2")
print("-"*70)
print(f"{'Metric':<30} {'CNN Baseline':<20} {'BLIP-2 Zero-Shot':<20}")
print("-"*70)
print(f"{'Approach':<30} {'Trained (908 samples)':<20} {'Zero-shot':<20}")
print(f"{'Parameters':<30} {'15.2M total':<20} {'2.7B frozen':<20}")
print("-"*70)
print(f"{'Test Accuracy':<30} {cnn_results['test_results']['accuracy']:<20.2f} {blip2_accuracy_corrected:<20.2f}")
print(f"{'Difference':<30} {'-':<20} {blip2_accuracy_corrected - cnn_results['test_results']['accuracy']:+.2f}")
print("-"*70)
print(f"{'Precision (No)':<30} {cnn_results['test_results']['classification_report']['no_precision']:<20.2f} {report_blip2['No']['precision']:<20.2f}")
print(f"{'Precision (Yes)':<30} {cnn_results['test_results']['classification_report']['yes_precision']:<20.2f} {report_blip2['Yes']['precision']:<20.2f}")
print("-"*70)
print(f"{'Recall (No)':<30} {cnn_results['test_results']['classification_report']['no_recall']:<20.2f} {report_blip2['No']['recall']:<20.2f}")
print(f"{'Recall (Yes)':<30} {cnn_results['test_results']['classification_report']['yes_recall']:<20.2f} {report_blip2['Yes']['recall']:<20.2f}")
print("-"*70)

# Save comparison
comparison = {
    'cnn_baseline': cnn_results['test_results'],
    'blip2_zero_shot': BLIP2_RESULTS['test_results'],
    'key_findings': {
        'accuracy_difference': round(blip2_accuracy_corrected - cnn_results['test_results']['accuracy'], 2),
        'winner': 'CNN' if cnn_results['test_results']['accuracy'] > blip2_accuracy_corrected else 'BLIP-2',
        'cnn_bias': 'Balanced predictions',
        'blip2_bias': 'Biased toward Yes (81% recall)'
    }
}

with open(f'{SAVE_DIR}/BASELINE_COMPARISON.json', 'w') as f:
    json.dump(comparison, f, indent=2)

print("\nALL RESULTS SAVED:")
print(f"  {SAVE_DIR}BLIP2_BASELINE_RESULTS.json")
print(f"  {SAVE_DIR}BASELINE_COMPARISON.json")
print("-"*30)

----------------------------------------------------------------------
FINAL COMPARISON: CNN vs BLIP-2
----------------------------------------------------------------------
Metric                         CNN Baseline         BLIP-2 Zero-Shot    
----------------------------------------------------------------------
Approach                       Trained (908 samples) Zero-shot           
Parameters                     15.2M total          2.7B frozen         
----------------------------------------------------------------------
Test Accuracy                  72.82                59.49               
Difference                     -                    -13.33
----------------------------------------------------------------------
Precision (No)                 0.75                 0.73                
Precision (Yes)                0.70                 0.53                
----------------------------------------------------------------------
Recall (No)                    0.75         