<a href="https://colab.research.google.com/github/Mr1-Robot/machine-learning-alternative-assessment/blob/main/alternative_assessment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Alternative Assessment - WOA7015
#####MUAAMAR MOHAMMED ABDULLAH AL-GHRAIRI - 24084470

In [None]:
import random
import numpy as np
import torch
import os

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print(f"Random Seed set to {seed}. Results are now reproducible.")

seed_everything(42)

In [None]:
# Mount google drive to access the dataset
from google.colab import drive
drive.mount('/content/drive')

DATA_ROOT = "/content/drive/MyDrive/alternative-assessment-materials/"
IMAGE_DIR = f"{DATA_ROOT}VQA_RAD_Images"
DATASET = f"{DATA_ROOT}VQA_RAD_Dataset.json"

In [None]:
# Core libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Data processing
import pandas as pd
import numpy as np
from PIL import Image
import json
import os

# Utilities
from tqdm import tqdm
import matplotlib.pyplot as plt

print("-"*30)
print("SETUP")
print("-"*30)
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print("-"*30)

Load & Explore **Dataset**

In [None]:
# Load dataset
vqa_data = pd.read_json(DATASET)
# Normalize the 'answer_type', since it has 'CLOSED' and 'CLOSED ' <- extra space at the end
vqa_data['answer_type'] = vqa_data['answer_type'].str.strip().str.upper()

print(f"Total Q&A pairs: {len(vqa_data)}")
print(f"Columns: {vqa_data.columns.tolist()}")

# # Answer type distribution
answer_types = vqa_data['answer_type'].value_counts()
print("\nAnswer Type Distribution:")
for key, value in answer_types.items():
  print(f"{key}: {value} ({value/len(vqa_data)*100:.1f}%)")

# Closed-ended questions
closed_ended_data = vqa_data[vqa_data['answer_type'] == 'CLOSED'].copy()
print(f"\nClosed-Ended Questions: {len(closed_ended_data)}\n")

# Normalize answers, adding extra key 'answer_norm'
closed_ended_data['answer_norm'] = closed_ended_data['answer'].str.lower().str.strip()
print(f"Normalized answers: {closed_ended_data['answer_norm']}")

# Closed answers distribution
closed_answers_dist = closed_ended_data['answer_norm'].value_counts()
print(f"\nClosed Answers Distribution: {closed_answers_dist}")
for answer, count in closed_answers_dist.items():
    print(f"{answer}: {count} ({count/len(closed_ended_data)*100:.1f}%)")

# Organ distribution
organ_dist = closed_ended_data['image_organ'].value_counts()
print("\nImage Organ Distribution:")
for organ, count in organ_dist.items():
    print(f"{organ}: {count} ({count/len(vqa_data)*100:.1f}%)")

**Constructing Binary (Yes/No) Dataset**

In [None]:
from sklearn.model_selection import train_test_split

print("-"*30)
print("Constructing Binary (Yes/No) Dataset")
print("-"*30)

# Filter for yes/no answers only
binary_data = closed_ended_data[
    closed_ended_data['answer_norm'].isin(['yes', 'no'])
].copy()


# Create binary labels for yes=1, no=0
binary_data['label'] = (binary_data['answer_norm'] == 'yes').astype(float)

print(f"Binary Dataset Size: {len(binary_data)}")

print("\nLabel Distrubution:")
label_dist = binary_data['label'].value_counts().sort_index()
for label, count in label_dist.items():
  answer = 'yes' if label == 1 else 'no'
  print(f"  {label} ({answer}): {count} ({count/len(binary_data)*100:.1f}%)")

# Check organ distribution in binary subset
print(f"\nOrgan Distribution in Binary Dataset:")
organ_dist = binary_data['image_organ'].value_counts()
for organ, count in organ_dist.items():
    print(f"  {organ}: {count} ({count/len(binary_data)*100:.1f}%)")

# Check question type distribution
print(f"\nQuestion Type Distribution:")
question_types = binary_data['question_type'].value_counts()
for question_type, count in question_types.items():
    print(f"  {question_type}: {count} ({count/len(binary_data)*100:.1f}%)")

# Verify no missing images
binary_data['image_path'] = binary_data['image_name'].apply(
    lambda x: os.path.join(IMAGE_DIR, x)
)
missing_images = binary_data[~binary_data['image_path'].apply(os.path.exists)]
print(f"\nMissing images: {len(missing_images)}")

print(f"\nFinal Binary Dataset: {len(binary_data)} examples")


###**Create PyTorch Dataset**

In [None]:
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image
import os
import re

# Dataset Class
class VQADataset(Dataset):
    """
    VQA Dataset for binary yes/no classification.
    Returns: (image, question_indices, question_len, label)
    """

    def __init__(self, dataframe, image_dir, vocab, transform=None, max_seq_len=20):
        """
        Args:
            dataframe: DataFrame with 'image_name', 'question', 'label'
            image_dir: Path to images
            vocab: Dictionary mapping words to integers {'word': idx}
            transform: Image transforms
            max_seq_len: Fixed length for padding (default 20)
        """
        self.data = dataframe.reset_index(drop=True)
        self.image_dir = image_dir
        self.vocab = vocab
        self.transform = transform
        self.max_seq_len = max_seq_len

    def tokenize(self, text):
        # Convert to lower case and remove punctuation
        text = str(text).lower()
        tokens = re.findall(r'\b\w+\b', text)
        return tokens

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        # LOAD IMAGE
        image_path = os.path.join(self.image_dir, row['image_name'])
        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        # PROCESS QUESTION
        tokens = self.tokenize(row['question'])

        # Handle empty questions
        # If tokenizer returns empty list (e.g., question was just "?"), use <UNK>
        if len(tokens) == 0:
            tokens = ['<UNK>']

        # Convert tokens to indices
        indices = [self.vocab.get(token, self.vocab.get('<UNK>', 1)) for token in tokens]

        # Truncate if too long
        if len(indices) > self.max_seq_len:
            indices = indices[:self.max_seq_len]

        actual_length = len(indices)

        # Pad with zeros (or <PAD> token index)
        pad_len = self.max_seq_len - len(indices)
        indices += [self.vocab.get('<PAD>', 0)] * pad_len

        # CONVERT TO TENSORS
        q_tensor = torch.tensor(indices, dtype=torch.long)
        len_tensor = torch.tensor(actual_length, dtype=torch.long)
        label_tensor = torch.tensor(row['label'], dtype=torch.float32)

        return image, q_tensor, len_tensor, label_tensor


# Define transforms (Standard Medical/ImageNet preprocessing)
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomRotation(degrees=15),
    transforms.ColorJitter(brightness=0.1, contrast=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

print("VQADataset defined (with Tokenization & Padding logic)")
print("Transforms defined")

In [None]:
# Group questions by image to see the structure
image_qa_mapping = binary_data.groupby('image_name').apply(
    lambda group: [
        {
            'question': row['question'],
            'answer': row['answer_norm'],
            'label': row['label']
        }
        for _, row in group.iterrows()
    ]
).to_dict()

print(f"Total unique images: {len(image_qa_mapping)}")
print(f"Total Q&A pairs: {len(binary_data)}")
print(f"Avg questions per image: {len(binary_data)/len(image_qa_mapping):.2f}")

# Show distribution of questions per image
qa_per_image = binary_data.groupby('image_name').size()
print(f"\nQuestions per image distribution:")
print(f"  Min: {qa_per_image.min()}")
print(f"  Max: {qa_per_image.max()}")
print(f"  Median: {qa_per_image.median():.0f}")
print(f"  Mean: {qa_per_image.mean():.2f}")

# Sample a few images to inspect
print("\n" + "-"*60)
print("SAMPLE IMAGE-QUESTION MAPPINGS")
print("-"*60)

sample_images = list(image_qa_mapping.keys())[:5]
for img_name in sample_images:
    qa_pairs = image_qa_mapping[img_name]
    print(f"\n{img_name}")
    print(f"   Organ: {binary_data[binary_data['image_name']==img_name]['image_organ'].iloc[0]}")
    print(f"   Questions ({len(qa_pairs)}):")
    for i, qa in enumerate(qa_pairs, 1):
        print(f"   {i}. Q: {qa['question']}")
        print(f"      A: {qa['answer']} (label={qa['label']})")

**Train/Val/Test Split (Stratified)**

In [None]:
import numpy as np

# Get unique images
unique_images = binary_data['image_name'].unique()
labels_per_image = binary_data.groupby('image_name')['label'].mean()

# Split IMAGES (80/10/10)
train_imgs, temp_imgs = train_test_split(unique_images, test_size=0.2, random_state=42)
val_imgs, test_imgs = train_test_split(temp_imgs, test_size=0.5, random_state=42)

# 3. Assign DataFrame rows based on the image split
train_df = binary_data[binary_data['image_name'].isin(train_imgs)].copy()
val_df = binary_data[binary_data['image_name'].isin(val_imgs)].copy()
test_df = binary_data[binary_data['image_name'].isin(test_imgs)].copy()

print("Group-Aware Split (No image leakage):")
print(f"  Train: {len(train_df)} questions ({len(train_imgs)} images)")
print(f"  Val:   {len(val_df)} questions ({len(val_imgs)} images)")
print(f"  Test:  {len(test_df)} questions ({len(test_imgs)} images)")

# Verify no overlap
assert set(train_df['image_name']) & set(test_df['image_name']) == set(), "FATAL: Train/Test overlap detected!"

**Build Vocabulary**

In [None]:
from collections import Counter
import re

def tokenize(text):
  """
  Simple tokenizer: lowercase and split on non-alphanumeric
  """
  text = text.lower()
  tokens = re.findall(r'\b\w+\b', text)
  return tokens


# Build vocabulary from training questions only
all_tokens = []
for question in train_df['question']:
  all_tokens.extend(tokenize(question))

token_counts = Counter(all_tokens)
print(f"Total tokens in training: {len(all_tokens)}")
print(f"Unique tokens: {len(token_counts)}")

# Create vocabulary with special tokens
vocab = {
    '<PAD>': 0,
    '<UNK>': 1,
}

# Add tokens that appear at leat min_freq times
min_freq = 2
for token, count in token_counts.items():
  if count >= min_freq:
    vocab[token] = len(vocab)

print(f"Vocabulary size (min_freq={min_freq}): {len(vocab)}")
print(f"\nMost common tokens:")
for token, count in token_counts.most_common(20):
    print(f"  '{token}': {count}")

# Create reverse mapping
idx_to_token = {idx: token for token, idx in vocab.items()}

**Dataset Class**

In [None]:
class VQABinaryDataset(Dataset):
    def __init__(self, dataframe, image_dir, vocab, transform=None, max_seq_len=30):
        self.data = dataframe.reset_index(drop=True)
        self.image_dir = image_dir
        self.vocab = vocab
        self.transform = transform
        self.max_seq_len = max_seq_len

    def tokenize(self, text):
        text = text.lower()
        tokens = re.findall(r'\b\w+\b', text)
        return tokens

    def encode_question(self, question):
        tokens = self.tokenize(question)

        # Handle empty questions (e.g. "?") by inserting UNK
        if len(tokens) == 0:
            tokens = ['<UNK>']

        # Convert tokens to indices
        indices = [self.vocab.get(token, self.vocab['<UNK>']) for token in tokens]

        # Truncate
        if len(indices) > self.max_seq_len:
            indices = indices[:self.max_seq_len]

        actual_length = len(indices)

        # Pad
        if len(indices) < self.max_seq_len:
            indices += [self.vocab['<PAD>']] * (self.max_seq_len - len(indices))

        return torch.tensor(indices, dtype=torch.long), actual_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        image_path = os.path.join(self.image_dir, row['image_name'])
        image = Image.open(image_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        question_encoded, question_length = self.encode_question(row['question'])
        label = torch.tensor(row['label'], dtype=torch.float32)

        return image, question_encoded, question_length, label

# Re-create datasets with the fix
train_dataset = VQABinaryDataset(train_df, IMAGE_DIR, vocab, transform=train_transform)
val_dataset = VQABinaryDataset(val_df, IMAGE_DIR, vocab, transform=val_transform)
test_dataset = VQABinaryDataset(test_df, IMAGE_DIR, vocab, transform=val_transform)

# Re-create loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)

print("Datasets patched for zero-length sequences.")

**Dataloaders**

In [None]:
# DataLoader configuration
BATCH_SIZE = 32
NUM_WORKERS = 2

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS,
    pin_memory=True if torch.cuda.is_available() else False
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=True if torch.cuda.is_available() else False
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=True if torch.cuda.is_available() else False
)

print(f"DataLoaders created:")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Train batches: {len(train_loader)}")
print(f"  Val batches:   {len(val_loader)}")
print(f"  Test batches:  {len(test_loader)}")

# Test batch loading
sample_batch = next(iter(train_loader))
images, questions, lengths, labels = sample_batch
print(f"\nSample batch:")
print(f"  Images shape: {images.shape}")
print(f"  Questions shape: {questions.shape}")
print(f"  Lengths shape: {lengths.shape}")
print(f"  Labels shape: {labels.shape}")

###**CNN Model Architecture (ResNet18 + LSTM)**

In [None]:
import torchvision.models as models
from torchvision.models import ResNet18_Weights

class MultimodalVQAModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=300, lstm_hidden_dim=64, dropout=0.5):
        super(MultimodalVQAModel, self).__init__()

        # Vision: ResNet-18 (Frozen)
        resnet = models.resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
        self.vision_encoder = nn.Sequential(*list(resnet.children())[:-1])
        for param in self.vision_encoder.parameters():
            param.requires_grad = False

        # Text: Small LSTM
        self.question_embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.question_lstm = nn.LSTM(embedding_dim, lstm_hidden_dim, batch_first=True, bidirectional=True)

        # Fusion
        # Vision(512) + LSTM(64*2 = 128) = 640
        self.fusion_dim = 512 + (lstm_hidden_dim * 2)

        # 4. Classifier
        # REMOVED Sigmoid from the end!
        self.classifier = nn.Sequential(
            nn.Linear(self.fusion_dim, 32),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(32, 1) # Output raw logits
        )

    def forward(self, images, questions, question_lengths):
        batch_size = images.size(0)

        # Vision
        vision_features = self.vision_encoder(images).view(batch_size, -1)

        # Language
        question_embedded = self.question_embedding(questions)
        packed_questions = nn.utils.rnn.pack_padded_sequence(
            question_embedded,
            question_lengths.cpu(),
            batch_first=True,
            enforce_sorted=False
        )
        packed_output, (hidden, cell) = self.question_lstm(packed_questions)
        question_features = torch.cat([hidden[0], hidden[1]], dim=1)

        # Fusion
        fused_features = torch.cat([vision_features, question_features], dim=1)

        # Prediction (Logits)
        predictions = self.classifier(fused_features)
        return predictions.squeeze(1)

# Re-initialize with SMALLER dimensions
model = MultimodalVQAModel(
    vocab_size=len(vocab),
    embedding_dim=300,
    lstm_hidden_dim=64,
    dropout=0.5
).to(device)

print("Model re-initialized.")

# Test Forward Pass again
model.eval()
with torch.no_grad():
    test_images, test_questions, test_lengths, test_labels = next(iter(train_loader))
    test_images = test_images.to(device)
    test_questions = test_questions.to(device)

    # This should now work without crashing
    test_outputs = model(test_images, test_questions, test_lengths)
    print(f"\nTest forward pass successful!")
    print(f"Output shape: {test_outputs.shape}")

In [None]:
import torch.optim as optim

# Calculate Class Weights
# Count positive examples in training split
num_pos = train_df['label'].sum()
num_neg = len(train_df) - num_pos
pos_weight = torch.tensor([num_neg / num_pos]).to(device)

print(f"Class Balance: Yes(1)={num_pos}, No(0)={num_neg}")
print(f"Pos Weight: {pos_weight.item():.2f}")

# Loss & Optimizer
# We use BCEWithLogitsLoss because it's numerically stable and handles pos_weight
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

optimizer = optim.Adam(model.parameters(), lr=0.001)

# Metrics Helper
def calculate_accuracy(preds, labels):
    # Apply sigmoid here since model outputs logits
    probs = torch.sigmoid(preds)
    predicted_classes = (probs > 0.5).float()
    correct = (predicted_classes == labels).float().sum()
    return correct / len(labels)

print("Training setup complete.")

**Training Configuration**

In [None]:
import torch.optim as optim
from sklearn.metrics import accuracy_score, f1_score
import time

# CONFIGURATION
NUM_EPOCHS = 20
WARMUP_EPOCHS = 3  # Freeze backbone for first 3 epochs
LEARNING_RATE_HEAD = 1e-3
LEARNING_RATE_BACKBONE = 5e-5

# Re-initialize Model fresh
model = MultimodalVQAModel(
    vocab_size=len(vocab),
    embedding_dim=300,
    lstm_hidden_dim=64,
    dropout=0.5
).to(device)

# Initial Optimizer (Only training the head/LSTM params)
# We filter for parameters that require grad
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=LEARNING_RATE_HEAD)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

print(f"Training Strategy: {WARMUP_EPOCHS} Warmup Epochs -> Fine-Tuning")

# MAIN LOOP
history = {'train_loss': [], 'val_acc': []}
best_val_acc = 0.0

for epoch in range(NUM_EPOCHS):

    # AUTOMATIC UNFREEZING LOGIC
    if epoch == WARMUP_EPOCHS:
        print("\nUNFREEZING BACKBONE (Fine-Tuning Mode Started)")

        # Unfreeze Vision Encoder
        for param in model.vision_encoder.parameters():
            param.requires_grad = True

        # Update Optimizer to include ALL parameters
        #    Use different learning rates: small for ResNet, normal for Head
        optimizer = optim.Adam([
            {'params': model.vision_encoder.parameters(), 'lr': LEARNING_RATE_BACKBONE},
            {'params': model.classifier.parameters(), 'lr': LEARNING_RATE_HEAD},
            {'params': model.question_lstm.parameters(), 'lr': LEARNING_RATE_HEAD},
            {'params': model.question_embedding.parameters(), 'lr': LEARNING_RATE_HEAD}
        ])

    # TRAIN
    model.train()
    running_loss = 0.0
    for images, questions, lengths, labels in train_loader:
        images, questions, labels = images.to(device), questions.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images, questions, lengths)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * images.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)

    # VAL
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for images, questions, lengths, labels in val_loader:
            images, questions, labels = images.to(device), questions.to(device), labels.to(device)
            outputs = model(images, questions, lengths)
            preds = (torch.sigmoid(outputs) > 0.5).float()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    val_acc = accuracy_score(all_labels, all_preds)

    # LOGGING
    history['train_loss'].append(epoch_loss)
    history['val_acc'].append(val_acc)

    print(f"Epoch {epoch+1}/{NUM_EPOCHS} | Loss: {epoch_loss:.4f} | Val Acc: {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "best_model.pth")

print(f"\nBest Accuracy: {best_val_acc:.4f}")

**Ploting the Curves**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report
import pandas as pd
import numpy as np

# COMPREHENSIVE VISUALIZATION

print("-" * 40)
print("GENERATING FINAL REPORT CHARTS")
print("-" * 40)

# Setup Data
# Ensure we have the predictions from the Test Set
model.eval()
y_true = []
y_scores = []
y_preds = []

with torch.no_grad():
    for images, questions, lengths, labels in test_loader:
        images, questions = images.to(device), questions.to(device)

        # Forward
        outputs = model(images, questions, lengths)
        probs = torch.sigmoid(outputs)

        y_scores.extend(probs.cpu().numpy())
        y_preds.extend((probs > 0.5).float().cpu().numpy())
        y_true.extend(labels.cpu().numpy())

y_true = np.array(y_true)
y_scores = np.array(y_scores)
y_preds = np.array(y_preds)

# Create Canvas (2x2 Grid)
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
plt.subplots_adjust(hspace=0.3, wspace=0.3)

# Learning Curves (Top Left)
# Robust check for available keys in history
ax1 = axes[0, 0]
if 'train_loss' in history:
    ax1.plot(history['train_loss'], label='Train Loss', color='blue', linewidth=2)
if 'val_loss' in history:
    ax1.plot(history['val_loss'], label='Val Loss', color='orange', linestyle='--', linewidth=2)
if 'val_acc' in history:
    ax1_twin = ax1.twinx()
    ax1_twin.plot(history['val_acc'], label='Val Acc', color='green', linestyle=':', linewidth=2)
    ax1_twin.set_ylabel('Accuracy', color='green')

ax1.set_title('Training Dynamics', fontsize=14, fontweight='bold')
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Loss')
ax1.legend(loc='upper left')
ax1.grid(True, alpha=0.3)


# Confusion Matrix (Top Right)
ax2 = axes[0, 1]
cm = confusion_matrix(y_true, y_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'],
            ax=ax2, annot_kws={"size": 16})
ax2.set_title('Confusion Matrix', fontsize=14, fontweight='bold')
ax2.set_xlabel('Predicted Label')
ax2.set_ylabel('True Label')


# ROC Curve (Bottom Left)
ax3 = axes[1, 0]
fpr, tpr, _ = roc_curve(y_true, y_scores)
roc_auc = auc(fpr, tpr)

ax3.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
ax3.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
ax3.set_xlim([0.0, 1.0])
ax3.set_ylim([0.0, 1.05])
ax3.set_xlabel('False Positive Rate')
ax3.set_ylabel('True Positive Rate')
ax3.set_title('ROC Curve', fontsize=14, fontweight='bold')
ax3.legend(loc="lower right")
ax3.grid(True, alpha=0.3)


# Detailed Metrics (Bottom Right)
ax4 = axes[1, 1]
report = classification_report(y_true, y_preds, target_names=['No', 'Yes'], output_dict=True)
metrics_df = pd.DataFrame(report).transpose().iloc[:2] # Get only Yes/No rows

# Plot Bar Chart
metrics_df[['precision', 'recall', 'f1-score']].plot(kind='bar', ax=ax4, color=['#a8dab5', '#4dbbd5', '#e64b35'])
ax4.set_title('Per-Class Performance', fontsize=14, fontweight='bold')
ax4.set_ylim(0, 1.0)
ax4.legend(loc='lower center', ncol=3)
ax4.grid(axis='y', alpha=0.3)
ax4.tick_params(axis='x', rotation=0)

# Show final plot
plt.show()

# Print Numeric Summary
print("\n" + "-"*30)
print("FINAL NUMERIC RESULTS")
print("-"*30)
print(classification_report(y_true, y_preds, target_names=['No', 'Yes']))
print(f"Global Test Accuracy: {np.mean(y_true == y_preds)*100:.2f}%")
print(f"AUC Score: {roc_auc:.4f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report

# Ensure Drive folder exists
DRIVE_FOLDER = "/content/drive/MyDrive/VQA_Assignment_Results"
os.makedirs(DRIVE_FOLDER, exist_ok=True)
print(f"Saving individual charts to: {DRIVE_FOLDER}")

# SAVE QUANTITATIVE CHARTS INDIVIDUALLY

# Learning Curves
plt.figure(figsize=(8, 6))
if 'train_loss' in history:
    plt.plot(history['train_loss'], label='Train Loss', color='blue', linewidth=2)
if 'val_loss' in history:
    plt.plot(history['val_loss'], label='Val Loss', color='orange', linestyle='--', linewidth=2)
plt.title('Training Loss Curve')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig(os.path.join(DRIVE_FOLDER, "chart_1_learning_curve.png"), dpi=300)
plt.close()
print("  Saved chart_1_learning_curve.png")

# Confusion Matrix
plt.figure(figsize=(6, 5))
cm = confusion_matrix(y_true, y_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'], annot_kws={"size": 14})
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig(os.path.join(DRIVE_FOLDER, "chart_2_confusion_matrix.png"), dpi=300)
plt.close()
print("  Saved chart_2_confusion_matrix.png")

# ROC Curve
plt.figure(figsize=(8, 6))
fpr, tpr, _ = roc_curve(y_true, y_scores)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
plt.savefig(os.path.join(DRIVE_FOLDER, "chart_3_roc_curve.png"), dpi=300)
plt.close()
print("  Saved chart_3_roc_curve.png")

# Metrics Bar Chart
plt.figure(figsize=(8, 6))
report = classification_report(y_true, y_preds, target_names=['No', 'Yes'], output_dict=True)
metrics_df = pd.DataFrame(report).transpose().iloc[:2]
metrics_df[['precision', 'recall', 'f1-score']].plot(kind='bar', color=['#a8dab5', '#4dbbd5', '#e64b35'])
plt.title('Precision, Recall & F1 Score')
plt.ylim(0, 1.0)
plt.legend(loc='lower center', ncol=3)
plt.grid(axis='y', alpha=0.3)
plt.xticks(rotation=0)
plt.savefig(os.path.join(DRIVE_FOLDER, "chart_4_metrics_bar.png"), dpi=300)
plt.close()
print("  Saved chart_4_metrics_bar.png")

# QUALITATIVE ANALYSIS (EXAMPLES)

def get_text_from_tensor(question_tensor):
    # Convert token IDs back to words
    words = []
    for idx in question_tensor:
        idx = idx.item()
        if idx == 0: break # Padding
        # Find word for index
        for word, vocab_idx in vocab.items():
            if vocab_idx == idx:
                words.append(word)
                break
    return " ".join(words)

# Get a batch of data
model.eval()
images, questions, lengths, labels = next(iter(test_loader))
images, questions = images.to(device), questions.to(device)
labels = labels.to(device)

with torch.no_grad():
    outputs = model(images, questions, lengths)
    preds = (torch.sigmoid(outputs) > 0.5).float()

# Find 3 Correct and 3 Incorrect
correct_indices = (preds == labels).nonzero(as_tuple=True)[0].cpu().numpy()
incorrect_indices = (preds != labels).nonzero(as_tuple=True)[0].cpu().numpy()

# Plotting Helper
def plot_examples(indices, title, filename, color):
    if len(indices) < 3:
        print(f"Not enough {title} examples to plot.")
        return

    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    for i, idx in enumerate(indices[:3]):
        ax = axes[i]

        # Un-normalize image for display
        img = images[idx].cpu().permute(1, 2, 0).numpy()
        img = img * [0.229, 0.224, 0.225] + [0.485, 0.456, 0.406] # Undo ImageNet Norm
        img = np.clip(img, 0, 1)

        ax.imshow(img)
        ax.axis('off')

        # Get Text
        q_text = get_text_from_tensor(questions[idx])
        pred_lbl = "Yes" if preds[idx] == 1 else "No"
        true_lbl = "Yes" if labels[idx] == 1 else "No"

        ax.set_title(f"Q: {q_text}?\nTrue: {true_lbl} | Pred: {pred_lbl}",
                     color=color, fontsize=11, fontweight='bold')

    plt.suptitle(title, fontsize=16)
    plt.tight_layout()
    plt.savefig(os.path.join(DRIVE_FOLDER, filename), dpi=300)
    plt.close()
    print(f"  Saved {filename}")

plot_examples(correct_indices, "Correct Predictions (Success Cases)", "chart_5_success_cases.png", "green")
plot_examples(incorrect_indices, "Incorrect Predictions (Failure Analysis)", "chart_6_failure_cases.png", "red")

##**BLIP Model Setup**

In [None]:

# BLIP MODEL SETUP (Generative VQA)

print("Installing Hugging Face Transformers...")
!pip install -q transformers

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BlipProcessor, BlipForQuestionAnswering
from PIL import Image

# Load BLIP Processor & Model
# We use the 'base' model fine-tuned for VQA
MODEL_ID = "Salesforce/blip-vqa-base"
processor = BlipProcessor.from_pretrained(MODEL_ID)
model = BlipForQuestionAnswering.from_pretrained(MODEL_ID).to(device)

print(f"Loaded: {MODEL_ID}")

# Define the BLIP Dataset
class VQABlipDataset(Dataset):
    def __init__(self, df, processor, image_dir):
        self.df = df
        self.processor = processor
        self.image_dir = image_dir

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # Load Image
        img_name = row['image_name']
        img_path = f"{self.image_dir}/{img_name}"
        image = Image.open(img_path).convert('RGB')

        # Get Text
        question = row['question']
        answer = row['answer'] # BLIP needs the actual word "yes" or "no"

        # Process Inputs (Image + Question)
        inputs = self.processor(
            images=image,
            text=question,
            return_tensors="pt"
        )

        # Process Targets (Answer)
        targets = self.processor(
            text=answer,
            return_tensors="pt"
        )

        # Format for Model
        return {
            "pixel_values": inputs.pixel_values.squeeze(),
            "input_ids": inputs.input_ids.squeeze(),
            "labels": targets.input_ids.squeeze()
        }

print("VQABlipDataset class defined.")

In [None]:
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence

# PHASE 3: BLIP TRAINING

# Custom Collate Function (Handles text padding)
def blip_collate_fn(batch):
    pixel_values = torch.stack([item['pixel_values'] for item in batch])

    # Pad questions to the longest in the batch
    input_ids = [item['input_ids'] for item in batch]
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=processor.tokenizer.pad_token_id)

    # Pad answers (labels) - use -100 to ignore padding in loss calculation
    labels = [item['labels'] for item in batch]
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)

    return {"pixel_values": pixel_values, "input_ids": input_ids, "labels": labels}

# Create DataLoaders
# Create fresh datasets using the BLIP-specific class
train_ds_blip = VQABlipDataset(train_df, processor, IMAGE_DIR)
val_ds_blip = VQABlipDataset(val_df, processor, IMAGE_DIR)
test_ds_blip = VQABlipDataset(test_df, processor, IMAGE_DIR)

train_loader_blip = DataLoader(train_ds_blip, batch_size=8, shuffle=True, collate_fn=blip_collate_fn)
val_loader_blip = DataLoader(val_ds_blip, batch_size=8, shuffle=False, collate_fn=blip_collate_fn)
test_loader_blip = DataLoader(test_ds_blip, batch_size=8, shuffle=False, collate_fn=blip_collate_fn)

# Optimizer
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

# Evaluation Helper (Generative)
def evaluate_blip(model, dataloader):
    model.eval()
    correct = 0
    total = 0

    print("  Validating (Generating answers)...")
    with torch.no_grad():
        for batch in dataloader:
            pixel_values = batch['pixel_values'].to(device)
            input_ids = batch['input_ids'].to(device)

            # Use BLIP to generate text answers
            generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_new_tokens=10)
            generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

            # Get Ground Truth (we need to decode the labels back to text)
            label_ids = batch['labels'].cpu().numpy()
            ground_truth_texts = []
            for ids in label_ids:
                # Filter out -100
                ids = ids[ids != -100]
                text = processor.decode(ids, skip_special_tokens=True)
                ground_truth_texts.append(text)

            # Compare
            for pred, true in zip(generated_texts, ground_truth_texts):
                # Simple normalization (lowercase, strip)
                if pred.lower().strip() == true.lower().strip():
                    correct += 1
                total += 1

    return correct / total

# Main Training Loop
NUM_EPOCHS = 10
best_acc = 0.0

print("\n" + "-"*40)
print("STARTING BLIP FINE-TUNING")
print("-"*40)

blip_history = {'loss': [], 'val_acc': []}

for epoch in range(NUM_EPOCHS):
    model.train()
    running_loss = 0.0

    # Train
    for batch in tqdm(train_loader_blip, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}"):
        pixel_values = batch['pixel_values'].to(device)
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        # Forward (BLIP computes loss automatically if labels are provided)
        outputs = model(pixel_values=pixel_values, input_ids=input_ids, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(train_loader_blip)

    # Validate
    val_acc = evaluate_blip(model, val_loader_blip)

    blip_history['loss'].append(avg_loss)
    blip_history['val_acc'].append(val_acc)

    print(f"Epoch {epoch+1}: Loss = {avg_loss:.4f} | Val Acc = {val_acc*100:.2f}%")

    # Save Best
    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model.state_dict(), "best_blip_model.pth")
        print("  New Best BLIP Model Saved!")

print(f"\nTraining Complete. Best Accuracy: {best_acc*100:.2f}%")

In [None]:
# INSPECT BLIP PREDICTIONS

model.eval()
print(f"Checking specific examples from Validation Set...\n")

# Get a small batch
batch = next(iter(val_loader_blip))
pixel_values = batch['pixel_values'].to(device)
input_ids = batch['input_ids'].to(device)
labels = batch['labels']

# Generate
with torch.no_grad():
    generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_new_tokens=5)
    preds = processor.batch_decode(generated_ids, skip_special_tokens=True)

# Decode Truth
ground_truths = []
for ids in labels:
    ids = ids[ids != -100] # Remove padding
    ground_truths.append(processor.decode(ids, skip_special_tokens=True))

# Print Comparison
print(f"{'PREDICTION':<20} | {'TRUTH':<20} | {'STATUS'}")
print("-" * 60)

for p, t in zip(preds, ground_truths):
    p_clean = p.strip().lower()
    t_clean = t.strip().lower()
    status = "Correct" if p_clean == t_clean else "Wrong"
    print(f"{p:<20} | {t:<20} | {status}")

In [None]:
# FINAL EVALUATION: BLIP MODEL

# Load the best weights from training
model.load_state_dict(torch.load("best_blip_model.pth"))
model.eval()

print("Running Final Inference on TEST SET...")

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader_blip):
        pixel_values = batch['pixel_values'].to(device)
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels']

        # Generate
        generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_new_tokens=5)
        preds_text = processor.batch_decode(generated_ids, skip_special_tokens=True)

        # Store Preds
        # Convert text "yes"/"no" back to 1/0 for metrics
        for p in preds_text:
            clean_p = p.strip().lower()
            if "yes" in clean_p:
                all_preds.append(1)
            else:
                all_preds.append(0) # Default to No if unsure

        # Store Truth
        for ids in labels:
            ids = ids[ids != -100]
            truth_text = processor.decode(ids, skip_special_tokens=True)
            if "yes" in truth_text.lower():
                all_labels.append(1)
            else:
                all_labels.append(0)

# Calculate Metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

blip_acc = accuracy_score(all_labels, all_preds)
print(f"\nFINAL BLIP TEST ACCURACY: {blip_acc*100:.2f}%")
print("(Compare this to CNN: 56.93%)")

print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=['No', 'Yes']))

# Plot Confusion Matrix
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.title('Confusion Matrix (BLIP Transformer)')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import shutil

# Define Results
models = ['CNN Baseline', 'BLIP (VLP)']
accuracies = [56.93, 65.69]
f1_scores_yes = [0.56, 0.69] # From classification reports

# Plot Side-by-Side Comparison
fig, ax = plt.subplots(1, 2, figsize=(14, 6))

# Accuracy Chart
bars1 = ax[0].bar(models, accuracies, color=['#95a5a6', '#2ecc71'], width=0.5)
ax[0].set_ylim(0, 100)
ax[0].set_ylabel('Accuracy (%)')
ax[0].set_title('Test Set Accuracy Comparison', fontsize=14, fontweight='bold')
ax[0].grid(axis='y', alpha=0.3)

# Add numbers on top
for bar in bars1:
    yval = bar.get_height()
    ax[0].text(bar.get_x() + bar.get_width()/2, yval + 1, f"{yval}%", ha='center', fontweight='bold')

# F1-Score Chart (The "Medical" Metric)
bars2 = ax[1].bar(models, f1_scores_yes, color=['#95a5a6', '#3498db'], width=0.5)
ax[1].set_ylim(0, 1.0)
ax[1].set_ylabel('F1-Score (Class: Yes)')
ax[1].set_title('Ability to Detect Pathology ("Yes" Class)', fontsize=14, fontweight='bold')
ax[1].grid(axis='y', alpha=0.3)

for bar in bars2:
    yval = bar.get_height()
    ax[1].text(bar.get_x() + bar.get_width()/2, yval + 0.02, f"{yval:.2f}", ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

# Save to Drive
DRIVE_FOLDER = "/content/drive/MyDrive/VQA_Assignment_Results"
if not os.path.exists(DRIVE_FOLDER):
    os.makedirs(DRIVE_FOLDER)

# Save Plot
plot_path = os.path.join(DRIVE_FOLDER, "final_comparison_chart.png")
fig.savefig(plot_path, dpi=300)
print(f"Saved Comparison Chart to: {plot_path}")

# Save BLIP Confusion Matrix (Regenerating to save)
import seaborn as sns
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'], annot_kws={"size": 14})
plt.title('Confusion Matrix (BLIP)')
plt.savefig(os.path.join(DRIVE_FOLDER, "blip_confusion_matrix.png"), dpi=300)
plt.close()
print(f"Saved BLIP Confusion Matrix")

# Save Best BLIP Model
print("Backing up BLIP model weights (this might take 30s)...")
shutil.copy("best_blip_model.pth", os.path.join(DRIVE_FOLDER, "blip_best_model.pth"))
print(f"Saved BLIP Model Weights to Drive")

print("\nASSIGNMENT COMPLETE. You have all models, charts, and metrics secured.")

**Visualizations**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
from sklearn.metrics import confusion_matrix, classification_report

# Setup Drive Path
DRIVE_FOLDER = "/content/drive/MyDrive/VQA_Assignment_Results"
os.makedirs(DRIVE_FOLDER, exist_ok=True)
print(f"Saving BLIP individual charts to: {DRIVE_FOLDER}")

# BLIP LEARNING CURVES (Fixed Legend)
fig, ax1 = plt.subplots(figsize=(10, 6))

# Loss (Left Axis)
#  plot explicitly on ax1
line1 = ax1.plot(blip_history['loss'], label='Training Loss', color='#2ecc71', linewidth=2, marker='o', markersize=4)
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Loss', color='#2ecc71', fontweight='bold')
ax1.tick_params(axis='y', labelcolor='#2ecc71')
ax1.grid(True, alpha=0.3)

# Accuracy (Right Axis)
#  create a twin axis sharing the same x-axis
ax2 = ax1.twinx()
line2 = ax2.plot(blip_history['val_acc'], label='Val Accuracy', color='#e67e22', linestyle='--', linewidth=2, marker='s', markersize=4)
ax2.set_ylabel('Accuracy', color='#e67e22', fontweight='bold')
ax2.set_ylim(0, 1.0)
ax2.tick_params(axis='y', labelcolor='#e67e22')

# Title
plt.title('BLIP Training Dynamics (Loss vs Accuracy)', fontsize=14, fontweight='bold')

# Combine Legends from both axes
# Add the lines together so they appear in one box
lines = line1 + line2
labels = [l.get_label() for l in lines]
ax1.legend(lines, labels, loc='center right')

plt.tight_layout()
plt.savefig(os.path.join(DRIVE_FOLDER, "blip_learning_curve.png"), dpi=300)
plt.show()
print("Saved blip_learning_curve.png")

# BLIP CONFUSION MATRIX
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens',
            xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'],
            annot_kws={"size": 16})
plt.title('Confusion Matrix (BLIP)', fontsize=14, fontweight='bold')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.tight_layout()
plt.savefig(os.path.join(DRIVE_FOLDER, "blip_confusion_matrix.png"), dpi=300)
plt.show()
print("Saved blip_confusion_matrix.png")

# BLIP METRICS BAR CHART
report = classification_report(all_labels, all_preds, target_names=['No', 'Yes'], output_dict=True)
metrics_df = pd.DataFrame(report).transpose().iloc[:2]

plt.figure(figsize=(8, 6))
metrics_df[['precision', 'recall', 'f1-score']].plot(kind='bar', color=['#a8dab5', '#4dbbd5', '#e64b35'], figsize=(8,6))
plt.title('BLIP Detailed Metrics', fontsize=14, fontweight='bold')
plt.ylim(0, 1.0)
plt.legend(loc='lower center', ncol=3)
plt.grid(axis='y', alpha=0.3)
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig(os.path.join(DRIVE_FOLDER, "blip_metrics_bar.png"), dpi=300)
plt.show()
print("Saved blip_metrics_bar.png")

#**Final Comparison Visualization**

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import shutil

# Setup Drive Path
DRIVE_FOLDER = "/content/drive/MyDrive/VQA_Assignment_Results"
os.makedirs(DRIVE_FOLDER, exist_ok=True)

# FINAL COMPARISON (CNN vs BLIP)

# Define Data (Hardcoded from classification reports)
# CNN: Prec=0.67, Rec=0.47, F1=0.56 | Acc=56.93
# BLIP: Prec=0.71, Rec=0.68, F1=0.69 | Acc=65.69

metrics_labels = ['Precision', 'Recall', 'F1-Score']
cnn_metrics = [0.67, 0.47, 0.56]
blip_metrics = [0.71, 0.68, 0.69]

accuracies = [56.93, 65.69]
model_names = ['CNN Baseline', 'BLIP (VLP)']

# Create Detailed Comparison Table
comparison_df = pd.DataFrame({
    'Metric': ['Test Accuracy', 'Precision (Yes)', 'Recall (Yes)', 'F1-Score (Yes)'],
    'CNN Baseline': ['56.93%', '0.67', '0.47', '0.56'],
    'BLIP (VLP)':   ['65.69%', '0.71', '0.68', '0.69'],
    'Improvement':  ['+8.76%', '+0.04', '+0.21', '+0.13']
})

print("\nFINAL PERFORMANCE COMPARISON TABLE")
print("-"*55)
print(comparison_df.to_string(index=False))
print("-"*55)

# Save Table to CSV
comparison_df.to_csv(os.path.join(DRIVE_FOLDER, "final_comparison_table.csv"), index=False)


# Create Comparison Plots
fig, ax = plt.subplots(1, 2, figsize=(16, 6))

# Overall Accuracy
bars1 = ax[0].bar(model_names, accuracies, color=['#95a5a6', '#2ecc71'], width=0.5)
ax[0].set_ylim(0, 85)
ax[0].set_ylabel('Accuracy (%)')
ax[0].set_title('Overall Test Accuracy', fontsize=14, fontweight='bold')
ax[0].grid(axis='y', alpha=0.3)

# Add labels
for bar in bars1:
    yval = bar.get_height()
    ax[0].text(bar.get_x() + bar.get_width()/2, yval + 1, f"{yval}%", ha='center', fontweight='bold', fontsize=12)


# Detailed Metrics (Grouped Bar Chart)
x = np.arange(len(metrics_labels))  # Label locations
width = 0.35  # Width of the bars

rects1 = ax[1].bar(x - width/2, cnn_metrics, width, label='CNN Baseline', color='#95a5a6')
rects2 = ax[1].bar(x + width/2, blip_metrics, width, label='BLIP (VLP)', color='#3498db')

ax[1].set_ylabel('Score (0-1)')
ax[1].set_title('Detailed Metrics for "Yes" Class (Pathology)', fontsize=14, fontweight='bold')
ax[1].set_xticks(x)
ax[1].set_xticklabels(metrics_labels, fontsize=11)
ax[1].set_ylim(0, 1.0)
ax[1].legend()
ax[1].grid(axis='y', alpha=0.3)

# Add labels on top of bars
def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        ax[1].annotate(f'{height:.2f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom', fontweight='bold')

autolabel(rects1)
autolabel(rects2)

plt.tight_layout()
plt.savefig(os.path.join(DRIVE_FOLDER, "final_comparison_chart_detailed.png"), dpi=300)
plt.show()

# Final Backup
print("Backing up BLIP model weights...")
shutil.copy("best_blip_model.pth", os.path.join(DRIVE_FOLDER, "blip_best_model.pth"))
print(f"Saved BLIP Model Weights and Final Charts to Drive.")