# ELEC 475 Lab 4: Model Evaluation - Recall@K Comparison

**Evaluate trained models and compare Recall@K metrics**

---

## ⚠️ Before Running:

1. **Upload your model checkpoints** as Kaggle datasets:
   - Create dataset: `your-username/clip-baseline-model` with `best_model.pth`
   - Create dataset: `your-username/clip-batchnorm-model` with `best_model.pth`
   - Create dataset: `your-username/clip-dropout-model` with `best_model.pth` (if trained)

2. **Add datasets**:
   - `jeffaudi/coco-2014-dataset-for-yolov3`
   - `jacobbadali2/elec-475-lab4`
   - Your model checkpoint datasets

3. **Enable GPU**: T4 or P100
4. **Enable Internet**: ON

---

## 1. Environment Check

In [None]:
import os
import torch

print("=" * 80)
print("ENVIRONMENT CHECK")
print("=" * 80)
print(f"Kaggle: {'KAGGLE_KERNEL_RUN_TYPE' in os.environ}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
print("=" * 80)

## 2. Install Dependencies

In [None]:
%%time
!pip install -q transformers torch torchvision tqdm pillow matplotlib
print("✓ Dependencies installed")

## 3. Clone Repository & Setup

In [None]:
%%time
import shutil

if os.path.exists('475_ML-CV_Labs'):
    shutil.rmtree('475_ML-CV_Labs')
    print("✓ Removed old repo")

!git clone https://github.com/Jcub05/475_ML-CV_Labs.git
os.chdir('475_ML-CV_Labs/Lab4')
print(f"✓ Fresh clone complete\nDirectory: {os.getcwd()}")

In [None]:
# Use Kaggle-optimized files
import shutil

shutil.copy('dataset.py', 'dataset_original.py')

# Write fixed dataset.py directly to ensure subset logic is present
dataset_code = r'''"""
Simplified COCO dataset loader for Kaggle that uses cached text embeddings.
Compatible with embeddings cached by cache_text_embeddings.py
Uses ALL 5 captions per image for 5x more training data.
"""
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from pathlib import Path
import os


class COCOCachedDataset(Dataset):
    """Dataset that uses pre-cached text embeddings from cache_text_embeddings.py"""
    
    def __init__(self, images_dir, embeddings_file, transform=None):
        self.images_dir = Path(images_dir)
        self.transform = transform
        
        # Load cached embeddings
        print(f"Loading embeddings from {embeddings_file}...")
        embeddings_cache = torch.load(embeddings_file)
        
        # embeddings_cache is a dict: {"image_id_caption_idx": tensor, ...}
        # We use ALL captions per image (not just first) for more training data
        
        # Store all embeddings with their image_id and caption index
        all_embeddings = []
        all_image_ids = []
        all_caption_indices = []
        
        for key, embedding in embeddings_cache.items():
            # Key format: "image_id_caption_idx" (e.g., "391895_0")
            image_id_str, caption_idx = key.rsplit('_', 1)
            image_id = int(image_id_str)
            
            # Use ALL captions (not just caption_idx == 0)
            all_embeddings.append(embedding)
            all_image_ids.append(image_id)
            all_caption_indices.append(int(caption_idx))
        
        # Build image paths and filter out missing images
        split_name = 'train' if 'train' in str(images_dir) else 'val'
        valid_image_ids = []
        valid_embeddings = []
        valid_paths = []
        valid_caption_indices = []
        missing_count = 0
        
        for i, (img_id, caption_idx) in enumerate(zip(all_image_ids, all_caption_indices)):
            img_path = self.images_dir / f"COCO_{split_name}2014_{img_id:012d}.jpg"
            
            # Only include if image file exists
            if img_path.exists():
                valid_image_ids.append(img_id)
                valid_embeddings.append(all_embeddings[i])
                valid_paths.append(img_path)
                valid_caption_indices.append(caption_idx)
            else:
                missing_count += 1
        
        self.image_ids = valid_image_ids
        self.embeddings = torch.stack(valid_embeddings) if valid_embeddings else torch.empty(0)
        self.image_paths = valid_paths
        self.caption_indices = valid_caption_indices
        
        print(f"✓ Loaded {len(self)} samples with embeddings")
        if missing_count > 0:
            print(f"  ⚠ Skipped {missing_count} samples with missing images")
    
    def __len__(self):
        return len(self.image_ids)
    
    def __getitem__(self, idx):
        # Load image
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)
        
        # Get cached text embedding
        text_embedding = self.embeddings[idx]
        
        return {
            'image': image,
            'text_embedding': text_embedding,
            'image_id': self.image_ids[idx]
        }


def create_dataloaders(config=None, data_root=None, train_images_dir=None, 
                       val_images_dir=None, cache_dir=None, batch_size=64,
                       num_workers=2, pin_memory=True, **kwargs):
    """
    Create dataloaders using cached embeddings.
    Accepts both Config object and individual arguments for compatibility.
    """
    
    # Handle Config object or individual arguments
    if config is not None and hasattr(config, 'train_images_path'):
        train_images_path = config.train_images_path
        val_images_path = config.val_images_path
        cache_path = config.cache_path
        batch_size = config.batch_size
        num_workers = config.num_workers
        pin_memory = config.pin_memory
        image_size = config.image_size
        clip_mean = config.clip_mean
        clip_std = config.clip_std
    else:
        # Use individual arguments
        train_images_path = Path(data_root) / (train_images_dir or "images/train2014")
        val_images_path = Path(data_root) / (val_images_dir or "images/val2014")
        cache_path = Path(cache_dir) if cache_dir else Path("/kaggle/input/elec-475-lab4")
        image_size = 224
        clip_mean = (0.48145466, 0.4578275, 0.40821073)
        clip_std = (0.26862954, 0.26130258, 0.27577711)
    
    # CLIP preprocessing
    transform = transforms.Compose([
        transforms.Resize((image_size, image_size)),
        transforms.ToTensor(),
        transforms.Normalize(mean=clip_mean, std=clip_std)
    ])
    
    # Create datasets
    train_dataset = COCOCachedDataset(
        images_dir=train_images_path,
        embeddings_file=cache_path / "text_embeddings_train.pt",
        transform=transform
    )
    
    val_dataset = COCOCachedDataset(
        images_dir=val_images_path,
        embeddings_file=cache_path / "text_embeddings_val.pt",
        transform=transform
    )
    
    # Apply subset if requested
    use_subset = kwargs.get('use_subset', False) or (config and getattr(config, 'use_subset', False))
    subset_size = kwargs.get('subset_size', 10000) or (config and getattr(config, 'subset_size', 10000))
    
    if use_subset:
        print(f"Subsetting validation set to {subset_size} samples...")
        if len(val_dataset) > subset_size:
            indices = torch.randperm(len(val_dataset))[:subset_size]
            val_dataset = torch.utils.data.Subset(val_dataset, indices)
            print(f"✓ Validation subset created: {len(val_dataset)} samples")
    
    # Create dataloaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=pin_memory
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=pin_memory
    )
    
    return train_loader, val_loader
'''

with open('dataset.py', 'w') as f:
    f.write(dataset_code)

print("✓ Using Kaggle-compatible dataset loader (with subset fix)")

shutil.copy('metrics.py', 'metrics_original.py')
shutil.copy('metrics_kaggle.py', 'metrics.py')
print("✓ Using GPU-optimized metrics")


## 4. Configure for Kaggle

In [None]:
# Create Kaggle config
from pathlib import Path

if os.path.exists('config.py'):
    os.remove('config.py')

config_code = '''
import os
from dataclasses import dataclass
from pathlib import Path
import torch

@dataclass
class Config:
    is_kaggle: bool = True
    data_root: str = "/kaggle/input/coco-2014-dataset-for-yolov3/coco2014"
    text_embeddings_path: str = "/kaggle/input/elec-475-lab4"
    train_images_dir: str = "images/train2014"
    val_images_dir: str = "images/val2014"
    train_captions_file: str = "annotations/instances_train2014.json"
    val_captions_file: str = "annotations/instances_val2014.json"
    checkpoint_dir: str = "checkpoints"
    results_dir: str = "results"
    embed_dim: int = 512
    image_size: int = 224
    pretrained_resnet: bool = True
    clip_mean: tuple = (0.48145466, 0.4578275, 0.40821073)
    clip_std: tuple = (0.26862954, 0.26130258, 0.27577711)
    clip_model_name: str = "openai/clip-vit-base-patch32"
    batch_size: int = 32
    num_epochs: int = 10
    learning_rate: float = 5e-5
    weight_decay: float = 0.05
    temperature: float = 0.07
    use_scheduler: bool = True
    scheduler_type: str = "cosine"
    optimizer_type: str = "adamw"
    beta1: float = 0.9
    beta2: float = 0.999
    eps: float = 1e-8
    max_grad_norm: float = 1.0
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    num_workers: int = 2
    pin_memory: bool = True
    use_amp: bool = True
    use_subset: bool = False
    subset_size: int = 10000
    use_cached_embeddings: bool = True
    eval_every_n_epochs: int = 1
    save_best_only: bool = False
    recall_k_values: list = None
    num_visualization_samples: int = 10
    save_visualizations: bool = True
    log_interval: int = 100
    verbose: bool = True
    
    def __post_init__(self):
        if self.recall_k_values is None:
            self.recall_k_values = [1, 5, 10]
        self.data_root = Path(self.data_root)
        self.train_images_path = self.data_root / self.train_images_dir
        self.val_images_path = self.data_root / self.val_images_dir
        self.train_captions_path = self.data_root / self.train_captions_file
        self.val_captions_path = self.data_root / self.val_captions_file
        self.cache_path = Path(self.text_embeddings_path)
        self.checkpoint_path = Path("/kaggle/working") / self.checkpoint_dir
        self.results_path = Path("/kaggle/working") / self.results_dir
    
    def create_directories(self):
        os.makedirs(self.checkpoint_path, exist_ok=True)
        os.makedirs(self.results_path, exist_ok=True)
    
    def validate_paths(self):
        required = [
            self.train_images_path,
            self.val_images_path,
            self.cache_path / "text_embeddings_train.pt",
            self.cache_path / "text_embeddings_val.pt"
        ]
        missing = [str(p) for p in required if not p.exists()]
        if missing:
            raise FileNotFoundError(f"Missing: {missing}")
        return True
    
    def __repr__(self):
        return f"Config(Kaggle | {self.device} | Batch:{self.batch_size} | Epochs:{self.num_epochs} | LR:{self.learning_rate} | WD:{self.weight_decay})"

def get_config(**kwargs):
    config = Config(**kwargs)
    config.create_directories()
    return config
'''

with open('config.py', 'w') as f:
    f.write(config_code)

print("✓ Kaggle config created")

In [None]:
# Validate setup
from config import get_config

config = get_config()
print("\n" + "=" * 80)
print("CONFIGURATION")
print("=" * 80)
print(config)
print(f"\nCheckpoints: {config.checkpoint_path}")
print(f"Results: {config.results_path}")
print(f"Text embeddings: {config.cache_path}")

print(f"\nValidating paths...")
config.validate_paths()
print("✓ All paths valid!")
print("=" * 80)

## 5. Define Model Checkpoints

**Update these paths** to match your uploaded checkpoint datasets:

In [None]:
# Define your model checkpoints
# UPDATE THESE PATHS to match your Kaggle dataset names!
models_to_evaluate = {
    'Baseline': '/kaggle/input/clip-baseline-model/best_model.pth',
    'BatchNorm': '/kaggle/input/clip-batchnorm-model/best_model.pth',
    # 'Dropout': '/kaggle/input/YOUR-DROPOUT-DATASET/best_model.pth',  # Uncomment when ready
}

# Verify checkpoints exist
print("Checking model checkpoints:")
for name, path in models_to_evaluate.items():
    if os.path.exists(path):
        print(f"  ✓ {name}: {path}")
    else:
        print(f"  ❌ {name}: NOT FOUND - {path}")
        print(f"     Please update the path or add the dataset!")

## 6. Create Evaluation Script

In [None]:
import json
import torch
import torch.nn as nn
from tqdm import tqdm
from pathlib import Path

from dataset import create_dataloaders
from model import CLIPFineTuneModel
from model_modified import create_modified_model
from transformers import CLIPTextModel, CLIPTokenizer
from metrics import compute_retrieval_metrics

@torch.no_grad()
def evaluate_model(model, dataloader, device, config, max_samples=20000):
    """
    Evaluate model and compute Recall@K metrics.
    Uses a subset of validation data to avoid OOM.
    """
    model.eval()
    
    # Collect embeddings (limited to max_samples)
    all_image_embeds = []
    all_text_embeds = []
    
    print(f"Collecting embeddings (max {max_samples} samples)...")
    pbar = tqdm(dataloader, desc="Evaluation")
    
    samples_collected = 0
    for batch in pbar:
        if samples_collected >= max_samples:
            break
            
        images = batch['image'].to(device)
        text_embeddings = batch['text_embedding'].to(device)
        
        # Forward pass - handle both 2 and 3 return values
        outputs = model(images, text_embeddings=text_embeddings)
        if len(outputs) == 3:
            image_embeds, text_embeds, _ = outputs
        else:
            image_embeds, text_embeds = outputs
        
        # Store embeddings (move to CPU to save GPU memory)
        all_image_embeds.append(image_embeds.cpu())
        all_text_embeds.append(text_embeds.cpu())
        
        samples_collected += images.size(0)
        
        # Clear GPU cache periodically
        if len(all_image_embeds) % 5 == 0:
            torch.cuda.empty_cache()
    
    # Concatenate all embeddings
    print(f"Computing Recall@K metrics on {samples_collected} samples...")
    all_image_embeds = torch.cat(all_image_embeds, dim=0)
    all_text_embeds = torch.cat(all_text_embeds, dim=0)
    
    print(f"Embedding shapes: Images={all_image_embeds.shape}, Text={all_text_embeds.shape}")
    
    # Compute Recall@K metrics (move back to GPU for fast computation)
    recall_metrics = compute_retrieval_metrics(
        all_image_embeds.to(device),
        all_text_embeds.to(device),
        k_values=config.recall_k_values
    )
    
    # Clear memory
    del all_image_embeds, all_text_embeds
    torch.cuda.empty_cache()
    
    return recall_metrics

print("✓ Evaluation function defined (memory-efficient)")

## 7. Evaluate All Models

In [None]:
%%time

# Create validation dataloader with SUBSET to avoid OOM
print("Creating validation dataloader (20K subset)...")
_, val_loader = create_dataloaders(
    data_root=config.data_root,
    batch_size=config.batch_size,
    num_workers=2,
    pin_memory=config.pin_memory,
    use_cached_embeddings=config.use_cached_embeddings,
    use_subset=True,  # Use subset to avoid OOM
    subset_size=20000
)
print(f"✓ Validation set: {len(val_loader.dataset)} samples\n")

# Load text encoder ONCE and reuse for all models
print("Loading shared CLIP text encoder...")
text_encoder = CLIPTextModel.from_pretrained(config.clip_model_name)
tokenizer = CLIPTokenizer.from_pretrained(config.clip_model_name)
device = config.device
text_encoder = text_encoder.to(device)
text_encoder.eval()
print("✓ Text encoder loaded\n")

# Store all results
all_results = {}

# Evaluate each model
for model_name, checkpoint_path in models_to_evaluate.items():
    print("\n" + "=" * 80)
    print(f"EVALUATING: {model_name}")
    print("=" * 80)
    
    if not os.path.exists(checkpoint_path):
        print(f"❌ Checkpoint not found: {checkpoint_path}")
        print("   Skipping...")
        continue
    
    # Load checkpoint
    print(f"Loading checkpoint: {checkpoint_path}")
    checkpoint = torch.load(checkpoint_path, map_location=device)
    
    # Determine model type and create model
    if 'batchnorm' in model_name.lower():
        # BatchNorm model
        model = create_modified_model(
            text_encoder=text_encoder,
            tokenizer=tokenizer,
            embed_dim=config.embed_dim,
            use_batchnorm=True,
            use_dropout=False,
            deeper_projection=False,
            learnable_temperature=False
        ).to(device)
    elif 'dropout' in model_name.lower():
        # Dropout model
        model = create_modified_model(
            text_encoder=text_encoder,
            tokenizer=tokenizer,
            embed_dim=config.embed_dim,
            use_batchnorm=False,
            use_dropout=True,
            dropout_rate=0.1,
            deeper_projection=False,
            learnable_temperature=False
        ).to(device)
    else:
        # Baseline model
        model = CLIPFineTuneModel(
            embed_dim=config.embed_dim,
            pretrained_resnet=config.pretrained_resnet,
            clip_model_name=config.clip_model_name,
            freeze_text_encoder=True
        ).to(device)
    
    # Load weights
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"✓ Loaded model from epoch {checkpoint.get('epoch', 'unknown')}")
    
    # Evaluate (will use all samples from the 20K subset)
    metrics = evaluate_model(model, val_loader, device, config, max_samples=20000)
    all_results[model_name] = metrics
    
    # Print results
    print(f"\nResults for {model_name}:")
    print(f"  Image→Text R@1:  {metrics['img2txt_r1']:.2f}%")
    print(f"  Image→Text R@5:  {metrics['img2txt_r5']:.2f}%")
    print(f"  Image→Text R@10: {metrics['img2txt_r10']:.2f}%")
    print(f"  Text→Image R@1:  {metrics['txt2img_r1']:.2f}%")
    print(f"  Text→Image R@5:  {metrics['txt2img_r5']:.2f}%")
    print(f"  Text→Image R@10: {metrics['txt2img_r10']:.2f}%")
    print(f"  Average Recall:  {metrics['avg_recall']:.2f}%")
    
    # Clean up model but keep text encoder
    del model, checkpoint
    torch.cuda.empty_cache()

# Clean up text encoder
del text_encoder, tokenizer
torch.cuda.empty_cache()

print("\n" + "=" * 80)
print("ALL EVALUATIONS COMPLETE!")
print("=" * 80)

## 8. Compare Results

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Create comparison table
print("\n" + "=" * 80)
print("RECALL@K COMPARISON")
print("=" * 80 + "\n")

# Build comparison dataframe
comparison_data = []
for model_name, metrics in all_results.items():
    comparison_data.append({
        'Model': model_name,
        'I→T R@1': f"{metrics['img2txt_r1']:.2f}%",
        'I→T R@5': f"{metrics['img2txt_r5']:.2f}%",
        'I→T R@10': f"{metrics['img2txt_r10']:.2f}%",
        'T→I R@1': f"{metrics['txt2img_r1']:.2f}%",
        'T→I R@5': f"{metrics['txt2img_r5']:.2f}%",
        'T→I R@10': f"{metrics['txt2img_r10']:.2f}%",
        'Avg': f"{metrics['avg_recall']:.2f}%"
    })

df = pd.DataFrame(comparison_data)
print(df.to_string(index=False))
print("\n" + "=" * 80)

# Save comparison table
df.to_csv('/kaggle/working/model_comparison.csv', index=False)
print("\n✓ Saved: /kaggle/working/model_comparison.csv")

In [None]:
# Plot comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

models = list(all_results.keys())
k_values = [1, 5, 10]

# Image to Text
for model_name in models:
    metrics = all_results[model_name]
    i2t_scores = [metrics['img2txt_r1'], metrics['img2txt_r5'], metrics['img2txt_r10']]
    axes[0].plot(k_values, i2t_scores, marker='o', label=model_name, linewidth=2)

axes[0].set_xlabel('K', fontsize=12)
axes[0].set_ylabel('Recall@K (%)', fontsize=12)
axes[0].set_title('Image → Text Retrieval', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
axes[0].set_xticks(k_values)

# Text to Image
for model_name in models:
    metrics = all_results[model_name]
    t2i_scores = [metrics['txt2img_r1'], metrics['txt2img_r5'], metrics['txt2img_r10']]
    axes[1].plot(k_values, t2i_scores, marker='o', label=model_name, linewidth=2)

axes[1].set_xlabel('K', fontsize=12)
axes[1].set_ylabel('Recall@K (%)', fontsize=12)
axes[1].set_title('Text → Image Retrieval', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
axes[1].set_xticks(k_values)

plt.tight_layout()
plt.savefig('/kaggle/working/recall_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print("✓ Saved: /kaggle/working/recall_comparison.png")

## 9. Save All Results

In [None]:
# Save all metrics to JSON
with open('/kaggle/working/all_model_metrics.json', 'w') as f:
    json.dump(all_results, f, indent=2)

print("\n" + "=" * 80)
print("OUTPUT FILES")
print("=" * 80)
!ls -lh /kaggle/working/

print("\n" + "=" * 80)
print("DOWNLOAD INSTRUCTIONS")
print("=" * 80)
print("""
1. Click 'Output' tab at top
2. Download all files

Key files:
  - model_comparison.csv (table of results)
  - recall_comparison.png (comparison plot)
  - all_model_metrics.json (detailed metrics)

Use these results in your lab report!
""")
print("=" * 80)

---

## ✅ Done!

**You now have:**
- Recall@K scores for all models
- Comparison table (CSV)
- Comparison plot (PNG)
- Detailed metrics (JSON)

**Next steps:**
1. Download the results
2. Analyze which modification worked best
3. Include in your lab report!

---