# Task 3: Window Soft-Argmax Evaluation

This notebook evaluates the **Window Soft-Argmax** prediction method on:
- **3 Baseline Models**: DINOv2, DINOv3, SAM
- **3 Fine-Tuned Models**: Using .pth checkpoint files

## Key Improvements over Baseline (Argmax)
1. **Sub-pixel precision**: Instead of discrete argmax, we use soft-argmax for refinement
2. **Noise robustness**: Windowed approach reduces sensitivity to local similarity noise
3. **Better PCK at strict thresholds**: Most improvements visible at Œ±=0.05 and Œ±=0.10

## 1. Setup & Imports

In [1]:
# Clone repository and install dependencies (Colab only)
!rm -rf semantic-correspondance
!git clone https://github.com/MarcotteS/semantic-correspondance.git

import sys
sys.path.append('/content/semantic-correspondance/src')

Cloning into 'semantic-correspondance'...
remote: Enumerating objects: 327, done.[K
remote: Counting objects: 100% (112/112), done.[K
remote: Compressing objects: 100% (83/83), done.[K
remote: Total 327 (delta 63), reused 67 (delta 24), pack-reused 215 (from 1)[K
Receiving objects: 100% (327/327), 3.45 MiB | 6.73 MiB/s, done.
Resolving deltas: 100% (186/186), done.


In [2]:
import os
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import json
from datetime import datetime
from collections import defaultdict
import pandas as pd

# Import from src
from dataset import SPairDataset, collate_fn_correspondence
from models import DINOv2Extractor, DINOv3Extractor, SAMExtractor
from correspondence import CorrespondenceMatcher
from correspondence_softargmax import WindowSoftArgmaxMatcher
from evaluation import CorrespondenceEvaluator
from analyzer import ResultsAnalyzer

# Check GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## 2. Download Dataset

In [3]:
# Download SPair-71k dataset
if not os.path.exists('SPair-71k') and not os.path.exists('data/SPair-71k'):
    !wget -q https://cvlab.postech.ac.kr/research/SPair-71k/data/SPair-71k.tar.gz
    !tar -xf SPair-71k.tar.gz
    print("Dataset downloaded and extracted!")
else:
    print("Dataset already exists.")

Dataset downloaded and extracted!


## 3. Configuration

In [15]:
# ========================================
# CONFIGURATION - Edit these paths
# ========================================

# Path to the dataset
DATA_PATH = '.'  # Path where SPair-71k was extracted

# Fine-tuned model checkpoints (.pth files)
# Set these paths to your fine-tuned model files
FINETUNED_CHECKPOINTS = {
    'dinov2_finetuned': '/content/drive/MyDrive/semantic-correspondance-project/checkpoints/dinov2_finetuned/last.pt',
    'dinov3_finetuned': '/content/drive/MyDrive/semantic-correspondance-project/checkpoints/dinov3_finetuned/last.pt',
    'sam_finetuned': '/content/drive/MyDrive/semantic-correspondance-project/checkpoints/sam_finetuned/last.pt',
}

# DINOv3 specific configuration
# You need to clone the dinov3 repo and get access to the checkpoint
DINOV3_REPO_DIR = '/content/dinov3'  # Path to cloned DINOv3 repo
DINOV3_WEIGHTS = "dinov3_vitb16_pretrain_lvd1689m-73cec8be.pth"

# SAM specific configuration
SAM_CHECKPOINT_PATH = 'sam_vit_b_01ec64.pth'

# Window Soft-Argmax parameters
WINDOW_SIZE = 5
TEMPERATURE = 0.05

# Evaluation parameters
IMAGE_SIZE_DINOV2 = 518  # For DINOv2 (multiple of 14)
IMAGE_SIZE_DINOV3 = 512  # For DINOv3 (multiple of 16)
IMAGE_SIZE_SAM = 512     # For SAM
BATCH_SIZE = 16
NUM_WORKERS = 4

# Output directory
RESULTS_DIR = './results/task3_evaluation'

## 4. Helper Functions

In [5]:
def evaluate_model_simple(matcher, dataloader, desc="Evaluating"):
    """
    Evaluate correspondence matcher on a dataset.
    Returns metrics dictionary.
    """
    evaluator = CorrespondenceEvaluator(thresholds=[0.05, 0.10, 0.15, 0.20])
    matcher.extractor.model.eval()

    with torch.no_grad():
        for batch in tqdm(dataloader, desc=desc):
            src_img = batch['src_img']
            trg_img = batch['trg_img']
            src_kps = batch['src_kps']

            pred_kps = matcher.find_correspondences(src_img, trg_img, src_kps)

            batch_size = src_img.shape[0]
            for b in range(batch_size):
                pred_kps_b = pred_kps[b]
                # Extract keypoint IDs correctly from collated batch
                kps_ids_b = batch['kps_ids'][b]
                batch_single = {
                    'trg_kps': batch['trg_kps'][b],
                    'pckthres': batch['pckthres'][b],
                    'n_pts': batch['n_pts'][b],
                    'kps_ids': kps_ids_b,
                    'pair_idx': batch['pair_idx'][b],
                    'category': batch['category'][b],
                }
                evaluator.update(pred_kps_b, batch_single)

    metrics = evaluator.get_metrics()
    evaluator.print_summary(metrics)
    return metrics


def load_finetuned_weights(model, checkpoint_path):
    """
    Load fine-tuned weights from a checkpoint file.
    Handles the checkpoint format from the training script.
    """
    if not os.path.exists(checkpoint_path):
        print(f"‚ö†Ô∏è Checkpoint not found: {checkpoint_path}")
        return False

    checkpoint = torch.load(checkpoint_path, map_location=device)

    # Handle different checkpoint formats
    if 'model' in checkpoint:
        state_dict = checkpoint['model']
    elif 'state_dict' in checkpoint:
        state_dict = checkpoint['state_dict']
    else:
        state_dict = checkpoint

    model.load_state_dict(state_dict)
    print(f"‚úÖ Loaded fine-tuned weights from: {checkpoint_path}")

    if 'epoch' in checkpoint:
        print(f"   Epoch: {checkpoint['epoch'] + 1}")
    if 'meta' in checkpoint:
        print(f"   Training config: {checkpoint['meta']}")

    return True


def save_results(all_results, output_dir):
    """
    Save all evaluation results to JSON and generate comparison plots.
    """
    os.makedirs(output_dir, exist_ok=True)

    # Save raw metrics
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    metrics_path = os.path.join(output_dir, f'metrics_{timestamp}.json')

    # Convert metrics to JSON-serializable format
    def to_json(obj):
        if isinstance(obj, dict):
            return {str(k): to_json(v) for k, v in obj.items()}
        elif isinstance(obj, (list, tuple)):
            return [to_json(v) for v in obj]
        elif isinstance(obj, (np.integer, np.floating)):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, torch.Tensor):
            return obj.cpu().tolist()
        return obj

    with open(metrics_path, 'w') as f:
        json.dump(to_json(all_results), f, indent=2)

    print(f"‚úÖ Results saved to: {metrics_path}")
    return metrics_path

## 5. Visualization Functions

In [6]:
def plot_model_comparison(all_results, save_path=None):
    """
    Create comparison bar chart for all models at different thresholds.
    """
    thresholds = [0.05, 0.10, 0.15, 0.20]
    model_names = list(all_results.keys())
    n_models = len(model_names)
    n_thresholds = len(thresholds)

    fig, ax = plt.subplots(figsize=(14, 8))

    x = np.arange(n_thresholds)
    width = 0.8 / n_models

    colors = plt.cm.tab10(np.linspace(0, 1, n_models))

    for i, (model_name, metrics) in enumerate(all_results.items()):
        pck_values = [metrics['overall'][t] for t in thresholds]
        offset = (i - n_models/2 + 0.5) * width
        bars = ax.bar(x + offset, pck_values, width, label=model_name, color=colors[i], alpha=0.8)

        # Add value labels
        for bar, val in zip(bars, pck_values):
            ax.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.5,
                   f'{val:.1f}', ha='center', va='bottom', fontsize=8, rotation=90)

    ax.set_xlabel('PCK Threshold (Œ±)', fontsize=12)
    ax.set_ylabel('PCK (%)', fontsize=12)
    ax.set_title('Task 3: Window Soft-Argmax - Model Comparison', fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels([f'Œ±={t:.2f}' for t in thresholds])
    ax.set_ylim([0, 100])
    ax.legend(loc='upper left', fontsize=10)
    ax.grid(True, axis='y', alpha=0.3)

    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.show()


def plot_baseline_vs_finetuned(all_results, save_path=None):
    """
    Compare baseline vs fine-tuned models.
    """
    thresholds = [0.05, 0.10, 0.15, 0.20]

    # Separate baseline and fine-tuned
    baselines = {k: v for k, v in all_results.items() if 'finetuned' not in k.lower()}
    finetuned = {k: v for k, v in all_results.items() if 'finetuned' in k.lower()}

    fig, axes = plt.subplots(1, 2, figsize=(16, 6))

    # Plot baselines
    ax1 = axes[0]
    for model_name, metrics in baselines.items():
        pck_values = [metrics['overall'][t] for t in thresholds]
        ax1.plot(thresholds, pck_values, marker='o', linewidth=2, markersize=8, label=model_name)
    ax1.set_xlabel('PCK Threshold (Œ±)', fontsize=12)
    ax1.set_ylabel('PCK (%)', fontsize=12)
    ax1.set_title('Baseline Models (Soft-Argmax)', fontsize=14, fontweight='bold')
    ax1.legend(fontsize=10)
    ax1.grid(True, alpha=0.3)
    ax1.set_ylim([0, 100])

    # Plot fine-tuned
    ax2 = axes[1]
    for model_name, metrics in finetuned.items():
        pck_values = [metrics['overall'][t] for t in thresholds]
        ax2.plot(thresholds, pck_values, marker='s', linewidth=2, markersize=8, label=model_name)
    ax2.set_xlabel('PCK Threshold (Œ±)', fontsize=12)
    ax2.set_ylabel('PCK (%)', fontsize=12)
    ax2.set_title('Fine-Tuned Models (Soft-Argmax)', fontsize=14, fontweight='bold')
    ax2.legend(fontsize=10)
    ax2.grid(True, alpha=0.3)
    ax2.set_ylim([0, 100])

    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.show()


def plot_per_category_comparison(all_results, threshold=0.10, save_path=None):
    """
    Compare per-category performance across models.
    """
    # Get all categories from first model
    first_model = list(all_results.values())[0]
    categories = sorted(first_model['per_category'].keys())

    model_names = list(all_results.keys())
    n_models = len(model_names)
    n_categories = len(categories)

    fig, ax = plt.subplots(figsize=(18, 8))

    x = np.arange(n_categories)
    width = 0.8 / n_models

    colors = plt.cm.tab10(np.linspace(0, 1, n_models))

    for i, (model_name, metrics) in enumerate(all_results.items()):
        pck_values = [metrics['per_category'].get(cat, {}).get(threshold, 0) for cat in categories]
        offset = (i - n_models/2 + 0.5) * width
        ax.bar(x + offset, pck_values, width, label=model_name, color=colors[i], alpha=0.8)

    ax.set_xlabel('Category', fontsize=12)
    ax.set_ylabel(f'PCK@{threshold:.2f} (%)', fontsize=12)
    ax.set_title(f'Per-Category Performance Comparison (PCK@{threshold:.2f})', fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(categories, rotation=45, ha='right')
    ax.set_ylim([0, 100])
    ax.legend(loc='upper right', fontsize=9)
    ax.grid(True, axis='y', alpha=0.3)

    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.show()


def create_summary_table(all_results):
    """
    Create a summary DataFrame comparing all models.
    """
    thresholds = [0.05, 0.10, 0.15, 0.20]

    data = []
    for model_name, metrics in all_results.items():
        row = {'Model': model_name}
        for t in thresholds:
            row[f'PCK@{t:.2f}'] = f"{metrics['overall'][t]:.2f}%"
        data.append(row)

    df = pd.DataFrame(data)
    return df

## 6. Load Dataset

In [7]:
# Load test dataset for DINOv2 models
print("Loading SPair-71k test dataset for DINOv2 (518x518)...")
test_dataset_dinov2 = SPairDataset(
    datapath=DATA_PATH,
    split='test',
    img_size=IMAGE_SIZE_DINOV2,
    category='all'
)

test_loader_dinov2 = DataLoader(
    test_dataset_dinov2,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    collate_fn=collate_fn_correspondence
)

print(f"Test dataset DINOv2: {len(test_dataset_dinov2)} pairs")

Loading SPair-71k test dataset for DINOv2 (518x518)...
Loading SPair-71k test annotations...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12234/12234 [00:01<00:00, 6629.56it/s]

Test dataset DINOv2: 12234 pairs





## 7. Evaluate Baseline Models with Window Soft-Argmax

### 7.1 DINOv2 Baseline

In [8]:
# Store all results
all_results = {}

print("="*70)
print("EVALUATING: DINOv2 Baseline + Window Soft-Argmax")
print("="*70)

# Initialize DINOv2 extractor
dinov2_extractor = DINOv2Extractor(model_name="dinov2_vitb14")

# Create Window Soft-Argmax matcher
dinov2_matcher = WindowSoftArgmaxMatcher(
    feature_extractor=dinov2_extractor,
    window_size=WINDOW_SIZE,
    temperature=TEMPERATURE
)

# Evaluate using DINOv2 loader
metrics_dinov2 = evaluate_model_simple(dinov2_matcher, test_loader_dinov2, desc="DINOv2 Baseline")
all_results['DINOv2_SoftArgmax'] = metrics_dinov2

EVALUATING: DINOv2 Baseline + Window Soft-Argmax
Downloading: "https://github.com/facebookresearch/dinov2/zipball/main" to /root/.cache/torch/hub/main.zip




Downloading: "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_pretrain.pth" to /root/.cache/torch/hub/checkpoints/dinov2_vitb14_pretrain.pth


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 330M/330M [00:01<00:00, 261MB/s]
DINOv2 Baseline: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 765/765 [40:20<00:00,  3.16s/it]


EVALUATION SUMMARY

üìä Overall PCK:
----------------------------------------------------------------------
  PCK@0.05: 39.33%
  PCK@0.10: 55.85%
  PCK@0.15: 64.27%
  PCK@0.20: 70.54%

üìÅ Per-Category PCK:
----------------------------------------------------------------------
  aeroplane       ‚Üí 0.05: 56.85% | 0.10: 70.54% | 0.15: 77.04% | 0.20: 82.13%
  bicycle         ‚Üí 0.05: 45.63% | 0.10: 62.34% | 0.15: 69.22% | 0.20: 75.66%
  bird            ‚Üí 0.05: 69.89% | 0.10: 86.89% | 0.15: 91.84% | 0.20: 95.12%
  boat            ‚Üí 0.05: 19.37% | 0.10: 34.82% | 0.15: 45.36% | 0.20: 53.28%
  bottle          ‚Üí 0.05: 25.82% | 0.10: 43.57% | 0.15: 52.82% | 0.20: 60.07%
  bus             ‚Üí 0.05: 37.56% | 0.10: 51.43% | 0.15: 57.37% | 0.20: 62.19%
  car             ‚Üí 0.05: 33.80% | 0.10: 48.78% | 0.15: 54.43% | 0.20: 59.08%
  cat             ‚Üí 0.05: 61.44% | 0.10: 69.36% | 0.15: 75.56% | 0.20: 81.51%
  chair           ‚Üí 0.05: 24.01% | 0.10: 37.46% | 0.15: 45.65% | 0.20: 54.22%




### 7.2 DINOv3 Baseline (Optional - requires setup)

In [12]:
# Clone DINOv3 repo if needed
!git clone https://github.com/facebookresearch/dinov3.git
print("DINOv3 repo cloned.")

print("DINOv3 repo already exists.")

# Install torchmetrics (required by DINOv3)
!pip install -q torchmetrics

fatal: destination path 'dinov3' already exists and is not an empty directory.
DINOv3 repo cloned.
DINOv3 repo already exists.


In [16]:
# Clean up cache first to be safe (as we had corruption issues before)
if os.path.exists(DINOV3_WEIGHTS) and os.path.getsize(DINOV3_WEIGHTS) < 1024*1024:
     print("‚ö†Ô∏è WARNING: DINOv3 checkpoint file looks too small! Check your path.")

RUN_DINOV3 = True  # Set to True after verifying usage

if RUN_DINOV3 and os.path.exists(DINOV3_WEIGHTS):
    print("="*70)
    print("EVALUATING: DINOv3 Baseline + Window Soft-Argmax")
    print("="*70)

    # Clear Hub Cache for DINOv3 specifically if it exists to force local load
    repo_cache = os.path.join(torch.hub.get_dir(), f"checkpoints/{os.path.basename(DINOV3_WEIGHTS)}")
    if os.path.exists(repo_cache):
         try:
             os.remove(repo_cache)
         except: pass

    # DINOv3 use dedicated size (512 or 224 - multiples of 16)
    test_dataset_dinov3 = SPairDataset(
        datapath=DATA_PATH,
        split='test',
        img_size=IMAGE_SIZE_DINOV3,  # 512
        category='all'
    )
    test_loader_dinov3 = DataLoader(
        test_dataset_dinov3, batch_size=BATCH_SIZE, shuffle=False,
        num_workers=NUM_WORKERS, collate_fn=collate_fn_correspondence
    )

    dinov3_extractor = DINOv3Extractor(
        repo_dir=DINOV3_REPO_DIR,
        weights=DINOV3_WEIGHTS
    )

    dinov3_matcher = WindowSoftArgmaxMatcher(
        feature_extractor=dinov3_extractor,
        window_size=WINDOW_SIZE,
        temperature=TEMPERATURE
    )

    metrics_dinov3 = evaluate_model_simple(dinov3_matcher, test_loader_dinov3, desc="DINOv3 Baseline")
    all_results['DINOv3_SoftArgmax'] = metrics_dinov3
else:
    print(f"‚ö†Ô∏è Skipping DINOv3 - configure DINOV3_WEIGHTS (Local path) and set RUN_DINOV3=True")

EVALUATING: DINOv3 Baseline + Window Soft-Argmax
Loading SPair-71k test annotations...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12234/12234 [00:02<00:00, 5087.08it/s]


Downloading: "file:///content/dinov3_vitb16_pretrain_lvd1689m-73cec8be.pth" to /root/.cache/torch/hub/checkpoints/dinov3_vitb16_pretrain_lvd1689m-73cec8be.pth


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 327M/327M [00:06<00:00, 53.5MB/s]
DINOv3 Baseline: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 765/765 [32:00<00:00,  2.51s/it]


EVALUATION SUMMARY

üìä Overall PCK:
----------------------------------------------------------------------
  PCK@0.05: 33.44%
  PCK@0.10: 54.05%
  PCK@0.15: 63.03%
  PCK@0.20: 68.77%

üìÅ Per-Category PCK:
----------------------------------------------------------------------
  aeroplane       ‚Üí 0.05: 36.86% | 0.10: 55.22% | 0.15: 62.21% | 0.20: 67.58%
  bicycle         ‚Üí 0.05: 32.15% | 0.10: 50.88% | 0.15: 58.48% | 0.20: 65.08%
  bird            ‚Üí 0.05: 52.97% | 0.10: 77.48% | 0.15: 84.27% | 0.20: 88.06%
  boat            ‚Üí 0.05: 11.34% | 0.10: 25.52% | 0.15: 34.20% | 0.20: 41.29%
  bottle          ‚Üí 0.05: 25.21% | 0.10: 46.20% | 0.15: 54.56% | 0.20: 60.28%
  bus             ‚Üí 0.05: 32.58% | 0.10: 48.12% | 0.15: 55.09% | 0.20: 58.87%
  car             ‚Üí 0.05: 32.21% | 0.10: 48.94% | 0.15: 54.65% | 0.20: 59.19%
  cat             ‚Üí 0.05: 59.62% | 0.10: 71.62% | 0.15: 76.46% | 0.20: 80.67%
  chair           ‚Üí 0.05: 19.91% | 0.10: 37.29% | 0.15: 46.25% | 0.20: 52.52%




### 7.3 SAM Baseline (Optional - requires checkpoint)

In [10]:
# Download SAM checkpoint if needed
if not os.path.exists(SAM_CHECKPOINT_PATH):
    !wget -q https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth
    print("SAM checkpoint downloaded.")

# Install segment-anything
!pip install -q segment-anything

SAM checkpoint downloaded.


In [11]:
RUN_SAM = True  # Set to True after downloading SAM checkpoint

if RUN_SAM and os.path.exists(SAM_CHECKPOINT_PATH):
    print("="*70)
    print("EVALUATING: SAM Baseline + Window Soft-Argmax")
    print("="*70)

    # SAM uses 512x512 images
    test_dataset_sam = SPairDataset(
        datapath=DATA_PATH,
        split='test',
        img_size=IMAGE_SIZE_SAM,
        category='all'
    )
    test_loader_sam = DataLoader(
        test_dataset_sam, batch_size=BATCH_SIZE // 2, shuffle=False,  # Smaller batch for memory
        num_workers=NUM_WORKERS, collate_fn=collate_fn_correspondence
    )

    sam_extractor = SAMExtractor(
        model_type="vit_b",
        checkpoint_path=SAM_CHECKPOINT_PATH,
        image_size=IMAGE_SIZE_SAM
    )

    sam_matcher = WindowSoftArgmaxMatcher(
        feature_extractor=sam_extractor,
        window_size=WINDOW_SIZE,
        temperature=TEMPERATURE
    )

    metrics_sam = evaluate_model_simple(sam_matcher, test_loader_sam, desc="SAM Baseline")
    all_results['SAM_SoftArgmax'] = metrics_sam
else:
    print("‚ö†Ô∏è Skipping SAM - configure and set RUN_SAM=True")

EVALUATING: SAM Baseline + Window Soft-Argmax
Loading SPair-71k test annotations...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12234/12234 [00:05<00:00, 2345.54it/s]
SAM Baseline: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1530/1530 [37:50<00:00,  1.48s/it]



EVALUATION SUMMARY

üìä Overall PCK:
----------------------------------------------------------------------
  PCK@0.05: 13.02%
  PCK@0.10: 23.23%
  PCK@0.15: 30.51%
  PCK@0.20: 36.85%

üìÅ Per-Category PCK:
----------------------------------------------------------------------
  aeroplane       ‚Üí 0.05: 16.01% | 0.10: 25.14% | 0.15: 32.08% | 0.20: 39.40%
  bicycle         ‚Üí 0.05: 10.68% | 0.10: 19.24% | 0.15: 25.50% | 0.20: 30.83%
  bird            ‚Üí 0.05: 17.50% | 0.10: 30.80% | 0.15: 38.11% | 0.20: 44.92%
  boat            ‚Üí 0.05: 7.62% | 0.10: 17.66% | 0.15: 24.81% | 0.20: 30.92%
  bottle          ‚Üí 0.05: 14.11% | 0.10: 24.45% | 0.15: 31.49% | 0.20: 38.32%
  bus             ‚Üí 0.05: 11.86% | 0.10: 17.93% | 0.15: 22.82% | 0.20: 28.05%
  car             ‚Üí 0.05: 13.06% | 0.10: 21.19% | 0.15: 26.17% | 0.20: 30.73%
  cat             ‚Üí 0.05: 23.10% | 0.10: 36.01% | 0.15: 44.56% | 0.20: 51.08%
  chair           ‚Üí 0.05: 8.76% | 0.10: 15.64% | 0.15: 21.11% | 0.20: 26.31%
 

## 8. Evaluate Fine-Tuned Models with Window Soft-Argmax

### 8.1 DINOv2 Fine-Tuned

In [None]:
checkpoint_path = FINETUNED_CHECKPOINTS.get('dinov2_finetuned', '')

if os.path.exists(checkpoint_path):
    print("="*70)
    print("EVALUATING: DINOv2 Fine-Tuned + Window Soft-Argmax")
    print("="*70)

    # Initialize fresh extractor
    dinov2_ft_extractor = DINOv2Extractor(model_name="dinov2_vitb14")

    # Load fine-tuned weights
    if load_finetuned_weights(dinov2_ft_extractor.model, checkpoint_path):
        dinov2_ft_matcher = WindowSoftArgmaxMatcher(
            feature_extractor=dinov2_ft_extractor,
            window_size=WINDOW_SIZE,
            temperature=TEMPERATURE
        )

        # Use DINOv2 loader (518x518)
        metrics_dinov2_ft = evaluate_model_simple(dinov2_ft_matcher, test_loader_dinov2, desc="DINOv2 Fine-Tuned")
        all_results['DINOv2_FineTuned_SoftArgmax'] = metrics_dinov2_ft
else:
    print(f"‚ö†Ô∏è Skipping DINOv2 Fine-Tuned - checkpoint not found: {checkpoint_path}")

### 8.2 DINOv3 Fine-Tuned

In [None]:
checkpoint_path = FINETUNED_CHECKPOINTS.get('dinov3_finetuned', '')

if RUN_DINOV3 and os.path.exists(checkpoint_path):
    print("="*70)
    print("EVALUATING: DINOv3 Fine-Tuned + Window Soft-Argmax")
    print("="*70)

    # Initialize fresh extractor
    dinov3_ft_extractor = DINOv3Extractor(
        repo_dir=DINOV3_REPO_DIR,
        weights=DINOV3_WEIGHTS
    )

    # Load fine-tuned weights
    if load_finetuned_weights(dinov3_ft_extractor.model, checkpoint_path):
        dinov3_ft_matcher = WindowSoftArgmaxMatcher(
            feature_extractor=dinov3_ft_extractor,
            window_size=WINDOW_SIZE,
            temperature=TEMPERATURE
        )

        # Use DINOv3 loader (512x512)
        metrics_dinov3_ft = evaluate_model_simple(dinov3_ft_matcher, test_loader_dinov3, desc="DINOv3 Fine-Tuned")
        all_results['DINOv3_FineTuned_SoftArgmax'] = metrics_dinov3_ft
else:
    print(f"‚ö†Ô∏è Skipping DINOv3 Fine-Tuned - checkpoint not found or DINOv3 not configured")

### 8.3 SAM Fine-Tuned

In [None]:
checkpoint_path = FINETUNED_CHECKPOINTS.get('sam_finetuned', '')

if RUN_SAM and os.path.exists(checkpoint_path):
    print("="*70)
    print("EVALUATING: SAM Fine-Tuned + Window Soft-Argmax")
    print("="*70)

    # Initialize fresh extractor
    sam_ft_extractor = SAMExtractor(
        model_type="vit_b",
        checkpoint_path=SAM_CHECKPOINT_PATH,
        image_size=IMAGE_SIZE_SAM
    )

    # Load fine-tuned weights
    if load_finetuned_weights(sam_ft_extractor.model, checkpoint_path):
        sam_ft_matcher = WindowSoftArgmaxMatcher(
            feature_extractor=sam_ft_extractor,
            window_size=WINDOW_SIZE,
            temperature=TEMPERATURE
        )

        metrics_sam_ft = evaluate_model_simple(sam_ft_matcher, test_loader_sam, desc="SAM Fine-Tuned")
        all_results['SAM_FineTuned_SoftArgmax'] = metrics_sam_ft
else:
    print(f"‚ö†Ô∏è Skipping SAM Fine-Tuned - checkpoint not found or SAM not configured")

## 9. Results Summary & Visualization

In [None]:
# Create summary table
print("\n" + "="*70)
print("RESULTS SUMMARY")
print("="*70)

summary_df = create_summary_table(all_results)
print(summary_df.to_string(index=False))

In [None]:
# Save all results
os.makedirs(RESULTS_DIR, exist_ok=True)
metrics_path = save_results(all_results, RESULTS_DIR)

In [None]:
# Plot overall comparison
if len(all_results) > 0:
    plot_model_comparison(
        all_results,
        save_path=os.path.join(RESULTS_DIR, 'model_comparison.png')
    )

In [None]:
# Plot baseline vs fine-tuned
if len(all_results) > 1:
    plot_baseline_vs_finetuned(
        all_results,
        save_path=os.path.join(RESULTS_DIR, 'baseline_vs_finetuned.png')
    )

In [None]:
# Plot per-category comparison at PCK@0.10
if len(all_results) > 0:
    plot_per_category_comparison(
        all_results,
        threshold=0.10,
        save_path=os.path.join(RESULTS_DIR, 'per_category_comparison.png')
    )

## 10. Generate Individual Model Reports

In [None]:
# Generate detailed reports for each model
for model_name, metrics in all_results.items():
    print(f"\n{'='*70}")
    print(f"Generating report for: {model_name}")
    print('='*70)

    model_dir = os.path.join(RESULTS_DIR, model_name)
    analyzer = ResultsAnalyzer(metrics)
    analyzer.generate_report(save_dir=model_dir)

## 11. Conclusion

This notebook evaluated the Window Soft-Argmax approach for semantic correspondence on:
- Baseline models (DINOv2, DINOv3, SAM)
- Fine-tuned models (with loaded .pth checkpoints)

The Window Soft-Argmax approach provides:
1. **Sub-pixel precision** - More accurate keypoint localization
2. **Noise robustness** - Windowed softmax reduces impact of noise
3. **Better performance at strict thresholds** - Most visible at Œ±=0.05 and Œ±=0.10

Results are saved in the `./results/task3_evaluation/` directory.