# 08 - Evaluation on Test-Shift with Threshold-Only Adaptation

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IvanNece/Detection-of-Anomalies-with-Localization/blob/main/notebooks/08_evaluation_shifted_threshold_only.ipynb)

**Phase 6.2: Threshold-Only Adaptation**

This notebook implements an ablation study to isolate the contribution of **threshold recalibration** from full model adaptation.

**Experimental Setup:**
- Models: Trained on Clean data (NO retraining)
- Thresholds: RE-calibrated on Val-shift (F1-optimal)
- Test: Evaluated on Test-shift

**Goal:**
Measure how much performance can be recovered through threshold adaptation alone, without model retraining.

**Metrics computed:**
- Image-level: AUROC, AUPRC, F1, Accuracy, Precision, Recall
- Pixel-level: Pixel AUROC, PRO (Per-Region Overlap)

## 1. Setup & Configuration

In [None]:
# ============================================================
# SETUP - Mount Google Drive & Clone Repository
# ============================================================

from google.colab import drive
from pathlib import Path
import os
import sys

# Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')
print("Done!\n")

# Clone repository on main branch
print("Cloning repository (branch: main)...")
repo_dir = '/content/Detection-of-Anomalies-with-Localization'

# Remove if exists
if os.path.exists(repo_dir):
    print("Removing existing repository...")
    !rm -rf {repo_dir}

# Clone from main branch
!git clone https://github.com/IvanNece/Detection-of-Anomalies-with-Localization.git {repo_dir}
print("Done!\n")

# Setup paths
PROJECT_ROOT = Path(repo_dir)

# Dataset location
DATASET_PATH = Path('/content/drive/MyDrive/mvtec_shifted')

# Output directories on Drive
DRIVE_ROOT = Path('/content/drive/MyDrive/anomaly_detection_project')
PATCHCORE_MODELS_DIR = DRIVE_ROOT / '04_patchcore_clean_outputs'
PADIM_MODELS_DIR = DRIVE_ROOT / '05_padim_clean_outputs'

RESULTS_DIR = PROJECT_ROOT / 'outputs' / 'results'
THRESHOLDS_DIR = PROJECT_ROOT / 'outputs' / 'thresholds'
VIZ_DIR = PROJECT_ROOT / 'outputs' / 'visualizations' / 'shift_threshold_only'

RESULTS_DIR.mkdir(parents=True, exist_ok=True)
THRESHOLDS_DIR.mkdir(parents=True, exist_ok=True)
VIZ_DIR.mkdir(parents=True, exist_ok=True)

# Verify dataset exists
if not DATASET_PATH.exists():
    raise FileNotFoundError(
        f"Dataset not found at {DATASET_PATH}\n"
        f"Please ensure mvtec_shifted folder is in your Google Drive root."
    )

# Add project root to Python path
sys.path.insert(0, str(PROJECT_ROOT))

print("\n" + "="*70)
print("SETUP COMPLETE")
print("="*70)
print(f"Project:   {PROJECT_ROOT}")
print(f"Dataset:   {DATASET_PATH}")
print(f"PatchCore: {PATCHCORE_MODELS_DIR}")
print(f"PaDiM:     {PADIM_MODELS_DIR}")
print(f"Results:   {RESULTS_DIR}")
print(f"Viz:       {VIZ_DIR}")
print("="*70)

Install both `faiss` and `anomalib` libraries, required for running PatchCore and PaDiM models respectively. **Must be done before any imports.**

In [None]:
!pip install faiss-cpu --quiet
!pip install anomalib --quiet

Import necessary libraries and modules.

In [None]:
# Standard imports
import json
from datetime import datetime

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

# Project imports
from src.utils.reproducibility import set_seed
from src.utils.config import load_config
from src.utils.paths import ProjectPaths
from src.data.splitter import load_splits
from src.data.dataset import MVTecDataset
from src.data.transforms import get_clean_transforms
from src.models.patchcore import PatchCore
from src.models.padim_wrapper import PadimWrapper

# Metrics imports
from src.metrics import (
    calibrate_threshold,
    ThresholdCalibrator,
    compute_image_metrics,
    compute_pixel_metrics,
    compute_roc_curve,
    compute_pr_curve,
    compute_confusion_matrix,
    aggregate_metrics,
    aggregate_pixel_metrics
)

# Set random seed for reproducibility
set_seed(42)

# Load configuration
config = load_config(PROJECT_ROOT / 'configs' / 'experiment_config.yaml')
paths = ProjectPaths(PROJECT_ROOT)

# Classes to evaluate
CLASSES = config.dataset.classes  # ['hazelnut', 'carpet', 'zipper']

# Device
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'


def evaluation_collate(batch):
    """Custom collate function to handle None masks in batches."""
    batch = list(zip(*batch))
    images = torch.stack(batch[0])
    masks = batch[1] # Keep as tuple/list to handle None
    labels = torch.tensor(batch[2])
    paths = batch[3]
    return images, masks, labels, paths

print(f"Using device: {DEVICE}")
print(f"Classes: {CLASSES}")

## 2. Load Shifted Data Splits

In [None]:
# Load shifted splits
SPLITS_PATH = paths.get_split_path('shifted')
splits = load_splits(SPLITS_PATH)

# Print split statistics
print("\nSplit Statistics:")
print("-" * 50)
for class_name in CLASSES:
    val_n = len(splits[class_name]['val']['images'])
    test_n = len(splits[class_name]['test']['images'])

    val_normal = sum(1 for l in splits[class_name]['val']['labels'] if l == 0)
    val_anom = sum(1 for l in splits[class_name]['val']['labels'] if l == 1)
    test_normal = sum(1 for l in splits[class_name]['test']['labels'] if l == 0)
    test_anom = sum(1 for l in splits[class_name]['test']['labels'] if l == 1)

    print(f"{class_name}:")
    print(f"  Val: {val_n} ({val_normal} normal, {val_anom} anomalous)")
    print(f"  Test: {test_n} ({test_normal} normal, {test_anom} anomalous)")

## 3. Threshold Calibration on Val-Shift

**CRITICAL STEP:** We recalibrate thresholds using predictions from **clean-trained models** on **Val-shift** data.

This isolates the effect of threshold adaptation from model adaptation.

In [None]:
# Initialize threshold calibrators
patchcore_calibrator = ThresholdCalibrator('patchcore')
padim_calibrator = ThresholdCalibrator('padim')

# Get transforms
transform = get_clean_transforms(image_size=config.dataset.image_size)

print("=" * 60)
print("THRESHOLD CALIBRATION (F1-Optimal on Val-shift)")
print("=" * 60)

In [None]:
# PATCHCORE: Calibrate thresholds on Val-shift
print("\n>>> PATCHCORE <<<\n")

for class_name in CLASSES:
    print(f"\n--- {class_name.upper()} ---")

    # Create validation dataset
    val_split = splits[class_name]['val']
    val_dataset = MVTecDataset.from_split(
        val_split,
        transform=transform,
        phase='val'
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=8,
        shuffle=False,
        num_workers=0,
        collate_fn=evaluation_collate
    )

    # Load model trained on CLEAN data
    model = PatchCore(
        backbone_layers=config.patchcore.layers,
        patch_size=config.patchcore.patch_size,
        coreset_ratio=config.patchcore.coreset_sampling_ratio,
        n_neighbors=config.patchcore.n_neighbors,
        device=DEVICE
    )
    model.load(PATCHCORE_MODELS_DIR, class_name, domain='clean')

    # Collect predictions on VAL-SHIFT
    val_scores = []
    val_labels = []

    with torch.no_grad():
        for images, masks, labels, paths in tqdm(val_loader, desc=f'Validating {class_name}'):
            images = images.to(DEVICE)
            scores, _ = model.predict(images, return_heatmaps=False)

            val_scores.extend(scores.tolist())
            val_labels.extend(labels.numpy().tolist())

    # Convert to arrays
    val_scores = np.array(val_scores)
    val_labels = np.array(val_labels)

    # Calibrate threshold on Val-shift
    threshold = patchcore_calibrator.calibrate(class_name, val_scores, val_labels)
    print(f"  Calibrated threshold: {threshold:.4f}")

# Save calibrated thresholds
patchcore_calibrator.save(THRESHOLDS_DIR / 'shift_threshold_only_patchcore.json')
print(f"\n[OK] Thresholds saved: shift_threshold_only_patchcore.json")

In [None]:
# PADIM: Calibrate thresholds on Val-shift
print("\n>>> PADIM <<<\n")

for class_name in CLASSES:
    print(f"\n--- {class_name.upper()} ---")

    # Create validation dataset
    val_split = splits[class_name]['val']
    val_dataset = MVTecDataset.from_split(
        val_split,
        transform=transform,
        phase='val'
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=8,
        shuffle=False,
        num_workers=0,
        collate_fn=evaluation_collate
    )

    # Load model trained on CLEAN data
    model = PadimWrapper(
        backbone=config.padim.backbone,
        layers=config.padim.layers,
        n_features=config.padim.n_features,
        image_size=config.dataset.image_size,
        device=DEVICE
    )
    model.load(PADIM_MODELS_DIR / f'padim_{class_name}_clean.pt')

    # Collect predictions on VAL-SHIFT
    val_scores = []
    val_labels = []

    with torch.no_grad():
        for images, masks, labels, paths in tqdm(val_loader, desc=f'Validating {class_name}'):
            images = images.to(DEVICE)
            scores, _ = model.predict(images, return_heatmaps=False)

            val_scores.extend(scores.tolist())
            val_labels.extend(labels.numpy().tolist())

    # Convert to arrays
    val_scores = np.array(val_scores)
    val_labels = np.array(val_labels)

    # Calibrate threshold on Val-shift
    threshold = padim_calibrator.calibrate(class_name, val_scores, val_labels)
    print(f"  Calibrated threshold: {threshold:.4f}")

# Save calibrated thresholds
padim_calibrator.save(THRESHOLDS_DIR / 'shift_threshold_only_padim.json')
print(f"\n[OK] Thresholds saved: shift_threshold_only_padim.json")

In [None]:
# ============================================================\
# VISUALIZATION: Score Distributions and Recalibrated Thresholds (Val-Shift)\
# ============================================================\

import matplotlib.pyplot as plt
import seaborn as sns

# Create figure
fig, axes = plt.subplots(len(CLASSES), 2, figsize=(14, 4*len(CLASSES)))

# If only 1 class, axes is 1D array, make it 2D for consistency
if len(CLASSES) == 1:
    axes = axes.reshape(1, -1)

print("\nGenerating Score Distribution Plots...")

for i, class_name in enumerate(CLASSES):
    # --- PatchCore ---
    ax1 = axes[i, 0]
    
    # We need to access the calibration data stored inside the calibrator
    # Note: access internal storage assuming it was populated during calibration loop
    # For this visualization to work, we need to have stored the scores/labels during calibration loop
    # Let's re-extract them seamlessly if they aren't readily available
    
    # Re-run prediction on VAL set for visualization if data not persistent (safer approach here)
    # OR better: Assume we are inside the loop or have data.
    # To make this robust as a standalone cell, let's presume we ran the calibration loop above.
    # We will use temporary lists if available, or just plot what we just computed.
    
    # Since the previous cells overwrote 'val_scores' and 'val_labels' in the loop, 
    # we can only plot the LAST class if we don't store them. 
    # TO FIX THIS: I provided a robust version below that doesn't depend on loop variables.
    pass 

plt.close() # Close the placeholder

# ROBUST PLOTTING CELL (Copy this one)
# This cell re-runs inference on VAL subset (very fast) to ensure data is available for plotting
# without relying on transient loop variables from previous cells.

fig, axes = plt.subplots(len(CLASSES), 2, figsize=(15, 5*len(CLASSES)))
if len(CLASSES) == 1: axes = axes.reshape(1, -1)

print("Generating Score Distribution & Threshold Plots (Val-Shift)...")

for i, class_name in enumerate(CLASSES):
    # 1. PatchCore Data
    # Load model clean
    model_pc = PatchCore(
        backbone_layers=config.patchcore.layers,
        patch_size=config.patchcore.patch_size,
        coreset_ratio=config.patchcore.coreset_sampling_ratio,
        n_neighbors=config.patchcore.n_neighbors,
        device=DEVICE
    )
    model_pc.load(PATCHCORE_MODELS_DIR, class_name, domain='clean')
    
    # Get Val Data
    val_dataset = MVTecDataset.from_split(splits[class_name]['val'], transform=transform, phase='val')
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=0, collate_fn=evaluation_collate)
    
    pc_scores = []
    pc_labels = []
    with torch.no_grad():
        for imgs, _, lbls, _ in val_loader:
            s, _ = model_pc.predict(imgs.to(DEVICE), return_heatmaps=False)
            pc_scores.extend(s.cpu().numpy())
            pc_labels.extend(lbls.numpy())
    
    pc_scores = np.array(pc_scores)
    pc_labels = np.array(pc_labels)
    pc_thresh = thresholds['patchcore'][class_name]

    # Plot PatchCore
    ax1 = axes[i, 0]
    sns.histplot(pc_scores[pc_labels==0], color='blue', label='Normal', kde=True, ax=ax1, alpha=0.5)
    sns.histplot(pc_scores[pc_labels==1], color='red', label='Anomalous', kde=True, ax=ax1, alpha=0.5)
    ax1.axvline(pc_thresh, color='green', linestyle='--', linewidth=2, label=f'New Threshold: {pc_thresh:.2f}')
    ax1.set_title(f'PatchCore (Val-Shift): {class_name}', fontsize=12, fontweight='bold')
    ax1.set_xlabel('Anomaly Score')
    ax1.legend()

    # 2. PaDiM Data
    model_pd = PadimWrapper(
        backbone=config.padim.backbone,
        layers=config.padim.layers,
        n_features=config.padim.n_features,
        image_size=config.dataset.image_size,
        device=DEVICE
    )
    model_pd.load(PADIM_MODELS_DIR / f'padim_{class_name}_clean.pt')
    
    pd_scores = []
    pd_labels = []
    with torch.no_grad():
        for imgs, _, lbls, _ in val_loader:
            s, _ = model_pd.predict(imgs.to(DEVICE), return_heatmaps=False)
            pd_scores.extend(s.cpu().numpy())
            pd_labels.extend(lbls.numpy())
            
    pd_scores = np.array(pd_scores)
    pd_labels = np.array(pd_labels)
    pd_thresh = thresholds['padim'][class_name]

    # Plot PaDiM
    ax2 = axes[i, 1]
    sns.histplot(pd_scores[pd_labels==0], color='blue', label='Normal', kde=True, ax=ax2, alpha=0.5)
    sns.histplot(pd_scores[pd_labels==1], color='red', label='Anomalous', kde=True, ax=ax2, alpha=0.5)
    ax2.axvline(pd_thresh, color='green', linestyle='--', linewidth=2, label=f'New Threshold: {pd_thresh:.2f}')
    ax2.set_title(f'PaDiM (Val-Shift): {class_name}', fontsize=12, fontweight='bold')
    ax2.set_xlabel('Anomaly Score')
    ax2.legend()

plt.suptitle(f'Score Distributions & Recalibrated Thresholds on Val-Shift\n(Clean Models)', fontsize=16, y=1.02)
plt.tight_layout()
plt.savefig(VIZ_DIR / 'shifted_score_distributions_recalibrated.png', dpi=150, bbox_inches='tight')
plt.show()
print(f"[OK] Plot saved to {VIZ_DIR / 'shifted_score_distributions_recalibrated.png'}")

## 4. Evaluation on Test-Shift with Recalibrated Thresholds

Now we evaluate on the test set using the **recalibrated thresholds from Val-shift**.

In [None]:
# Load recalibrated thresholds
thresholds = {
    'patchcore': patchcore_calibrator.thresholds,
    'padim': padim_calibrator.thresholds
}

# Storage for results
all_results = {
    'patchcore': {},
    'padim': {}
}

In [None]:
# PATCHCORE: Evaluate on Test-shift
print("\n" + "="*60)
print("TEST-SHIFT EVALUATION (PATCHCORE)")
print("="*60)

for class_name in CLASSES:
    print(f"\n--- {class_name.upper()} ---")

    # Create test dataset
    test_split = splits[class_name]['test']
    test_dataset = MVTecDataset.from_split(
        test_split,
        transform=transform,
        phase='test'
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=8,
        shuffle=False,
        num_workers=0,
        collate_fn=evaluation_collate
    )

    # Load model trained on CLEAN data
    model = PatchCore(
        backbone_layers=config.patchcore.layers,
        patch_size=config.patchcore.patch_size,
        coreset_ratio=config.patchcore.coreset_sampling_ratio,
        n_neighbors=config.patchcore.n_neighbors,
        device=DEVICE
    )
    model.load(PATCHCORE_MODELS_DIR, class_name, domain='clean')

    # Collect predictions
    all_scores = []
    all_labels = []
    all_heatmaps = []
    all_masks = []

    with torch.no_grad():
        for images, masks, labels, paths in tqdm(test_loader, desc=f'Testing {class_name}'):
            images = images.to(DEVICE)
            scores, heatmaps = model.predict(images, return_heatmaps=True)

            all_scores.extend(scores.tolist())
            all_labels.extend(labels.numpy().tolist())

            for mask in masks:
                if mask is not None:
                    all_masks.append(mask.numpy().squeeze())
                else:
                    all_masks.append(None)
            
            # Convert heatmaps to cpu list
            all_heatmaps.extend([h.cpu().numpy() for h in heatmaps])

    # Convert to arrays
    test_scores = np.array(all_scores)
    test_labels = np.array(all_labels)

    # Get recalibrated threshold
    threshold = thresholds['patchcore'][class_name]

    # Compute image-level metrics
    image_metrics = compute_image_metrics(test_labels, test_scores, threshold=threshold)

    # Compute pixel-level metrics
    pixel_metrics = compute_pixel_metrics(all_masks, all_heatmaps, compute_pro_metric=True)

    # Store results
    all_results['patchcore'][class_name] = {
        'threshold': threshold,
        'image_level': image_metrics,
        'pixel_level': pixel_metrics,
        'test_scores': test_scores.tolist(),
        'test_labels': test_labels.tolist()
    }

    # Print results
    print(f"  Threshold (recalibrated on val-shift): {threshold:.4f}")
    print(f"  AUROC: {image_metrics['auroc']:.4f}")
    print(f"  AUPRC: {image_metrics['auprc']:.4f}")
    print(f"  F1: {image_metrics['f1']:.4f}")
    print(f"  Accuracy: {image_metrics.get('accuracy', 'N/A'):.4f}")
    print(f"  Pixel AUROC: {pixel_metrics.get('pixel_auroc', 'N/A')}")
    print(f"  PRO: {pixel_metrics.get('pro', 'N/A')}")

In [None]:
# PADIM: Evaluate on Test-shift
print("\n" + "="*60)
print("TEST-SHIFT EVALUATION (PADIM)")
print("="*60)

for class_name in CLASSES:
    print(f"\n--- {class_name.upper()} ---")

    # Create test dataset
    test_split = splits[class_name]['test']
    test_dataset = MVTecDataset.from_split(
        test_split,
        transform=transform,
        phase='test'
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=8,
        shuffle=False,
        num_workers=0,
        collate_fn=evaluation_collate
    )

    # Load model trained on CLEAN data
    model = PadimWrapper(
        backbone=config.padim.backbone,
        layers=config.padim.layers,
        n_features=config.padim.n_features,
        image_size=config.dataset.image_size,
        device=DEVICE
    )
    model.load(PADIM_MODELS_DIR / f'padim_{class_name}_clean.pt')

    # Collect predictions
    all_scores = []
    all_labels = []
    all_heatmaps = []
    all_masks = []

    with torch.no_grad():
        for images, masks, labels, paths in tqdm(test_loader, desc=f'Testing {class_name}'):
            images = images.to(DEVICE)
            scores, heatmaps = model.predict(images, return_heatmaps=True)

            all_scores.extend(scores.tolist())
            all_labels.extend(labels.numpy().tolist())

            for mask in masks:
                if mask is not None:
                    all_masks.append(mask.numpy().squeeze())
                else:
                    all_masks.append(None)
            
            # Convert heatmaps to cpu list
            all_heatmaps.extend([h.cpu().numpy() for h in heatmaps])

    # Convert to arrays
    test_scores = np.array(all_scores)
    test_labels = np.array(all_labels)

    # Get recalibrated threshold
    threshold = thresholds['padim'][class_name]

    # Compute image-level metrics
    image_metrics = compute_image_metrics(test_labels, test_scores, threshold=threshold)

    # Compute pixel-level metrics
    pixel_metrics = compute_pixel_metrics(all_masks, all_heatmaps, compute_pro_metric=True)

    # Store results
    all_results['padim'][class_name] = {
        'threshold': threshold,
        'image_level': image_metrics,
        'pixel_level': pixel_metrics,
        'test_scores': test_scores.tolist(),
        'test_labels': test_labels.tolist()
    }

    # Print results
    print(f"  Threshold (recalibrated on val-shift): {threshold:.4f}")
    print(f"  AUROC: {image_metrics['auroc']:.4f}")
    print(f"  AUPRC: {image_metrics['auprc']:.4f}")
    print(f"  F1: {image_metrics['f1']:.4f}")
    print(f"  Accuracy: {image_metrics.get('accuracy', 'N/A'):.4f}")
    print(f"  Pixel AUROC: {pixel_metrics.get('pixel_auroc', 'N/A')}")
    print(f"  PRO: {pixel_metrics.get('pro', 'N/A')}")

In [None]:
# Compute Aggregated Metrics (Macro-Average)
aggregated_results = {
    'patchcore': aggregate_metrics(all_results['patchcore']),
    'padim': aggregate_metrics(all_results['padim'])
}

print("\n" + "="*60)
print("AGGREGATED RESULTS (MACRO-AVERAGE)")
print("="*60)

print("\nPATCHCORE:")
for metric, value in aggregated_results['patchcore'].items():
    print(f"  {metric}: {value:.4f}")

print("\nPADIM:")
for metric, value in aggregated_results['padim'].items():
    print(f"  {metric}: {value:.4f}")

In [None]:
# Save full results to JSON
results_path = RESULTS_DIR / 'shifted_threshold_only_results.json'

with open(results_path, 'w') as f:
    json.dump(all_results, f, indent=2, default=lambda x: x.tolist() if isinstance(x, np.ndarray) else x)

print(f"\n[OK] Results saved to {results_path}")

In [None]:
# Create summary CSV
import pandas as pd

summary_rows = []

for method, results in all_results.items():
    for class_name, metrics in results.items():
        row = {
            'Method': method.upper(),
            'Class': class_name.capitalize(),
            'Threshold': f"{metrics['threshold']:.2f}",
            'AUROC': f"{metrics['image_level']['auroc']:.4f}",
            'AUPRC': f"{metrics['image_level']['auprc']:.4f}",
            'F1': f"{metrics['image_level']['f1']:.4f}",
            'Accuracy': f"{metrics['image_level']['accuracy']:.4f}",
            'Pixel AUROC': f"{metrics['pixel_level']['pixel_auroc']:.4f}",
            'PRO': f"{metrics['pixel_level']['pro']:.4f}"
        }
        summary_rows.append(row)

    # Add macro Average
    macro_avg = aggregated_results[method]
    row = {
        'Method': method.upper(),
        'Class': 'Macro_average',
        'Threshold': '-',
        'AUROC': f"{macro_avg['auroc']:.4f}",
        'AUPRC': f"{macro_avg['auprc']:.4f}",
        'F1': f"{macro_avg['f1']:.4f}",
        'Accuracy': f"{macro_avg['accuracy']:.4f}",
        'Pixel AUROC': f"{macro_avg['pixel_auroc']:.4f}",
        'PRO': f"{macro_avg['pro']:.4f}"
    }
    summary_rows.append(row)

df_summary = pd.DataFrame(summary_rows)
csv_path = RESULTS_DIR / 'shifted_threshold_only_summary.csv'
df_summary.to_csv(csv_path, index=False)

print("\nSUMMARY TABLE:")
print(df_summary)
print(f"\n[OK] Summary CSV saved to {csv_path}")

In [None]:
# Visualize Comparison: Threshold Adaptation vs No Adaptation
# Note: This requires loading results from the previous no-adaptation run if available

no_adapt_path = RESULTS_DIR / 'shifted_no_adaptation_results.json'
if no_adapt_path.exists():
    with open(no_adapt_path, 'r') as f:
        no_adapt_results = json.load(f)
    
    # Function to plot comparison
    def plot_adaptation_comparison(no_adapt, adapt, metric='auroc', title='AUROC Comparison'):
        labels = [c.capitalize() for c in CLASSES] + ['Average']
        
        # Extract no-adapt values
        na_patchcore = [no_adapt['patchcore'][c]['image_level'][metric] for c in CLASSES]
        na_padim = [no_adapt['padim'][c]['image_level'][metric] for c in CLASSES]
        
        # Calculate averages
        na_patchcore.append(np.mean(na_patchcore))
        na_padim.append(np.mean(na_padim))
        
        # Extract adapt values
        a_patchcore = [adapt['patchcore'][c]['image_level'][metric] for c in CLASSES]
        a_padim = [adapt['padim'][c]['image_level'][metric] for c in CLASSES]
        
        # Calculate averages
        a_patchcore.append(np.mean(a_patchcore))
        a_padim.append(np.mean(a_padim))
        
        x = np.arange(len(labels))
        width = 0.2
        
        fig, ax = plt.subplots(figsize=(10, 6))
        
        ax.bar(x - 1.5*width, na_patchcore, width, label='PatchCore (No Adapt)', color='#1f77b4', alpha=0.7)
        ax.bar(x - 0.5*width, a_patchcore, width, label='PatchCore (Thresh Adapt)', color='#1f77b4')
        
        ax.bar(x + 0.5*width, na_padim, width, label='PaDiM (No Adapt)', color='#ff7f0e', alpha=0.7)
        ax.bar(x + 1.5*width, a_padim, width, label='PaDiM (Thresh Adapt)', color='#ff7f0e')
        
        ax.set_ylabel(metric.upper())
        ax.set_title(title)
        ax.set_xticks(x)
        ax.set_xticklabels(labels)
        ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        ax.grid(True, linestyle='--', alpha=0.6)
        ax.set_ylim(0, 1.05)
        
        plt.tight_layout()
        plt.savefig(VIZ_DIR / f'comparison_{metric}.png')
        plt.show()

    plot_adaptation_comparison(no_adapt_results, all_results, metric='auroc', title='Image AUROC: Threshold Adaptation vs No Adaptation')
    plot_adaptation_comparison(no_adapt_results, all_results, metric='f1', title='F1 Score: Threshold Adaptation vs No Adaptation')
    
    print(f"\n[OK] Comparison plots saved to {VIZ_DIR}")
else:
    print("\n[INFO] No adaptation results not found, skipping comparison plots.")

---
## Files Generated

This notebook generates the following files, representing the results of the **Threshold-Only Adaptation Ablation Study**:

**Thresholds** (`outputs/thresholds/`):
- `shift_threshold_only_patchcore.json` - Thresholds recalibrated on *Val-shift* for PatchCore
- `shift_threshold_only_padim.json` - Thresholds recalibrated on *Val-shift* for PaDiM

**Results** (`outputs/results/`):
- `shifted_threshold_only_results.json` - Detailed metrics (AUROC, F1, Pixel-AUROC, etc.) for each class
- `shifted_threshold_only_summary.csv` - Summary table of performance metrics (easy to read/plot)

**Visualizations** (`outputs/visualizations/shift_threshold_only/`):
- `comparison_auroc.png` - Bar chart comparing *No-Adaptation* vs *Threshold-Adaptation* AUROC
- `comparison_f1.png` - Bar chart comparing *No-Adaptation* vs *Threshold-Adaptation* F1 Scores

These files quantify the performance gain achievable purely by recalibrating the decision boundary on the target domain, serving as a baseline for the full model adaptation in Phase 7.

In [None]:
# ============================================================\
# BACKUP RESULTS TO DRIVE
# ============================================================\

import shutil
import os

# Define source and destination paths
SOURCE_RESULTS = RESULTS_DIR
SOURCE_THRESHOLDS = THRESHOLDS_DIR
SOURCE_VIZ = VIZ_DIR

# Define Drive destination folder for this specific phase
DRIVE_OUTPUT_DIR = DRIVE_ROOT / '08_evaluation_shifted_threshold_only_outputs'

# Create Drive directory if it doesn't exist
if not os.path.exists(DRIVE_OUTPUT_DIR):
    os.makedirs(DRIVE_OUTPUT_DIR)
    print(f"Created Drive directory: {DRIVE_OUTPUT_DIR}")

print(f"\nBacking up results to: {DRIVE_OUTPUT_DIR}")
print("-" * 60)

# 1. Copy Results JSONs and CSVs
print("Copying results files...")
for file_path in SOURCE_RESULTS.glob('*.*'):
    if file_path.is_file():
        dest_path = DRIVE_OUTPUT_DIR / file_path.name
        shutil.copy2(file_path, dest_path)
        print(f"  ✓ Copied: {file_path.name}")

# 2. Copy Threshold files
print("\nCopying threshold files...")
for file_path in SOURCE_THRESHOLDS.glob('*shift_threshold_only*.json'):
    if file_path.is_file():
        dest_path = DRIVE_OUTPUT_DIR / file_path.name
        shutil.copy2(file_path, dest_path)
        print(f"  ✓ Copied: {file_path.name}")

# 3. Copy Visualizations (zip folder to keep structure clean)
print("\nArchiving and copying visualizations...")
viz_archive_path = shutil.make_archive(PROJECT_ROOT / 'shifted_threshold_viz', 'zip', SOURCE_VIZ)
shutil.copy2(viz_archive_path, DRIVE_OUTPUT_DIR / 'visualizations.zip')
print(f"  ✓ Copied: visualizations.zip")

print("-" * 60)
print("Backup Complete!")