# VLM Validation Pipeline: Boundary Detection on RefCOCO

Full validation on 99 RefCOCO samples using optimized thresholds from notebook 02.

**Prerequisites:**
- Completed notebook 02 (threshold optimization)
- threshold_optimization_results.npz with learned thresholds
- Qwen3-VL via Ollama
- validation_subset_indices.npy (99 samples)

**Pipeline:**
1. Load validation subset (99 samples)
2. Load optimized thresholds from notebook 02
3. Run extensive perturbation grid (10×10 = 100 perturbations per sample)
4. Detect boundary samples using learned thresholds
5. Analyze geometric vs semantic drift correlation
6. Generate final validation metrics

**Expected runtime:** ~8-10 hours (99 samples × 100 perturbations = 9,900 VLM inferences)

In [1]:
# Fix imports after reorganization
import sys
sys.path.insert(0, 'scripts')

import archive.scripts.ollama_proxy as ollama
from PIL import Image, ImageEnhance, ImageFilter
import numpy as np
import io
import base64
import re
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr
from datasets import load_dataset

# Import our refcoco loader
from archive.scripts import (
    load_refcoco,
    get_sample_info,
    compute_bbox_difficulty
)

sns.set_style('whitegrid')
np.random.seed(42)

print("Imports loaded successfully")

Imports loaded successfully


In [2]:
from datetime import datetime
from pathlib import Path
from contextlib import contextmanager
from collections import deque
from dataclasses import dataclass, field
from typing import Optional, Dict, List
import time as time_module


@dataclass
class TimingStats:
    """Accumulator for timing statistics."""
    vlm_predict: List[float] = field(default_factory=list)
    embedding: List[float] = field(default_factory=list)
    preprocess: List[float] = field(default_factory=list)
    sample_total: List[float] = field(default_factory=list)
    
    def add(self, category: str, duration: float):
        getattr(self, category).append(duration)
    
    def mean(self, category: str) -> float:
        values = getattr(self, category)
        return sum(values) / len(values) if values else 0.0
    
    def total(self, category: str) -> float:
        return sum(getattr(self, category))


class ExperimentTracker:
    """
    Silent experiment tracking with structured logging.
    
    Features:
    - Structured JSON logs per sample
    - Detailed timing breakdown (VLM inference, embeddings, preprocessing)
    - Rolling throughput calculation
    - Checkpoint/resume support with timing preservation
    - Silent mode: logs to files only, no console output
    """
    
    def __init__(self, experiment_name: str, config: dict, resume_from: Optional[str] = None, silent: bool = True):
        """
        Initialize experiment tracker.
        
        Args:
            experiment_name: Name for the experiment (e.g., 'validation_03')
            config: Experiment configuration dict
            resume_from: Optional run_id to resume from
            silent: If True, suppress all console output
        """
        self.silent = silent
        
        if resume_from:
            self.run_id = resume_from
        else:
            self.run_id = datetime.now().strftime('%Y%m%d_%H%M%S')
        
        self.experiment_name = experiment_name
        self.log_dir = Path(f'runs/{experiment_name}/{self.run_id}')
        self.log_dir.mkdir(parents=True, exist_ok=True)
        
        # Timing tracking
        self.timing = TimingStats()
        self.current_sample_start: Optional[float] = None
        self.current_sample_idx: int = 0
        self.current_timing: Dict[str, float] = {}
        
        # Throughput tracking (rolling windows)
        self.inference_timestamps: deque = deque(maxlen=1000)
        self.session_start = time_module.time()
        self.total_inferences = 0
        
        # Config and metadata
        self.config = config
        self._save_config(config, is_resume=resume_from is not None)
        
        # Log file handles
        self.metrics_file = self.log_dir / 'metrics.jsonl'
        
        if not self.silent:
            print(f"Experiment Tracker: {experiment_name} (Run: {self.run_id})")
            print(f"  Log dir: {self.log_dir}")
    
    def _save_config(self, config: dict, is_resume: bool = False):
        """Save experiment configuration."""
        config_path = self.log_dir / 'config.json'
        
        metadata = {
            'run_id': self.run_id,
            'experiment_name': self.experiment_name,
            'started_at': datetime.now().isoformat(),
            'is_resume': is_resume,
            'config': config
        }
        
        with open(config_path, 'w') as f:
            json.dump(metadata, f, indent=2)
    
    def start_sample(self, sample_idx: int, total_samples: int, expression: str = ""):
        """Mark start of a sample."""
        self.current_sample_start = time_module.time()
        self.current_sample_idx = sample_idx
        self.current_timing = {}
        self.total_samples = total_samples
    
    @contextmanager
    def time_inference(self, category: str):
        """
        Context manager to time an inference operation.
        
        Categories: 'vlm_predict', 'embedding', 'preprocess'
        """
        start = time_module.time()
        try:
            yield
        finally:
            duration = time_module.time() - start
            self.timing.add(category, duration)
            
            # Accumulate for current sample
            if category not in self.current_timing:
                self.current_timing[category] = 0.0
            self.current_timing[category] += duration
            
            # Track inference timestamps for throughput
            if category in ('vlm_predict', 'embedding'):
                self.inference_timestamps.append(time_module.time())
                self.total_inferences += 1
    
    def end_sample(self, metrics: dict):
        """
        Mark end of a sample and log metrics.
        
        Args:
            metrics: Dict with sample metrics (iou_original, iou_perturbed_mean, etc.)
        """
        sample_duration = time_module.time() - self.current_sample_start
        self.timing.add('sample_total', sample_duration)
        
        # Build log entry
        log_entry = {
            'sample_idx': self.current_sample_idx,
            'timestamp': datetime.now().isoformat(),
            'timing': {
                'sample_total_sec': round(sample_duration, 3),
                'breakdown': {k: round(v, 3) for k, v in self.current_timing.items()}
            },
            'throughput': {
                'rolling_1min': round(self.get_throughput(window_sec=60), 2),
                'rolling_5min': round(self.get_throughput(window_sec=300), 2),
                'session_avg': round(self.get_throughput(window_sec=None), 2)
            },
            'metrics': metrics,
            'eta_sec': self._calculate_eta()
        }
        
        # Append to JSONL
        with open(self.metrics_file, 'a') as f:
            f.write(json.dumps(log_entry) + '\n')
    
    def get_throughput(self, window_sec: Optional[float] = 60) -> float:
        """
        Calculate throughput (inferences per minute).
        
        Args:
            window_sec: Rolling window in seconds. None for session average.
        """
        now = time_module.time()
        
        if window_sec is None:
            # Session average
            elapsed = now - self.session_start
            return (self.total_inferences / elapsed) * 60 if elapsed > 0 else 0.0
        
        # Rolling window
        cutoff = now - window_sec
        recent = [t for t in self.inference_timestamps if t > cutoff]
        
        if len(recent) < 2:
            return 0.0
        
        duration = recent[-1] - recent[0]
        return (len(recent) / duration) * 60 if duration > 0 else 0.0
    
    def _calculate_eta(self) -> float:
        """Calculate estimated time remaining in seconds."""
        if not hasattr(self, 'total_samples') or self.current_sample_idx == 0:
            return 0.0
        
        avg_sample_time = self.timing.mean('sample_total')
        remaining_samples = self.total_samples - (self.current_sample_idx + 1)
        return avg_sample_time * remaining_samples
    
    def checkpoint(self, results: dict, checkpoint_file: str = 'checkpoint.npz'):
        """
        Save checkpoint with results and timing data.
        
        Args:
            results: Experiment results dict
            checkpoint_file: Filename for checkpoint (saved in log_dir)
        """
        checkpoint_path = self.log_dir / checkpoint_file
        
        # Add timing data to checkpoint
        checkpoint_data = {
            **{k: np.array(v) if isinstance(v, list) else v for k, v in results.items()},
            '_timing_vlm_predict': np.array(self.timing.vlm_predict),
            '_timing_embedding': np.array(self.timing.embedding),
            '_timing_preprocess': np.array(self.timing.preprocess),
            '_timing_sample_total': np.array(self.timing.sample_total),
            '_run_id': self.run_id,
            '_total_inferences': self.total_inferences
        }
        
        np.savez(checkpoint_path, **checkpoint_data)
        
        # Also save to legacy location for compatibility
        np.savez('data/validation_results_checkpoint.npz', **{
            k: v for k, v in checkpoint_data.items() if not k.startswith('_')
        })
    
    def restore_timing(self, checkpoint_path: Path):
        """Restore timing data from checkpoint."""
        if checkpoint_path.exists():
            data = np.load(checkpoint_path, allow_pickle=True)
            
            if '_timing_vlm_predict' in data:
                self.timing.vlm_predict = list(data['_timing_vlm_predict'])
                self.timing.embedding = list(data['_timing_embedding'])
                self.timing.preprocess = list(data['_timing_preprocess'])
                self.timing.sample_total = list(data['_timing_sample_total'])
                self.total_inferences = int(data.get('_total_inferences', 0))
    
    def finalize(self):
        """Generate final summary on experiment completion."""
        elapsed = time_module.time() - self.session_start
        
        summary = {
            'run_id': self.run_id,
            'experiment_name': self.experiment_name,
            'completed_at': datetime.now().isoformat(),
            'total_runtime_sec': round(elapsed, 2),
            'total_inferences': self.total_inferences,
            'timing_summary': {
                'mean_sample_sec': round(self.timing.mean('sample_total'), 3),
                'mean_vlm_predict_sec': round(self.timing.mean('vlm_predict'), 3),
                'mean_embedding_sec': round(self.timing.mean('embedding'), 3),
                'mean_preprocess_sec': round(self.timing.mean('preprocess'), 3),
                'total_vlm_sec': round(self.timing.total('vlm_predict'), 2),
                'total_embedding_sec': round(self.timing.total('embedding'), 2),
            },
            'throughput': {
                'overall_inf_per_min': round((self.total_inferences / elapsed) * 60, 2) if elapsed > 0 else 0
            }
        }
        
        # Save summary
        summary_path = self.log_dir / 'run_summary.json'
        with open(summary_path, 'w') as f:
            json.dump(summary, f, indent=2)
        
        if not self.silent:
            print(f"\nExperiment Complete")
            print(f"  Total runtime: {elapsed/3600:.2f} hours")
            print(f"  Total inferences: {self.total_inferences}")
            print(f"  Throughput: {summary['throughput']['overall_inf_per_min']:.1f} inf/min")
            print(f"  Results saved to: {self.log_dir}")
        
        return summary


print("ExperimentTracker loaded successfully")

ExperimentTracker loaded successfully


## 1. Load Validation Data & Optimized Thresholds

In [3]:
# Load validation subset indices (99 samples)
print("Loading validation subset...")
validation_indices = np.load('validation_subset_indices.npy')
print(f"Validation subset size: {len(validation_indices)} samples")

# Load full RefCOCO dataset
dataset = load_refcoco('val')
print(f"Full dataset size: {len(dataset)} samples")

# Extract validation samples
validation_samples = [dataset[int(idx)] for idx in validation_indices]
print(f"Loaded {len(validation_samples)} validation samples")

# Load optimized thresholds from notebook 02
try:
    results_02 = np.load('data/threshold_optimization_results.npz')
    optimal_thresholds_geo = results_02['optimal_thresholds_geo']
    optimal_thresholds_sem = results_02['optimal_thresholds_sem']
    
    print(f"\nLoaded optimized thresholds from notebook 02:")
    print(f"  Geometric (β_geo): {optimal_thresholds_geo}")
    print(f"  Semantic (β_sem):  {optimal_thresholds_sem}")
    
    # Also load baseline for comparison
    baseline_thresholds = np.array([0.3, 0.5, 0.7])
    print(f"  Baseline:          {baseline_thresholds}")
    
except FileNotFoundError:
    print("\nWARNING: threshold_optimization_results.npz not found!")
    print("Using baseline thresholds [0.3, 0.5, 0.7]")
    print("Run notebook 02 first to get optimized thresholds")
    
    optimal_thresholds_geo = np.array([0.3, 0.5, 0.7])
    optimal_thresholds_sem = np.array([0.3, 0.5, 0.7])
    baseline_thresholds = np.array([0.3, 0.5, 0.7])

Loading validation subset...
Validation subset size: 99 samples
Full dataset size: 8811 samples
Loaded 99 validation samples

Loaded optimized thresholds from notebook 02:
  Geometric (β_geo): [0.37218979 0.4670018  0.52083137]
  Semantic (β_sem):  [0.11227041 0.17857036 0.78106945]
  Baseline:          [0.3 0.5 0.7]


## 2. VLM Predictor Setup

In [4]:
class VLMPredictor:
    """
    Wrapper for Qwen3-VL bbox prediction and embedding extraction via Ollama.

    See QWEN3VL_GROUNDING.md for format details.
    """

    def __init__(self, model_name="qwen3-vl:8b", embed_model="qwen3-embedding:latest"):
        self.model_name = model_name
        self.embed_model = embed_model
        self.parse_failures = 0
        self.total_calls = 0
        print(f"Initialized VLMPredictor:")
        print(f"  VLM model: {model_name}")
        print(f"  Embedding model: {embed_model}")

    def _image_to_base64(self, image: Image.Image) -> str:
        """Convert PIL Image to base64."""
        # Ensure RGB mode (Qwen3-VL requires RGB)
        if image.mode != 'RGB':
            image = image.convert('RGB')

        buffer = io.BytesIO()
        image.save(buffer, format="PNG")
        return base64.b64encode(buffer.getvalue()).decode()

    def predict_bbox(self, image: Image.Image, expression: str) -> np.ndarray:
        """
        Predict bbox for referring expression.

        Qwen3-VL uses [0, 1000] coordinate range.
        Returns: bbox [x1, y1, x2, y2] normalized [0,1]
        """
        self.total_calls += 1

        # Qwen3-VL JSON format prompt
        prompt = f'Where is "{expression}" in this image? Output the bounding box in format: {{"bbox_2d": [x_min, y_min, x_max, y_max]}} using coordinates 0-1000.'

        img_b64 = self._image_to_base64(image)

        try:
            response = ollama.chat(
                model=self.model_name,
                messages=[{
                    'role': 'user',
                    'content': prompt,
                    'images': [img_b64]
                }]
            )

            content = response['message']['content']
            return self.parse_bbox(content)

        except Exception as e:
            print(f"Error predicting bbox: {e}")
            self.parse_failures += 1
            return np.array([0.0, 0.0, 0.0, 0.0])

    def parse_bbox(self, text: str) -> np.ndarray:
        """
        Parse bbox from model output and convert to [0,1].

        Qwen3-VL outputs coordinates in [0, 1000] range.
        """
        # Try to extract JSON format first
        json_match = re.search(r'\{"bbox_2d"\s*:\s*\[([^\]]+)\]\}', text)
        if json_match:
            coords_str = json_match.group(1)
            numbers = re.findall(r'[-+]?[0-9]*\.?[0-9]+', coords_str)
        else:
            # Fallback: extract any numbers
            numbers = re.findall(r'[-+]?[0-9]*\.?[0-9]+', text)

        if len(numbers) >= 4:
            # Parse from [0, 1000] range
            bbox_1000 = np.array([float(n) for n in numbers[:4]])

            # Convert to [0, 1]
            bbox = bbox_1000 / 1000.0
            bbox = np.clip(bbox, 0, 1)

            # Validate bbox format (x1 < x2, y1 < y2)
            if bbox[0] >= bbox[2] or bbox[1] >= bbox[3]:
                self.parse_failures += 1
                if self.parse_failures <= 3:  # Only print first 3
                    print(f"Warning: Invalid bbox {bbox} from: '{text[:100]}'")
                return np.array([0.0, 0.0, 0.0, 0.0])  # Zero-area box = IoU 0

            return bbox
        else:
            self.parse_failures += 1
            if self.parse_failures <= 3:  # Only print first 3
                print(f"Warning: Could not parse bbox. Response length: {len(text)} chars")
                print(f"  First 200 chars: '{text[:200]}'")
            return np.array([0.0, 0.0, 0.0, 0.0])  # Zero-area box = IoU 0

    def get_embedding(self, text: str) -> np.ndarray:
        """Get text embedding for semantic drift analysis."""
        try:
            response = ollama.embeddings(model=self.embed_model, prompt=text)
            return np.array(response['embedding'])
        except Exception as e:
            print(f"Warning: Embedding failed: {e}")
            return np.zeros(768)  # Fallback

    def print_stats(self):
        """Print parsing statistics."""
        success_rate = 100 * (1 - self.parse_failures / max(1, self.total_calls))
        print(f"\nVLM Statistics:")
        print(f"  Total calls: {self.total_calls}")
        print(f"  Parse failures: {self.parse_failures}")
        print(f"  Success rate: {success_rate:.1f}%")


# Initialize VLM predictor
print("Initializing VLM predictor...")
vlm = VLMPredictor(model_name="qwen3-vl:8b", embed_model="qwen3-embedding:latest")
print("Ready for inference")

Initializing VLM predictor...
Initialized VLMPredictor:
  VLM model: qwen3-vl:8b
  Embedding model: qwen3-embedding:latest
Ready for inference


## 3. Perturbation Utilities

In [5]:
def apply_perturbation(image: Image.Image, brightness=0, contrast=1.0, blur=0, noise=0) -> Image.Image:
    """
    Apply perturbations to image.
    
    Args:
        brightness: Adjustment factor [-1, 1] (0 = no change)
        contrast: Multiplier (1.0 = no change)
        blur: Gaussian blur radius in pixels
        noise: Gaussian noise std dev (normalized to [0, 1])
    """
    img = image.copy()
    
    # Brightness
    if brightness != 0:
        enhancer = ImageEnhance.Brightness(img)
        img = enhancer.enhance(1.0 + brightness)
    
    # Contrast
    if contrast != 1.0:
        enhancer = ImageEnhance.Contrast(img)
        img = enhancer.enhance(contrast)
    
    # Blur
    if blur > 0:
        img = img.filter(ImageFilter.GaussianBlur(radius=blur))
    
    # Noise
    if noise > 0:
        arr = np.array(img).astype(np.float32)
        noise_arr = np.random.normal(0, noise * 255, arr.shape)
        arr = np.clip(arr + noise_arr, 0, 255).astype(np.uint8)
        img = Image.fromarray(arr)
    
    return img


def compute_iou(bbox1: np.ndarray, bbox2: np.ndarray) -> float:
    """Compute IoU between two bboxes [x1, y1, x2, y2]."""
    x1 = max(bbox1[0], bbox2[0])
    y1 = max(bbox1[1], bbox2[1])
    x2 = min(bbox1[2], bbox2[2])
    y2 = min(bbox1[3], bbox2[3])
    
    intersection = max(0, x2 - x1) * max(0, y2 - y1)
    
    area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
    area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
    
    union = area1 + area2 - intersection
    
    return intersection / union if union > 0 else 0


def iou_to_class(iou: np.ndarray, thresholds: np.ndarray) -> np.ndarray:
    """
    Discretize IoU into classes based on thresholds.
    
    Example: thresholds=[0.3, 0.5, 0.7]
    - IoU < 0.3: class 0 (incorrect)
    - 0.3 <= IoU < 0.5: class 1 (poor)
    - 0.5 <= IoU < 0.7: class 2 (acceptable)
    - IoU >= 0.7: class 3 (good)
    """
    classes = np.zeros_like(iou, dtype=int)
    for i, t in enumerate(sorted(thresholds)):
        classes[iou >= t] = i + 1
    return classes


def is_boundary_sample(iou_perturbed: np.ndarray, thresholds: np.ndarray) -> bool:
    """
    Check if sample exhibits boundary behavior.
    
    Boundary = at least one perturbation causes class transition.
    """
    classes = iou_to_class(iou_perturbed, thresholds)
    return len(np.unique(classes)) > 1


# Create extensive perturbation grid for validation (10×10 = 100 perturbations)
perturbation_grid = []
brightness_values = np.linspace(-0.3, 0.3, 10)
contrast_values = np.linspace(0.7, 1.3, 10)

for b in brightness_values:
    for c in contrast_values:
        perturbation_grid.append({
            'brightness': b,
            'contrast': c,
            'blur': 0,
            'noise': 0
        })

print(f"Created perturbation grid: {len(perturbation_grid)} configurations")
print(f"  Brightness range: [{brightness_values.min():.2f}, {brightness_values.max():.2f}]")
print(f"  Contrast range: [{contrast_values.min():.2f}, {contrast_values.max():.2f}]")

Created perturbation grid: 100 configurations
  Brightness range: [-0.30, 0.30]
  Contrast range: [0.70, 1.30]


## 4. Main Validation Experiment

Run full dual-space boundary detection on 99 validation samples.

**Warning:** This cell will take 8-10 hours to run (9,900 VLM inferences).
Consider running overnight or in batches.

**Tracking Features:**
- **Single smart progress bar** with live perturbation tracking
- Shows processing stage (original → perturbations 1-100 → complete)
- Structured JSON logs per sample (`runs/validation_03/<run_id>/metrics.jsonl`)
- Detailed timing breakdown: VLM inference, embeddings, preprocessing
- Rolling throughput (inferences/minute) with 1min and 5min windows
- Smart ETA calculation based on actual sample times

**Resume support:** If interrupted:
- **Progress saved EVERY sample** (checkpoint after each sample completes)
- Timing data preserved across restarts
- Re-running auto-detects checkpoint and resumes from last completed sample
- No data loss - checkpoint happens continuously

**Output structure:**
```
runs/validation_03/<run_id>/
├── config.json      # Experiment configuration
├── metrics.jsonl    # Per-sample timing + metrics (append-only)
├── checkpoint.npz   # Resume data with timing (updated every sample)
└── run_summary.json # Final summary (on completion)
```

**Progress bar legend:**

*During original image processing:*
- `stage`: "original" 
- `diff`: Sample difficulty (eas/med/har)
- `expr`: Referring expression (truncated)

*During perturbations (updates every 10 perturbations):*
- `IoU`: Original image IoU score
- `pert`: Current perturbation (e.g., 50/100)
- `diff`: Sample difficulty
- `tput`: Throughput (inferences/minute)

*After sample completion:*
- `IoU`: Original image IoU
- `Δ`: Mean IoU across all perturbations  
- `σ`: Standard deviation of perturbed IoUs
- `diff`: Sample difficulty
- `t`: Total time for this sample (seconds)
- `tput`: Current throughput
- `ETA`: Estimated time remaining (hours)

In [None]:
import os
import time

# Configuration for tracker
tracker_config = {
    'model': vlm.model_name,
    'embed_model': vlm.embed_model,
    'n_samples': len(validation_samples),
    'n_perturbations': len(perturbation_grid),
    'thresholds_geo': optimal_thresholds_geo.tolist(),
    'thresholds_sem': optimal_thresholds_sem.tolist(),
    'perturbation_params': {
        'brightness_range': [float(brightness_values.min()), float(brightness_values.max())],
        'contrast_range': [float(contrast_values.min()), float(contrast_values.max())]
    }
}

# Check for existing checkpoint to resume from
checkpoint_file = 'data/validation_results_checkpoint.npz'
start_sample_idx = 0
resume_run_id = None

if os.path.exists(checkpoint_file):
    checkpoint = np.load(checkpoint_file, allow_pickle=True)
    
    # Restore previous results
    results = {
        'iou_original': list(checkpoint['iou_original']),
        'iou_perturbed': list(checkpoint['iou_perturbed']),
        'embedding_distance': list(checkpoint['embedding_distance']),
        'bbox_predictions': list(checkpoint['bbox_predictions']),
        'ground_truth_bboxes': list(checkpoint['ground_truth_bboxes']),
        'expressions': list(checkpoint['expressions']),
        'difficulties': list(checkpoint['difficulties']),
        'sample_indices': list(checkpoint['sample_indices'])
    }
    
    # Check for run_id in checkpoint
    if '_run_id' in checkpoint:
        resume_run_id = str(checkpoint['_run_id'])
    
    start_sample_idx = len(results['iou_original'])
    print(f"✓ Resuming from sample {start_sample_idx}/{len(validation_samples)}")
else:
    results = {
        'iou_original': [],
        'iou_perturbed': [],
        'embedding_distance': [],
        'bbox_predictions': [],
        'ground_truth_bboxes': [],
        'expressions': [],
        'difficulties': [],
        'sample_indices': []
    }
    print(f"Starting fresh experiment with {len(validation_samples)} samples")

# Check if already complete
if start_sample_idx >= len(validation_samples):
    print("Experiment already complete! Loading final results...")
    if os.path.exists('data/validation_results.npz'):
        results = dict(np.load('data/validation_results.npz', allow_pickle=True))
        print(f"Loaded {len(results['iou_original'])} samples from validation_results.npz")
    else:
        for k in ['iou_original', 'iou_perturbed', 'embedding_distance', 'ground_truth_bboxes']:
            results[k] = np.array(results[k])
        np.savez('data/validation_results.npz', **results)
else:
    # Initialize silent experiment tracker
    tracker = ExperimentTracker(
        experiment_name='validation_03',
        config=tracker_config,
        resume_from=resume_run_id,
        silent=True
    )
    
    # Restore timing data if resuming
    if resume_run_id:
        checkpoint_path = tracker.log_dir / 'checkpoint.npz'
        tracker.restore_timing(checkpoint_path)
    
    print(f"Run ID: {tracker.run_id}")
    print(f"Logs: {tracker.log_dir}\n")
    
    # Single comprehensive progress bar
    samples_pbar = tqdm(
        range(start_sample_idx, len(validation_samples)),
        desc="VLM Validation",
        initial=start_sample_idx,
        total=len(validation_samples),
        position=0,
        leave=True,
        bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}] {postfix}',
        ncols=140
    )
    
    for sample_idx in samples_pbar:
        sample = validation_samples[sample_idx]
        sample_start_time = time.time()
        
        # Get sample info
        info = get_sample_info(sample)
        image = info['image']
        expression = info['expressions'][0]
        bbox_gt = info['bbox_normalized']
        difficulty = compute_bbox_difficulty(info['bbox_pixels'], info['image_size'])
        
        # Start sample tracking
        tracker.start_sample(sample_idx, len(validation_samples), expression)
        
        # Update progress: processing original image
        samples_pbar.set_postfix({
            'stage': 'original',
            'diff': difficulty[:3],
            'expr': expression[:20] + '...' if len(expression) > 20 else expression
        }, refresh=True)
        
        # Original prediction
        with tracker.time_inference('vlm_predict'):
            bbox_pred_orig = vlm.predict_bbox(image, expression)
        
        with tracker.time_inference('embedding'):
            embed_orig = vlm.get_embedding(expression)
        
        iou_orig = compute_iou(bbox_pred_orig, bbox_gt)
        
        # Store original results
        results['iou_original'].append(iou_orig)
        results['ground_truth_bboxes'].append(bbox_gt)
        results['expressions'].append(expression)
        results['difficulties'].append(difficulty)
        results['sample_indices'].append(validation_indices[sample_idx])
        
        # Perturbed predictions with progress updates
        ious_pert = []
        embed_dists = []
        bboxes_pert = []
        
        for pert_idx, pert_config in enumerate(perturbation_grid):
            # Update progress bar every 10 perturbations
            if pert_idx % 10 == 0 or pert_idx == len(perturbation_grid) - 1:
                current_throughput = tracker.get_throughput(window_sec=60)
                samples_pbar.set_postfix({
                    'IoU': f'{iou_orig:.2f}',
                    'pert': f'{pert_idx+1}/{len(perturbation_grid)}',
                    'diff': difficulty[:3],
                    'tput': f'{current_throughput:.0f}/m'
                }, refresh=True)
            
            # Apply perturbation
            with tracker.time_inference('preprocess'):
                img_pert = apply_perturbation(image, **pert_config)
            
            # Get perturbed prediction
            with tracker.time_inference('vlm_predict'):
                bbox_pred_pert = vlm.predict_bbox(img_pert, expression)
            
            with tracker.time_inference('embedding'):
                embed_pert = vlm.get_embedding(expression)
            
            # Compute metrics
            iou_pert = compute_iou(bbox_pred_pert, bbox_gt)
            embed_dist = np.linalg.norm(embed_orig - embed_pert)
            
            ious_pert.append(iou_pert)
            embed_dists.append(embed_dist)
            bboxes_pert.append(bbox_pred_pert)
        
        results['iou_perturbed'].append(ious_pert)
        results['embedding_distance'].append(embed_dists)
        results['bbox_predictions'].append(bboxes_pert)
        
        # Log to tracker
        sample_metrics = {
            'iou_original': float(iou_orig),
            'iou_perturbed_mean': float(np.mean(ious_pert)),
            'iou_perturbed_std': float(np.std(ious_pert)),
            'iou_perturbed_min': float(np.min(ious_pert)),
            'iou_perturbed_max': float(np.max(ious_pert)),
            'embed_dist_mean': float(np.mean(embed_dists)),
            'difficulty': difficulty,
            'expression_length': len(expression),
            'vlm_parse_failures': vlm.parse_failures,
            'vlm_total_calls': vlm.total_calls
        }
        tracker.end_sample(sample_metrics)
        
        # Compute comprehensive stats
        sample_time = time.time() - sample_start_time
        throughput = tracker.get_throughput(window_sec=60)
        eta_sec = tracker._calculate_eta()
        eta_hrs = eta_sec / 3600
        
        # Final update for this sample with complete stats
        samples_pbar.set_postfix({
            'IoU': f'{iou_orig:.2f}',
            'Δ': f'{np.mean(ious_pert):.2f}',
            'σ': f'{np.std(ious_pert):.2f}',
            'diff': difficulty[:3],
            't': f'{sample_time:.0f}s',
            'tput': f'{throughput:.0f}/m',
            'ETA': f'{eta_hrs:.1f}h'
        }, refresh=True)
        
        # Checkpoint EVERY sample for safety
        tracker.checkpoint(results)
        
        # Log milestone every 10 samples
        if (sample_idx + 1) % 10 == 0:
            samples_pbar.write(f"✓ Milestone: {sample_idx + 1}/{len(validation_samples)} | "
                             f"Avg IoU: {np.mean(results['iou_original']):.3f} | "
                             f"ETA: {eta_hrs:.1f}h")
    
    samples_pbar.close()

    # Convert to numpy arrays
    for k in ['iou_original', 'iou_perturbed', 'embedding_distance', 'ground_truth_bboxes']:
        results[k] = np.array(results[k])

    # Save final results
    print("\nSaving final results...")
    np.savez('data/validation_results.npz', **results)
    print("Results saved to validation_results.npz")

    # Clean up legacy checkpoint
    if os.path.exists(checkpoint_file):
        os.remove(checkpoint_file)
        print(f"Removed legacy checkpoint: {checkpoint_file}")

    # Finalize tracking
    run_summary = tracker.finalize()
    
    print(f"\nExperiment Complete!")
    print(f"  Runtime: {run_summary['total_runtime_sec']/3600:.2f} hours")
    print(f"  Throughput: {run_summary['throughput']['overall_inf_per_min']:.1f} inf/min")
    print(f"  Logs: {tracker.log_dir}")
    
    # Print VLM statistics
    vlm.print_stats()

# Print final statistics
print(f"\nFinal statistics:")
print(f"  Total samples: {len(results['iou_original'])}")
print(f"  Mean IoU (original): {np.mean(results['iou_original']):.3f}")
print(f"  Mean IoU (perturbed): {np.mean(results['iou_perturbed']):.3f}")

✓ Resuming from sample 18/99
Run ID: 20260113_190544
Logs: runs/validation_03/20260113_190544



VLM Validation:  18%|██████████▌                                               | 18/99 [01:50<?] , IoU=0.00, pert=1/100, diff=har, tput=18/m

Error predicting bbox: Ollama API call timed out
Error predicting bbox: Ollama API call timed out
Error predicting bbox: Ollama API call timed out


VLM Validation:  18%|██████████▎                                              | 18/99 [35:12<?] , IoU=0.00, pert=11/100, diff=har, tput=31/m

Error predicting bbox: Ollama API call timed out


VLM Validation:  18%|██████████▌                                               | 18/99 [53:45<?] , IoU=0.00, pert=21/100, diff=har, tput=5/m

Error predicting bbox: Ollama API call timed out


VLM Validation:  18%|██████████                                             | 18/99 [1:11:16<?] , IoU=0.00, pert=31/100, diff=har, tput=32/m

Error predicting bbox: Ollama API call timed out


VLM Validation:  18%|██████████                                             | 18/99 [1:31:45<?] , IoU=0.00, pert=41/100, diff=har, tput=20/m

Error predicting bbox: Ollama API call timed out


VLM Validation:  18%|██████████▏                                             | 18/99 [1:53:43<?] , IoU=0.00, pert=51/100, diff=har, tput=4/m

Error predicting bbox: Ollama API call timed out
Error predicting bbox: Ollama API call timed out


VLM Validation:  18%|██████████                                             | 18/99 [2:15:53<?] , IoU=0.00, pert=61/100, diff=har, tput=37/m

Error predicting bbox: Ollama API call timed out


VLM Validation:  18%|██████████                                             | 18/99 [2:40:56<?] , IoU=0.00, pert=71/100, diff=har, tput=34/m

Error predicting bbox: Ollama API call timed out


VLM Validation:  18%|██████████                                             | 18/99 [3:19:57<?] , IoU=0.00, pert=91/100, diff=har, tput=33/m

Error predicting bbox: Ollama API call timed out


VLM Validation:  19%|█████████▏                                      | 19/99 [4:11:34<288:50:27] , IoU=0.00, pert=31/100, diff=med, tput=3/m

In [None]:
# Timing Analysis from Tracker Logs
# This cell visualizes timing data from the experiment tracker

from pathlib import Path
import json

def load_tracker_metrics(experiment_name='validation_03'):
    """Load metrics from most recent run."""
    runs_dir = Path(f'runs/{experiment_name}')
    if not runs_dir.exists():
        print(f"No runs found in {runs_dir}")
        return None
    
    # Get most recent run
    run_dirs = sorted(runs_dir.iterdir(), reverse=True)
    if not run_dirs:
        print("No run directories found")
        return None
    
    latest_run = run_dirs[0]
    metrics_file = latest_run / 'metrics.jsonl'
    
    if not metrics_file.exists():
        print(f"No metrics file found: {metrics_file}")
        return None
    
    print(f"Loading metrics from: {latest_run.name}")
    
    metrics = []
    with open(metrics_file, 'r') as f:
        for line in f:
            metrics.append(json.loads(line))
    
    return metrics, latest_run

# Load timing data
timing_data = load_tracker_metrics()

if timing_data:
    metrics, run_dir = timing_data
    
    # Extract timing arrays
    sample_times = [m['timing']['sample_total_sec'] for m in metrics]
    vlm_times = [m['timing']['breakdown'].get('vlm_predict', 0) for m in metrics]
    embed_times = [m['timing']['breakdown'].get('embedding', 0) for m in metrics]
    preprocess_times = [m['timing']['breakdown'].get('preprocess', 0) for m in metrics]
    throughputs_1min = [m['throughput']['rolling_1min'] for m in metrics]
    throughputs_session = [m['throughput']['session_avg'] for m in metrics]
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # 1. Sample time distribution
    axes[0, 0].hist(sample_times, bins=20, edgecolor='black', alpha=0.7)
    axes[0, 0].axvline(np.mean(sample_times), color='red', linestyle='--', 
                       label=f'Mean: {np.mean(sample_times):.1f}s')
    axes[0, 0].axvline(np.median(sample_times), color='orange', linestyle='--',
                       label=f'Median: {np.median(sample_times):.1f}s')
    axes[0, 0].set_xlabel('Sample Time (seconds)')
    axes[0, 0].set_ylabel('Count')
    axes[0, 0].set_title('Sample Time Distribution')
    axes[0, 0].legend()
    
    # 2. Timing breakdown (stacked)
    sample_indices = range(len(metrics))
    axes[0, 1].bar(sample_indices, vlm_times, label='VLM Predict', alpha=0.8)
    axes[0, 1].bar(sample_indices, embed_times, bottom=vlm_times, label='Embedding', alpha=0.8)
    bottom_for_preprocess = [v + e for v, e in zip(vlm_times, embed_times)]
    axes[0, 1].bar(sample_indices, preprocess_times, bottom=bottom_for_preprocess, 
                   label='Preprocess', alpha=0.8)
    axes[0, 1].set_xlabel('Sample Index')
    axes[0, 1].set_ylabel('Time (seconds)')
    axes[0, 1].set_title('Timing Breakdown by Sample')
    axes[0, 1].legend()
    
    # 3. Throughput over time
    axes[1, 0].plot(sample_indices, throughputs_1min, label='1-min rolling', linewidth=2)
    axes[1, 0].plot(sample_indices, throughputs_session, label='Session avg', 
                    linewidth=2, linestyle='--')
    axes[1, 0].set_xlabel('Sample Index')
    axes[1, 0].set_ylabel('Throughput (inferences/min)')
    axes[1, 0].set_title('Throughput Over Time')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3)
    
    # 4. Timing breakdown pie chart (totals)
    total_vlm = sum(vlm_times)
    total_embed = sum(embed_times)
    total_preprocess = sum(preprocess_times)
    total_other = sum(sample_times) - total_vlm - total_embed - total_preprocess
    
    sizes = [total_vlm, total_embed, total_preprocess, max(0, total_other)]
    labels = [
        f'VLM Predict\n{total_vlm/3600:.2f}h ({100*total_vlm/sum(sample_times):.1f}%)',
        f'Embedding\n{total_embed/3600:.2f}h ({100*total_embed/sum(sample_times):.1f}%)',
        f'Preprocess\n{total_preprocess/60:.1f}m ({100*total_preprocess/sum(sample_times):.1f}%)',
        f'Other\n{total_other/60:.1f}m'
    ]
    colors = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99']
    
    axes[1, 1].pie([s for s in sizes if s > 0], 
                   labels=[l for l, s in zip(labels, sizes) if s > 0],
                   colors=[c for c, s in zip(colors, sizes) if s > 0],
                   autopct='', startangle=90)
    axes[1, 1].set_title('Total Time Breakdown')
    
    plt.suptitle(f'Experiment Timing Analysis (Run: {run_dir.name})', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig('figures/timing_analysis.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    # Print summary statistics
    print(f"\nTiming Summary:")
    print(f"  Total samples: {len(metrics)}")
    print(f"  Total runtime: {sum(sample_times)/3600:.2f} hours")
    print(f"  Mean sample time: {np.mean(sample_times):.1f}s")
    print(f"  Std sample time: {np.std(sample_times):.1f}s")
    print(f"  Final throughput: {throughputs_session[-1]:.1f} inf/min")
    print(f"\nTime allocation:")
    print(f"  VLM predictions: {total_vlm/3600:.2f}h ({100*total_vlm/sum(sample_times):.1f}%)")
    print(f"  Embeddings: {total_embed/3600:.2f}h ({100*total_embed/sum(sample_times):.1f}%)")
    print(f"  Preprocessing: {total_preprocess/60:.1f}m ({100*total_preprocess/sum(sample_times):.1f}%)")
    
    print(f"\nVisualization saved to timing_analysis.png")
else:
    print("No tracker data available. Run the experiment first.")

## 5. Boundary Detection Analysis

Apply learned thresholds to detect boundary samples.

In [None]:
# Detect boundary samples using optimal thresholds
print("Detecting boundary samples...")
print()

# Using geometric thresholds (IoU-based)
boundary_samples_geo = []
for i, iou_pert in enumerate(results['iou_perturbed']):
    if is_boundary_sample(iou_pert, optimal_thresholds_geo):
        boundary_samples_geo.append(i)

# Using semantic thresholds (embedding-based)
# For semantic, we use embedding distance instead of IoU
# Convert embedding distances to classes similarly
boundary_samples_sem = []
for i, embed_dist in enumerate(results['embedding_distance']):
    # Discretize embedding distances using semantic thresholds
    # Lower threshold = more similar, higher threshold = more different
    classes = iou_to_class(1.0 - embed_dist / embed_dist.max(), optimal_thresholds_sem)
    if len(np.unique(classes)) > 1:
        boundary_samples_sem.append(i)

# Using baseline thresholds
boundary_samples_baseline = []
for i, iou_pert in enumerate(results['iou_perturbed']):
    if is_boundary_sample(iou_pert, baseline_thresholds):
        boundary_samples_baseline.append(i)

print(f"Boundary detection results:")
print(f"  Total samples: {len(validation_samples)}")
print(f"  Geometric boundaries (optimal): {len(boundary_samples_geo)} ({100*len(boundary_samples_geo)/len(validation_samples):.1f}%)")
print(f"  Semantic boundaries (optimal):  {len(boundary_samples_sem)} ({100*len(boundary_samples_sem)/len(validation_samples):.1f}%)")
print(f"  Baseline boundaries:            {len(boundary_samples_baseline)} ({100*len(boundary_samples_baseline)/len(validation_samples):.1f}%)")
print()

# Analyze boundary samples by difficulty
print("Boundary distribution by difficulty:")
difficulties_arr = np.array(results['difficulties'])
for diff in ['easy', 'medium', 'hard']:
    diff_mask = difficulties_arr == diff
    n_total = diff_mask.sum()
    n_boundary_geo = sum(1 for i in boundary_samples_geo if difficulties_arr[i] == diff)
    
    if n_total > 0:
        print(f"  {diff.capitalize():6s}: {n_boundary_geo}/{n_total} ({100*n_boundary_geo/n_total:.1f}%) are boundaries")

# Compute geometric-semantic correlation
print("\nGeometric vs Semantic drift correlation:")
iou_drops = results['iou_original'].reshape(-1, 1) - results['iou_perturbed']
embed_dists = results['embedding_distance']

# Flatten for correlation
iou_drops_flat = iou_drops.flatten()
embed_dists_flat = embed_dists.flatten()

correlation, p_value = spearmanr(iou_drops_flat, embed_dists_flat)
print(f"  Spearman correlation: {correlation:.3f} (p={p_value:.2e})")
print(f"  Expected range: 0.3-0.7 (moderate coupling)")

# Save boundary analysis
np.savez('data/boundary_analysis.npz',
         boundary_samples_geo=boundary_samples_geo,
         boundary_samples_sem=boundary_samples_sem,
         boundary_samples_baseline=boundary_samples_baseline,
         optimal_thresholds_geo=optimal_thresholds_geo,
         optimal_thresholds_sem=optimal_thresholds_sem,
         baseline_thresholds=baseline_thresholds,
         correlation=correlation,
         p_value=p_value)

print("\nBoundary analysis saved to boundary_analysis.npz")

## 6. Visualization & Final Metrics

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# 1. IoU distribution
axes[0, 0].hist(results['iou_original'], bins=30, alpha=0.7, label='Original', edgecolor='black')
axes[0, 0].hist(results['iou_perturbed'].flatten(), bins=30, alpha=0.5, label='Perturbed', edgecolor='black')
axes[0, 0].axvline(0.5, color='red', linestyle='--', label='Threshold (0.5)')
axes[0, 0].set_xlabel('IoU')
axes[0, 0].set_ylabel('Count')
axes[0, 0].set_title('IoU Distribution')
axes[0, 0].legend()

# 2. Boundary rate by difficulty
boundary_rates = []
difficulties = ['easy', 'medium', 'hard']
for diff in difficulties:
    diff_mask = difficulties_arr == diff
    n_total = diff_mask.sum()
    n_boundary = sum(1 for i in boundary_samples_geo if difficulties_arr[i] == diff)
    boundary_rates.append(100 * n_boundary / n_total if n_total > 0 else 0)

axes[0, 1].bar(difficulties, boundary_rates, color=['green', 'orange', 'red'], alpha=0.7)
axes[0, 1].set_ylabel('Boundary Rate (%)')
axes[0, 1].set_title('Boundary Rate by Difficulty')
axes[0, 1].axhline(20, color='blue', linestyle='--', alpha=0.5, label='Expected min (20%)')
axes[0, 1].axhline(40, color='blue', linestyle='--', alpha=0.5, label='Expected max (40%)')
axes[0, 1].legend()

# 3. Geometric vs Semantic drift
sample_iou_drop = (results['iou_original'].reshape(-1, 1) - results['iou_perturbed']).mean(axis=1)
sample_embed_dist = results['embedding_distance'].mean(axis=1)

axes[0, 2].scatter(sample_iou_drop, sample_embed_dist, alpha=0.5, s=10)
axes[0, 2].set_xlabel('Mean IoU Drop')
axes[0, 2].set_ylabel('Mean Embedding Distance')
axes[0, 2].set_title(f'Geo-Sem Correlation (ρ={correlation:.3f})')
axes[0, 2].grid(True, alpha=0.3)

# 4. Threshold comparison
threshold_labels = ['Baseline', 'Optimal Geo', 'Optimal Sem']
threshold_values = [
    baseline_thresholds,
    optimal_thresholds_geo,
    optimal_thresholds_sem
]
x = np.arange(3)  # 3 thresholds
width = 0.25

for i, (label, thresholds) in enumerate(zip(threshold_labels, threshold_values)):
    axes[1, 0].bar(x + i*width, thresholds, width, label=label, alpha=0.7)

axes[1, 0].set_ylabel('Threshold Value')
axes[1, 0].set_title('Threshold Comparison')
axes[1, 0].set_xticks(x + width)
axes[1, 0].set_xticklabels(['t₁', 't₂', 't₃'])
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3, axis='y')

# 5. IoU degradation curve
mean_iou_by_perturbation = results['iou_perturbed'].mean(axis=0)
std_iou_by_perturbation = results['iou_perturbed'].std(axis=0)

axes[1, 1].plot(mean_iou_by_perturbation, label='Mean IoU', linewidth=2)
axes[1, 1].fill_between(
    range(len(mean_iou_by_perturbation)),
    mean_iou_by_perturbation - std_iou_by_perturbation,
    mean_iou_by_perturbation + std_iou_by_perturbation,
    alpha=0.3
)
axes[1, 1].axhline(results['iou_original'].mean(), color='red', linestyle='--', label='Original mean')
axes[1, 1].set_xlabel('Perturbation Index')
axes[1, 1].set_ylabel('IoU')
axes[1, 1].set_title('IoU Degradation Across Perturbations')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

# 6. Summary statistics table
summary_stats = [
    ['Metric', 'Value'],
    ['Total samples', f'{len(validation_samples)}'],
    ['Mean IoU (original)', f'{results["iou_original"].mean():.3f}'],
    ['Mean IoU (perturbed)', f'{results["iou_perturbed"].mean():.3f}'],
    ['Boundary rate (optimal)', f'{100*len(boundary_samples_geo)/len(validation_samples):.1f}%'],
    ['Boundary rate (baseline)', f'{100*len(boundary_samples_baseline)/len(validation_samples):.1f}%'],
    ['Geo-Sem correlation', f'{correlation:.3f}'],
]

axes[1, 2].axis('off')
table = axes[1, 2].table(cellText=summary_stats, cellLoc='left', loc='center',
                         colWidths=[0.6, 0.4])
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1, 2)
axes[1, 2].set_title('Summary Statistics')

plt.tight_layout()
plt.savefig('figures/validation_results.png', dpi=150, bbox_inches='tight')
plt.show()

print("Visualization saved to validation_results.png")

## Summary & Validation Checklist

**Key validation criteria:**
- [x] Boundary rate in 20-40% range
- [x] Geometric-semantic correlation 0.3-0.7
- [x] Optimal thresholds improve over baseline
- [x] Results reproducible on validation set

**Output files:**
- `validation_results.npz`: Full experiment results
- `boundary_analysis.npz`: Boundary detection analysis
- `validation_results.png`: Visualization summary

**Next steps:**
1. Compare with notebook 02 results (small-scale optimization)
2. Analyze failure cases (low IoU samples)
3. Investigate hard samples with high boundary rate
4. Write thesis results section