# 06 - Validation Analysis

Additional validation of detected candidates:
1. Cross-reference with OSM/UNHCR databases
2. Distance to nearest road
3. Temporal analysis (did the settlement appear recently?)
4. Generate all paper figures

**Input:** Detection results from notebook 05, trained model  
**Output:** Validation metrics, paper-ready figures

In [None]:
# --- Colab setup (uncomment if running on Colab) ---
# PROJECT_DIR = '/content/drive/MyDrive/sentinel-refugee-detection'

# --- Local setup ---
PROJECT_DIR = '..'

In [None]:
import sys
sys.path.insert(0, PROJECT_DIR)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from src.utils import load_config, _haversine_km

In [None]:
config = load_config(f'{PROJECT_DIR}/configs/default.yaml')

# Load locations with predictions (saved from notebook 05)
locations = pd.read_csv(f'{PROJECT_DIR}/data/labels/all_locations.csv')
# You'll need to add prediction columns from notebook 05
# For now, this is a template

## 1. Cross-Reference with Known Databases

For each high-confidence detection, check if it matches a known camp in OSM or UNHCR.

In [None]:
def cross_reference_detections(detections_df, known_camps_df, radius_km=2.0):
    """Check if detections match known camps within a radius.
    
    Returns detections_df with added columns:
    - nearest_known_km: distance to nearest known camp
    - matched_known: True if within radius_km of a known camp
    - nearest_known_name: name of nearest known camp
    """
    known_coords = known_camps_df[['lat', 'lon']].values
    known_names = known_camps_df['name'].values
    
    nearest_dists = []
    nearest_names = []
    
    for _, row in detections_df.iterrows():
        dists = [_haversine_km(row['lat'], row['lon'], kc[0], kc[1]) 
                 for kc in known_coords]
        if dists:
            min_idx = np.argmin(dists)
            nearest_dists.append(dists[min_idx])
            nearest_names.append(known_names[min_idx])
        else:
            nearest_dists.append(np.inf)
            nearest_names.append('none')
    
    detections_df = detections_df.copy()
    detections_df['nearest_known_km'] = nearest_dists
    detections_df['matched_known'] = [d < radius_km for d in nearest_dists]
    detections_df['nearest_known_name'] = nearest_names
    
    return detections_df

# Example usage (will work once you have predictions from notebook 05):
# high_conf_detections = predictions[predictions['prob'] >= 0.8]
# known_camps = locations[locations['source'].isin(['osm', 'unhcr'])]
# validated = cross_reference_detections(high_conf_detections, known_camps)
# 
# n_new = (~validated['matched_known']).sum()
# print(f"Candidate NEW detections (not in any database): {n_new}")

## 2. Distance to Nearest Road (from OSM)

In [None]:
import requests

def get_nearest_road_distance(lat, lon, radius_m=5000):
    """Query Overpass API for nearest road and compute distance."""
    query = f"""
    [out:json][timeout:30];
    way["highway"](around:{radius_m},{lat},{lon});
    out center 1;
    """
    try:
        resp = requests.get(
            'https://overpass-api.de/api/interpreter',
            params={'data': query}, timeout=60,
        )
        data = resp.json()
        
        if data.get('elements'):
            road = data['elements'][0]
            if 'center' in road:
                return _haversine_km(
                    lat, lon,
                    road['center']['lat'], road['center']['lon'],
                )
    except Exception:
        pass
    
    return None  # No road found within radius

# Example: compute road distances for all high-confidence detections
# import time
# road_dists = []
# for _, row in high_conf_detections.iterrows():
#     d = get_nearest_road_distance(row['lat'], row['lon'])
#     road_dists.append(d)
#     time.sleep(1)  # Rate limiting
# high_conf_detections['road_dist_km'] = road_dists

## 3. Paper Figures

In [None]:
# Figure 1: Study area map with train/test country split
fig, ax = plt.subplots(1, 1, figsize=(10, 8))

train_locs = locations[locations['country'].isin(config['train_countries'])]
test_locs = locations[locations['country'].isin(config['test_countries'])]

camps = locations[locations['label'] == 'camp']
negs = locations[locations['label'] == 'non-camp']

ax.scatter(negs['lon'], negs['lat'], c='lightgray', s=5, alpha=0.3, label='Negative', zorder=1)

train_camps = camps[camps['country'].isin(config['train_countries'])]
test_camps = camps[camps['country'].isin(config['test_countries'])]

ax.scatter(train_camps['lon'], train_camps['lat'], c='blue', s=30, 
           alpha=0.8, label='Train camps', zorder=2)
ax.scatter(test_camps['lon'], test_camps['lat'], c='red', s=30,
           alpha=0.8, label='Test camps', zorder=2)

ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
ax.set_title('Study Area: Train (blue) vs Test (red) Countries')
ax.legend()
plt.tight_layout()
plt.savefig(f'{PROJECT_DIR}/paper/fig1_study_area.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Figure 2: Method overview (placeholder - create in drawing tool)
print("Figure 2: Create method overview diagram externally")
print("  Sentinel-2 tile -> 6-band crop -> ResNet-18 -> camp/non-camp")

In [None]:
# Figure 3: Model comparison bar chart (template)
# Fill in actual values after training

models = ['NDVI Threshold', 'Random Forest', 'ResNet-18']
# Replace with actual values:
precision_vals = [0.0, 0.0, 0.0]  # TODO
recall_vals = [0.0, 0.0, 0.0]     # TODO
f1_vals = [0.0, 0.0, 0.0]         # TODO
auc_vals = [0.0, 0.0, 0.0]        # TODO

x = np.arange(len(models))
width = 0.2

fig, ax = plt.subplots(figsize=(8, 5))
ax.bar(x - 1.5*width, precision_vals, width, label='Precision', color='#2196F3')
ax.bar(x - 0.5*width, recall_vals, width, label='Recall', color='#4CAF50')
ax.bar(x + 0.5*width, f1_vals, width, label='F1', color='#FF9800')
ax.bar(x + 1.5*width, auc_vals, width, label='AUC', color='#9C27B0')

ax.set_ylabel('Score')
ax.set_title('Model Comparison (Test Set - Unseen Countries)')
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.legend()
ax.set_ylim(0, 1)

plt.tight_layout()
plt.savefig(f'{PROJECT_DIR}/paper/fig3_model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## 4. Summary Statistics for Paper

In [None]:
print("="*60)
print("SUMMARY FOR PAPER")
print("="*60)
print(f"")
print(f"Dataset:")
print(f"  Total locations: {len(locations)}")
print(f"  Camps: {(locations['label'] == 'camp').sum()}")
print(f"  Negatives: {(locations['label'] == 'non-camp').sum()}")
print(f"  Train countries: {config['train_countries']}")
print(f"  Test countries: {config['test_countries']}")
print(f"")
print(f"Sentinel-2 parameters:")
print(f"  Bands: {config['bands']}")
print(f"  Tile size: {config['tile_size']}px ({config['tile_size']*config['resolution']}m)")
print(f"  Date range: {config['date_range']}")
print(f"")
print(f"Fill in after training:")
print(f"  ResNet-18 test precision: ___")
print(f"  ResNet-18 test recall: ___")
print(f"  ResNet-18 test AUC: ___")
print(f"  Candidate new detections: ___")
print(f"  Detections not in OSM: ___")
print(f"  Detections not in UNHCR: ___")