# 01 - Download Ground Truth Labels

Downloads known refugee camp locations and generates categorized negative samples.

**Key improvements over naive approach:**
- **Grid tiling:** Each camp expands to a 3x3 grid (9 tiles), multiplying dataset x9
- **Categorized negatives:** rural, urban, barren (enables error analysis by type)

**Sources:**
1. OpenStreetMap via Overpass API
2. UNHCR camp location data (manual CSV)

**Output:** `data/labels/camps.geojson`, `data/labels/negatives.geojson`, `data/labels/all_locations.csv`

In [None]:
import sys
sys.path.insert(0, '..')

from src.utils import (
    load_config,
    query_osm_camps_all,
    load_unhcr_camps,
    merge_and_deduplicate,
    expand_camps_to_grid,
    generate_negatives,
    df_to_geojson,
)

In [None]:
config = load_config('../configs/default.yaml')
all_countries = config['train_countries'] + config['test_countries']
print(f"Countries: {all_countries}")
print(f"Grid size: {config['grid']['size']}x{config['grid']['size']} = {config['grid']['size']**2} tiles per camp")
print(f"Negative categories: {config['negative_categories']}")

## 1. Query OpenStreetMap

In [None]:
osm_camps = query_osm_camps_all(all_countries)
print(f"\nTotal OSM camps: {len(osm_camps)}")
osm_camps.head()

## 2. Load UNHCR Data (optional)

Download camp locations from https://data.unhcr.org/ and save as CSV at `data/labels/unhcr_camps.csv`.

In [None]:
from pathlib import Path

unhcr_path = Path('../data/labels/unhcr_camps.csv')

if unhcr_path.exists():
    unhcr_camps = load_unhcr_camps(unhcr_path)
    print(f"UNHCR camps loaded: {len(unhcr_camps)}")
    sources = [osm_camps, unhcr_camps]
else:
    print("UNHCR CSV not found. Using OSM data only.")
    print(f"To add UNHCR data, download from https://data.unhcr.org/")
    sources = [osm_camps]

## 3. Merge and Deduplicate

In [None]:
camps = merge_and_deduplicate(sources, buffer_km=1.0)

# Add tile IDs and label
camps['label'] = 'camp'
camps['tile_id'] = [f'camp_{i:04d}' for i in range(len(camps))]

print(f"\nUnique camps: {len(camps)}")
print(camps['country'].value_counts())

## 4. Expand Camps to 3x3 Grid

Each camp becomes 9 tiles. This multiplies the positive dataset by 9 and captures
the full spatial extent of each camp.

In [None]:
camp_tiles = expand_camps_to_grid(camps, config)
print(f"\nCamp tiles per country:")
print(camp_tiles['country'].value_counts())

## 5. Generate Categorized Negatives

Three explicit categories:
- **Rural:** dispersed rural areas
- **Urban:** near known cities (formal dense settlements)
- **Barren:** desert, bare soil, savanna

Keeping the category enables error analysis later.

In [None]:
negatives = generate_negatives(camps, config)

# Add tile IDs
negatives['tile_id'] = [f"neg_{row['neg_category']}_{i:04d}" 
                        for i, (_, row) in enumerate(negatives.iterrows())]

print(f"\nTotal negatives: {len(negatives)}")
print("\nBy category:")
print(negatives['neg_category'].value_counts())
print("\nBy country:")
print(negatives['country'].value_counts())

## 6. Save Everything

In [None]:
import pandas as pd

# Save GeoJSON
df_to_geojson(camp_tiles, '../data/labels/camps.geojson')
df_to_geojson(negatives, '../data/labels/negatives.geojson')

# Combined CSV for downstream notebooks
combined = pd.concat([camp_tiles, negatives], ignore_index=True)
combined.to_csv('../data/labels/all_locations.csv', index=False)

print(f"\nTotal dataset locations: {len(combined)}")
print(f"  Camp tiles: {len(camp_tiles)} ({len(camps)} camps x {config['grid']['size']**2} grid)")
print(f"  Negative tiles: {len(negatives)}")
print(f"  Ratio camp:negative = 1:{len(negatives)/max(len(camp_tiles),1):.1f}")

## 7. Visualization

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(18, 7))

# Left: all locations
ax = axes[0]
for cat, color in [('rural', 'green'), ('urban', 'blue'), ('barren', 'orange')]:
    mask = negatives['neg_category'] == cat
    ax.scatter(negatives[mask]['lon'], negatives[mask]['lat'], 
              c=color, s=5, alpha=0.3, label=f'Neg: {cat}')
ax.scatter(camp_tiles['lon'], camp_tiles['lat'], 
          c='red', s=15, alpha=0.7, label='Camp tiles')
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
ax.set_title('All Locations (Camps + Categorized Negatives)')
ax.legend(markerscale=3)

# Right: zoom into one camp grid
ax = axes[1]
first_camp = camps.iloc[0]['tile_id'] if len(camps) > 0 else None
if first_camp:
    grid = camp_tiles[camp_tiles['parent_camp'] == first_camp]
    center = grid[grid['is_center'] == True]
    peripheral = grid[grid['is_center'] == False]
    ax.scatter(peripheral['lon'], peripheral['lat'], c='salmon', s=100, 
              label='Peripheral tiles', zorder=2)
    ax.scatter(center['lon'], center['lat'], c='red', s=200, marker='*',
              label='Center tile', zorder=3)
    ax.set_title(f'Grid Detail: {first_camp}')
    ax.legend()
    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')

plt.tight_layout()
plt.savefig('../data/labels/locations_map.png', dpi=150)
plt.show()