# 01 - Download Ground Truth Labels

Downloads known refugee camp locations and generates categorized negative samples.

**Data sources (layered — OSM alone is insufficient):**
1. **UNHCR/HDX** — Official refugee camp datasets (Ethiopia shapefile, Syria CSV)
2. **OpenStreetMap** — Overpass API for tagged camps
3. **Known camps** — Literature-verified locations (Chad, South Sudan, Yemen)

**Key design decisions:**
- **Grid tiling:** 5 tiles per camp (center + 4 adjacent, corners excluded)
- **Categorized negatives:** rural, urban, barren, informal (hard negatives)
- **Mini-test mode:** Validate with 10 camps before full download

**Output:** `data/labels/camps.geojson`, `data/labels/negatives.geojson`, `data/labels/all_locations.csv`

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd
from pathlib import Path

from src.utils import (
    load_config,
    query_osm_camps_all,
    load_unhcr_camps,
    merge_and_deduplicate,
    expand_camps_to_grid,
    generate_negatives,
    df_to_geojson,
    print_dataset_summary,
)

In [None]:
config = load_config('../configs/default.yaml')
all_countries = config['train_countries'] + config['test_countries']
print(f"Countries: {all_countries}")
print(f"Grid: {config['grid']['size']}x{config['grid']['size']}, corners={'included' if config['grid'].get('include_corners', False) else 'EXCLUDED'}")
print(f"Negative categories: {config['negative_categories']}")
print(f"Mini-test mode: {config.get('mini_test', {}).get('enabled', False)}")
if config.get('mini_test', {}).get('enabled', False):
    print(f"  Will use only {config['mini_test']['n_camps']} camps for initial validation")

## 1. Query OpenStreetMap

OSM has **incomplete** coverage for refugee camps. For our 5 countries:
- Syria: ~30 (many are Turkish border camps)
- South Sudan: ~3 settlements (11 zones)
- Chad: 0
- Ethiopia: 1
- Yemen: 5

This is why we **must** supplement with UNHCR/HDX data and known camp coordinates.

In [None]:
osm_camps = query_osm_camps_all(all_countries)
print(f"\nTotal OSM camps: {len(osm_camps)}")
osm_camps.head()

## 2. Load UNHCR/HDX + Known Camp Data

Multiple supplementary sources fill OSM gaps:
- **Ethiopia:** UNHCR HDX shapefile (26 active camps)
- **Chad:** Literature-verified camps (19 camps)  
- **South Sudan:** Known settlements (17 camps)
- **Syria:** Internal IDP camps + Turkey border camps from HDX
- **Yemen:** Known refugee/IDP sites (6 camps)

Download these first by running `python3 scripts/download_labels.py` or use pre-built CSVs.

In [None]:
sources = [osm_camps]

# Ethiopia UNHCR camps (from HDX shapefile)
eth_path = Path('../data/labels/ethiopia_unhcr_camps.csv')
if eth_path.exists():
    eth_camps = pd.read_csv(eth_path)
    sources.append(eth_camps)
    print(f"Ethiopia UNHCR camps: {len(eth_camps)}")

# Chad known camps
chad_path = Path('../data/labels/chad_known_camps.csv')
if chad_path.exists():
    chad_camps = pd.read_csv(chad_path)
    sources.append(chad_camps)
    print(f"Chad known camps: {len(chad_camps)}")

# Syria HDX (Turkey border camps)
syria_hdx_path = Path('../data/labels/syria_hdx.csv')
if syria_hdx_path.exists():
    syria_hdx = pd.read_csv(syria_hdx_path)
    turkey_camps = syria_hdx[syria_hdx['Country'] == 'Turkey'].copy()
    turkey_camps = pd.DataFrame({
        'name': turkey_camps['Name'].values,
        'lat': turkey_camps['Lat'].values,
        'lon': turkey_camps['Long'].values,
        'country': 'syria',
        'source': 'unhcr_hdx',
    })
    sources.append(turkey_camps)
    print(f"Turkey/Syria border camps: {len(turkey_camps)}")

# Known camps for countries with poor OSM coverage
known_path = Path('../data/labels/known_camps_supplementary.csv')
if known_path.exists():
    known_camps = pd.read_csv(known_path)
    sources.append(known_camps)
    print(f"Supplementary known camps: {len(known_camps)}")

# Legacy UNHCR CSV (if user downloaded manually)
unhcr_path = Path('../data/labels/unhcr_camps.csv')
if unhcr_path.exists():
    unhcr_camps = load_unhcr_camps(unhcr_path)
    sources.append(unhcr_camps)
    print(f"UNHCR CSV camps: {len(unhcr_camps)}")

print(f"\nTotal sources: {len(sources)}")

## 3. Merge and Deduplicate

Using 2km dedup buffer (camps can span >1km).
Expect ~102 unique camps across 5 countries.

In [None]:
camps = merge_and_deduplicate(sources, buffer_km=2.0)

# Add tile IDs and label
camps['label'] = 'camp'
camps['tile_id'] = [f"camp_{row['country']}_{i:03d}" 
                    for i, (_, row) in enumerate(camps.iterrows())]

# Per-country summary
print(f"\nUnique camps: {len(camps)}")
print(camps['country'].value_counts())

# Mini-test mode: select N camps with balanced country representation
if config.get('mini_test', {}).get('enabled', False):
    n_mini = config['mini_test']['n_camps']
    mini_camps = []
    countries = sorted(camps['country'].unique())
    per_country = max(1, n_mini // len(countries))
    for country in countries:
        subset = camps[camps['country'] == country]
        n = min(per_country, len(subset))
        mini_camps.append(subset.sample(n=n, random_state=42))
    camps = pd.concat(mini_camps, ignore_index=True)
    camps['tile_id'] = [f"camp_{row['country']}_{i:03d}" 
                        for i, (_, row) in enumerate(camps.iterrows())]
    print(f"\n*** MINI-TEST MODE: {len(camps)} camps ***")
    print(camps[['name', 'country', 'source']].to_string(index=False))

## 4. Expand Camps to Grid

With `include_corners=false` (default): 5 tiles per camp (center + 4 adjacent).
Corner tiles are excluded because at 1.81km from center, <10% of a typical camp falls in them.

In [None]:
camp_tiles = expand_camps_to_grid(camps, config)
print(f"\nCamp tiles per country:")
print(camp_tiles['country'].value_counts())

## 5. Generate Categorized Negatives

Four explicit categories:
- **Rural:** dispersed rural areas
- **Urban:** near known cities (formal dense settlements)
- **Barren:** desert, bare soil, savanna
- **Informal:** dense self-built urban areas NOT labeled as camps (hard negatives)

`negative_ratio` = negatives PER CAMP per category (not per tile).

In [None]:
negatives = generate_negatives(camps, config)

# Add tile IDs
negatives['tile_id'] = [f"neg_{row['neg_category']}_{i:04d}" 
                        for i, (_, row) in enumerate(negatives.iterrows())]

print(f"\nTotal negatives: {len(negatives)}")
print("\nBy category:")
print(negatives['neg_category'].value_counts())
print("\nBy country:")
print(negatives['country'].value_counts())

## 6. Save Everything

In [None]:
import pandas as pd

# Exact dataset counts — no estimation
print_dataset_summary(camp_tiles, negatives, config)

# Save GeoJSON
df_to_geojson(camp_tiles, '../data/labels/camps.geojson')
df_to_geojson(negatives, '../data/labels/negatives.geojson')

# Combined CSV for downstream notebooks
combined = pd.concat([camp_tiles, negatives], ignore_index=True)
combined.to_csv('../data/labels/all_locations.csv', index=False)
print(f"\nSaved to data/labels/all_locations.csv")

## 7. Visualization

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(18, 7))

# Left: all locations by category
ax = axes[0]
for cat, color in [('rural', 'green'), ('urban', 'blue'), ('barren', 'orange'), ('informal', 'purple')]:
    mask = negatives['neg_category'] == cat
    if mask.any():
        ax.scatter(negatives[mask]['lon'], negatives[mask]['lat'], 
                  c=color, s=5, alpha=0.3, label=f'Neg: {cat}')
ax.scatter(camp_tiles['lon'], camp_tiles['lat'], 
          c='red', s=15, alpha=0.7, label='Camp tiles')
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
ax.set_title('All Locations (Camps + Categorized Negatives)')
ax.legend(markerscale=3)

# Right: zoom into one camp grid
ax = axes[1]
first_camp = camps.iloc[0]['tile_id'] if len(camps) > 0 else None
if first_camp:
    grid = camp_tiles[camp_tiles['parent_camp'] == first_camp]
    center = grid[grid['is_center'] == True]
    adjacent = grid[(grid['is_center'] == False) & (grid['label'] == 'camp')]
    context = grid[grid['label'] == 'camp_context']
    
    if len(context) > 0:
        ax.scatter(context['lon'], context['lat'], c='lightyellow', s=100, 
                  edgecolors='orange', label='Context (excluded)', zorder=1)
    ax.scatter(adjacent['lon'], adjacent['lat'], c='salmon', s=100, 
              label='Adjacent tiles', zorder=2)
    ax.scatter(center['lon'], center['lat'], c='red', s=200, marker='*',
              label='Center tile', zorder=3)
    ax.set_title(f'Grid Detail: {first_camp}')
    ax.legend()
    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')

plt.tight_layout()
plt.savefig('../data/labels/locations_map.png', dpi=150)
plt.show()