# 00 - Pre-Training Visual Validation (GATE)

**Run this BEFORE training.** This is a go/no-go check.

If you cannot visually distinguish camps from non-camps at 10m resolution,
the CNN won't be able to either. Stop and reconsider before wasting compute.

## Checklist
- [ ] Can you see repetitive tent/shelter structures in camp tiles?
- [ ] Do camps look different from rural villages?
- [ ] Do camps look different from dense urban areas?
- [ ] Are the NDVI/NDBI values meaningfully different?
- [ ] Is the dataset large enough? (target: >1000 positive tiles)

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd

from src.utils import load_config

In [None]:
config = load_config('../configs/default.yaml')
tile_dir = Path('../data/sentinel2')

# Count available tiles
all_files = list(tile_dir.glob('*.npy'))
camp_files = sorted(tile_dir.glob('camp_*.npy'))
neg_rural = sorted(tile_dir.glob('neg_rural_*.npy'))
neg_urban = sorted(tile_dir.glob('neg_urban_*.npy'))
neg_barren = sorted(tile_dir.glob('neg_barren_*.npy'))
neg_informal = sorted(tile_dir.glob('neg_informal_*.npy'))

print(f'=== DATASET SIZE CHECK ===')
print(f'Total tiles: {len(all_files)}')
print(f'  Camp tiles: {len(camp_files)}')
print(f'  Neg rural: {len(neg_rural)}')
print(f'  Neg urban: {len(neg_urban)}')
print(f'  Neg barren: {len(neg_barren)}')
print(f'  Neg informal: {len(neg_informal)}')
print()
if len(camp_files) < 100:
    print('WARNING: <100 camp tiles. Need more data or more countries.')
elif len(camp_files) < 500:
    print('OK but tight: 100-500 camp tiles. Transfer learning should help.')
else:
    print('GOOD: >500 camp tiles. Solid dataset size.')

## 1. RGB Comparison: Camps vs Negatives

**Look for:** Dense, regular patterns of small structures (tents/shelters) in camp tiles.
Camps often appear as white/gray clusters with regular spacing.

In [None]:
def show_rgb(file_path, ax, title=None):
    """Display RGB composite of a tile."""
    tile = np.load(file_path)
    rgb = tile[:3].transpose(1, 2, 0)  # R, G, B channels
    p98 = np.percentile(rgb, 98)
    if p98 > 0:
        rgb = np.clip(rgb / p98, 0, 1)
    ax.imshow(rgb)
    if title:
        ax.set_title(title, fontsize=8)
    ax.axis('off')

# Show 20 camp tiles and 20 negative tiles (5 per category)
n_show = min(5, len(camp_files))
categories = [
    ('CAMPS', camp_files),
    ('Rural neg', neg_rural),
    ('Urban neg', neg_urban),
    ('Barren neg', neg_barren),
    ('Informal neg', neg_informal),
]

n_rows = len([c for c in categories if len(c[1]) > 0])
fig, axes = plt.subplots(n_rows, n_show, figsize=(n_show * 3, n_rows * 3))
if n_rows == 1:
    axes = axes[np.newaxis, :]

row = 0
for cat_name, files in categories:
    if not files:
        continue
    for col in range(min(n_show, len(files))):
        show_rgb(files[col], axes[row, col], files[col].stem)
    axes[row, 0].set_ylabel(cat_name, fontsize=12, rotation=0, labelpad=80)
    row += 1

plt.suptitle('VISUAL CHECK: Can you distinguish camps from non-camps?', fontsize=14)
plt.tight_layout()
plt.show()

## 2. Derived Indices Comparison

Compare NDVI, NDBI, SWIR_ratio between camps and negatives.
If the distributions overlap completely, the model will struggle.

In [None]:
def get_channel_stats(files, max_files=50):
    """Get mean value of each channel across files."""
    stats = []
    for f in files[:max_files]:
        tile = np.load(f)
        stats.append([tile[ch].mean() for ch in range(tile.shape[0])])
    return np.array(stats)

channel_names = config['input_channels']

fig, axes = plt.subplots(2, 3, figsize=(15, 8))

camp_stats = get_channel_stats(camp_files) if camp_files else None

for idx, (ch_name, ax) in enumerate(zip(channel_names, axes.flat)):
    if camp_stats is not None and idx < camp_stats.shape[1]:
        ax.hist(camp_stats[:, idx], bins=20, alpha=0.5, color='red', label='Camp', density=True)
    
    for cat_name, files, color in [
        ('Rural', neg_rural, 'green'),
        ('Urban', neg_urban, 'blue'),
        ('Barren', neg_barren, 'orange'),
        ('Informal', neg_informal, 'purple'),
    ]:
        if files:
            stats = get_channel_stats(files)
            if idx < stats.shape[1]:
                ax.hist(stats[:, idx], bins=20, alpha=0.3, color=color, 
                        label=cat_name, density=True)
    
    ax.set_title(ch_name)
    ax.legend(fontsize=8)

plt.suptitle('Channel Distributions: Camps vs Negative Categories', fontsize=14)
plt.tight_layout()
plt.show()

## 3. Side-by-Side Detail View

All 6 channels for one camp tile vs one negative tile.

In [None]:
if camp_files and neg_urban:
    fig, axes = plt.subplots(2, 6, figsize=(20, 7))
    
    for row, (files, label) in enumerate([(camp_files, 'CAMP'), (neg_urban, 'URBAN NEG')]):
        tile = np.load(files[0])
        for ch in range(min(6, tile.shape[0])):
            cmap = 'RdYlGn' if channel_names[ch] in ('NDVI',) else 'viridis'
            im = axes[row, ch].imshow(tile[ch], cmap=cmap)
            axes[row, ch].set_title(f'{channel_names[ch]}', fontsize=10)
            axes[row, ch].axis('off')
            plt.colorbar(im, ax=axes[row, ch], fraction=0.046)
        axes[row, 0].set_ylabel(label, fontsize=12, rotation=0, labelpad=70)
    
    plt.suptitle('Channel Detail: Camp vs Urban Negative', fontsize=14)
    plt.tight_layout()
    plt.show()
else:
    print('Need both camp and urban negative tiles for comparison')

## 4. GO / NO-GO Decision

Answer honestly:

In [None]:
print('=== PRE-TRAINING CHECKLIST ===')
print()
print('Answer these questions based on the visualizations above:')
print()
print('1. Can you see camp structures (tents, shelters) in the camp tiles?')
print('   If NO at 10m resolution -> consider higher resolution data (Sentinel-1 SAR)')
print()
print('2. Do camps look DIFFERENT from urban negatives?')
print('   If NO -> the model will confuse them. Add more hard negatives.')
print()
print('3. Are NDVI/NDBI distributions separated?')
print('   If NO -> spectral info alone is insufficient. Texture features needed.')
print()
print('4. Dataset size adequate?')
print(f'   Camp tiles: {len(camp_files)}')
print(f'   Target: >200 camps x 9 grid = >1800 camp tiles')
print()
print('If mostly YES -> proceed to training (notebook 03, 04)')
print('If mostly NO -> STOP. Revise approach before wasting compute.')
print()
print('Remember: a negative result ("camps not distinguishable at 10m")')
print('is ALSO a publishable finding.')