# Sen1Floods11 — Step 1: Data Preprocessing (U-Net, 256×256)
This notebook builds processed 2-channel (VV,VH) tiles and binary masks, normalizes data, writes manifests, and produces quick sanity plots.

In [None]:
import os, sys, json, glob, math, random, shutil, warnings
from pathlib import Path
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
print('Python:', sys.version)

## Paths & config
Adjust RAW_DIR to your local Sen1Floods11 root. Processed outputs go to processed/images, processed/masks, processed/manifests.

In [None]:
# Project root inferred from this notebook location
ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
# Prefer environment variable SEN1FLOODS11_DIR if set; else default under data/
RAW_DIR = Path(os.getenv('SEN1FLOODS11_DIR', str(ROOT / 'data' / 'sen1floods11')))
PROC_DIR = ROOT / 'processed'
IMG_DIR = PROC_DIR / 'images'
MSK_DIR = PROC_DIR / 'masks'
MAN_DIR = PROC_DIR / 'manifests'
for d in [PROC_DIR, IMG_DIR, MSK_DIR, MAN_DIR]: d.mkdir(parents=True, exist_ok=True)
TILE_SIZE = 256
SEED = 42
random.seed(SEED); np.random.seed(SEED)
print('ROOT=', ROOT)
print('RAW_DIR=', RAW_DIR)

## Install deps (if needed)
Uncomment to install rasterio/geopandas/albumentations in notebook env.

In [None]:
# !pip install rasterio geopandas albumentations imageio
import rasterio, imageio

## I/O helpers: read VV/VH stack, mask, normalize, tiler

In [None]:
def read_vv_vh(vv_path: Path, vh_path: Path):
    import rasterio
    with rasterio.open(vv_path) as src:
        vv = src.read(1).astype('float32')
        meta = src.meta.copy()
    with rasterio.open(vh_path) as src:
        vh = src.read(1).astype('float32')
    stack = np.stack([vv, vh], axis=0)  # (2,H,W)
    return stack, meta

def try_read_mask(mask_path: Path):
    # Try raster first; fallback to imageio (PNG)
    try:
        with rasterio.open(mask_path) as src:
            m = src.read(1)
            return (m > 0).astype('uint8')
    except Exception:
        m = imageio.v2.imread(mask_path)
        if m.ndim == 3:
            m = m[...,0]
        return (m > 0).astype('uint8')

def normalize_stack(stack, pmin=1, pmax=99):
    out = stack.copy()
    for i in range(out.shape[0]):
        band = out[i]
        lo = np.percentile(band[~np.isnan(band)], pmin) if np.any(~np.isnan(band)) else np.nanmin(band)
        hi = np.percentile(band[~np.isnan(band)], pmax) if np.any(~np.isnan(band)) else np.nanmax(band)
        out[i] = np.clip((band - lo) / (hi - lo + 1e-6), 0, 1)
    return out

def tile_and_save(stack, mask, meta, base_id: str, out_img_dir: Path, out_mask_dir: Path, tile=256):
    H, W = stack.shape[1], stack.shape[2]
    tid_list = []
    tcount = 0
    for y in range(0, H, tile):
        for x in range(0, W, tile):
            if y+tile > H or x+tile > W:
                continue
            img_tile = stack[:, y:y+tile, x:x+tile]
            mask_tile = mask[y:y+tile, x:x+tile]
            # require some valid pixels
            valid = np.count_nonzero(~np.isnan(img_tile[0]))
            if valid < 0.05 * tile * tile:
                continue
            img_path = out_img_dir / f'{base_id}_{tcount}.npy'
            msk_path = out_mask_dir / f'{base_id}_{tcount}.npy'
            np.save(img_path, img_tile)
            np.save(msk_path, mask_tile.astype('uint8'))
            # Save minimal geo meta per tile (affine shift)
            meta_path = out_img_dir / f'{base_id}_{tcount}.json'
            tile_meta = {
                'transform': list(meta.get('transform', [])) if 'transform' in meta else None,
                'crs': str(meta.get('crs', '')) if 'crs' in meta else None,
                'x': x, 'y': y, 'tile': tile
            }
            with open(meta_path, 'w') as f: json.dump(tile_meta, f)
            tid_list.append((str(img_path), str(msk_path)))
            tcount += 1
    return tid_list

## Discover raw scenes and pair VV/VH/mask
Adjust the glob patterns to match your local Sen1Floods11 folder layout.

In [None]:
# Example patterns (edit to match your structure)
VV_GLOB = str(RAW_DIR / '**' / '*VV*.tif')
VH_GLOB = str(RAW_DIR / '**' / '*VH*.tif')
MSK_GLOB = str(RAW_DIR / '**' / '*mask*.tif')  # or *.png

vv_files = sorted(glob.glob(VV_GLOB, recursive=True))
vh_files = sorted(glob.glob(VH_GLOB, recursive=True))
msk_files = sorted(glob.glob(MSK_GLOB, recursive=True))
print('Found', len(vv_files), 'VV,', len(vh_files), 'VH,', len(msk_files), 'masks')

# Simple pairing heuristic by basename key (customize if needed)
def key(p):
    b = Path(p).stem
    b = b.replace('VV','').replace('VH','').replace('_mask','')
    return b

vh_map = {key(p): p for p in vh_files}
msk_map = {key(p): p for p in msk_files}
pairs = []
for v in vv_files:
    k = key(v)
    if k in vh_map and k in msk_map:
        pairs.append((v, vh_map[k], msk_map[k], k))
print('Paired scenes:', len(pairs))
pairs[:3]

## Process scenes → tiles and manifests

In [None]:
records = []
for vv_path, vh_path, msk_path, scene_id in pairs:
    try:
        stack, meta = read_vv_vh(vv_path, vh_path)
        mask = try_read_mask(msk_path)
        stack = normalize_stack(stack)
        tiles = tile_and_save(stack, mask, meta, base_id=scene_id, out_img_dir=IMG_DIR, out_mask_dir=MSK_DIR, tile=TILE_SIZE)
        for img_p, msk_p in tiles:
            records.append({
                'id': Path(img_p).stem,
                'image_path': str(img_p),
                'mask_path': str(msk_p),
                'scene_id': scene_id
            })
    except Exception as e:
        print('Skip scene due to error:', scene_id, e)

df = pd.DataFrame(records)
print('Total tiles:', len(df))
df.head()

## Train/Val split and write manifests

In [None]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=SEED, stratify=df['scene_id']) if len(df['scene_id'].unique())>1 else (df, df.sample(frac=0))
train_csv = MAN_DIR / 'train.csv'
val_csv = MAN_DIR / 'val.csv'
train_df.to_csv(train_csv, index=False)
val_df.to_csv(val_csv, index=False)
print('Wrote:', train_csv, 'and', val_csv)

## Quick sanity check: visualize one tile

In [None]:
if len(df):
    r = df.sample(1).iloc[0]
    img = np.load(r['image_path'])  # (2,H,W)
    msk = np.load(r['mask_path'])  # (H,W)
    fig,axs = plt.subplots(1,3, figsize=(10,4))
    axs[0].imshow(img[0], cmap='gray'); axs[0].set_title('VV')
    axs[1].imshow(img[1], cmap='gray'); axs[1].set_title('VH')
    axs[2].imshow(msk, cmap='Blues'); axs[2].set_title('Mask')
    [a.axis('off') for a in axs]; plt.tight_layout(); plt.show()
else:
    print('No tiles built yet — check RAW_DIR patterns and rerun.')

## Next: Training U-Net (Option B, PyTorch)
Use the manifests to build DataLoaders with albumentations and train a U-Net (efficientnet-b0 encoder).