Build Sentinel-1 / Sentinel-2 patch dataset using MPF-guided sampling.

This notebook:
- loads collocation metadata
- extracts MPF-guided patches
- performs minimal sanity checks
- saves a reusable dataset artefact

All core logic lives in data/patching.py.

In [5]:
import sys
from pathlib import Path

# Add project root to PYTHONPATH
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

In [6]:
from data_processing.patching import (
    PatchConfig,
    build_patch_dataset,
    save_patch_dataset,
)
import numpy as np


In [8]:
# Paths
CSV_PATH = (
    "selected_data_with_all_collocated_paths.csv"
)

OUTPUT_PATH = "patch_dataset.npz"

# Patching configuration (matches thesis Section 3.1.3)
cfg = PatchConfig(
    patch_size=256,
    stride=256,
    mpf_threshold=0.10,
    min_cluster_area=100,
    blank_threshold=1e-6,
)

cfg


PatchConfig(patch_size=256, stride=256, mpf_threshold=0.1, min_cluster_area=100, blank_threshold=1e-06)

In [9]:
s1, s2, mpf, dates = build_patch_dataset(
    csv_path=CSV_PATH,
    cfg=cfg,
)

print(f"Total patches extracted: {len(s1)}")
print("Shapes:")
print("S1:", s1.shape)   # (N, 2, H, W)
print("S2:", s2.shape)   # (N, 4, H, W)
print("MPF:", mpf.shape) # (N, 1, H, W)
print("Dates:", dates.shape)


Total patches extracted: 1265
Shapes:
S1: (1265, 2, 256, 256)
S2: (1265, 4, 256, 256)
MPF: (1265, 1, 256, 256)
Dates: (1265,)


In [14]:
# Check for NaNs in Sentinel-1 (input-critical)
nan_mask = np.isnan(s1).any(axis=(1, 2, 3))
num_nan = nan_mask.sum()

print(f"Patches with NaNs in S1: {num_nan} / {len(s1)}")


Patches with NaNs in S1: 265 / 1265


In [17]:
print("S1 statistics (HH, HV):")
print("Mean:", s1.mean(axis=(0, 2, 3)))
print("Std :", s1.std(axis=(0, 2, 3)))

print("\nS2 statistics (B2, B3, B4, B8):")
print("Mean:", s2.mean(axis=(0, 2, 3)))
print("Std :", s2.std(axis=(0, 2, 3)))


S1 statistics (HH, HV):
Mean: [nan nan]
Std : [nan nan]

S2 statistics (B2, B3, B4, B8):
Mean: [5828.52497422 5222.25501639 4727.72224577 3973.46883021]
Std : [1931.71369347 1917.68371091 2264.67232152 2031.43184432]


In [None]:
save_patch_dataset(
    path=OUTPUT_PATH,
    s1=s1,
    s2=s2,
    mpf=mpf,
    dates=dates,
)

print(f"Saved dataset to: {OUTPUT_PATH}")
