In [1]:
# 02_generate_patches_labels.ipynb

import os
import numpy as np
import rasterio
from rasterio.windows import from_bounds
from rasterio.warp import transform_bounds
from tqdm import tqdm

# --------------------------
# Parameters
# --------------------------
DATA_DIR = '../data/raw'
OUT_DIR = '../data/processed'
PATCH_SIZE = 128
THRESHOLD = 305  # 305 Kelvin ~ 32℃
DAYS = [196, 197, 198, 200, 205]

# Washington DC bounds (lat/lon)
dc_bounds_latlon = {
    'left': -77.12,
    'right': -76.90,
    'bottom': 38.80,
    'top': 39.00
}

# Create output directory
os.makedirs(OUT_DIR, exist_ok=True)

# --------------------------
# Process each .tif file
# --------------------------
for day in tqdm(DAYS):
    tif_path = os.path.join(DATA_DIR, f'gf_Day2020_{day}.tif')
    if not os.path.exists(tif_path):
        print(f"File not found: {tif_path}")
        continue

    with rasterio.open(tif_path) as src:
        # Convert geographic bounds to raster projection coordinates
        dc_bounds_proj = transform_bounds('EPSG:4326', src.crs,
                                          dc_bounds_latlon['left'],
                                          dc_bounds_latlon['bottom'],
                                          dc_bounds_latlon['right'],
                                          dc_bounds_latlon['top'])

        window = from_bounds(*dc_bounds_proj, transform=src.transform)
        data = src.read(1, window=window).astype(np.float32)
        data[data <= 0] = np.nan  # Handle invalid values

    h, w = data.shape
    patches = []
    labels = []

    # Slide over window
    for i in range(0, h - PATCH_SIZE + 1, PATCH_SIZE):
        for j in range(0, w - PATCH_SIZE + 1, PATCH_SIZE):
            patch = data[i:i+PATCH_SIZE, j:j+PATCH_SIZE]
            if np.isnan(patch).mean() > 0.1:
                continue

            # Normalize patch (zero mean, unit variance)
            patch_norm = (patch - np.nanmean(patch)) / np.nanstd(patch)
            patches.append(patch_norm)

            # Label = 1 if >50% pixels > threshold
            is_heat_island = (patch > THRESHOLD).mean() > 0.5
            labels.append(int(is_heat_island))

    # Save results as .npy
    patches = np.array(patches)
    labels = np.array(labels)
    np.save(os.path.join(OUT_DIR, f"patches_day{day}.npy"), patches)
    np.save(os.path.join(OUT_DIR, f"labels_day{day}.npy"), labels)
    print(f"Day {day}: {patches.shape[0]} patches saved. Heat islands: {labels.sum()}")

  0%|          | 0/5 [00:00<?, ?it/s]

Day 196: 0 patches saved. Heat islands: 0.0


100%|██████████| 5/5 [00:00<00:00, 74.86it/s]

Day 197: 0 patches saved. Heat islands: 0.0
Day 198: 0 patches saved. Heat islands: 0.0
Day 200: 0 patches saved. Heat islands: 0.0
Day 205: 0 patches saved. Heat islands: 0.0



