In [1]:
# ==========================================================
# Batch Preprocessing for 7-Day Urban Heat Island Dataset
# ==========================================================

import os
import numpy as np
import matplotlib.pyplot as plt
import rasterio
from rasterio.windows import from_bounds
from rasterio.warp import transform_bounds

# ------------------------------
# 1. Parameters
# ------------------------------
DAYS = [187,196,205,215,212,223,240]
PATCH_SIZE = 16
STRIDE = 8
NAN_THRESHOLD = 0.3
TEMP_THRESHOLD = 303.0
HOT_RATIO = 0.5
OUTPUT_DIR = '../data/processed'

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Geographic bounding box for DMV region (lat/lon)
dmv_bounds_latlon = {
    'left':  -77.8,
    'right': -76.0,
    'bottom': 38.2,
    'top':    39.8
}

# ------------------------------
# 2. Define extraction function
# ------------------------------
def extract_patches_from_tif(day):
    tif_path = f'../data/raw/gf_Day2020_{day}.tif'
    print(f'\n Processing Day {day}...')

    with rasterio.open(tif_path) as src:
        # Convert lat/lon bounds to raster CRS
        dmv_bounds_proj = transform_bounds('EPSG:4326', src.crs,
                                           dmv_bounds_latlon['left'],
                                           dmv_bounds_latlon['bottom'],
                                           dmv_bounds_latlon['right'],
                                           dmv_bounds_latlon['top'])
        # Crop region
        window = from_bounds(*dmv_bounds_proj, transform=src.transform)
        patch = src.read(1, window=window).astype(np.float32)

    patch[patch <= 0] = np.nan  # Replace invalid values

    # Sliding window extraction
    H, W = patch.shape
    patches, labels = [], []

    for i in range(0, H - PATCH_SIZE + 1, STRIDE):
        for j in range(0, W - PATCH_SIZE + 1, STRIDE):
            sub = patch[i:i+PATCH_SIZE, j:j+PATCH_SIZE]
            if np.isnan(sub).mean() <= NAN_THRESHOLD:
                valid = sub[~np.isnan(sub)]
                hot_ratio = (valid > TEMP_THRESHOLD).mean() if len(valid) else 0.0
                label = 1 if hot_ratio > HOT_RATIO else 0
                patches.append(sub)
                labels.append(label)

    patches = np.array(patches, dtype=np.float32)
    labels = np.array(labels, dtype=np.int64)

    # Save per-day files
    np.save(f"{OUTPUT_DIR}/patches_day{day}_sw.npy", patches)
    np.save(f"{OUTPUT_DIR}/labels_day{day}_sw.npy", labels)

    print(f"Day {day}: {len(patches)} patches | UHI: {(labels==1).sum()} | Non-UHI: {(labels==0).sum()}")
    return len(patches), (labels==1).sum(), (labels==0).sum()


In [2]:
# ------------------------------
# 3. Process all days
# ------------------------------
summary = []

for d in DAYS:
    total, uhi, non_uhi = extract_patches_from_tif(d)
    summary.append((d, total, uhi, non_uhi))

# Summary table
print("\n Summary of all days:")
print("Day | Total | UHI | Non-UHI")
for s in summary:
    print(f"{s[0]} | {s[1]} | {s[2]} | {s[3]}")



 Processing Day 187...
Day 187: 767 patches | UHI: 304 | Non-UHI: 463

 Processing Day 196...
Day 196: 767 patches | UHI: 275 | Non-UHI: 492

 Processing Day 205...
Day 205: 767 patches | UHI: 404 | Non-UHI: 363

 Processing Day 215...
Day 215: 767 patches | UHI: 343 | Non-UHI: 424

 Processing Day 212...
Day 212: 767 patches | UHI: 312 | Non-UHI: 455

 Processing Day 223...
Day 223: 767 patches | UHI: 360 | Non-UHI: 407

 Processing Day 240...
Day 240: 767 patches | UHI: 369 | Non-UHI: 398

 Summary of all days:
Day | Total | UHI | Non-UHI
187 | 767 | 304 | 463
196 | 767 | 275 | 492
205 | 767 | 404 | 363
215 | 767 | 343 | 424
212 | 767 | 312 | 455
223 | 767 | 360 | 407
240 | 767 | 369 | 398


In [3]:
import numpy as np

processed_dir = "../data/processed"

# --------------------------
# 1. Define training / val days
# --------------------------
train_days = [187, 196, 205, 212, 223, 240]
val_day = [215]

# --------------------------
# 2. Load & merge training days
# --------------------------
X_train = np.concatenate(
    [np.load(f"{processed_dir}/patches_day{d}_sw.npy") for d in train_days],
    axis=0
)
y_train = np.concatenate(
    [np.load(f"{processed_dir}/labels_day{d}_sw.npy") for d in train_days],
    axis=0
)

print("Training set:", X_train.shape, y_train.shape)

# --------------------------
# 3. Load validation day
# --------------------------
X_val = np.load(f"{processed_dir}/patches_day215_sw.npy")
y_val = np.load(f"{processed_dir}/labels_day215_sw.npy")

print("Validation set:", X_val.shape, y_val.shape)

# --------------------------
# 4. Save final merged dataset
# --------------------------
np.save(f"{processed_dir}/patches_train.npy", X_train)
np.save(f"{processed_dir}/labels_train.npy", y_train)

np.save(f"{processed_dir}/patches_val.npy", X_val)
np.save(f"{processed_dir}/labels_val.npy", y_val)

print("Saved: patches_train.npy, labels_train.npy")
print("Saved: patches_val.npy, labels_val.npy")


Training set: (4602, 16, 16) (4602,)
Validation set: (767, 16, 16) (767,)
Saved: patches_train.npy, labels_train.npy
Saved: patches_val.npy, labels_val.npy
