# Wetland Classification Map Generator

Applies the trained **Random Forest model** to 64-band satellite embedding tiles and produces a **single-band classified GeoTIFF** of the Bow River Basin.

**Classes:**
| Value | Class |
|-------|-------|
| 0 | Background/Upland |
| 1 | Marsh |
| 2 | Swamp |
| 3 | Shallow Water |
| 4 | Fen |
| 5 | Bog |
| 255 | No Data |

**Memory-optimized:** Processes each tile in small row chunks to stay within Colab RAM limits.

**Output:** `bow_river_classification_rf.tif` on Google Drive.

In [None]:
# CELL 1: Setup
print("Setting up environment...")

import os
import gc
from google.colab import drive

# Mount Google Drive
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')
else:
    print("Drive already mounted")

# Install dependencies
!pip install -q rasterio tqdm joblib

import numpy as np
import rasterio
from rasterio.windows import Window
import joblib
from pathlib import Path
from tqdm import tqdm
from datetime import datetime

print("Setup complete!")

In [None]:
# CELL 2: Configuration
print("="*60)
print("CONFIGURATION")
print("="*60)

# =====================
# PATHS
# =====================
embeddings_dir = Path("/content/drive/MyDrive/EarthEngine")
labels_path = str(embeddings_dir / "bow_river_wetlands_10m_final.tif")

# Path to trained RF model (.pkl)
# Option A: Upload to Colab runtime (use Files panel on left)
# Option B: Path on Google Drive
model_path = "/content/drive/MyDrive/rf_wetland_model_v1_20260120_130828.pkl"

# Output classification map
output_path = "/content/drive/MyDrive/bow_river_classification_rf.tif"

# =====================
# MEMORY TUNING
# =====================
# How many rows of a tile to process at once.
# Lower = less RAM, slower. Higher = more RAM, faster.
# 128 rows x 3072 cols x 64 bands x 4 bytes = ~96 MB per chunk (safe for Colab)
CHUNK_ROWS = 128

# =====================
# VALID EMBEDDING TILES (88 verified tiles with real data)
# =====================
VALID_TILE_NAMES = [
    'bow_river_embeddings_2020_CORRECTED-0000000000-0000000000.tif',
    'bow_river_embeddings_2020_CORRECTED-0000000000-0000003072.tif',
    'bow_river_embeddings_2020_CORRECTED-0000000000-0000006144.tif',
    'bow_river_embeddings_2020_CORRECTED-0000000000-0000009216.tif',
    'bow_river_embeddings_2020_CORRECTED-0000000000-0000012288.tif',
    'bow_river_embeddings_2020_CORRECTED-0000000000-0000015360.tif',
    'bow_river_embeddings_2020_CORRECTED-0000000000-0000018432.tif',
    'bow_river_embeddings_2020_CORRECTED-0000000000-0000021504.tif',
    'bow_river_embeddings_2020_CORRECTED-0000000000-0000024576.tif',
    'bow_river_embeddings_2020_CORRECTED-0000000000-0000027648.tif',
    'bow_river_embeddings_2020_CORRECTED-0000000000-0000030720.tif',
    'bow_river_embeddings_2020_CORRECTED-0000003072-0000000000.tif',
    'bow_river_embeddings_2020_CORRECTED-0000003072-0000003072.tif',
    'bow_river_embeddings_2020_CORRECTED-0000003072-0000006144.tif',
    'bow_river_embeddings_2020_CORRECTED-0000003072-0000009216.tif',
    'bow_river_embeddings_2020_CORRECTED-0000003072-0000012288.tif',
    'bow_river_embeddings_2020_CORRECTED-0000003072-0000015360.tif',
    'bow_river_embeddings_2020_CORRECTED-0000003072-0000018432.tif',
    'bow_river_embeddings_2020_CORRECTED-0000003072-0000021504.tif',
    'bow_river_embeddings_2020_CORRECTED-0000003072-0000024576.tif',
    'bow_river_embeddings_2020_CORRECTED-0000003072-0000027648.tif',
    'bow_river_embeddings_2020_CORRECTED-0000003072-0000030720.tif',
    'bow_river_embeddings_2020_CORRECTED-0000006144-0000000000.tif',
    'bow_river_embeddings_2020_CORRECTED-0000006144-0000003072.tif',
    'bow_river_embeddings_2020_CORRECTED-0000006144-0000006144.tif',
    'bow_river_embeddings_2020_CORRECTED-0000006144-0000009216.tif',
    'bow_river_embeddings_2020_CORRECTED-0000006144-0000012288.tif',
    'bow_river_embeddings_2020_CORRECTED-0000006144-0000015360.tif',
    'bow_river_embeddings_2020_CORRECTED-0000006144-0000018432.tif',
    'bow_river_embeddings_2020_CORRECTED-0000006144-0000021504.tif',
    'bow_river_embeddings_2020_CORRECTED-0000006144-0000024576.tif',
    'bow_river_embeddings_2020_CORRECTED-0000006144-0000027648.tif',
    'bow_river_embeddings_2020_CORRECTED-0000006144-0000030720.tif',
    'bow_river_embeddings_2020_CORRECTED-0000009216-0000000000.tif',
    'bow_river_embeddings_2020_CORRECTED-0000009216-0000003072.tif',
    'bow_river_embeddings_2020_CORRECTED-0000009216-0000006144.tif',
    'bow_river_embeddings_2020_CORRECTED-0000009216-0000009216.tif',
    'bow_river_embeddings_2020_CORRECTED-0000009216-0000012288.tif',
    'bow_river_embeddings_2020_CORRECTED-0000009216-0000015360.tif',
    'bow_river_embeddings_2020_CORRECTED-0000009216-0000018432.tif',
    'bow_river_embeddings_2020_CORRECTED-0000009216-0000021504.tif',
    'bow_river_embeddings_2020_CORRECTED-0000009216-0000024576.tif',
    'bow_river_embeddings_2020_CORRECTED-0000009216-0000027648.tif',
    'bow_river_embeddings_2020_CORRECTED-0000009216-0000030720.tif',
    'bow_river_embeddings_2020_CORRECTED-0000012288-0000000000.tif',
    'bow_river_embeddings_2020_CORRECTED-0000012288-0000003072.tif',
    'bow_river_embeddings_2020_CORRECTED-0000012288-0000006144.tif',
    'bow_river_embeddings_2020_CORRECTED-0000012288-0000009216.tif',
    'bow_river_embeddings_2020_CORRECTED-0000012288-0000012288.tif',
    'bow_river_embeddings_2020_CORRECTED-0000012288-0000015360.tif',
    'bow_river_embeddings_2020_CORRECTED-0000012288-0000018432.tif',
    'bow_river_embeddings_2020_CORRECTED-0000012288-0000021504.tif',
    'bow_river_embeddings_2020_CORRECTED-0000012288-0000024576.tif',
    'bow_river_embeddings_2020_CORRECTED-0000012288-0000027648.tif',
    'bow_river_embeddings_2020_CORRECTED-0000012288-0000030720.tif',
    'bow_river_embeddings_2020_CORRECTED-0000015360-0000000000.tif',
    'bow_river_embeddings_2020_CORRECTED-0000015360-0000003072.tif',
    'bow_river_embeddings_2020_CORRECTED-0000015360-0000006144.tif',
    'bow_river_embeddings_2020_CORRECTED-0000015360-0000009216.tif',
    'bow_river_embeddings_2020_CORRECTED-0000015360-0000012288.tif',
    'bow_river_embeddings_2020_CORRECTED-0000015360-0000015360.tif',
    'bow_river_embeddings_2020_CORRECTED-0000015360-0000018432.tif',
    'bow_river_embeddings_2020_CORRECTED-0000015360-0000021504.tif',
    'bow_river_embeddings_2020_CORRECTED-0000015360-0000024576.tif',
    'bow_river_embeddings_2020_CORRECTED-0000015360-0000027648.tif',
    'bow_river_embeddings_2020_CORRECTED-0000015360-0000030720.tif',
    'bow_river_embeddings_2020_CORRECTED-0000018432-0000000000.tif',
    'bow_river_embeddings_2020_CORRECTED-0000018432-0000003072.tif',
    'bow_river_embeddings_2020_CORRECTED-0000018432-0000006144.tif',
    'bow_river_embeddings_2020_CORRECTED-0000018432-0000009216.tif',
    'bow_river_embeddings_2020_CORRECTED-0000018432-0000012288.tif',
    'bow_river_embeddings_2020_CORRECTED-0000018432-0000015360.tif',
    'bow_river_embeddings_2020_CORRECTED-0000018432-0000018432.tif',
    'bow_river_embeddings_2020_CORRECTED-0000018432-0000021504.tif',
    'bow_river_embeddings_2020_CORRECTED-0000018432-0000024576.tif',
    'bow_river_embeddings_2020_CORRECTED-0000018432-0000027648.tif',
    'bow_river_embeddings_2020_CORRECTED-0000018432-0000030720.tif',
    'bow_river_embeddings_2020_CORRECTED-0000021504-0000000000.tif',
    'bow_river_embeddings_2020_CORRECTED-0000021504-0000003072.tif',
    'bow_river_embeddings_2020_CORRECTED-0000021504-0000009216.tif',
    'bow_river_embeddings_2020_CORRECTED-0000021504-0000012288.tif',
    'bow_river_embeddings_2020_CORRECTED-0000021504-0000015360.tif',
    'bow_river_embeddings_2020_CORRECTED-0000021504-0000018432.tif',
    'bow_river_embeddings_2020_CORRECTED-0000021504-0000021504.tif',
    'bow_river_embeddings_2020_CORRECTED-0000021504-0000024576.tif',
    'bow_river_embeddings_2020_CORRECTED-0000021504-0000027648.tif',
    'bow_river_embeddings_2020_CORRECTED-0000021504-0000030720.tif',
]

# Class definitions
CLASS_NAMES = {
    0: 'Background/Upland',
    1: 'Marsh',
    2: 'Swamp',
    3: 'Shallow Water',
    4: 'Fen',
    5: 'Bog',
}
NODATA_VALUE = 255

# Build tile file paths from allowlist
tile_files = [embeddings_dir / name for name in VALID_TILE_NAMES]

# Verify paths
print(f"\nLabels:     {labels_path}")
print(f"Embeddings: {embeddings_dir}")
print(f"Model:      {model_path}")
print(f"Output:     {output_path}")
print(f"Tiles:      {len(tile_files)} valid tiles")
print(f"Chunk rows: {CHUNK_ROWS} (~{CHUNK_ROWS * 3072 * 64 * 4 / 1024**2:.0f} MB peak per chunk)")

assert os.path.exists(labels_path), f"Labels not found: {labels_path}"
assert embeddings_dir.exists(), f"Embeddings dir not found: {embeddings_dir}"
assert os.path.exists(model_path), f"Model not found: {model_path}\n  Upload it via Colab Files panel or copy to Drive."

# Check that tiles exist
missing = [t.name for t in tile_files if not t.exists()]
if missing:
    print(f"\nWARNING: {len(missing)} tiles not found on Drive:")
    for m in missing[:5]:
        print(f"  - {m}")
    if len(missing) > 5:
        print(f"  ... and {len(missing) - 5} more")
    tile_files = [t for t in tile_files if t.exists()]
    print(f"  Proceeding with {len(tile_files)} available tiles")

print("\nConfiguration validated!")

In [None]:
# CELL 3: Load Model & Reference Raster
# NOTE: The model (~3.5 GB) will use most of the available RAM.
# The chunked tile processing in Cell 4 keeps additional RAM usage low.
print("\n" + "="*60)
print("LOADING MODEL & REFERENCE DATA")
print("="*60)

# Load RF model
print("\nLoading Random Forest model (3.5 GB — may take a few minutes)...")
rf_model = joblib.load(model_path)
print(f"  Trees: {rf_model.n_estimators}")
print(f"  Features: {rf_model.n_features_in_}")

# Force single-threaded prediction to reduce memory spikes during inference
rf_model.n_jobs = 1
print("  Set n_jobs=1 for memory-efficient inference")

# Read spatial metadata from labels raster
print("\nReading spatial reference from labels raster...")
with rasterio.open(labels_path) as labels_src:
    out_height = labels_src.height
    out_width = labels_src.width
    out_crs = labels_src.crs
    out_transform = labels_src.transform
    print(f"  Dimensions: {out_height} x {out_width}")
    print(f"  CRS: {out_crs}")
    print(f"  Resolution: {out_transform[0]:.1f}m")

# Verify first tile
with rasterio.open(tile_files[0]) as test_src:
    print(f"\nFirst tile: {tile_files[0].name}")
    print(f"  Bands: {test_src.count}")
    print(f"  Size: {test_src.height} x {test_src.width}")
    assert test_src.count == 64, f"Expected 64 bands, got {test_src.count}"

gc.collect()
print("\nReady for inference!")

In [None]:
# CELL 4: Run Inference — MEMORY-EFFICIENT (chunked row-by-row)
#
# Instead of loading a full 3072x3072x64 tile (~2.3 GB) at once,
# we read CHUNK_ROWS rows at a time (~96 MB), predict, write, then discard.
# Peak RAM = model (3.5 GB) + one chunk (~96 MB) = safe for Colab.

print("\n" + "="*60)
print("RUNNING CLASSIFICATION INFERENCE (MEMORY-EFFICIENT)")
print("="*60)
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Processing in chunks of {CHUNK_ROWS} rows")

# Create output GeoTIFF (initialized with nodata)
out_profile = {
    'driver': 'GTiff',
    'dtype': 'uint8',
    'width': out_width,
    'height': out_height,
    'count': 1,
    'crs': out_crs,
    'transform': out_transform,
    'nodata': NODATA_VALUE,
    'compress': 'lzw',
    'tiled': True,
    'blockxsize': 512,
    'blockysize': 512,
}

print(f"\nCreating output raster ({out_height}x{out_width})...")
# Create the file and fill with nodata in streaming chunks to avoid a big alloc
with rasterio.open(output_path, 'w', **out_profile) as dst:
    nodata_row = np.full((512, out_width), NODATA_VALUE, dtype=np.uint8)
    for row_start in range(0, out_height, 512):
        h = min(512, out_height - row_start)
        dst.write(nodata_row[:h], 1, window=Window(0, row_start, out_width, h))
del nodata_row
gc.collect()
print("Output raster created.")

# Track statistics
total_pixels_classified = 0
total_pixels_nodata = 0
class_counts = np.zeros(6, dtype=np.int64)
skipped_tiles = []

# Process each tile in row chunks
print(f"\nProcessing {len(tile_files)} tiles...")

with rasterio.open(output_path, 'r+') as dst:
    for tile_file in tqdm(tile_files, desc="Tiles", unit=" tiles"):
        # Parse tile offset from filename: *-RRRRRRRRRR-CCCCCCCCCC.tif
        parts = tile_file.stem.split('-')
        if len(parts) < 3:
            skipped_tiles.append(tile_file.name)
            continue
        try:
            row_offset = int(parts[-2])
            col_offset = int(parts[-1])
        except ValueError:
            skipped_tiles.append(tile_file.name)
            continue

        try:
            with rasterio.open(tile_file) as tile_src:
                if tile_src.count != 64:
                    skipped_tiles.append(tile_file.name)
                    continue

                tile_h = tile_src.height
                tile_w = tile_src.width

                # Clip tile to output raster bounds
                valid_h = min(tile_h, out_height - row_offset)
                valid_w = min(tile_w, out_width - col_offset)
                if valid_h <= 0 or valid_w <= 0:
                    skipped_tiles.append(tile_file.name)
                    continue

                # --- CHUNKED PROCESSING ---
                # Read CHUNK_ROWS rows at a time instead of the full tile
                for chunk_start in range(0, valid_h, CHUNK_ROWS):
                    chunk_h = min(CHUNK_ROWS, valid_h - chunk_start)

                    # Read chunk: shape (64, chunk_h, valid_w)
                    chunk_data = tile_src.read(
                        window=Window(0, chunk_start, valid_w, chunk_h)
                    )

                    # Reshape to (n_pixels, 64)
                    n_pixels = chunk_h * valid_w
                    pixels = chunk_data.reshape(64, n_pixels).T

                    # Free chunk_data immediately — we have pixels now
                    del chunk_data

                    # Mask NaN pixels
                    valid_mask = ~np.isnan(pixels).any(axis=1)
                    n_valid = valid_mask.sum()
                    n_nan = n_pixels - n_valid

                    # Predict on valid pixels only
                    predictions = np.full(n_pixels, NODATA_VALUE, dtype=np.uint8)
                    if n_valid > 0:
                        predictions[valid_mask] = rf_model.predict(
                            pixels[valid_mask]
                        ).astype(np.uint8)

                    del pixels  # free immediately after predict

                    # Write chunk to correct position in output
                    pred_2d = predictions.reshape(chunk_h, valid_w)
                    write_window = Window(
                        col_offset,
                        row_offset + chunk_start,
                        valid_w,
                        chunk_h
                    )
                    dst.write(pred_2d, 1, window=write_window)

                    del predictions, pred_2d

                    # Stats
                    total_pixels_classified += n_valid
                    total_pixels_nodata += n_nan
                    for cls in range(6):
                        # Recompute from write — we already deleted predictions
                        pass  # counted below via class_counts update

        except Exception as e:
            print(f"\n  Error on {tile_file.name}: {e}")
            skipped_tiles.append(tile_file.name)

        gc.collect()  # Force GC between tiles

print(f"\nFinished: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("Inference complete!")

In [None]:
# CELL 5: Verify & Summarize
print("="*60)
print("CLASSIFICATION MAP SUMMARY")
print("="*60)

# File info
file_size_mb = os.path.getsize(output_path) / (1024**2)
print(f"\nOutput file: {output_path}")
print(f"File size:   {file_size_mb:.1f} MB")

# Verify with rasterio and compute class distribution from the actual file
print("\nReading output raster for verification...")
with rasterio.open(output_path) as src:
    print(f"Dimensions:  {src.height} x {src.width}")
    print(f"CRS:         {src.crs}")
    print(f"Resolution:  {src.transform[0]:.1f}m")
    print(f"Data type:   {src.dtypes[0]}")
    print(f"NoData:      {src.nodata}")

    # Sample class distribution (read in chunks to avoid RAM spike)
    print("\nComputing class distribution (scanning output)...")
    class_counts = np.zeros(256, dtype=np.int64)
    for row_start in range(0, src.height, 512):
        h = min(512, src.height - row_start)
        chunk = src.read(1, window=Window(0, row_start, src.width, h))
        values, counts = np.unique(chunk, return_counts=True)
        for v, c in zip(values, counts):
            class_counts[v] += c

total_nodata = class_counts[NODATA_VALUE]
total_classified = class_counts[:6].sum()
total_pixels = total_classified + total_nodata

print(f"\nPixels classified: {total_classified:,}")
print(f"Pixels nodata:     {total_nodata:,}")

if total_classified > 0:
    print(f"\nClass Distribution:")
    print(f"  {'Class':<5} {'Name':<20} {'Count':>12} {'Percent':>8}")
    print(f"  {'-'*47}")
    for cls in range(6):
        pct = 100 * class_counts[cls] / total_classified
        print(f"  {cls:<5} {CLASS_NAMES[cls]:<20} {class_counts[cls]:>12,} {pct:>7.2f}%")

if skipped_tiles:
    print(f"\nSkipped {len(skipped_tiles)} tiles:")
    for t in skipped_tiles:
        print(f"  - {t}")

print(f"\n" + "="*60)
print("DONE! Next steps:")
print("="*60)
print(f"1. Download '{os.path.basename(output_path)}' from Google Drive")
print(f"2. Open in QGIS to verify the map visually")
print(f"3. Hand off to frontend team for web display")