In [1]:
import numpy as np
import geopandas as gpd
from scipy.spatial import cKDTree
from datetime import datetime 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
import pandas as pd
import contextily as cnx
import matplotlib.pyplot as plt
from datetime import date
import re
from rasterstats import zonal_stats
import rasterio
import os
from tqdm import tqdm
from shapely.geometry import box
import math

<h2> Obtaining fire dates and information for each Footprint <h2>

In [11]:
# Input Files
RASTER_FILE_PATH = 'MapBiomas Fire/2019-2024/Burned_monthly_AoD_2019_2024-0000053760-0000080640.tif'
GEDI_FILE_PATH = 'GEDI_Fire_SlideRule_footprints.gpkg'


# The raster path string is needed by the zonal_stats function inside the loop
raster_path = RASTER_FILE_PATH

# Match the output name with the  

match = re.search(r'(\d+-\d+)\.tif$', RASTER_FILE_PATH)

if match:
    # If found, the ID is the first captured group (group 1)
    RASTER_ID = match.group(1)
else:
    # Fallback in case the pattern isn't found
    RASTER_ID = "UnknownID"

# Output File
OUTPUT_FILENAME = f"GEDI_Fire_Dates_{RASTER_ID}.gpkg"

print(f"File will be saved as: {OUTPUT_FILENAME}")

# Set the folder where you want to save the final GeoPackage
OUTPUT_FOLDER = "GEDI_Fire_Dates"

# Use os.path.join for platform compatibility (best practice)
OUTPUT_GPKG_PATH = os.path.join(OUTPUT_FOLDER, OUTPUT_FILENAME)

File will be saved as: GEDI_Fire_Dates_0000053760-0000080640.gpkg


In [12]:
print("Starting Spatial Filtering")

try:
    # 1. Load both inputs
    fire_scars_src = rasterio.open(RASTER_FILE_PATH)
    gedi_all_footprints = gpd.read_file(GEDI_FILE_PATH)
    
    initial_count = len(gedi_all_footprints)
    print(f"Loaded {initial_count} footprints before filtering.")

    # 2. Get Raster Bounding Box (BBOX) and create the clipping geometry
    raster_bounds = fire_scars_src.bounds
    
    # Use shapely.box to create a valid Polygon geometry from the BoundingBox coordinates
    raster_polygon = box(
        raster_bounds.left, 
        raster_bounds.bottom, 
        raster_bounds.right, 
        raster_bounds.top
    )

    # Create the GeoSeries for clipping, setting its CRS to the raster's CRS
    raster_extent_poly = gpd.GeoSeries(
        [raster_polygon], 
        crs=fire_scars_src.crs
    )

    # 3. Perform the Clipping (Filtering)
    gedi_for_zonal_stats = (
    gedi_all_footprints.to_crs(fire_scars_src.crs)
    .clip(raster_extent_poly)
    )

    # Repair invalid geometries but keep attributes
    gedi_for_zonal_stats["geometry"] = gedi_for_zonal_stats.geometry.buffer(0)

    # Remove empty geometries
    gedi_for_zonal_stats = gedi_for_zonal_stats[~gedi_for_zonal_stats.geometry.is_empty]

    # Ensure CRS is set correctly
    gedi_for_zonal_stats = gedi_for_zonal_stats.set_crs(fire_scars_src.crs, allow_override=True)
    final_count = len(gedi_for_zonal_stats)

    print(f"Filtered down to {final_count} footprints inside the raster extent.")
    print(f"Successfully clipped {initial_count - final_count} footprints.")

finally:
    # Ensure the raster file handle is closed immediately after getting its bounds
    if not fire_scars_src.closed:
        fire_scars_src.close()
    print("Raster source file closed.")
    print("--------------------------------")

Starting Spatial Filtering
Loaded 1425683 footprints before filtering.
Filtered down to 0 footprints inside the raster extent.
Successfully clipped 1425683 footprints.
Raster source file closed.
--------------------------------


In [17]:
gedi_for_zonal_stats.head()

Unnamed: 0,solar_elevation,beam,flags,elevation,agbd,sensitivity,track,orbit,gedi_time,index_right,geometry
1254305,50.86795,6,134,127.868805,0.078555,0.981744,10991,11523,2020-12-24 17:37:52.316,360061,"POLYGON ((-47.96747 -5.57841, -47.96747 -5.578..."
1255062,-1.728935,6,134,126.508835,1.681483,0.983874,5062,5247,2019-11-15 21:12:32.416,359940,"POLYGON ((-47.98042 -5.57474, -47.98042 -5.574..."
1255063,-1.729386,6,134,126.740173,5.192692,0.989204,5062,5247,2019-11-15 21:12:32.424,359940,"POLYGON ((-47.98012 -5.57432, -47.98012 -5.574..."
1255064,-1.729836,6,134,120.966354,50.916496,0.990079,5062,5247,2019-11-15 21:12:32.432,359940,"POLYGON ((-47.97982 -5.5739, -47.97982 -5.5739..."
1255065,-1.730286,6,134,128.061768,17.863623,0.988968,5062,5247,2019-11-15 21:12:32.441,359940,"POLYGON ((-47.97952 -5.57348, -47.97952 -5.573..."


In [4]:
# Corrected function to extract fire dates with None handling

def extract_fire_dates(fire_scars_src, GEDI_shots_gdf): 
    with fire_scars_src as src: 
        gdf_polygons = GEDI_shots_gdf.copy()

        band_years = [int(re.search(r"(\d{4})", d).group(1)) for d in src.descriptions]

        all_fire_dates = []
        fire_counts = []

        print(f"Starting Zonal Stats for {len(gdf_polygons)} footprints...")
        
        for poly in tqdm(gdf_polygons.geometry, desc="Extracting Fire Info"):
            dates = []
            for b in range(1, src.count + 1):
                stats = zonal_stats(poly, raster_path, stats="majority", band=b)
                month = stats[0].get("majority", None)
                if month is not None and month > 0: 
                    year = band_years[b - 1]
                    dates.append(date(year, int(month), 1))
            all_fire_dates.append(dates)
            fire_counts.append(len(dates))
        
        gdf_polygons["fire_dates"] = all_fire_dates
        gdf_polygons["fire_count"] = fire_counts
        gdf_polygons["first_fire_date"] = [d[0] if d else None for d in all_fire_dates]
        gdf_polygons["last_fire_date"]  = [d[-1] if d else None for d in all_fire_dates]

        # QGIS-friendly conversions
        gdf_polygons["fire_dates_str"] = [
            ",".join(d.strftime("%Y-%m-%d") for d in dates) if dates else ""
            for dates in gdf_polygons["fire_dates"]
        ]
        import pandas as pd
        gdf_polygons["first_fire_date"] = pd.to_datetime(gdf_polygons["first_fire_date"])
        gdf_polygons["last_fire_date"]  = pd.to_datetime(gdf_polygons["last_fire_date"])

        # Optional: drop raw list column
        gdf_polygons = gdf_polygons.drop(columns=["fire_dates"])

        return gdf_polygons



In [8]:
# Configuration for batching
BATCH_SIZE = 50000  # adjust to suit memory/speed

total = len(gedi_for_zonal_stats)
num_batches = math.ceil(total / BATCH_SIZE)

print("\n" + "="*60)
print(f"Processing {total} footprints in {num_batches} batches of {BATCH_SIZE}...")
print(f"Tile ID: {RASTER_ID}")
print("="*60)

# Re-open the raster source for processing
fire_scars_src = rasterio.open(RASTER_FILE_PATH)

try:
    for i in range(num_batches):
        start = i * BATCH_SIZE
        end = min((i + 1) * BATCH_SIZE, total)
        
        # Build batch filename with tile ID + batch number
        batch_filename = f"GEDI_Fire_Dates_{RASTER_ID}_batch{i+1}.gpkg"
        batch_path = os.path.join(OUTPUT_FOLDER, batch_filename)

        # Safe restart: skip if this batch file already exists
        if os.path.exists(batch_path):
            print(f"Skipping batch {i+1}/{num_batches} (already exists): {batch_path}")
            continue

        print(f"\n--- Batch {i+1}/{num_batches}: footprints {start} to {end} ---")
        batch_gdf = gedi_for_zonal_stats.iloc[start:end]

        # Run extraction for this batch
        batch_result = extract_fire_dates(fire_scars_src, batch_gdf)

        # Save incrementally
        batch_result.to_file(batch_path, driver="GPKG")
        print(f"✅ Saved batch {i+1} to {batch_path}")

finally:
    if not fire_scars_src.closed:
        fire_scars_src.close()
    print("Raster source file closed.")



Processing 161550 footprints in 4 batches of 50000...
Tile ID: 0000053760-0000026880

--- Batch 1/4: footprints 0 to 50000 ---
Starting Zonal Stats for 50000 footprints...


Extracting Fire Info: 100%|██████████| 50000/50000 [31:16<00:00, 26.65it/s]


✅ Saved batch 1 to GEDI_Fire_Dates\GEDI_Fire_Dates_0000053760-0000026880_batch1.gpkg

--- Batch 2/4: footprints 50000 to 100000 ---
Starting Zonal Stats for 50000 footprints...


Extracting Fire Info: 100%|██████████| 50000/50000 [31:25<00:00, 26.52it/s]


✅ Saved batch 2 to GEDI_Fire_Dates\GEDI_Fire_Dates_0000053760-0000026880_batch2.gpkg

--- Batch 3/4: footprints 100000 to 150000 ---
Starting Zonal Stats for 50000 footprints...


Extracting Fire Info: 100%|██████████| 50000/50000 [31:58<00:00, 26.07it/s]


✅ Saved batch 3 to GEDI_Fire_Dates\GEDI_Fire_Dates_0000053760-0000026880_batch3.gpkg

--- Batch 4/4: footprints 150000 to 161550 ---
Starting Zonal Stats for 11550 footprints...


Extracting Fire Info: 100%|██████████| 11550/11550 [07:40<00:00, 25.11it/s]


✅ Saved batch 4 to GEDI_Fire_Dates\GEDI_Fire_Dates_0000053760-0000026880_batch4.gpkg
Raster source file closed.
