In [1]:
# activate autoreload
%load_ext autoreload
%autoreload 2

# detect Colab vs. local
try:
    import google.colab
    IN_COLAB = True
    print('Google Colab session!')
except ImportError:
    IN_COLAB = False
    print('Not a Google Colab session.')

import os, sys
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    PROJECT_ROOT = '/content/drive/MyDrive/papers/2025b_relevance_2.0'
else:
    PROJECT_ROOT = os.path.dirname(os.path.abspath(os.path.dirname("__file__")))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT) if PROJECT_ROOT not in sys.path else None

# set standardised paths
DATA_PATH: str = os.path.join(PROJECT_ROOT, 'data')
FIGURE_PATH: str = os.path.join(PROJECT_ROOT, 'figures')
print(f"PROJECT_ROOT = {PROJECT_ROOT}")
print(f"DATA_PATH    = {DATA_PATH}")
print(f"FIGURE_PATH    = {FIGURE_PATH}")

Not a Google Colab session.
PROJECT_ROOT = /mnt/c/Users/DavidHanny/OneDrive - IT U interdisciplinary transformation university austria/Documents/projects/papers/2025_GSAI_RES_LLM_Contextual_Predictions
DATA_PATH    = /mnt/c/Users/DavidHanny/OneDrive - IT U interdisciplinary transformation university austria/Documents/projects/papers/2025_GSAI_RES_LLM_Contextual_Predictions/data
FIGURE_PATH    = /mnt/c/Users/DavidHanny/OneDrive - IT U interdisciplinary transformation university austria/Documents/projects/papers/2025_GSAI_RES_LLM_Contextual_Predictions/figures


# **Ground truth data preparation**
In this notebook, we prepare the ground truth event delineations for the following two events:
- the January 2025 Southern California (SoCal) wildfires
- the September 2024 Western Europe floods

In [2]:
import re
import numpy as np
import pandas as pd
import geopandas as gpd
import contextily as ctx
import matplotlib.pyplot as plt
from ipywidgets import interact, Dropdown
from shapely.ops import unary_union
from shapely.validation import make_valid
from tqdm.auto import tqdm
tqdm.pandas()

## **1. 2025 SoCal wildfires**
For the 2025 wildfires in Southern California, we obtained wildfire from the NASA Fire Information for Resource Management System (FIRMS). It provides near real-time active fire detections using satellite observations from NASA's MODIS and VIIRS instruments.

In [4]:
fire_delineations: gpd.GeoDataFrame = gpd.read_file(os.path.join(DATA_PATH, 'raw', '2025_socal_wildfires', 'nasa_firms', 'DL_FIRE_M-C61_615073', 'fire_archive_M-C61_615073.shp'))

# Parse date
fire_delineations['ACQ_DATE'] = pd.to_datetime(fire_delineations['ACQ_DATE'])

# Zero‐pad ACQ_TIME to 4 digits and split into HH:MM
time_str = fire_delineations['ACQ_TIME'].astype(str).str.zfill(4)
hh_mm   = time_str.str[:2] + ':' + time_str.str[2:]

# Combine date + time and parse
fire_delineations['ACQ_DATETIME'] = pd.to_datetime(
    fire_delineations['ACQ_DATE'].dt.strftime('%Y-%m-%d') + ' ' + hh_mm,
    format='%Y-%m-%d %H:%M'
)

# Save and display the result
fire_delineations.to_parquet(os.path.join(DATA_PATH, 'raw', '2025_socal_wildfires', 'nasa_firms', 'modis_fire_delineations.parquet'))
pd.DataFrame(fire_delineations)

Unnamed: 0,LATITUDE,LONGITUDE,BRIGHTNESS,SCAN,TRACK,ACQ_DATE,ACQ_TIME,SATELLITE,INSTRUMENT,CONFIDENCE,VERSION,BRIGHT_T31,FRP,DAYNIGHT,TYPE,geometry,ACQ_DATETIME
0,32.7404,-114.6500,319.4,1.2,1.1,2024-12-24,1038,Aqua,MODIS,98,61.03,278.5,27.1,N,0,POINT (-114.65 32.7404),2024-12-24 10:38:00
1,36.4977,-119.6244,301.7,1.4,1.2,2024-12-29,1815,Terra,MODIS,49,61.03,281.3,9.2,D,0,POINT (-119.6244 36.4977),2024-12-29 18:15:00
2,33.5558,-116.6458,309.2,1.2,1.1,2024-12-29,2139,Aqua,MODIS,60,61.03,298.1,7.1,D,0,POINT (-116.6458 33.5558),2024-12-29 21:39:00
3,32.6542,-114.8122,314.0,1.0,1.0,2024-12-29,2139,Aqua,MODIS,65,61.03,300.8,8.9,D,0,POINT (-114.8122 32.6542),2024-12-29 21:39:00
4,32.6190,-116.1323,312.7,1.1,1.1,2024-12-29,2139,Aqua,MODIS,63,61.03,299.4,7.6,D,0,POINT (-116.1323 32.619),2024-12-29 21:39:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919,40.2805,-123.0807,320.3,3.4,1.7,2025-01-30,2128,Aqua,MODIS,80,61.03,280.1,116.4,D,0,POINT (-123.0807 40.2805),2025-01-30 21:28:00
920,40.2703,-123.1202,328.3,3.4,1.7,2025-01-30,2128,Aqua,MODIS,86,61.03,282.3,168.9,D,0,POINT (-123.1202 40.2703),2025-01-30 21:28:00
921,40.2712,-123.0997,317.0,4.5,2.0,2025-01-30,2305,Aqua,MODIS,67,61.03,278.1,161.8,D,0,POINT (-123.0997 40.2712),2025-01-30 23:05:00
922,40.2721,-123.1039,317.3,4.5,2.0,2025-01-30,2305,Aqua,MODIS,68,61.03,278.2,164.4,D,0,POINT (-123.1039 40.2721),2025-01-30 23:05:00


Perhaps we can also visualise this a bit.

In [None]:
# Prepare interactive controls
dates: list[pd.Timestamp] = sorted(fire_delineations['ACQ_DATE'].unique())

# Plotting function
@interact(
    date=Dropdown(options=dates, description='Day:'),
)
def plot_fire_delineations(date: pd.Timestamp):
    subset: pd.DataFrame = fire_delineations[fire_delineations['ACQ_DATE'] == date]
    ax = subset.to_crs(3857).plot(marker='s', color='dimgray')
    ctx.add_basemap(ax=ax)

## **2. 2024 Central Europe floods**
To obtain flood delinations for the Central Europe floods, we used the Copernicus Global Flood Monitoring System (GFM), obtaining the *observed flood extent* product for Austria, Czechia, Hungary, Poland, Romania and Slovakia.

The new GFM provides a continuous monitoring of floods worldwide by immediately processing and analyzing all incoming Copernicus Sentinel-1 Synthetic Aperture Radar (SAR) satellite data. Being a fully automated system, one of the strengths of the GFM is the high timeliness of its products. Secondly, the implementation of three independently developed state-of-the-art satellite flood mapping algorithms, underpins the robustness and high quality of the derived flood and water extent maps.

<strike>
**Alternative option with less data**

https://emergency.copernicus.eu/news/floods-in-central-eastern-europe-september-2024/

For the September 2024, central Europe floods, we might alternatively obtain delinations of the flooded areas from the Copernicus Emergency Management System (CEMS). The CEMS On-demand mapping component has received the following activations to monitor the flood extent and assess the damage:
1. EMSR766 Flood in Croatia
2. EMSR764 Flood in Lower Austria, Austria
3. EMSR763 Flood in Germany
4. EMSR762 Flood in Emilia-Romagna, Italy
5. EMSR761 Flood in Dresden, Germany
6. EMSR759 Flood on the Danube in Hungary, Austria, and Slovakia
7. EMSR758 Flood in Galati and Vaslui counties, Romania
8. EMSR757 Flood in March, Morava Basins, Slovakia
9. EMSR756 Flood in South West Poland
10. EMSR755 Flood in Brandenbourg, Germany
</strike>

We downloaded and merged the GFM data for all five countries. To generate a meaningful ground truth, we need to merge it all.

In [None]:
ROOT_DIR: str = os.path.join(DATA_PATH, 'raw', '2024_central_europe_floods', 'gfm', 'merged')

    # regex to pull out for the delineations
# - equi7grid_cont => Continent (excluding Antarctica) and spatial resolution in the GFM’s Equi7Grid global grid and tiling system.
# - equi7grid_tile_id => Unique identifier of tile in the GFM’s Equi7Grid global grid and tiling system.
# - sensing_date => Date of sensing given in ISO 8601 format YYYYMMDDThh:mm:ss.
# - polarisation => Sentinel-1 SAR polarisation scheme; VV = Vertical Transmit and Vertical Receive co-polarised SAR data.
# - equi7grid_cont_2 (+020M) => Continent (excluding Antarctica) and spatial resolution of 20m.
# - file_date => Date the file is associated with, given in ISO 8601 format YYYYMMDD.
# Example string: EU_E054N012T3_ENSEMBLE_FLOOD_20240929T044555_VV_EU020M_E054N012T3_20240929.geojson
FLOOD_DEL_PATTERN = re.compile(
    r'^(?P<equi7grid_cont>[A-Za-z]+)_'
    r'(?P<equi7grid_tile_id>[A-Za-z0-9]+)_ENSEMBLE_FLOOD_'
    r'(?P<sensing_date>[A-Za-z0-9]+)_'
    r'(?P<polarisation>[A-Za-z]+)_'
    r'(?P<equi7grid_cont_2>[A-Z]{2}020M)_'
    r'(?P<equi7grid_tile_id_2>[A-Za-z0-9]+)_'
    r'(?P<file_date>[0-9]+)\.geojson$'
)

# regex to pull out for the satellite footprints
FOOTPRINT_PATTERN = re.compile(
    r'^(?P<equi7grid_cont>[A-Za-z]+)_'
    r'(?P<equi7grid_tile_id>[A-Za-z0-9]+)_sentinel-1_footprint_'
    r'(?P<unique_id>[0-9]+)_'
    r'(?P<file_date>[0-9]+)\.geojson$'
)

# Collect GeoDataFrames
delineation_gdfs: list[gpd.GeoDataFrame] = []
footprint_gdfs: list[gpd.GeoDataFrame] = []

# iterae over all folders and files in the data directory
for root, dirs, files in tqdm(os.walk(ROOT_DIR), desc='Searching for GeoJSON files', unit='folder'):
    for fname in files:
        if not fname.lower().endswith('.geojson'):
            continue
            
        path: str = os.path.join(root, fname)

        # Try flood delineation pattern
        match = FLOOD_DEL_PATTERN.match(fname)
        if match:
            info = match.groupdict()
            # parse dates
            info['sensing_date'] = pd.to_datetime(info['sensing_date'], format='%Y%m%dT%H%M%S')
            info['file_date'] = pd.to_datetime(info['file_date'], format='%Y%m%d')

            gdf = gpd.read_file(path)
            # attach metadata
            for key in ['equi7grid_cont', 'equi7grid_cont_2', 'equi7grid_tile_id', 'equi7grid_tile_id_2', 'polarisation']:
                gdf[key] = info[key]
            gdf['sensing_date'] = info['sensing_date']
            gdf['file_date'] = info['file_date']
            delineation_gdfs.append(gdf)
            continue

        # Try footprint pattern
        match = FOOTPRINT_PATTERN.match(fname)
        if match:
            info = match.groupdict()
            # parse file date
            info['file_date'] = pd.to_datetime(info['file_date'], format='%Y%m%d')
            gdf = gpd.read_file(path)

            # attach metadata
            gdf['equi7grid_cont'] = info['equi7grid_cont']
            gdf['equi7grid_tile_id'] = info['equi7grid_tile_id']
            gdf['unique_id'] = info['unique_id']
            gdf['file_date'] = info['file_date']
            footprint_gdfs.append(gdf)
            continue

        # Unmatched
        print(f"Skipping (unmatched): {fname}")


# Combine and save delineations
if delineation_gdfs:
    combined_delineations = pd.concat(delineation_gdfs, ignore_index=True)
    combined_delineations = gpd.GeoDataFrame(combined_delineations, crs=delineation_gdfs[0].crs)
    combined_delineations.to_parquet(
        os.path.join(DATA_PATH, 'raw', '2024_central_europe_floods', 'gfm', 'combined_flood_delineations.parquet')
    )
    print(f"Combined {len(delineation_gdfs)} delineation files into {len(combined_delineations)} features.")
else:
    print("No flood delineation GeoJSONs found or matched the pattern.")

# Combine and save footprints
if footprint_gdfs:
    combined_footprints = pd.concat(footprint_gdfs, ignore_index=True)
    combined_footprints = gpd.GeoDataFrame(combined_footprints, crs=footprint_gdfs[0].crs)
    combined_footprints.to_parquet(
        os.path.join(DATA_PATH, 'raw', '2024_central_europe_floods', 'gfm', 'combined_footprints.parquet')
    )
    print(f"Combined {len(footprint_gdfs)} footprint files into {len(combined_footprints)} features.")
else:
    print("No footprint GeoJSONs found or matched the pattern.")

pd.DataFrame(combined_delineations)

Searching for GeoJSON files: 31folder [00:56,  1.81s/folder]


Combined 1144 delineation files into 134885 features.
Combined 1146 footprint files into 1146 features.


Unnamed: 0,DN,geometry,equi7grid_cont,equi7grid_cont_2,equi7grid_tile_id,equi7grid_tile_id_2,polarisation,sensing_date,file_date
0,1.0,"POLYGON ((20.77297 43.71864, 20.77297 43.71843...",EU,EU020M,E054N009T3,E054N009T3,VV,2024-09-01 16:25:22,2024-09-01
1,1.0,"POLYGON ((21.40436 43.69692, 21.40436 43.69671...",EU,EU020M,E054N009T3,E054N009T3,VV,2024-09-01 16:25:22,2024-09-01
2,1.0,"POLYGON ((20.95005 43.63937, 20.95005 43.63916...",EU,EU020M,E054N009T3,E054N009T3,VV,2024-09-01 16:25:22,2024-09-01
3,1.0,"POLYGON ((21.01519 43.62988, 21.01519 43.62967...",EU,EU020M,E054N009T3,E054N009T3,VV,2024-09-01 16:25:22,2024-09-01
4,1.0,"POLYGON ((21.02953 43.63094, 21.02953 43.63073...",EU,EU020M,E054N009T3,E054N009T3,VV,2024-09-01 16:25:22,2024-09-01
...,...,...,...,...,...,...,...,...,...
134880,1.0,"POLYGON ((17.77175 52.99709, 17.77175 52.99685...",EU,EU020M,E054N021T3,E054N021T3,VV,2024-09-30 16:36:30,2024-09-30
134881,1.0,"POLYGON ((17.77079 52.99709, 17.77079 52.99685...",EU,EU020M,E054N021T3,E054N021T3,VV,2024-09-30 16:36:30,2024-09-30
134882,1.0,"POLYGON ((17.76984 52.99685, 17.76984 52.99661...",EU,EU020M,E054N021T3,E054N021T3,VV,2024-09-30 16:36:30,2024-09-30
134883,1.0,"POLYGON ((17.54106 52.9718, 17.54106 52.97132,...",EU,EU020M,E054N021T3,E054N021T3,VV,2024-09-30 16:36:30,2024-09-30


In [15]:
pd.DataFrame(combined_footprints)

Unnamed: 0,identifier,time_begin,time_end,geometry,equi7grid_cont,equi7grid_tile_id,unique_id,file_date
0,S1A_IW_GRDH_1SDV_20240901T162522_20240901T1625...,2024-09-01 16:25:22.869,2024-09-01 16:25:47.868,"POLYGON ((21.00955 42.4215, 24.18848 42.82938,...",EU,E054N009T3,4963189,2024-09-01
1,S1A_IW_GRDH_1SDV_20240901T162547_20240901T1626...,2024-09-01 16:25:47.869,2024-09-01 16:26:12.868,"POLYGON ((20.6005 43.92047, 23.86017 44.32815,...",EU,E054N009T3,4963320,2024-09-01
2,S1A_IW_GRDH_1SDV_20240901T162612_20240901T1626...,2024-09-01 16:26:12.869,2024-09-01 16:26:37.867,"POLYGON ((20.17851 45.41832, 23.52526 45.82621...",EU,E054N012T3,4963208,2024-09-01
3,S1A_IW_GRDH_1SDV_20240901T162637_20240901T1627...,2024-09-01 16:26:37.868,2024-09-01 16:27:02.867,"POLYGON ((19.74934 46.91588, 23.19021 47.32422...",EU,E054N012T3,4963307,2024-09-01
4,S1A_IW_GRDH_1SDV_20240901T162547_20240901T1626...,2024-09-01 16:25:47.869,2024-09-01 16:26:12.868,"POLYGON ((20.6005 43.92047, 23.86017 44.32815,...",EU,E054N012T3,4963321,2024-09-01
...,...,...,...,...,...,...,...,...
1141,S1A_IW_GRDH_1SDV_20240930T163540_20240930T1636...,2024-09-30 16:35:40.455,2024-09-30 16:36:05.454,"POLYGON ((16.83065 49.7877, 20.47293 50.19798,...",EU,E054N018T3,5112818,2024-09-30
1142,S1A_IW_GRDH_1SDV_20240930T163605_20240930T1636...,2024-09-30 16:36:05.455,2024-09-30 16:36:30.454,"POLYGON ((16.34537 51.28087, 20.10642 51.69337...",EU,E054N018T3,5112921,2024-09-30
1143,S1A_IW_GRDH_1SDV_20240930T163630_20240930T1636...,2024-09-30 16:36:30.455,2024-09-30 16:36:55.454,"POLYGON ((15.84316 52.77319, 19.73504 53.18857...",EU,E054N021T3,5112840,2024-09-30
1144,S1A_IW_GRDH_1SDV_20240930T163655_20240930T1637...,2024-09-30 16:36:55.455,2024-09-30 16:37:20.454,"POLYGON ((15.31659 54.26393, 19.34951 54.68284...",EU,E054N021T3,5112877,2024-09-30


To make things easier in the future, we can merge the delineations with footprints.

In [16]:
combined_footprints['footprint_geometry'] = combined_footprints['geometry']
delinations_with_footprints_gdf = gpd.sjoin(
    left_df=combined_delineations,
    right_df=combined_footprints,
    how='left',
    predicate='intersects',
    on_attribute=['equi7grid_cont', 'equi7grid_tile_id', 'file_date']
).drop_duplicates().drop(columns=['index_right'])

print(combined_delineations.shape)
print(delinations_with_footprints_gdf.shape)
delinations_with_footprints_gdf.to_parquet(
    os.path.join(DATA_PATH, 'raw', '2024_central_europe_floods', 'gfm', 'delineations_w_footprints.parquet')
)
pd.DataFrame(delinations_with_footprints_gdf)

(134885, 9)
(135046, 14)


Unnamed: 0,DN,geometry,equi7grid_cont,equi7grid_cont_2,equi7grid_tile_id,equi7grid_tile_id_2,polarisation,sensing_date,file_date,identifier,time_begin,time_end,unique_id,footprint_geometry
0,1.0,"POLYGON ((20.77297 43.71864, 20.77297 43.71843...",EU,EU020M,E054N009T3,E054N009T3,VV,2024-09-01 16:25:22,2024-09-01,S1A_IW_GRDH_1SDV_20240901T162522_20240901T1625...,2024-09-01 16:25:22.869,2024-09-01 16:25:47.868,4963189,"POLYGON ((21.00955 42.4215, 24.18848 42.82938,..."
1,1.0,"POLYGON ((21.40436 43.69692, 21.40436 43.69671...",EU,EU020M,E054N009T3,E054N009T3,VV,2024-09-01 16:25:22,2024-09-01,S1A_IW_GRDH_1SDV_20240901T162522_20240901T1625...,2024-09-01 16:25:22.869,2024-09-01 16:25:47.868,4963189,"POLYGON ((21.00955 42.4215, 24.18848 42.82938,..."
2,1.0,"POLYGON ((20.95005 43.63937, 20.95005 43.63916...",EU,EU020M,E054N009T3,E054N009T3,VV,2024-09-01 16:25:22,2024-09-01,S1A_IW_GRDH_1SDV_20240901T162522_20240901T1625...,2024-09-01 16:25:22.869,2024-09-01 16:25:47.868,4963189,"POLYGON ((21.00955 42.4215, 24.18848 42.82938,..."
3,1.0,"POLYGON ((21.01519 43.62988, 21.01519 43.62967...",EU,EU020M,E054N009T3,E054N009T3,VV,2024-09-01 16:25:22,2024-09-01,S1A_IW_GRDH_1SDV_20240901T162522_20240901T1625...,2024-09-01 16:25:22.869,2024-09-01 16:25:47.868,4963189,"POLYGON ((21.00955 42.4215, 24.18848 42.82938,..."
4,1.0,"POLYGON ((21.02953 43.63094, 21.02953 43.63073...",EU,EU020M,E054N009T3,E054N009T3,VV,2024-09-01 16:25:22,2024-09-01,S1A_IW_GRDH_1SDV_20240901T162522_20240901T1625...,2024-09-01 16:25:22.869,2024-09-01 16:25:47.868,4963189,"POLYGON ((21.00955 42.4215, 24.18848 42.82938,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134880,1.0,"POLYGON ((17.77175 52.99709, 17.77175 52.99685...",EU,EU020M,E054N021T3,E054N021T3,VV,2024-09-30 16:36:30,2024-09-30,S1A_IW_GRDH_1SDV_20240930T163630_20240930T1636...,2024-09-30 16:36:30.455,2024-09-30 16:36:55.454,5112840,"POLYGON ((15.84316 52.77319, 19.73504 53.18857..."
134881,1.0,"POLYGON ((17.77079 52.99709, 17.77079 52.99685...",EU,EU020M,E054N021T3,E054N021T3,VV,2024-09-30 16:36:30,2024-09-30,S1A_IW_GRDH_1SDV_20240930T163630_20240930T1636...,2024-09-30 16:36:30.455,2024-09-30 16:36:55.454,5112840,"POLYGON ((15.84316 52.77319, 19.73504 53.18857..."
134882,1.0,"POLYGON ((17.76984 52.99685, 17.76984 52.99661...",EU,EU020M,E054N021T3,E054N021T3,VV,2024-09-30 16:36:30,2024-09-30,S1A_IW_GRDH_1SDV_20240930T163630_20240930T1636...,2024-09-30 16:36:30.455,2024-09-30 16:36:55.454,5112840,"POLYGON ((15.84316 52.77319, 19.73504 53.18857..."
134883,1.0,"POLYGON ((17.54106 52.9718, 17.54106 52.97132,...",EU,EU020M,E054N021T3,E054N021T3,VV,2024-09-30 16:36:30,2024-09-30,S1A_IW_GRDH_1SDV_20240930T163630_20240930T1636...,2024-09-30 16:36:30.455,2024-09-30 16:36:55.454,5112840,"POLYGON ((15.84316 52.77319, 19.73504 53.18857..."


We should also visualise our ground truth events.

In [None]:
# Prepare interactive controls
dates: list[pd.Timestamp] = sorted(delinations_with_footprints_gdf['sensing_date'].dt.date.unique())

# Plotting function
@interact(
    date=Dropdown(options=dates, description='Day:'),
)
def plot_flood_delineations(date: pd.Timestamp):
    subset: gpd.GeoDataFrame = delinations_with_footprints_gdf[delinations_with_footprints_gdf['sensing_date'].dt.date == date]
    footprint_subset: gpd.GeoDataFrame = subset.set_geometry('footprint_geometry', crs=4326)
    fig, ax = plt.subplots()
    subset.to_crs(3857).plot(color='blue', edgecolor='blue', linewidth=2, ax=ax)
    footprint_subset.to_crs(3857).plot(color='None', edgecolor='k', linewidth=2, ax=ax)
    ctx.add_basemap(ax=ax)

interactive(children=(Dropdown(description='Day:', options=(datetime.date(2024, 9, 1), datetime.date(2024, 9, …