In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import geopandas as gpd
import pandas as pd
import polars as pl

from utils import flags_to_label_df, flags_to_sample_df

In [3]:
# server_root = "/mnt/share/FER"
server_root = "//digs110/FER"

# Homogenize Sample Data

following columns will be compiled

Two files:

samples.geoparquet (? file format)

- Unique Sample ID (int)
- Original Sample ID (int)
- Interpreter (str)
- Source (str)
- S2 Tile (str)
- Cluster (Polygon ID, storm event) (str)
- Cluster Description (str)
- Comment (str)
- Confidence (categorical)
- Point (geometry)

labels.pq

- Timestamp (datetime)
- original label (str)
- Label (categorical)

In [4]:
samples_schema = gpd.GeoDataFrame(
    {
        "sample_id": pd.Series(dtype="uint16[pyarrow]"),
        "original_sample_id": pd.Series(dtype="int64[pyarrow]"),
        "interpreter": pd.Series(dtype="string[pyarrow]"),
        "dataset": pd.Series(dtype="string[pyarrow]"),
        "source": pd.Series(dtype="string[pyarrow]"),
        "source_description": pd.Series(dtype="string[pyarrow]"),
        "s2_tile": pd.Series(dtype="string[pyarrow]"),
        "cluster_id": pd.Series(dtype="string[pyarrow]"),
        "cluster_description": pd.Series(dtype="string[pyarrow]"),
        "comment": pd.Series(dtype="string[pyarrow]"),
        "confidence": pd.Series(
            dtype="string[pyarrow]"
        ),  # PyArrow doesn't yet support 'categorical' as a dtype
        "geometry": gpd.GeoSeries(crs="EPSG:4326"),
    },
    geometry="geometry",
)

In [5]:
labels_schema = pd.DataFrame(
    {
        "sample_id": pd.Series(dtype="uint16[pyarrow]"),
        "original_sample_id": pd.Series(dtype="int64[pyarrow]"),
        "dataset": pd.Series(dtype="string[pyarrow]"),
        "label": pd.Series(dtype="uint16[pyarrow]"),
        "original_label": pd.Series(dtype="string[pyarrow]"),
        "start": pd.Series(dtype="timestamp[s, tz=UTC][pyarrow]"),
        "end": pd.Series(dtype="timestamp[s, tz=UTC][pyarrow]"),
    }
)

# Label mapping 

The labels can be mapped roughly based on the categorization of ICP forest data. Going from coarse to fine categorization.
If more e0act reasons aren't known, the coarser category can be set.

<pre>
100 - Alive Vegetation  
    110 - Mature Forest  
    120 - Revegetation  
        121 - With Trees (after clear cut)  
        122 - Canopy closing (after thinning/defoliation)  
        123 - Without Trees (shrubs and grasses, no reforestation visible)  

200 - Disturbed  
    210 - Planned  
        211 - Clear Cut  
        212 - Thinning  
        213 - Forestry Mulching (Non Forest Vegetation Removal) 
    220 - Salvage  
        221 - After Biotic Disturbance  
        222 - After Abiotic Disturbance  
    230 - Biotic  
        231 - Bark Beetle (with decline)  
        232 - Gypsy Moth (temporary)  
    240 - Abiotic  
        241 - Drought  
        242 - Wildfire  
        243 - Wind  
        244 - Avalanche  
        245 - Flood  
</pre>

## Evoland + FNEWs

In [6]:
# evoland
evoland = Path(
    server_root
    + "/EvoLand/WP2_6_CFM/Referenzdaten/Database/PointMultitemp/evoland_ref_points_mt.shp"
)
evoland_multitemp = gpd.read_file(evoland)

In [7]:
# Drop duplicates since the geometry and sample ID are likely repeated
samples_df = evoland_multitemp.drop_duplicates(
    "id"
).copy()  # .copy() to avoid SettingWithCopyWarning


def combine_comments(row):
    parts = [row["comment1"], row["comment2"], row["comment3"]]
    # Filter out None / NaN / empty
    parts = [str(p) for p in parts if pd.notna(p) and str(p).strip()]
    return " ".join(parts) if parts else ""

In [8]:
samples_df["source"] = samples_df["site"].replace(
    {"Spain": "EFFIS", "Germany": "Regional Forestry Departments", "Sweden": "EFFIS"}
)
samples_df["source_description"] = samples_df["site"].replace(
    {
        "Spain": "Evoland Project, EFFIS Source of Wildfire Polygons, manual interpretation based on S2 timeseries",
        "Germany": "FNews Project, German Forestry Departmetns Source of Disturbance Types, manual interpretation based on S2 timeseries",
        "Sweden": "Evoland Project, EFFIS Source of Wildfire Polygons, manual interpretation based on S2 timeseries",
    }
)
samples_df["comment"] = samples_df.apply(combine_comments, axis=1)

In [9]:
# get new labels/confidence/labeller from flags
flag_paths = list(Path("../data/evoland_flags_reinterpreted").glob("*.json"))
raw_evo_labels = flags_to_label_df(flag_paths)
raw_evo_samples = flags_to_sample_df(flag_paths)

In [10]:
subset_samples = samples_df[
    ["id", "poly_id", "tile", "source", "source_description", "geometry"]
]

In [11]:
filtered_raw_samples = raw_evo_samples.filter(
    pl.col.confidence.is_in(["high", "medium"]),
    # If interpreter is still evo, the sample has not been reinterpreted
    # there are a few sample points which are exact duplicates for which this is the case
    pl.col.interpreter.ne(pl.lit("evo")),
).with_columns(
    # remove the comment, if it is the same as in the original evoland table
    pl.col.original_sample_id.cast(pl.Int64),
    comment=(
        pl.when(pl.col.comment.is_in(samples_df["comment"].drop_duplicates().to_list()))
        .then(pl.lit(""))
        .otherwise(pl.col.comment)
    ),
)
filtered_raw_samples

confidence,comment,interpreter,original_sample_id
str,str,str,i64
"""high""","""border, thinning, then clear c…","""vij""",0
"""medium""","""unclear progression, edge""","""vij""",1
"""high""","""very low tcd, new plantation""","""vij""",10
"""high""","""""","""vij""",100
"""high""","""""","""vij""",101
…,…,…,…
"""high""","""""","""vij""",992
"""high""","""""","""vij""",993
"""high""","""""","""vij""",995
"""high""","""""","""vij""",996


In [12]:
evo_samples_together = filtered_raw_samples.to_pandas().merge(
    subset_samples, left_on="original_sample_id", right_on="id"
)
evo_samples_together

Unnamed: 0,confidence,comment,interpreter,original_sample_id,id,poly_id,tile,source,source_description,geometry
0,high,"border, thinning, then clear cut",vij,0,0,0.0,30SUF,EFFIS,"Evoland Project, EFFIS Source of Wildfire Poly...",POINT (-4.12212 36.74179)
1,medium,"unclear progression, edge",vij,1,1,1.0,30SUF,EFFIS,"Evoland Project, EFFIS Source of Wildfire Poly...",POINT (-4.12161 36.74231)
2,high,"very low tcd, new plantation",vij,10,10,11.0,30SUF,EFFIS,"Evoland Project, EFFIS Source of Wildfire Poly...",POINT (-4.13315 36.76793)
3,high,,vij,100,100,38.0,30SUF,EFFIS,"Evoland Project, EFFIS Source of Wildfire Poly...",POINT (-5.14862 36.51116)
4,high,,vij,101,101,38.0,30SUF,EFFIS,"Evoland Project, EFFIS Source of Wildfire Poly...",POINT (-5.15389 36.50781)
...,...,...,...,...,...,...,...,...,...,...
908,high,,vij,992,992,331.0,32UNC,Regional Forestry Departments,"FNews Project, German Forestry Departmetns Sou...",POINT (10.14152 51.73675)
909,high,,vij,993,993,325.0,32UNC,Regional Forestry Departments,"FNews Project, German Forestry Departmetns Sou...",POINT (9.94794 51.79978)
910,high,,vij,995,995,391.0,32UNC,Regional Forestry Departments,"FNews Project, German Forestry Departmetns Sou...",POINT (10.24565 51.62535)
911,high,,vij,996,996,325.0,32UNC,Regional Forestry Departments,"FNews Project, German Forestry Departmetns Sou...",POINT (9.94913 51.80094)


In [13]:
# Create the new GeoDataFrame for samples
# Initialize with None/default values where appropriate
samples_evo = (
    gpd.GeoDataFrame(
        {
            "original_sample_id": evo_samples_together["id"],
            "interpreter": "vij",
            "source": evo_samples_together["source"],
            "source_description": evo_samples_together["source_description"],
            "dataset": "Evoland",
            "s2_tile": evo_samples_together["tile"],
            "cluster_id": evo_samples_together["poly_id"],
            "cluster_description": "Damage polygons",
            "comment": evo_samples_together["comment"],
            "confidence": evo_samples_together["confidence"],
            "geometry": evo_samples_together["geometry"],
        },
        crs=samples_df.crs,  # Retain the original Coordinate Reference System
    )
    .astype(samples_schema.dtypes.drop(["sample_id", "geometry"]))
    .sort_values("original_sample_id")
)

In [14]:
samples_evo

Unnamed: 0,original_sample_id,interpreter,source,source_description,dataset,s2_tile,cluster_id,cluster_description,comment,confidence,geometry
0,0,vij,EFFIS,"Evoland Project, EFFIS Source of Wildfire Poly...",Evoland,30SUF,0.0,Damage polygons,"border, thinning, then clear cut",high,POINT (-4.12212 36.74179)
1,1,vij,EFFIS,"Evoland Project, EFFIS Source of Wildfire Poly...",Evoland,30SUF,1.0,Damage polygons,"unclear progression, edge",medium,POINT (-4.12161 36.74231)
299,2,vij,EFFIS,"Evoland Project, EFFIS Source of Wildfire Poly...",Evoland,30SUF,2.0,Damage polygons,plantation,high,POINT (-4.1192 36.74203)
396,3,vij,EFFIS,"Evoland Project, EFFIS Source of Wildfire Poly...",Evoland,30SUF,3.0,Damage polygons,plantation,high,POINT (-4.12845 36.75831)
474,4,vij,EFFIS,"Evoland Project, EFFIS Source of Wildfire Poly...",Evoland,30SUF,5.0,Damage polygons,clear cut,high,POINT (-4.12816 36.75908)
...,...,...,...,...,...,...,...,...,...,...,...
215,1263,vij,Regional Forestry Departments,"FNews Project, German Forestry Departmetns Sou...",Evoland,33UVS,421.0,Damage polygons,,high,POINT (14.08833 50.99596)
216,1264,vij,Regional Forestry Departments,"FNews Project, German Forestry Departmetns Sou...",Evoland,33UVS,420.0,Damage polygons,,high,POINT (14.10273 51.0011)
217,1265,vij,Regional Forestry Departments,"FNews Project, German Forestry Departmetns Sou...",Evoland,33UVS,,Damage polygons,,high,POINT (14.11205 51.00166)
218,1266,vij,Regional Forestry Departments,"FNews Project, German Forestry Departmetns Sou...",Evoland,33UVS,419.0,Damage polygons,,high,POINT (14.10425 51.00364)


In [15]:
hrvpp_label_mapping = {
    "0": 110,  # Stable → Mature Forest
    "1": 231,  # Biotic disturbance / bark-beetle → Bark Beetle (with decline)
    "2": 230,  # Biotic disturbance / other → Biotic (unspecific)
    "3": 120,  # Growth → Revegetation (with trees / canopy closing)
    "4": 123,  # Revegetation → Without Trees (shrubs and grasses, no reforestation)
    "b": 123,  # Stable / non-forest → Without Trees (assumes non-forest like grassland/shrubland)
    # Event flags
    "5": 211,  # Harvest / clear-cut → Clear Cut
    "6": 212,  # Harvest / thinning or selective logging → Thinning
    "7": 243,  # Abiotic disturbance / windthrow → Wind
    "8": 242,  # Abiotic disturbance / wildfire → Wildfire
    "9": 240,  # Abiotic disturbance / other → Abiotic (unspecific)
    "a": 220,  # Harvest / salvage logging → After Natural Disturbance (Needs to be split into after abiotic and biotic)
    "c": 213,  # Vegetation removal → Forestry Mulching (Non Forest Vegetation Removal)
    "d": 999,  # L3A artifact → No match in primary classes (metadata/quality issue)
    "e": 200,  # Unknown disturbance
}

In [16]:
# bring them to start end format as done for the hrvpp calibration set
labels_evo = (
    raw_evo_labels.with_columns(
        pl.col.original_label.replace_strict(hrvpp_label_mapping).alias("label"),
        dataset=pl.lit("Evoland"),
    )
    .to_pandas()
    .astype(labels_schema.dtypes.drop("sample_id"))
)
labels_evo

Unnamed: 0,original_label,start,end,original_sample_id,label,dataset
0,0,2016-11-10 00:00:00+00:00,2022-03-09 00:00:00+00:00,0,110,Evoland
1,6,2022-04-08 00:00:00+00:00,2022-04-08 23:59:59+00:00,0,212,Evoland
2,5,2024-04-02 00:00:00+00:00,2024-04-02 23:59:59+00:00,0,211,Evoland
3,0,2016-11-10 00:00:00+00:00,2023-05-03 00:00:00+00:00,1,110,Evoland
4,5,2023-06-22 00:00:00+00:00,2023-06-22 23:59:59+00:00,1,211,Evoland
...,...,...,...,...,...,...
3059,4,2021-05-21 00:00:00+00:00,2024-12-01 00:00:00+00:00,996,123,Evoland
3060,0,2015-08-06 00:00:00+00:00,2019-04-25 00:00:00+00:00,997,110,Evoland
3061,1,2019-10-07 00:00:00+00:00,2020-04-21 00:00:00+00:00,997,231,Evoland
3062,a,2020-04-24 00:00:00+00:00,2020-04-24 23:59:59+00:00,997,220,Evoland


## HRVPP Calibration

In [17]:
flag_paths_hrvpp = list(
    Path(
        r"\\digs110\FER\HR-VPP2\Subversion\InterpretationVersioned\stage1\Flags_L3A_json"
    ).glob("*.json")
)
raw_hrvpp_labels = flags_to_label_df(flag_paths_hrvpp)
raw_hrvpp_samples = flags_to_sample_df(flag_paths_hrvpp)

Missing Segment
0058
{
    "flags": {
        "2016-02-17 00:00:00.000000": "0",
        "2021-11-25 00:00:00.000000": "c",
        "2024-12-31 00:00:00.000000": "0"
    },
    "confidence": "medium",
    "comment": "",
    "interpreter": "pum"
}
Missing Segment
0083
{
    "flags": {
        "2016-02-17 00:00:00.000000": "0",
        "2017-02-14 00:00:00.000000": "c",
        "2024-12-26 00:00:00.000000": "0"
    },
    "confidence": "medium",
    "comment": "",
    "interpreter": "pum"
}
Missing Segment
0098
{
    "flags": {
        "2016-03-11 00:00:00.000000": "0",
        "2020-03-10 00:00:00.000000": "c",
        "2024-12-29 00:00:00.000000": "0"
    },
    "confidence": "medium",
    "comment": "very low TCD",
    "interpreter": "pum"
}
Missing Segment
0114
{
    "flags": {
        "2016-03-11 00:00:00.000000": "0",
        "2022-08-27 00:00:00.000000": "c",
        "2024-12-29 00:00:00.000000": "0"
    },
    "confidence": "medium",
    "comment": "soil management Aug 2022",
   

In [18]:
flag_paths_hrvpp = list(
    Path(
        r"\\digs110\FER\HR-VPP2\Subversion\InterpretationVersioned\stage1\Flags_L3A_json"
    ).glob("*.json")
)
raw_hrvpp_labels = flags_to_label_df(flag_paths_hrvpp)
raw_hrvpp_samples = flags_to_sample_df(flag_paths_hrvpp)

Missing Segment
0058
{
    "flags": {
        "2016-02-17 00:00:00.000000": "0",
        "2021-11-25 00:00:00.000000": "c",
        "2024-12-31 00:00:00.000000": "0"
    },
    "confidence": "medium",
    "comment": "",
    "interpreter": "pum"
}
Missing Segment
0083
{
    "flags": {
        "2016-02-17 00:00:00.000000": "0",
        "2017-02-14 00:00:00.000000": "c",
        "2024-12-26 00:00:00.000000": "0"
    },
    "confidence": "medium",
    "comment": "",
    "interpreter": "pum"
}
Missing Segment
0098
{
    "flags": {
        "2016-03-11 00:00:00.000000": "0",
        "2020-03-10 00:00:00.000000": "c",
        "2024-12-29 00:00:00.000000": "0"
    },
    "confidence": "medium",
    "comment": "very low TCD",
    "interpreter": "pum"
}
Missing Segment
0114
{
    "flags": {
        "2016-03-11 00:00:00.000000": "0",
        "2022-08-27 00:00:00.000000": "c",
        "2024-12-29 00:00:00.000000": "0"
    },
    "confidence": "medium",
    "comment": "soil management Aug 2022",
   

In [19]:
hrvpp_path = Path(
    server_root
    + "/HR-VPP2/Calibration/Interpretation/stage1/HR-VPP2_VDTC_calib_samples_stage1_merged.shp"
)
hrvpp = gpd.read_file(hrvpp_path)

In [20]:
together_hrvpp = (
    raw_hrvpp_samples.filter(pl.col.confidence.ne("low"))
    .to_pandas()
    .merge(
        hrvpp[["id", "id_str", "tile", "stratum", "geometry"]],
        left_on="original_sample_id",
        right_on="id_str",
    )
)
together_hrvpp

Unnamed: 0,confidence,comment,interpreter,original_sample_id,id,id_str,tile,stratum,geometry
0,medium,drought 2023?,pum,0001,1,0001,29SPC,1,POINT (-6.77759 38.22003)
1,high,low TCD,vij,0002,2,0002,29SPC,1,POINT (-6.64978 38.46161)
2,high,,pum,0003,3,0003,29SPC,1,POINT (-7.09658 38.57429)
3,high,road - low TCD,vij,0004,4,0004,29SPC,1,POINT (-6.61063 38.36791)
4,high,low TCD,vij,0005,5,0005,29SPC,1,POINT (-6.74202 38.24211)
...,...,...,...,...,...,...,...,...,...
2611,high,,vij,2723,2723,2723,35VMJ,4042,POINT (25.51873 62.05893)
2612,high,,vij,2724,2724,2724,35VMJ,4042,POINT (25.3021 62.20753)
2613,medium,widening of a power line,vij,2725,2725,2725,35VMJ,4042,POINT (26.07052 62.13655)
2614,high,,pum,2726,2726,2726,35VMJ,4042,POINT (26.87233 61.92974)


In [21]:
# Create the new GeoDataFrame for samples
# Initialize with None/default values where appropriate
samples_hrvpp = (
    gpd.GeoDataFrame(
        {
            "original_sample_id": together_hrvpp["id"],
            "interpreter": together_hrvpp["interpreter"],
            "dataset": "HRVPP",
            "source": "Sentinel 2 Time-series",
            "source_description": "HR-VPP2 Project, manual interpretation based on S2 timeseries, ESRI wayback and ESA High Resolution Images",
            "s2_tile": together_hrvpp["tile"],
            "cluster_id": together_hrvpp["stratum"],
            "cluster_description": "Stratum of the sampling",
            "comment": together_hrvpp["comment"],
            "confidence": together_hrvpp["confidence"],
            "geometry": together_hrvpp["geometry"],
        },
        crs=hrvpp.crs,  # Retain the original Coordinate Reference System
    )
    .astype(samples_schema.dtypes.drop(["sample_id", "geometry"]))
    .sort_values("original_sample_id")
)

In [22]:
samples_hrvpp

Unnamed: 0,original_sample_id,interpreter,dataset,source,source_description,s2_tile,cluster_id,cluster_description,comment,confidence,geometry
0,1,pum,HRVPP,Sentinel 2 Time-series,"HR-VPP2 Project, manual interpretation based o...",29SPC,1,Stratum of the sampling,drought 2023?,medium,POINT (-6.77759 38.22003)
1,2,vij,HRVPP,Sentinel 2 Time-series,"HR-VPP2 Project, manual interpretation based o...",29SPC,1,Stratum of the sampling,low TCD,high,POINT (-6.64978 38.46161)
2,3,pum,HRVPP,Sentinel 2 Time-series,"HR-VPP2 Project, manual interpretation based o...",29SPC,1,Stratum of the sampling,,high,POINT (-7.09658 38.57429)
3,4,vij,HRVPP,Sentinel 2 Time-series,"HR-VPP2 Project, manual interpretation based o...",29SPC,1,Stratum of the sampling,road - low TCD,high,POINT (-6.61063 38.36791)
4,5,vij,HRVPP,Sentinel 2 Time-series,"HR-VPP2 Project, manual interpretation based o...",29SPC,1,Stratum of the sampling,low TCD,high,POINT (-6.74202 38.24211)
...,...,...,...,...,...,...,...,...,...,...,...
2611,2723,vij,HRVPP,Sentinel 2 Time-series,"HR-VPP2 Project, manual interpretation based o...",35VMJ,4042,Stratum of the sampling,,high,POINT (25.51873 62.05893)
2612,2724,vij,HRVPP,Sentinel 2 Time-series,"HR-VPP2 Project, manual interpretation based o...",35VMJ,4042,Stratum of the sampling,,high,POINT (25.3021 62.20753)
2613,2725,vij,HRVPP,Sentinel 2 Time-series,"HR-VPP2 Project, manual interpretation based o...",35VMJ,4042,Stratum of the sampling,widening of a power line,medium,POINT (26.07052 62.13655)
2614,2726,pum,HRVPP,Sentinel 2 Time-series,"HR-VPP2 Project, manual interpretation based o...",35VMJ,4042,Stratum of the sampling,,high,POINT (26.87233 61.92974)


In [23]:
# bring them to start end format as done for the hrvpp calibration set
labels_hrvpp = (
    raw_hrvpp_labels.with_columns(
        pl.col.original_label.replace_strict(hrvpp_label_mapping).alias("label"),
        dataset=pl.lit("HRVPP"),
    )
    .to_pandas()
    .astype(labels_schema.dtypes.drop("sample_id"))
)
labels_hrvpp

Unnamed: 0,original_label,start,end,original_sample_id,label,dataset
0,0,2016-03-01 00:00:00+00:00,2025-01-01 00:00:00+00:00,1,110,HRVPP
1,0,2016-03-01 00:00:00+00:00,2025-01-01 00:00:00+00:00,2,110,HRVPP
2,0,2016-03-01 00:00:00+00:00,2025-01-01 00:00:00+00:00,3,110,HRVPP
3,b,2016-03-01 00:00:00+00:00,2025-01-01 00:00:00+00:00,4,123,HRVPP
4,0,2016-03-01 00:00:00+00:00,2025-01-01 00:00:00+00:00,5,110,HRVPP
...,...,...,...,...,...,...
4080,4,2023-07-16 00:00:00+00:00,2024-11-01 00:00:00+00:00,2726,123,HRVPP
4081,0,2016-05-01 00:00:00+00:00,2018-05-01 00:00:00+00:00,2727,110,HRVPP
4082,5,2018-05-16 00:00:00+00:00,2018-05-16 23:59:59+00:00,2727,211,HRVPP
4083,4,2019-07-01 00:00:00+00:00,2024-11-01 00:00:00+00:00,2727,123,HRVPP


## Windthrow samples

Same format as HRVPP

In [24]:
wt = gpd.read_file("../data/windthrow_flags.geojson")
wt.columns

Index(['Id_poly', 'EventDate', 'StormName', 'EventType', 'Country', 'Area',
       'Perimeter', 'Damage_deg', 'Methods', 'Dataprovid', 'Source', 'id',
       'EventId', 'max_inscri', 'centroid', 'data_loc', 'confidence',
       'comment', 'interpret1', 'segments', 'events', 'geometry'],
      dtype='object')

In [25]:
flag_paths_wt = list(Path("../data/windthrow_flags").glob("*.json"))
raw_wt_labels = flags_to_label_df(flag_paths_wt).with_columns(
    pl.col.original_sample_id.cast(pl.Int32)
)
raw_wt_samples = flags_to_sample_df(flag_paths_wt).with_columns(
    pl.col.original_sample_id.cast(pl.Int32)
)

Missing Segment
11237
{
    "flags": {
        "2016-12-02 00:00:00.000000": "0"
    },
    "confidence": "low",
    "comment": null,
    "interpreter": "vij"
}
Missing Segment
11241
{
    "flags": {
        "2016-12-02 00:00:00.000000": "0"
    },
    "confidence": "low",
    "comment": null,
    "interpreter": "vij"
}
Missing Segment
11245
{
    "flags": {
        "2016-12-05 00:00:00.000000": "0"
    },
    "confidence": "low",
    "comment": null,
    "interpreter": "vij"
}
Missing Segment
11260
{
    "flags": {
        "2016-12-02 00:00:00.000000": "0"
    },
    "confidence": "low",
    "comment": "no wt",
    "interpreter": "vij"
}
Missing Segment
11275
{
    "flags": {
        "2016-12-02 00:00:00.000000": "0"
    },
    "confidence": "low",
    "comment": null,
    "interpreter": "vij"
}
Missing Segment
11311
{
    "flags": {
        "2016-12-02 00:00:00.000000": "0"
    },
    "confidence": "low",
    "comment": "likely no wt",
    "interpreter": "vij"
}
Missing Segment
11326

In [27]:
together_wt = (
    raw_wt_samples.filter(pl.col.confidence.ne("low"))
    .to_pandas()
    .merge(
        wt[["id", "Source", "EventId", "geometry"]],
        left_on="original_sample_id",
        right_on="id",
    )
)
together_wt

Unnamed: 0,confidence,comment,interpreter,original_sample_id,id,Source,EventId,geometry
0,high,,vij,10017,10017,http://foresta.sisef.org/contents/?id=efor3070...,IT20181028,POINT (11.20557 45.98172)
1,high,,vij,10046,10046,http://foresta.sisef.org/contents/?id=efor3070...,IT20181028,POINT (11.25928 46.02815)
2,medium,border,vij,1015,1015,https://www.mdpi.com/2072-4292/11/2/115,DE20171110,POINT (11.51864 53.19573)
3,high,,vij,102,102,http://skogsdataportalen.skogsstyrelsen.se/Sko...,SE20181028,POINT (13.16201 56.49697)
4,high,unclear salvage,vij,10394,10394,http://foresta.sisef.org/contents/?id=efor3070...,IT20181028,POINT (11.77777 46.19341)
...,...,...,...,...,...,...,...,...
289,high,no apparent wt,vij,90,90,http://skogsdataportalen.skogsstyrelsen.se/Sko...,SE20181028,POINT (11.49436 58.57324)
290,high,unclear salvage,vij,9503,9503,http://foresta.sisef.org/contents/?id=efor3070...,IT20181028,POINT (11.50066 46.09481)
291,high,,vij,9504,9504,http://foresta.sisef.org/contents/?id=efor3070...,IT20181028,POINT (11.52082 46.09586)
292,high,unclear salvage,vij,9617,9617,http://foresta.sisef.org/contents/?id=efor3070...,IT20181028,POINT (11.50803 46.15393)


In [28]:
# Create the new GeoDataFrame for samples
# Initialize with None/default values where appropriate
samples_wt = (
    gpd.GeoDataFrame(
        {
            "original_sample_id": together_wt["id"],
            "interpreter": together_wt["interpreter"],
            "dataset": "Windthrow",
            "source": "FORWIND + Copernicus Emergency Service",
            "source_description": together_wt["Source"],
            "cluster_id": together_wt["EventId"],
            "cluster_description": "Id of the Event, given as ISO2 + Date of storm",
            "comment": together_wt["comment"],
            "confidence": together_wt["confidence"],
            "geometry": together_wt[
                "geometry"
            ],  # Geometry is already in shapely format from geopandas.read_file
        },
        crs=hrvpp.crs,  # Retain the original Coordinate Reference System
    )
    .astype(samples_schema.dtypes.drop(["sample_id", "s2_tile", "geometry"]))
    .sort_values("original_sample_id")
)

In [29]:
samples_wt

Unnamed: 0,original_sample_id,interpreter,dataset,source,source_description,cluster_id,cluster_description,comment,confidence,geometry
215,60,vij,Windthrow,FORWIND + Copernicus Emergency Service,http://skogsdataportalen.skogsstyrelsen.se/Sko...,SE20181028,"Id of the Event, given as ISO2 + Date of storm",no apparent wt,high,POINT (14.00079 58.07212)
217,61,vij,Windthrow,FORWIND + Copernicus Emergency Service,http://skogsdataportalen.skogsstyrelsen.se/Sko...,SE20181028,"Id of the Event, given as ISO2 + Date of storm",no apparent wt,high,POINT (13.99821 58.07167)
289,90,vij,Windthrow,FORWIND + Copernicus Emergency Service,http://skogsdataportalen.skogsstyrelsen.se/Sko...,SE20181028,"Id of the Event, given as ISO2 + Date of storm",no apparent wt,high,POINT (11.49436 58.57324)
3,102,vij,Windthrow,FORWIND + Copernicus Emergency Service,http://skogsdataportalen.skogsstyrelsen.se/Sko...,SE20181028,"Id of the Event, given as ISO2 + Date of storm",,high,POINT (13.16201 56.49697)
6,105,vij,Windthrow,FORWIND + Copernicus Emergency Service,http://skogsdataportalen.skogsstyrelsen.se/Sko...,SE20181028,"Id of the Event, given as ISO2 + Date of storm",,high,POINT (14.13445 56.28274)
...,...,...,...,...,...,...,...,...,...,...
148,16066,vij,Windthrow,FORWIND + Copernicus Emergency Service,https://mapping.emergency.copernicus.eu/activa...,SI20200205,"Id of the Event, given as ISO2 + Date of storm",,high,POINT (14.39175 46.2299)
149,16067,vij,Windthrow,FORWIND + Copernicus Emergency Service,https://mapping.emergency.copernicus.eu/activa...,SI20200205,"Id of the Event, given as ISO2 + Date of storm",unclear salvage,high,POINT (14.4004 46.2291)
150,16070,vij,Windthrow,FORWIND + Copernicus Emergency Service,https://mapping.emergency.copernicus.eu/activa...,SI20200205,"Id of the Event, given as ISO2 + Date of storm",,high,POINT (14.39046 46.24575)
151,16071,vij,Windthrow,FORWIND + Copernicus Emergency Service,https://mapping.emergency.copernicus.eu/activa...,SI20200205,"Id of the Event, given as ISO2 + Date of storm",unclear salvage,high,POINT (14.39247 46.2441)


In [30]:
# bring them to start end format as done for the hrvpp calibration set
labels_wt = (
    raw_wt_labels.with_columns(
        pl.col.original_label.replace_strict(hrvpp_label_mapping).alias("label"),
        dataset=pl.lit("Windthrow"),
    )
    .to_pandas()
    .astype(labels_schema.dtypes.drop("sample_id"))
)
labels_wt

Unnamed: 0,original_label,start,end,original_sample_id,label,dataset
0,0,2016-11-28 00:00:00+00:00,2021-06-03 00:00:00+00:00,100,110,Windthrow
1,5,2021-06-05 00:00:00+00:00,2021-06-05 23:59:59+00:00,100,211,Windthrow
2,4,2022-08-12 00:00:00+00:00,2024-11-21 00:00:00+00:00,100,123,Windthrow
3,0,2016-11-15 00:00:00+00:00,2018-10-24 00:00:00+00:00,10017,110,Windthrow
4,7,2018-11-15 00:00:00+00:00,2018-11-15 23:59:59+00:00,10017,243,Windthrow
...,...,...,...,...,...,...
1034,4,2019-09-16 00:00:00+00:00,2024-12-28 00:00:00+00:00,9617,123,Windthrow
1035,0,2016-11-15 00:00:00+00:00,2018-10-26 00:00:00+00:00,9704,110,Windthrow
1036,7,2018-10-31 00:00:00+00:00,2018-10-31 23:59:59+00:00,9704,243,Windthrow
1037,a,2021-05-28 00:00:00+00:00,2021-05-28 23:59:59+00:00,9704,220,Windthrow


# Concatenate all data

In this step we also remove all samples with confidence `low`

In [31]:
samples = pd.concat(
    [samples_schema, samples_evo, samples_hrvpp, samples_wt]
).reset_index(drop=True)
# set new unique sample_id
samples["sample_id"] = pd.Series(range(len(samples)), dtype="uint16[pyarrow]")
samples["confidence"] = samples["confidence"].str.lower()
samples = samples.query("confidence!='low'")
samples

Unnamed: 0,sample_id,original_sample_id,interpreter,dataset,source,source_description,s2_tile,cluster_id,cluster_description,comment,confidence,geometry
0,0,0,vij,Evoland,EFFIS,"Evoland Project, EFFIS Source of Wildfire Poly...",30SUF,0.0,Damage polygons,"border, thinning, then clear cut",high,POINT (-4.12212 36.74179)
1,1,1,vij,Evoland,EFFIS,"Evoland Project, EFFIS Source of Wildfire Poly...",30SUF,1.0,Damage polygons,"unclear progression, edge",medium,POINT (-4.12161 36.74231)
2,2,2,vij,Evoland,EFFIS,"Evoland Project, EFFIS Source of Wildfire Poly...",30SUF,2.0,Damage polygons,plantation,high,POINT (-4.1192 36.74203)
3,3,3,vij,Evoland,EFFIS,"Evoland Project, EFFIS Source of Wildfire Poly...",30SUF,3.0,Damage polygons,plantation,high,POINT (-4.12845 36.75831)
4,4,4,vij,Evoland,EFFIS,"Evoland Project, EFFIS Source of Wildfire Poly...",30SUF,5.0,Damage polygons,clear cut,high,POINT (-4.12816 36.75908)
...,...,...,...,...,...,...,...,...,...,...,...,...
3818,3818,16066,vij,Windthrow,FORWIND + Copernicus Emergency Service,https://mapping.emergency.copernicus.eu/activa...,,SI20200205,"Id of the Event, given as ISO2 + Date of storm",,high,POINT (14.39175 46.2299)
3819,3819,16067,vij,Windthrow,FORWIND + Copernicus Emergency Service,https://mapping.emergency.copernicus.eu/activa...,,SI20200205,"Id of the Event, given as ISO2 + Date of storm",unclear salvage,high,POINT (14.4004 46.2291)
3820,3820,16070,vij,Windthrow,FORWIND + Copernicus Emergency Service,https://mapping.emergency.copernicus.eu/activa...,,SI20200205,"Id of the Event, given as ISO2 + Date of storm",,high,POINT (14.39046 46.24575)
3821,3821,16071,vij,Windthrow,FORWIND + Copernicus Emergency Service,https://mapping.emergency.copernicus.eu/activa...,,SI20200205,"Id of the Event, given as ISO2 + Date of storm",unclear salvage,high,POINT (14.39247 46.2441)


In [32]:
labels = pd.concat([labels_schema, labels_evo, labels_hrvpp, labels_wt])
# set new sample_ids to labels
labels = labels.drop("sample_id", axis=1).merge(
    samples[["original_sample_id", "dataset", "sample_id"]],
    on=["original_sample_id", "dataset"],
    how="inner",
)

labels

Unnamed: 0,original_sample_id,dataset,label,original_label,start,end,sample_id
0,0,Evoland,110,0,2016-11-10 00:00:00+00:00,2022-03-09 00:00:00+00:00,0
1,0,Evoland,212,6,2022-04-08 00:00:00+00:00,2022-04-08 23:59:59+00:00,0
2,0,Evoland,211,5,2024-04-02 00:00:00+00:00,2024-04-02 23:59:59+00:00,0
3,1,Evoland,110,0,2016-11-10 00:00:00+00:00,2023-05-03 00:00:00+00:00,1
4,1,Evoland,211,5,2023-06-22 00:00:00+00:00,2023-06-22 23:59:59+00:00,1
...,...,...,...,...,...,...,...
7908,9617,Windthrow,123,4,2019-09-16 00:00:00+00:00,2024-12-28 00:00:00+00:00,3686
7909,9704,Windthrow,110,0,2016-11-15 00:00:00+00:00,2018-10-26 00:00:00+00:00,3687
7910,9704,Windthrow,243,7,2018-10-31 00:00:00+00:00,2018-10-31 23:59:59+00:00,3687
7911,9704,Windthrow,220,a,2021-05-28 00:00:00+00:00,2021-05-28 23:59:59+00:00,3687


In [33]:
# Throw out d class in original_label (data artifact only relevant to our specific data source, might get irrelevant later etc)
labels = labels.query("original_label!='d'")
# Sort by sample_id and start date to ensure chronological order
labels = labels.sort_values(["sample_id", "start"]).reset_index(drop=True)

# Group by sample_id and shift the start column to get the next start date
labels["start_next_label"] = labels.groupby("sample_id")["start"].shift(-1)
labels

Unnamed: 0,original_sample_id,dataset,label,original_label,start,end,sample_id,start_next_label
0,0,Evoland,110,0,2016-11-10 00:00:00+00:00,2022-03-09 00:00:00+00:00,0,2022-04-08 00:00:00+00:00
1,0,Evoland,212,6,2022-04-08 00:00:00+00:00,2022-04-08 23:59:59+00:00,0,2024-04-02 00:00:00+00:00
2,0,Evoland,211,5,2024-04-02 00:00:00+00:00,2024-04-02 23:59:59+00:00,0,
3,1,Evoland,110,0,2016-11-10 00:00:00+00:00,2023-05-03 00:00:00+00:00,1,2023-06-22 00:00:00+00:00
4,1,Evoland,211,5,2023-06-22 00:00:00+00:00,2023-06-22 23:59:59+00:00,1,
...,...,...,...,...,...,...,...,...
7908,16071,Windthrow,243,7,2020-03-11 00:00:00+00:00,2020-03-11 23:59:59+00:00,3821,2020-03-16 00:00:00+00:00
7909,16071,Windthrow,120,3,2020-03-16 00:00:00+00:00,2024-12-30 00:00:00+00:00,3821,
7910,16076,Windthrow,110,0,2017-08-04 00:00:00+00:00,2020-01-21 00:00:00+00:00,3822,2020-03-11 00:00:00+00:00
7911,16076,Windthrow,243,7,2020-03-11 00:00:00+00:00,2020-03-11 23:59:59+00:00,3822,2020-06-29 00:00:00+00:00


## Assign appropriate salvage class

If the salvage class can be narrowed (biotic/abiotic disturbance previous in the segmentation), then we do this here

In [34]:
# change label 220 within a unique sample_id to:
# -  221 if sample_id group has label starting with 23 (abiotic)
# -  222 if sample_id group has label starting with 24 (biotic)
# -  Leave as 220 if original_sample_id has neither 23 or 24
def determine_new_label(group):
    labels = group["label"].astype(str)
    if not (labels == "220").any():
        return labels
    # only run through rest of logic if there is a 220 class
    has_23 = labels.str.startswith("23").any()
    has_24 = labels.str.startswith("24").any()

    # Create new labels based on conditions
    new_labels = labels.copy()
    mask_220 = labels.str.startswith("220")

    if has_23 and has_24:
        pass
    elif has_23:
        new_labels.loc[mask_220] = "221"
    elif has_24:
        new_labels.loc[mask_220] = "222"

    return new_labels


labels["label"] = (
    labels.groupby("sample_id")
    .apply(determine_new_label, include_groups=False)
    .reset_index(level=0, drop=True)
)
labels["label"] = labels["label"].astype("uint16[pyarrow]")

In [35]:
labels.dtypes

original_sample_id                   int64[pyarrow]
dataset                             string[pyarrow]
label                               uint16[pyarrow]
original_label                      string[pyarrow]
start                 timestamp[s, tz=UTC][pyarrow]
end                   timestamp[s, tz=UTC][pyarrow]
sample_id                           uint16[pyarrow]
start_next_label      timestamp[s, tz=UTC][pyarrow]
dtype: object

# Write data

In [36]:
samples.to_parquet("../data/samples.parquet")
labels.to_parquet("../data/labels.parquet")

In [37]:
# write a csv map from dataset, original sample id to new sample id to track changes with git
samples[["sample_id", "original_sample_id", "dataset"]].to_csv(
    "../data/id_mapping.csv", index=False
)