In [1]:
from pathlib import Path

import geopandas as gpd
import pandas as pd

from utils import segments_and_events_to_long

%load_ext autoreload
%autoreload 2

In [2]:
# server_root = "/mnt/share/FER"
server_root = "//digs110/FER"

# Homogenize Sample Data

following columns will be compiled

Two files:

samples.geoparquet (? file format)

- Unique Sample ID (int)
- Original Sample ID (int)
- Interpreter (str)
- Source (str)
- S2 Tile (str)
- Cluster (Polygon ID, storm event) (str)
- Cluster Description (str)
- Comment (str)
- Confidence (categorical)
- Point (geometry)

labels.pq

- Timestamp (datetime)
- original label (str)
- Label (categorical)

In [3]:
samples_schema = gpd.GeoDataFrame(
    {
        "sample_id": pd.Series(dtype="uint64[pyarrow]"),
        "original_sample_id": pd.Series(dtype="int64[pyarrow]"),
        "interpreter": pd.Series(dtype="string[pyarrow]"),
        "dataset": pd.Series(dtype="string[pyarrow]"),
        "source": pd.Series(dtype="string[pyarrow]"),
        "source_description": pd.Series(dtype="string[pyarrow]"),
        "s2_tile": pd.Series(dtype="string[pyarrow]"),
        "cluster_id": pd.Series(dtype="string[pyarrow]"),
        "cluster_description": pd.Series(dtype="string[pyarrow]"),
        "comment": pd.Series(dtype="string[pyarrow]"),
        "confidence": pd.Series(
            dtype="string[pyarrow]"
        ),  # PyArrow doesn't yet support 'categorical' as a dtype
        "geometry": gpd.GeoSeries(crs="EPSG:4326"),
    },
    geometry="geometry",
)

In [4]:
labels_schema = pd.DataFrame(
    {
        "sample_id": pd.Series(dtype="uint64[pyarrow]"),
        "original_sample_id": pd.Series(dtype="int64[pyarrow]"),
        "dataset": pd.Series(dtype="string[pyarrow]"),
        "label": pd.Series(dtype="string[pyarrow]"),
        "original_label": pd.Series(dtype="string[pyarrow]"),
        "start": pd.Series(dtype="timestamp[s, tz=UTC][pyarrow]"),
        "end": pd.Series(dtype="timestamp[s, tz=UTC][pyarrow]"),
    }
)

# Label mapping 

The labels can be mapped roughly based on the categorization of ICP forest data. Going from coarse to fine categorization.
If more e0act reasons aren't known, the coarser category can be set.

<pre>
100 - Alive Vegetation  
    110 - Mature Forest  
    120 - Revegetation  
        121 - With Trees (after clear cut)  
        122 - Canopy closing (after thinning/defoliation)  
        123 - Without Trees (shrubs and grasses, no reforestation visible)  

200 - Disturbed  
    210 - Planned  
        211 - Clear Cut  
        212 - Thinning  
        213 - Forestry Mulching (Non Forest Vegetation Removal) 
    220 - Salvage  
        221 - After Biotic Disturbance  
        222 - After Abiotic Disturbance  
    230 - Biotic  
        231 - Bark Beetle (with decline)  
        232 - Gypsy Moth (temporary)  
    240 - Abiotic  
        241 - Drought  
        242 - Wildfire  
        244 - Wind  
        245 - Avalanche  
        246 - Flood  
</pre>

## Evoland + FNEWs

In [5]:
# evoland
evoland = Path(
    server_root
    + "/EvoLand/WP2_6_CFM/Referenzdaten/Database/PointMultitemp/evoland_ref_points_mt.shp"
)
evoland_multitemp = gpd.read_file(evoland)

In [None]:
evoland_multitemp[["id", "state"]].query("id==0")

Unnamed: 0,id,state
0,0,
1,0,
2,0,
3,0,
4,0,
...,...,...
275,0,
276,0,
277,0,
278,0,


: 

In [6]:
# Drop duplicates since the geometry and sample ID are likely repeated
samples_df = evoland_multitemp.drop_duplicates(
    "id"
).copy()  # .copy() to avoid SettingWithCopyWarning


def combine_comments(row):
    parts = [row["comment1"], row["comment2"], row["comment3"]]
    # Filter out None / NaN / empty
    parts = [str(p) for p in parts if pd.notna(p) and str(p).strip()]
    return ", ".join(parts) if parts else ""

In [7]:
source = samples_df["site"].replace(
    {"Spain": "EFFIS", "Germany": "Regional Forestry Departments", "Sweden": "EFFIS"}
)
source_description = samples_df["site"].replace(
    {
        "Spain": "Evoland Project, EFFIS Source of Wildfire Polygons, manual interpretation based on S2 timeseries",
        "Germany": "FNews Project, German Forestry Departmetns Source of Disturbance Types, manual interpretation based on S2 timeseries",
        "Sweden": "Evoland Project, EFFIS Source of Wildfire Polygons, manual interpretation based on S2 timeseries",
    }
)

In [8]:
# Create the new GeoDataFrame for samples
# Initialize with None/default values where appropriate
samples_evo = gpd.GeoDataFrame(
    {
        "original_sample_id": samples_df["id"],
        "interpreter": "pum",
        "source": source,
        "source_description": source_description,
        "dataset": "Evoland",
        "s2_tile": samples_df["tile"],
        "cluster_id": samples_df["poly_id"],
        "cluster_description": "Damage polygons",
        "comment": samples_df.apply(combine_comments, axis=1),
        "confidence": "HIGH",
        "geometry": samples_df[
            "geometry"
        ],  # Geometry is already in shapely format from geopandas.read_file
    },
    crs=samples_df.crs,  # Retain the original Coordinate Reference System
).astype(samples_schema.dtypes.drop(["sample_id", "geometry"]))

In [9]:
evoland_label_mapping = {
    "gesund": 110,
    "brand": 242,
    "raeumung": 220,  # needs to be handled more specifically, to refine abiotic, biotic
    "entnahme": 211,  # clear cut
    "kaefer": 231,
    "sturm": 244,
    "erholung": 122,  # erholung only happens after schwammspinner and does not have an upper bound -> so canopy closing
    "teilraeumung": 220,
    "sturm-teil": 244,
    "durchforstung": 212,
    "kaefer-teil": 231,
    "fraesen": 213,
    "unknown": 999,
    "trockenstress": 241,
    "schwammspinner": 232,
    "verjuengung": 121,  # thinning
    "sturm_snow": 244,
    "sturm-snow": 244,
    "unklar": 999,
    "befallsmax": 231,
}

In [10]:
# drop any dates with None as label
evoland_cleaned = evoland_multitemp.dropna(subset="state")

In [11]:
labels_evo_long = pd.DataFrame(
    {
        "original_sample_id": evoland_cleaned["id"],
        "timestamp": pd.to_datetime(
            evoland_cleaned["datetime"], format="%Y%m%dT%H%M%S"
        ),  # Convert to datetime
        "original_label": evoland_cleaned[
            "state"
        ],  # Assuming 'state' is the original label
        "label": evoland_cleaned["state"]
        .replace(evoland_label_mapping)
        .infer_objects(copy=False),
    }
)

  "label": evoland_cleaned["state"].replace(evoland_label_mapping).infer_objects(copy=False)


In [12]:
labels_evo_long["dataset"] = "Evoland"

In [13]:
labels_evo_long

Unnamed: 0,original_sample_id,timestamp,original_label,label,dataset
561,2,2019-01-09 11:11:03,gesund,110,Evoland
562,2,2019-01-14 11:11:07,gesund,110,Evoland
563,2,2019-01-24 11:11:08,gesund,110,Evoland
564,2,2019-01-29 11:11:05,gesund,110,Evoland
565,2,2019-02-03 11:11:08,gesund,110,Evoland
...,...,...,...,...,...
182690,1267,2021-09-05 10:16:33,kaefer-teil,231,Evoland
182691,1267,2021-09-10 10:16:37,kaefer-teil,231,Evoland
182692,1267,2021-10-10 10:16:42,kaefer-teil,231,Evoland
182693,1267,2021-10-15 10:16:38,kaefer-teil,231,Evoland


In [14]:
# bring them to start end format as done for the hrvpp calibration set

labels_evo = (
    labels_evo_long.groupby(["original_sample_id", "label"])
    .agg(
        original_label=("original_label", "first"),
        dataset=("dataset", "first"),
        start=("timestamp", "min"),
        end=("timestamp", "max"),
    )
    .reset_index()
).astype(labels_schema.dtypes.drop("sample_id"))
labels_evo

Unnamed: 0,original_sample_id,label,original_label,dataset,start,end
0,2,110,gesund,Evoland,2019-01-09 11:11:03+00:00,2022-04-01 00:00:00+00:00
1,2,212,durchforstung,Evoland,2022-04-08 11:11:05+00:00,2022-12-29 11:11:07+00:00
2,3,110,gesund,Evoland,2019-01-09 11:11:03+00:00,2022-05-01 00:00:00+00:00
3,3,212,durchforstung,Evoland,2022-05-08 11:11:05+00:00,2022-12-29 11:11:07+00:00
4,4,110,gesund,Evoland,2019-01-09 11:11:03+00:00,2020-05-03 11:11:11+00:00
...,...,...,...,...,...,...
2043,1265,231,kaefer-teil,Evoland,2019-09-01 10:16:37+00:00,2021-10-30 10:16:42+00:00
2044,1266,110,gesund,Evoland,2015-07-04 10:13:37+00:00,2019-09-11 10:16:35+00:00
2045,1266,231,kaefer-teil,Evoland,2019-09-21 10:16:37+00:00,2021-10-30 10:16:42+00:00
2046,1267,110,gesund,Evoland,2015-07-04 10:13:37+00:00,2020-06-17 10:16:43+00:00


## HRVPP Calibration

In [23]:
hrvpp_path = Path(
    server_root
    + "/HR-VPP2/Calibration/Interpretation/stage1/HR-VPP2_VDTC_calib_samples_stage1_merged.shp"
)
hrvpp = gpd.read_file(hrvpp_path)

In [17]:
# Create the new GeoDataFrame for samples
# Initialize with None/default values where appropriate
samples_hrvpp = gpd.GeoDataFrame(
    {
        "original_sample_id": hrvpp["id"],
        "interpreter": hrvpp["interpret1"],
        "dataset": "HRVPP",
        "source": "Sentinel 2 Time-series",
        "source_description": "HR-VPP2 Project, manual interpretation based on S2 timeseries, ESRI wayback and ESA High Resolution Images",
        "s2_tile": hrvpp["tile"],
        "cluster_id": hrvpp["stratum"],
        "cluster_description": "Stratum of the sampling",
        "comment": hrvpp["comment"],
        "confidence": hrvpp["confidence"],
        "geometry": hrvpp[
            "geometry"
        ],  # Geometry is already in shapely format from geopandas.read_file
    },
    crs=hrvpp.crs,  # Retain the original Coordinate Reference System
).astype(samples_schema.dtypes.drop(["sample_id", "geometry"]))

In [18]:
hrvpp_label_mapping = {
    "0": "110",  # Stable → Mature Forest
    "1": "231",  # Biotic disturbance / bark-beetle → Bark Beetle (with decline)
    "2": "230",  # Biotic disturbance / other → Biotic (unspecific)
    "3": "120",  # Growth → Revegetation (with trees / canopy closing)
    "4": "123",  # Revegetation → Without Trees (shrubs and grasses, no reforestation)
    "b": "123",  # Stable / non-forest → Without Trees (assumes non-forest like grassland/shrubland)
    # Event flags
    "5": "211",  # Harvest / clear-cut → Clear Cut
    "6": "212",  # Harvest / thinning or selective logging → Thinning
    "7": "244",  # Abiotic disturbance / windthrow → Wind
    "8": "242",  # Abiotic disturbance / wildfire → Wildfire
    "9": "240",  # Abiotic disturbance / other → Abiotic (unspecific)
    "a": "220",  # Harvest / salvage logging → After Natural Disturbance (Needs to be split into after abiotic and biotic)
    "c": "213",  # Vegetation removal → Forestry Mulching (Non Forest Vegetation Removal)
    "d": "999",  # L3A artifact → No match in primary classes (metadata/quality issue)
}

In [19]:
labels_hrvpp = segments_and_events_to_long(hrvpp)

# Optionally, sort by id
labels_hrvpp = labels_hrvpp.sort_values(by=["original_sample_id", "start"]).reset_index(
    drop=True
)

labels_hrvpp["label"] = labels_hrvpp["original_label"].replace(hrvpp_label_mapping)
labels_hrvpp["dataset"] = "HRVPP"
labels_hrvpp = labels_hrvpp.astype(labels_schema.dtypes.drop("sample_id"))
labels_hrvpp

Missing segment date
232
Missing segment date
256
Missing segment date
284
Missing segment date
784
Missing segment date
830
Missing segment date
833
Missing segment date
1184
Missing segment date
1254
Missing segment date
1405
Missing segment date
1486
Missing segment date
1512
Missing segment date
1520
Missing segment date
1539
Missing segment date
1572
Missing segment date
1700
Missing segment date
1976
Missing segment date
1980
Missing segment date
2210
Missing segment date
2240
Missing segment date
2334
Missing segment date
2421


Unnamed: 0,original_sample_id,original_label,start,end,label,dataset
0,1,0,2016-03-01 00:00:00+00:00,2025-01-01 00:00:00+00:00,110,HRVPP
1,2,0,2016-03-01 00:00:00+00:00,2025-01-01 00:00:00+00:00,110,HRVPP
2,3,0,2016-03-01 00:00:00+00:00,2025-01-01 00:00:00+00:00,110,HRVPP
3,4,b,2016-03-01 00:00:00+00:00,2025-01-01 00:00:00+00:00,123,HRVPP
4,5,0,2016-03-01 00:00:00+00:00,2025-01-01 00:00:00+00:00,110,HRVPP
...,...,...,...,...,...,...
3985,2726,4,2023-07-16 00:00:00+00:00,2024-11-01 00:00:00+00:00,123,HRVPP
3986,2727,0,2016-05-01 00:00:00+00:00,2018-05-01 00:00:00+00:00,110,HRVPP
3987,2727,5,2018-05-16 00:00:00+00:00,2018-05-16 23:59:59+00:00,211,HRVPP
3988,2727,4,2019-07-01 00:00:00+00:00,2024-11-01 00:00:00+00:00,123,HRVPP


## Windthrow samples

Same format as HRVPP

In [26]:
wt = gpd.read_file("../windthrow_flags.geojson")
wt.columns

Index(['Id_poly', 'EventDate', 'StormName', 'EventType', 'Country', 'Area',
       'Perimeter', 'Damage_deg', 'Methods', 'Dataprovid', 'Source', 'id',
       'EventId', 'max_inscri', 'centroid', 'data_loc', 'confidence',
       'comment', 'interpret1', 'segments', 'events', 'geometry'],
      dtype='object')

In [21]:
wt["Source"]

0      http://skogsdataportalen.skogsstyrelsen.se/Sko...
1      http://skogsdataportalen.skogsstyrelsen.se/Sko...
2      http://skogsdataportalen.skogsstyrelsen.se/Sko...
3      http://skogsdataportalen.skogsstyrelsen.se/Sko...
4      http://skogsdataportalen.skogsstyrelsen.se/Sko...
                             ...                        
567    https://mapping.emergency.copernicus.eu/activa...
568    https://mapping.emergency.copernicus.eu/activa...
569    https://mapping.emergency.copernicus.eu/activa...
570    https://mapping.emergency.copernicus.eu/activa...
571    https://mapping.emergency.copernicus.eu/activa...
Name: Source, Length: 572, dtype: object

In [22]:
# Create the new GeoDataFrame for samples
# Initialize with None/default values where appropriate
samples_wt = gpd.GeoDataFrame(
    {
        "original_sample_id": wt["id"],
        "interpreter": wt["interpret1"],
        "dataset": "Windthrow",
        "source": "FORWIND + Copernicus Emergency Service",
        "source_description": wt["Source"],
        "cluster_id": wt["EventId"],
        "cluster_description": "Id of the Event, given as ISO2 + Date of storm",
        "comment": wt["comment"],
        "confidence": wt["confidence"],
        "geometry": wt[
            "geometry"
        ],  # Geometry is already in shapely format from geopandas.read_file
    },
    crs=hrvpp.crs,  # Retain the original Coordinate Reference System
).astype(samples_schema.dtypes.drop(["sample_id", "s2_tile", "geometry"]))

In [23]:
samples_wt

Unnamed: 0,original_sample_id,interpreter,dataset,source,source_description,cluster_id,cluster_description,comment,confidence,geometry
0,60,vij,Windthrow,FORWIND + Copernicus Emergency Service,http://skogsdataportalen.skogsstyrelsen.se/Sko...,SE20181028,"Id of the Event, given as ISO2 + Date of storm",no apparent wt,HIGH,POINT (14.00079 58.07212)
1,61,vij,Windthrow,FORWIND + Copernicus Emergency Service,http://skogsdataportalen.skogsstyrelsen.se/Sko...,SE20181028,"Id of the Event, given as ISO2 + Date of storm",no apparent wt,HIGH,POINT (13.99821 58.07167)
2,87,vij,Windthrow,FORWIND + Copernicus Emergency Service,http://skogsdataportalen.skogsstyrelsen.se/Sko...,SE20181028,"Id of the Event, given as ISO2 + Date of storm",border - regrowth,LOW,POINT (13.87461 58.58656)
3,90,vij,Windthrow,FORWIND + Copernicus Emergency Service,http://skogsdataportalen.skogsstyrelsen.se/Sko...,SE20181028,"Id of the Event, given as ISO2 + Date of storm",no apparent wt,HIGH,POINT (11.49436 58.57324)
4,100,vij,Windthrow,FORWIND + Copernicus Emergency Service,http://skogsdataportalen.skogsstyrelsen.se/Sko...,SE20181028,"Id of the Event, given as ISO2 + Date of storm",,LOW,POINT (12.56487 57.15269)
...,...,...,...,...,...,...,...,...,...,...
567,13903,vij,Windthrow,FORWIND + Copernicus Emergency Service,https://mapping.emergency.copernicus.eu/activa...,LV20220807,"Id of the Event, given as ISO2 + Date of storm",unclear,LOW,POINT (26.30478 56.87101)
568,14629,vij,Windthrow,FORWIND + Copernicus Emergency Service,https://mapping.emergency.copernicus.eu/activa...,LV20220807,"Id of the Event, given as ISO2 + Date of storm",border,MEDIUM,POINT (26.38622 57.29761)
569,14038,vij,Windthrow,FORWIND + Copernicus Emergency Service,https://mapping.emergency.copernicus.eu/activa...,LV20220807,"Id of the Event, given as ISO2 + Date of storm",unclear,LOW,POINT (26.02159 57.21371)
570,14885,vij,Windthrow,FORWIND + Copernicus Emergency Service,https://mapping.emergency.copernicus.eu/activa...,LV20220807,"Id of the Event, given as ISO2 + Date of storm",unclear,LOW,POINT (27.15899 56.26346)


In [24]:
# Labels
labels_wt = segments_and_events_to_long(wt)
# Optionally, sort by id
labels_wt = labels_wt.sort_values(by=["original_sample_id", "start"]).reset_index(
    drop=True
)

labels_wt["label"] = labels_wt["original_label"].replace(hrvpp_label_mapping)
labels_wt["dataset"] = "Windthrow"
labels_wt = labels_wt.astype(labels_schema.dtypes.drop("sample_id"))
labels_wt

Missing segment date
773
Missing segment date
814
Missing segment date
787
Missing segment date
554
Missing segment date
433
Missing segment date
673
Missing segment date
587
Missing segment date
733
Missing segment date
655
Missing segment date
544
Missing segment date
559
Missing segment date
844
Missing segment date
2099
Missing segment date
1250
Missing segment date
1416
Missing segment date
1369
Missing segment date
1697
Missing segment date
1610
Missing segment date
1960
Missing segment date
7466
Missing segment date
4883
Missing segment date
6247
Missing segment date
6721
Missing segment date
7606
Missing segment date
11311
Missing segment date
11275
Missing segment date
11746
Missing segment date
11593
Missing segment date
11371
Missing segment date
11477
Missing segment date
11623
Missing segment date
11488
Missing segment date
11260
Missing segment date
11695
Missing segment date
11241
Missing segment date
11365
Missing segment date
11237
Missing segment date
11553
Missing se

Unnamed: 0,original_sample_id,original_label,start,end,label,dataset
0,60,0,2016-11-28 00:00:00+00:00,2024-12-16 00:00:00+00:00,110,Windthrow
1,61,0,2016-11-28 00:00:00+00:00,2024-12-16 00:00:00+00:00,110,Windthrow
2,87,0,2016-11-28 00:00:00+00:00,2024-11-19 00:00:00+00:00,110,Windthrow
3,90,0,2016-12-11 00:00:00+00:00,2024-12-14 00:00:00+00:00,110,Windthrow
4,100,0,2016-11-28 00:00:00+00:00,2021-06-03 00:00:00+00:00,110,Windthrow
...,...,...,...,...,...,...
1046,16072,0,2017-08-04 00:00:00+00:00,2020-01-26 00:00:00+00:00,110,Windthrow
1047,16072,7,2020-03-11 00:00:00+00:00,2020-03-11 23:59:59+00:00,244,Windthrow
1048,16076,0,2017-08-04 00:00:00+00:00,2020-01-21 00:00:00+00:00,110,Windthrow
1049,16076,7,2020-03-11 00:00:00+00:00,2020-03-11 23:59:59+00:00,244,Windthrow


In [25]:
labels_wt["original_label"]

0       0
1       0
2       0
3       0
4       0
       ..
1046    0
1047    7
1048    0
1049    7
1050    4
Name: original_label, Length: 1051, dtype: string

# Concatenate all data

In this step we also remove all samples with confidence `low`

In [26]:
samples = pd.concat(
    [samples_schema, samples_evo, samples_hrvpp, samples_wt]
).reset_index(drop=True)
# set new unique sample_id
samples["sample_id"] = pd.Series(range(len(samples)), dtype="uint64[pyarrow]")
samples["confidence"] = samples["confidence"].str.lower()
samples = samples.query("confidence!='low'")
samples

Unnamed: 0,sample_id,original_sample_id,interpreter,dataset,source,source_description,s2_tile,cluster_id,cluster_description,comment,confidence,geometry
0,0,0,pum,Evoland,EFFIS,"Evoland Project, EFFIS Source of Wildfire Poly...",30SUF,0.0,Damage polygons,leichte Durchforstung 2021,high,POINT (-4.12212 36.74179)
1,1,1,pum,Evoland,EFFIS,"Evoland Project, EFFIS Source of Wildfire Poly...",30SUF,1.0,Damage polygons,"Durchforstung_2021, kein Change erkennbar",high,POINT (-4.12161 36.74231)
2,2,2,pum,Evoland,EFFIS,"Evoland Project, EFFIS Source of Wildfire Poly...",30SUF,2.0,Damage polygons,"Durchforstung_2021, kein Change erkennbar",high,POINT (-4.1192 36.74203)
3,3,3,pum,Evoland,EFFIS,"Evoland Project, EFFIS Source of Wildfire Poly...",30SUF,3.0,Damage polygons,"Durchforstung 2021, unsicher",high,POINT (-4.12845 36.75831)
4,4,4,pum,Evoland,EFFIS,"Evoland Project, EFFIS Source of Wildfire Poly...",30SUF,5.0,Damage polygons,"Durchforstung 2021, starke Durchforstung",high,POINT (-4.12816 36.75908)
...,...,...,...,...,...,...,...,...,...,...,...,...
4295,4295,14414,vij,Windthrow,FORWIND + Copernicus Emergency Service,https://mapping.emergency.copernicus.eu/activa...,,LV20220807,"Id of the Event, given as ISO2 + Date of storm",,high,POINT (26.1231 56.51315)
4296,4296,14172,vij,Windthrow,FORWIND + Copernicus Emergency Service,https://mapping.emergency.copernicus.eu/activa...,,LV20220807,"Id of the Event, given as ISO2 + Date of storm",,high,POINT (26.27588 57.06756)
4298,4298,15954,vij,Windthrow,FORWIND + Copernicus Emergency Service,https://mapping.emergency.copernicus.eu/activa...,,LV20220807,"Id of the Event, given as ISO2 + Date of storm",might have bark beet,medium,POINT (23.44004 56.5117)
4300,4300,14519,vij,Windthrow,FORWIND + Copernicus Emergency Service,https://mapping.emergency.copernicus.eu/activa...,,LV20220807,"Id of the Event, given as ISO2 + Date of storm",small scale,medium,POINT (26.21896 56.99972)


In [27]:
labels = pd.concat([labels_schema, labels_evo, labels_hrvpp, labels_wt])
# set new sample_ids to labels
labels = labels.drop("sample_id", axis=1).merge(
    samples[["original_sample_id", "dataset", "sample_id"]],
    on=["original_sample_id", "dataset"],
    how="inner",
)

labels

Unnamed: 0,original_sample_id,dataset,label,original_label,start,end,sample_id
0,2,Evoland,110,gesund,2019-01-09 11:11:03+00:00,2022-04-01 00:00:00+00:00,2
1,2,Evoland,212,durchforstung,2022-04-08 11:11:05+00:00,2022-12-29 11:11:07+00:00,2
2,3,Evoland,110,gesund,2019-01-09 11:11:03+00:00,2022-05-01 00:00:00+00:00,3
3,3,Evoland,212,durchforstung,2022-05-08 11:11:05+00:00,2022-12-29 11:11:07+00:00,3
4,4,Evoland,110,gesund,2019-01-09 11:11:03+00:00,2020-05-03 11:11:11+00:00,4
...,...,...,...,...,...,...,...
6836,16071,Windthrow,244,7,2020-03-11 00:00:00+00:00,2020-03-11 23:59:59+00:00,4172
6837,16071,Windthrow,120,3,2020-03-16 00:00:00+00:00,2024-12-30 00:00:00+00:00,4172
6838,16076,Windthrow,110,0,2017-08-04 00:00:00+00:00,2020-01-21 00:00:00+00:00,4203
6839,16076,Windthrow,244,7,2020-03-11 00:00:00+00:00,2020-03-11 23:59:59+00:00,4203


In [28]:
# Throw out d class in original_label (data artifact only relevant to our specific data source, might get irrelevant later etc)
labels = labels.query("original_label!='d'")
# Sort by sample_id and start date to ensure chronological order
labels = labels.sort_values(["sample_id", "start"]).reset_index(drop=True)


# Group by sample_id and shift the start column to get the next start date
labels["start_next_label"] = labels.groupby("sample_id")["start"].shift(-1)
labels

Unnamed: 0,original_sample_id,dataset,label,original_label,start,end,sample_id,start_next_label
0,2,Evoland,110,gesund,2019-01-09 11:11:03+00:00,2022-04-01 00:00:00+00:00,2,2022-04-08 11:11:05+00:00
1,2,Evoland,212,durchforstung,2022-04-08 11:11:05+00:00,2022-12-29 11:11:07+00:00,2,
2,3,Evoland,110,gesund,2019-01-09 11:11:03+00:00,2022-05-01 00:00:00+00:00,3,2022-05-08 11:11:05+00:00
3,3,Evoland,212,durchforstung,2022-05-08 11:11:05+00:00,2022-12-29 11:11:07+00:00,3,
4,4,Evoland,110,gesund,2019-01-09 11:11:03+00:00,2020-05-03 11:11:11+00:00,4,2020-05-08 11:11:06+00:00
...,...,...,...,...,...,...,...,...
6813,14519,Windthrow,110,0,2017-05-30 00:00:00+00:00,2023-07-28 00:00:00+00:00,4300,2024-04-20 00:00:00+00:00
6814,14519,Windthrow,220,a,2024-04-20 00:00:00+00:00,2024-04-20 23:59:59+00:00,4300,
6815,14629,Windthrow,110,0,2017-03-16 00:00:00+00:00,2023-08-07 00:00:00+00:00,4302,2023-08-17 00:00:00+00:00
6816,14629,Windthrow,244,7,2023-08-17 00:00:00+00:00,2023-08-17 23:59:59+00:00,4302,2023-09-26 00:00:00+00:00


## Assign appropriate salvage class

If the salvage class can be narrowed (biotic/abiotic disturbance previous in the segmentation), then we do this here

In [29]:
# change label 220 within a unique sample_id to:
# -  221 if sample_id group has label starting with 23 (abiotic)
# -  222 if sample_id group has label starting with 24 (biotic)
# -  Leave as 220 if original_sample_id has neither 23 or 24
def determine_new_label(group):
    labels = group["label"].astype(str)
    if not (labels == "220").any():
        return labels
    # only run through rest of logic if there is a 220 class
    has_23 = labels.str.startswith("23").any()
    has_24 = labels.str.startswith("24").any()

    # Create new labels based on conditions
    new_labels = labels.copy()
    mask_220 = labels.str.startswith("220")

    if has_23 and has_24:
        pass
    elif has_23:
        new_labels.loc[mask_220] = "221"
    elif has_24:
        new_labels.loc[mask_220] = "222"

    return new_labels


labels["label"] = (
    labels.groupby("sample_id")
    .apply(determine_new_label, include_groups=False)
    .reset_index(level=0, drop=True)
)

# Write data

In [30]:
samples.to_parquet("../samples.parquet")
labels.to_parquet("../labels.parquet")

In [31]:
# write a csv map from dataset, original sample id to new sample id to track changes with git
samples[["sample_id", "original_sample_id", "dataset"]].to_csv(
    "../id_mapping.csv", index=False
)