In [1]:
import pandas as pd
import geopandas as gpd
from sklearn.metrics import (
    confusion_matrix, cohen_kappa_score,
    accuracy_score, precision_score, recall_score, f1_score
)


def save_confusion_matrix(gdf, field_true, field_pred, csv_path):
    """
    Berechnet Confusion Matrix + Metriken mit festem Label-Mapping.
    Null/NaN-Werte werden entfernt.
    """

    # --- Hardcoded Mapping ---
    mapping = {
        1: "Forest land",
        2: "Cropland",
        3: "Grassland",
        4: "Wetlands",
        5: "Settlements",
        6: "Other Land"
    }

    # --- NaN-Zeilen filtern ---
    mask = ~(gdf[field_true].isna() | gdf[field_pred].isna())
    df = gdf.loc[mask, [field_true, field_pred]].copy()

    # --- Numerische Labels konsistent (1.0 → 1) ---
    def normalize_label(val):
        if pd.isna(val):
            return None
        try:
            f = float(val)
            if f.is_integer():
                return int(f)
            return f
        except:
            return str(val).strip()

    df[field_true] = df[field_true].apply(normalize_label)
    df[field_pred] = df[field_pred].apply(normalize_label)

    # --- Map Labels über hardcoded Mapping ---
    df[field_true] = df[field_true].map(mapping)
    df[field_pred] = df[field_pred].map(mapping)

    # --- y_true / y_pred ---
    y_true = df[field_true]
    y_pred = df[field_pred]

    # --- Alle Labels aus Mapping, die tatsächlich vorkommen ---
    labels = [mapping[k] for k in sorted(mapping.keys()) if mapping[k] in y_true.values or mapping[k] in y_pred.values]

    # --- Confusion Matrix ---
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    cm_df = pd.DataFrame(cm, index=labels, columns=labels)
    cm_df.index.name = "true"
    cm_df.columns.name = "pred"

    # --- Metriken ---
    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, average="weighted", zero_division=0),
        "recall": recall_score(y_true, y_pred, average="weighted", zero_division=0),
        "f1_score": f1_score(y_true, y_pred, average="weighted", zero_division=0),
        "kappa": cohen_kappa_score(y_true, y_pred)
    }
    metrics_df = pd.DataFrame.from_dict(metrics, orient="index", columns=["value"])

    # --- CSV speichern ---
    with open(csv_path, "w", encoding="utf-8") as f:
        f.write("Confusion Matrix\n")
        cm_df.to_csv(f)
        f.write("\nMetrics\n")
        metrics_df.to_csv(f)

    return {
        "confusion_matrix": cm_df,
        "metrics": metrics_df,
        "labels": labels
    }


In [8]:
gdf=gpd.read_file(r"C:\Users\st1179523\Documents\GitHub\landcover_analysis\data\analysis\worldcover\arealstatistik_mapped_2021_center_points.gpkg")
field_true="IPCC_AS_Id"
field_pred="IPCC_WC_Id"
csv_path= r"C:\Users\st1179523\Documents\GitHub\landcover_analysis\data\analysis\worldcover\confusion_matrix_WC_2021_center_points.csv"


result = save_confusion_matrix(gdf, field_true, field_pred, csv_path)

print(result["confusion_matrix"])
print(result["metrics"])


pred         Forest land  Cropland  Grassland  Wetlands  Settlements  \
true                                                                   
Forest land      1299666      1525      88151      8905        23992   
Cropland           17201    252861     111940        38         2010   
Grassland         181980     24080    1124557       838        24042   
Wetlands            1080        15        831    140162          194   
Settlements        54315      5757      34312       342       105215   
Other Land         13421        25      76937      1823          146   

pred         Other Land  
true                     
Forest land        5621  
Cropland            343  
Grassland         51033  
Wetlands           1153  
Settlements        1304  
Other Land       473256  
              value
accuracy   0.822392
precision  0.822787
recall     0.822392
f1_score   0.819677
kappa      0.754341


In [9]:
gdf=gpd.read_file(r"C:\Users\st1179523\Documents\GitHub\landcover_analysis\data\analysis\worldcover\arealstatistik_mapped_2020_center_points.gpkg")
field_true="IPCC_AS_Id"
field_pred="IPCC_WC_Id"
csv_path= r"C:\Users\st1179523\Documents\GitHub\landcover_analysis\data\analysis\worldcover\confusion_matrix_WC_2020_center_points.csv"


result = save_confusion_matrix(gdf, field_true, field_pred, csv_path)

print(result["confusion_matrix"])
print(result["metrics"])


pred         Forest land  Cropland  Grassland  Wetlands  Settlements  \
true                                                                   
Forest land      1292705      1228      99662      8343        15958   
Cropland           20926    223938     136028        13         1304   
Grassland         173275     18542    1118996       810        18202   
Wetlands            1090         9        970    140193          223   
Settlements        58543      4446      42031       282        90740   
Other Land         10308        13     105097      1797          164   

pred         Other Land  
true                     
Forest land        9964  
Cropland           2184  
Grassland         76705  
Wetlands            950  
Settlements        5203  
Other Land       448229  
              value
accuracy   0.802796
precision  0.804903
recall     0.802796
f1_score   0.798252
kappa      0.725821


In [None]:
gdf=gpd.read_file(r"C:\Users\st1179523\Documents\GitHub\landcover_analysis\data\analysis\worldcover\arealstatistik_mapped_2021_Max_Area.gpkg")
field_true="IPCC_AS_Id"
field_pred="IPCC_WC_Id"
csv_path= r"C:\Users\st1179523\Documents\GitHub\landcover_analysis\data\analysis\worldcover\confusion_matrix_WC_2021_Max_Area.csv"


result = save_confusion_matrix(gdf, field_true, field_pred, csv_path)

print(result["confusion_matrix"])
print(result["metrics"])


In [None]:
gdf=gpd.read_file(r"C:\Users\st1179523\Documents\GitHub\landcover_analysis\data\analysis\worldcover\arealstatistik_mapped_2020_Max_Area.gpkg")
field_true="IPCC_AS_Id"
field_pred="IPCC_WC_Id"
csv_path= r"C:\Users\st1179523\Documents\GitHub\landcover_analysis\data\analysis\worldcover\confusion_matrix_WC_2020_Max_Area.csv"


result = save_confusion_matrix(gdf, field_true, field_pred, csv_path)

print(result["confusion_matrix"])
print(result["metrics"])


In [None]:
gdf=gpd.read_file(r"C:\Users\st1179523\Documents\GitHub\landcover_analysis\data\analysis\av\AV_As_Center_Pixel.gpkg")
field_true="IPCC_AS_Id"
field_pred="IPCC_AV_Id"
csv_path= r"C:\Users\st1179523\Documents\GitHub\landcover_analysis\data\analysis\av\confusion_matrix_AV_Center_Pixel.csv"


result = save_confusion_matrix(gdf, field_true, field_pred, csv_path)

print(result["confusion_matrix"])
print(result["metrics"])


pred         Forest land  Cropland  Grassland  Wetlands  Settlements  \
true                                                                   
Forest land         6087       262         77        50           73   
Cropland               5      3066          8         0           25   
Grassland             34      6515        254         0           58   
Wetlands              35        44        435      3273            4   
Settlements           65      1295        133        10         1954   
Other Land             2         3          1         0            0   

pred         Other Land  
true                     
Forest land           7  
Cropland              2  
Grassland             7  
Wetlands              2  
Settlements          88  
Other Land            0  
              value
accuracy   0.612968
precision  0.677793
recall     0.612968
f1_score   0.584610
kappa      0.528665


In [None]:
gdf=gpd.read_file(r"C:\Users\st1179523\Documents\GitHub\landcover_analysis\data\analysis\av\AV_As_Maximal_Area.gpkg")
field_true="IPCC_AS_Id"
field_pred="IPCC_AV_Id"
csv_path= r"C:\Users\st1179523\Documents\GitHub\landcover_analysis\data\analysis\av\confusion_matrix_AV_Maximal_Area.csv"


result = save_confusion_matrix(gdf, field_true, field_pred, csv_path)

print(result["confusion_matrix"])
print(result["metrics"])


In [10]:
gdf=gpd.read_file(r"C:\Users\st1179523\Documents\GitHub\landcover_analysis\data\analysis\corine\areal_corine_ipcc_2012_center.gpkg")
field_true="IPCC_AS_Id"
field_pred="IPCC_CORINE_Id"
csv_path= r"C:\Users\st1179523\Documents\GitHub\landcover_analysis\data\analysis\corine\confusion_matrix_corine_2012_Center_pixel.csv"


result = save_confusion_matrix(gdf, field_true, field_pred, csv_path)

print(result["confusion_matrix"])
print(result["metrics"])


pred         Forest land  Cropland  Grassland  Wetlands  Settlements  \
true                                                                   
Forest land      1065883     63796     136390      1865        16539   
Cropland           15682    365605      40119       447        27927   
Grassland         166219    194205     732520      1722        40939   
Wetlands           14732      8124      12867    142732         4954   
Settlements        24277     57527      35101      1839       189776   
Other Land         20609       567      36760       136          312   

pred         Other Land  
true                     
Forest land        8429  
Cropland             13  
Grassland        172051  
Wetlands           5204  
Settlements         488  
Other Land       522715  
              value
accuracy   0.731213
precision  0.744702
recall     0.731213
f1_score   0.729092
kappa      0.653422


In [11]:
gdf=gpd.read_file(r"C:\Users\st1179523\Documents\GitHub\landcover_analysis\data\analysis\corine\areal_corine_ipcc_2018_center.gpkg")
field_true="IPCC_AS_Id"
field_pred="IPCC_CORINE_Id"
csv_path= r"C:\Users\st1179523\Documents\GitHub\landcover_analysis\data\analysis\corine\confusion_matrix_corine_2018_Center_pixel.csv"


result = save_confusion_matrix(gdf, field_true, field_pred, csv_path)

print(result["confusion_matrix"])
print(result["metrics"])


pred         Forest land  Cropland  Grassland  Wetlands  Settlements  \
true                                                                   
Forest land      1074182    130804      80117      1919        16552   
Cropland           15118    394245         10       394        22526   
Grassland         158477    496025     426261      1691        36697   
Wetlands           14971     14013       7299    142802         4987   
Settlements        25250     95432       3729      1856       200387   
Other Land         19393      1176      33625       121          248   

pred         Other Land  
true                     
Forest land        9788  
Cropland             12  
Grassland        177715  
Wetlands           5424  
Settlements         502  
Other Land       515323  
              value
accuracy   0.666784
precision  0.741592
recall     0.666784
f1_score   0.660602
kappa      0.582401


In [14]:
gdf=gpd.read_file(r"C:\Users\st1179523\Documents\GitHub\landcover_analysis\data\analysis\corine\areal_corine_ipcc_2012_max_area.gpkg")
field_true="IPCC_AS_Id"
field_pred="IPCC_CORINE_Id"
csv_path= r"C:\Users\st1179523\Documents\GitHub\landcover_analysis\data\analysis\corine\confusion_matrix_corine_2012_max_area.csv"


result = save_confusion_matrix(gdf, field_true, field_pred, csv_path)

print(result["confusion_matrix"])
print(result["metrics"])


pred         Forest land  Cropland  Grassland  Wetlands  Settlements  \
true                                                                   
Forest land      1107794     48943     113724      1149        13851   
Cropland            7117    376790      40970       168        24739   
Grassland         138121    202538     756161      1223        39203   
Wetlands           13806      7470      12618    145318         4168   
Settlements        20299     53989      34858       721       198656   
Other Land         19872       573      34757        70          323   

pred         Other Land  
true                     
Forest land        7441  
Cropland              9  
Grassland        170410  
Wetlands           5233  
Settlements         485  
Other Land       525504  
              value
accuracy   0.753250
precision  0.767866
recall     0.753250
f1_score   0.751417
kappa      0.681859


In [15]:
gdf=gpd.read_file(r"C:\Users\st1179523\Documents\GitHub\landcover_analysis\data\analysis\corine\areal_corine_ipcc_2018_max_area.gpkg")
field_true="IPCC_AS_Id"
field_pred="IPCC_CORINE_Id"
csv_path= r"C:\Users\st1179523\Documents\GitHub\landcover_analysis\data\analysis\corine\confusion_matrix_corine_2018_max_area.csv"


result = save_confusion_matrix(gdf, field_true, field_pred, csv_path)

print(result["confusion_matrix"])
print(result["metrics"])


pred         Forest land  Cropland  Grassland  Wetlands  Settlements  \
true                                                                   
Forest land      1115808     50144     123645      1207        13817   
Cropland            6827    366965      39088       141        19276   
Grassland         130453    206610     747803      1192        34566   
Wetlands           14061      7622      12775    145379         4206   
Settlements        21145     57853      37226       721       209708   
Other Land         18707       495      32371        58          254   

pred         Other Land  
true                     
Forest land        8741  
Cropland              8  
Grassland        176242  
Wetlands           5453  
Settlements         503  
Other Land       518001  
              value
accuracy   0.751662
precision  0.768473
recall     0.751662
f1_score   0.750357
kappa      0.679788
