**(Dec.3) This code has been refactored for post-processing only and no longer supports evaluation**
* For polygonization please refer to `post_processing_stage_1_1129.py`, `post_processing_stage_2_0829.py`, `post_processing_stage_3_1017.py`, `post_processing_stage_4_0827.py`

* This code includes Grouping adjacent polygonds and Dropping small objects

* Output is `.csv` file for econometric analysis and `.gpkg` for visualization using QGIS

    *Note: For evaluation using GT labels(3 images), refer to <br> (1)`/shared/data/climateplus2025/Post_Processing_with_Evaluation_1024_Nov20` and then <br>(2) `/shared/data/climateplus2025/Postprocessing_for_poster_3_images_1024_Nov20`*

**Load data and align CRS**

In [None]:
import geopandas as gpd
import pandas as pd
import glob
import os
from shapely.ops import unary_union
from shapely.geometry import Polygon, MultiPolygon
import numpy as np
import matplotlib.pyplot as plt

# Load all prediction(after post_processing stage3) output from our model
pred_dir = "/shared/data/climateplus2025/Postprocessing_EntireDataset_CapeTown_Image_2018_2023_Mask2Former_1024_Nov29/2021/output_stage_3"
gpkg_files = glob.glob(os.path.join(pred_dir, "*.gpkg"))
print(f"Found {len(gpkg_files)} prediction GPKG files")

prd_list = [gpd.read_file(f) for f in gpkg_files]

prd = gpd.GeoDataFrame(
    pd.concat(prd_list, ignore_index=True),
    geometry="geometry",
    crs=prd_list[0].crs  # keep CRS from first file
)

# display(gt.head(3))
display(prd.sample(3))

# Aligning CRS ESRI:102562 (GT gpkg) 
print("=== CRS diagnostics ===")
# print(f"GT CRS   : {gt.crs}")
print(f"Pred CRS : {prd.crs}")

Found 1573 prediction GPKG files


**Data Preparating;Cleaning noise** (Prediction Only)

* Removes non-polygon geometries (points, lines, noisy collections) and retains only valid polygons for evaluation.

In [None]:
EPS_FACTOR = 1e-6 # ignore tiny overlaps: area(pred ∩ GT) > EPS_FACTOR * area(GT)

# --------------------------------------------------------------
# 1. Prediction-Only Cleaning Function
#    - Remove tiny polygons (EPS)
#    - Clean multipolygons
#    - Remove tiny overlaps between predictions (optional rule-based)
# --------------------------------------------------------------

from shapely.geometry import Polygon, MultiPolygon
from shapely.ops import unary_union

def clean_predictions(pred_gdf, eps_factor=1e-6):
    """
    Clean prediction geometries only.
    Steps:
      - Remove extremely tiny polygons (slivers)
      - Clean multipolygon fragments
      - Optionally remove small overlaps (same image & same label)
    Metadata columns are preserved.
    """
    
    # --------------------------------------------
    # A. Remove extremely small polygons (slivers)
    # --------------------------------------------
    # dynamic EPS threshold
    max_area = pred_gdf["area_m2"].max()
    eps_area = eps_factor * max_area

    pred_gdf = pred_gdf[pred_gdf["area_m2"] > eps_area].copy()

    # --------------------------------------------
    # B. Clean multipolygon fragments
    # --------------------------------------------
    def clean_geom(g):
        if g is None or g.is_empty:
            return None
        
        if isinstance(g, MultiPolygon):
            polys = [p for p in g.geoms if p.area > eps_area]
            if len(polys) == 0:
                return None
            return unary_union(polys)
        
        if isinstance(g, Polygon) and g.area > eps_area:
            return g
        
        return None

    pred_gdf["geometry"] = pred_gdf["geometry"].apply(clean_geom)
    pred_gdf = pred_gdf[pred_gdf["geometry"].notnull()].copy()

    # --------------------------------------------
    # C. Remove tiny overlaps between predictions
    # --------------------------------------------
    cleaned_rows = []
    for (img, lbl), group in pred_gdf.groupby(["image_id", "label"]):
        group = group.reset_index(drop=True)

        remove_ids = set()

        for i in range(len(group)):
            for j in range(i+1, len(group)):
                g1 = group.loc[i, "geometry"]
                g2 = group.loc[j, "geometry"]

                if g1 is None or g2 is None:
                    continue

                inter = g1.intersection(g2)
                if inter.is_empty:
                    continue

                # Tiny overlaps are noise → remove smaller polygon
                if inter.area > eps_area:
                    a1 = g1.area
                    a2 = g2.area
                    if a1 < a2:
                        remove_ids.add(group.loc[i, "prediction_id"])
                    else:
                        remove_ids.add(group.loc[j, "prediction_id"])
        
        cleaned_group = group[~group["prediction_id"].isin(remove_ids)]
        cleaned_rows.append(cleaned_group)

    cleaned_prd = pd.concat(cleaned_rows, ignore_index=True)

    return cleaned_prd

# --------------------------------------------------------------
# 2. Run cleaning
# --------------------------------------------------------------

prd_cleaned = clean_predictions(prd, eps_factor=EPS_FACTOR)

print("Original prd size :", prd.shape)
print("Cleaned  prd size :", prd_cleaned.shape)
display(prd_cleaned.sample(10))


Original prd size : (208370, 9)
Cleaned  prd size : (208370, 9)


Unnamed: 0,prediction_id,image_id,label,area_m2,polygon_centroid_pixel_y_x_,polygon_centroid_CRS_X_Y_,polygon_centroid_GPS_lat_lon_,vertex_count,geometry
189931,i_2022_RGB_8cm_W56D_15_pred_da43df1a,i_2022_RGB_8cm_W56D_15,PV_pool,0.0096,"[5562,12027]","[-50037.8,-3767445.0]","[-34.03291608,18.45817015]",7,"POLYGON ((-50037.8 -3767445.08, -50037.88 -376..."
191870,i_2022_RGB_8cm_W56D_24_pred_3a3fca6c,i_2022_RGB_8cm_W56D_24,PV_pool,15.0336,"[10709,7378]","[-51409.72,-3769856.76]","[-34.05459131,18.44317296]",176,"POLYGON ((-51408.92 -3769853.96, -51409 -37698..."
208221,i_2022_RGB_8cm_W67D_24_pred_77314c1f,i_2022_RGB_8cm_W67D_24,PV_pool,17.1616,"[3695,6957]","[-61443.4,-3779295.64]","[-34.13913836,18.33383612]",185,"POLYGON ((-61442.12 -3779292.6, -61442.2 -3779..."
197876,i_2022_RGB_8cm_W57B_5_pred_cbbaeefe,i_2022_RGB_8cm_W57B_5,PV_normal,1.3952,"[10456,7680]","[-50385.56,-3770836.52]","[-34.06347341,18.45420874]",62,"POLYGON ((-50385.4 -3770835.64, -50385.48 -377..."
161482,i_2022_RGB_8cm_W46D_3_pred_3014655f,i_2022_RGB_8cm_W46D_3,PV_heater,1.824,"[4667,11232]","[-42101.4,-3765373.4]","[-34.01458927,18.54420598]",66,"POLYGON ((-42101.32 -3765372.36, -42101.4 -376..."
130914,i_2022_RGB_8cm_W45C_23_pred_62e00c37,i_2022_RGB_8cm_W45C_23,PV_heater,1.9808,"[5334,1092]","[-47912.6,-3759426.76]","[-33.96073132,18.48162107]",67,"POLYGON ((-47913.24 -3759426.04, -47913.24 -37..."
203519,i_2022_RGB_8cm_W57D_18_pred_7f38a493,i_2022_RGB_8cm_W57D_18,PV_normal,5.584,"[7033,6614]","[-52470.84,-3778562.68]","[-34.13302022,18.43115552]",123,"POLYGON ((-52469.32 -3778560.92, -52469.4 -377..."
50688,i_2022_RGB_8cm_W31C_6_pred_aba8e38e,i_2022_RGB_8cm_W31C_6,PV_normal,10.6944,"[12178,5595]","[-39552.36,-3716974.28]","[-33.57835297,18.57396862]",156,"POLYGON ((-39551.24 -3716972.36, -39551.32 -37..."
522,i_2022_RGB_8cm_W07C_13_pred_98fc597d,i_2022_RGB_8cm_W07C_13,PV_pool,0.9984,"[1030,6126]","[-7509.88,-3777082.44]","[-34.12096632,18.9185953]",72,"POLYGON ((-7509.56 -3777081.96, -7509.56 -3777..."
174336,i_2022_RGB_8cm_W55C_18_pred_a23256ab,i_2022_RGB_8cm_W55C_18,PV_normal,13.0528,"[9151,11036]","[-57117.08,-3758732.12]","[-33.95400957,18.38208534]",159,"POLYGON ((-57117 -3758729.72, -57117.08 -37587..."


**Step2. Group neighboring Polygons**

- !!!(Dec.3) pred_id, centroid, and geom is newly computed !!!

- A threshold of 2 pixels (≈16 cm) was found to be optimal.
- When sampling polygons that were split along map boundaries, their gaps were typically around 2 pixels.
- If polygons farther apart than this threshold are merged, it can lead to unwanted merging of unrelated predicted polygons.

In [None]:
# --------------------------------------------------------------
# 3. Convert cleaned prediction DataFrame → GeoDataFrame
# --------------------------------------------------------------

prd_cleaned_gdf = gpd.GeoDataFrame(
    prd_cleaned,
    geometry="geometry",
    crs=prd.crs
).reset_index(drop=True)

print("prd_cleaned_gdf created:", prd_cleaned_gdf.shape)
display(prd_cleaned_gdf.head())


prd_cleaned_gdf created: (208370, 9)


Unnamed: 0,prediction_id,image_id,label,area_m2,polygon_centroid_pixel_y_x_,polygon_centroid_CRS_X_Y_,polygon_centroid_GPS_lat_lon_,vertex_count,geometry
0,i_2022_RGB_8cm_E07A_1_pred_b622f7c3,i_2022_RGB_8cm_E07A_1,PV_heater,1.2384,"[4238,3883]","[310.68,-3770339.08]","[-34.06020065,19.00336527]",47,"POLYGON ((310.52 -3770338.36, 310.44 -3770338...."
1,i_2022_RGB_8cm_E07A_11_pred_4fbe4790,i_2022_RGB_8cm_E07A_11,PV_heater,0.0992,"[12464,8596]","[687.72,-3772997.16]","[-34.08416361,19.00745144]",15,"POLYGON ((687.8 -3772996.92, 687.72 -3772997, ..."
2,i_2022_RGB_8cm_E07A_11_pred_1d42af25,i_2022_RGB_8cm_E07A_11,PV_heater,1.6288,"[12452,9872]","[789.8,-3772996.2]","[-34.08415489,19.00855747]",55,"POLYGON ((789.64 -3772995.48, 789.56 -3772995...."
3,i_2022_RGB_8cm_E07A_11_pred_d278ea03,i_2022_RGB_8cm_E07A_11,PV_heater,0.0096,"[4849,10878]","[870.28,-3772387.96]","[-34.07867142,19.00942887]",5,"POLYGON ((870.28 -3772387.88, 870.2 -3772387.9..."
4,i_2022_RGB_8cm_E07A_6_pred_228caff1,i_2022_RGB_8cm_E07A_6,PV_heater,0.2016,"[10285,3565]","[285.24,-3771822.84]","[-34.07357707,19.00309019]",21,"POLYGON ((285.24 -3771822.52, 285.16 -3771822...."


**Merge Same-Class Neighboring Polygons**
 - If 'True' merge them, If 'False', keep them separate.

In [None]:
# ===============================================
# 0. Imports
# ===============================================
import geopandas as gpd
import pandas as pd
from shapely.ops import unary_union
from tqdm.auto import tqdm
import numpy as np

DIST_THRESHOLD = 0.16  # 2 pixels × 0.08m/pixel


# ===============================================
# FIX A: Convert centroid pixel values to float tuples
# ===============================================
def fix_pixel_centroid(val):
    if val is None:
        return (np.nan, np.nan)

    # Already tuple or list
    if isinstance(val, (list, tuple)):
        try:
            return (float(val[0]), float(val[1]))
        except:
            return (np.nan, np.nan)

    # If string: "(123.4, 567.8)" or "[123.4, 567.8]"
    if isinstance(val, str):
        cleaned = val.replace("(", "").replace(")", "") \
                     .replace("[", "").replace("]", "")
        parts = cleaned.split(",")
        if len(parts) == 2:
            try:
                return (float(parts[0]), float(parts[1]))
            except:
                return (np.nan, np.nan)

    return (np.nan, np.nan)


# ===============================================
# 1. Neighbor Search (Prediction Only)
# ===============================================
def find_neighboring_polygons_pred(df: gpd.GeoDataFrame, distance_thr: float = DIST_THRESHOLD):

    df = gpd.GeoDataFrame(df, geometry="geometry", crs=df.crs)
    df = df.reset_index(drop=True)

    results = []
    grouped = list(df.groupby(["image_id", "label"]))

    for (img, lbl), group in tqdm(grouped, desc="Neighbor Search Groups"):

        group = group.reset_index().rename(columns={"index": "orig_idx"})
        gdf = gpd.GeoDataFrame(group, geometry="geometry", crs=df.crs)
        sindex = gdf.sindex
        n = len(gdf)

        for i in range(n):
            geom1 = gdf.geometry.iloc[i]
            if geom1 is None or geom1.is_empty:
                continue

            cand_idx = sindex.query(
                geom1.buffer(distance_thr),
                predicate="intersects"
            )

            for cid in cand_idx:
                if cid <= i:
                    continue

                geom2 = gdf.geometry.iloc[cid]
                if geom2 is None or geom2.is_empty:
                    continue

                dist = geom1.distance(geom2)

                if dist < distance_thr:
                    results.append({
                        "image_id": img,
                        "label": lbl,
                        "idx1": gdf.loc[i, "orig_idx"],
                        "idx2": gdf.loc[cid, "orig_idx"],
                        "pred1": gdf.loc[i, "prediction_id"],
                        "pred2": gdf.loc[cid, "prediction_id"],
                        "dist_m": float(round(dist, 4)),
                    })

    return pd.DataFrame(results)



# ===============================================
# 2. Merge Neighboring Polygons (Prediction Only)
# ===============================================
def merge_neighbors_pred_only(prd_gdf, neighbors_df):

    prd_gdf = prd_gdf.reset_index(drop=True)

    if neighbors_df.empty:
        print("No neighbors found → returning original df.")
        prd_gdf["source_pred_ids"] = prd_gdf["prediction_id"].astype(str)
        prd_gdf["merge_size"] = 1
        return prd_gdf

    edges = neighbors_df[["idx1", "idx2"]].values.tolist()
    merge_groups = []
    visited = set()

    # Connected components
    for a, b in edges:
        if a in visited or b in visited:
            continue

        group = set([a, b])
        added = True

        while added:
            added = False
            for x, y in edges:
                if x in group or y in group:
                    before = len(group)
                    group.update([x, y])
                    if len(group) > before:
                        added = True

        merge_groups.append(group)
        visited.update(group)

    merged_rows = []
    merged_indices = set()

    for group in merge_groups:

        subset = prd_gdf.loc[list(group)]

        image_id = subset.iloc[0]["image_id"]
        label = subset.iloc[0]["label"]

        merged_geom = unary_union(subset.geometry.tolist())

        area = merged_geom.area
        centroid = merged_geom.centroid

        centroid_latlon = (
            gpd.GeoSeries([centroid], crs=prd_gdf.crs)
            .to_crs(4326)
            .iloc[0]
        )
        lat, lon = centroid_latlon.y, centroid_latlon.x

        # Pixel centroids (fixed)
        pix_list = subset["polygon_centroid_pixel_y_x_"].tolist()
        pix_y = float(np.nanmean([p[0] for p in pix_list]))
        pix_x = float(np.nanmean([p[1] for p in pix_list]))
        pix_centroid = (pix_y, pix_x)

        merged_pred_id = ";;".join(subset["prediction_id"].astype(str).tolist())

        merged_rows.append({
            "prediction_id": merged_pred_id,
            "image_id": image_id,
            "label": label,
            "area_m2": area,
            "polygon_centroid_pixel_y_x_": pix_centroid,
            "polygon_centroid_CRS_X_Y_": (centroid.x, centroid.y),
            "polygon_centroid_GPS_lat_lon_": (lat, lon),
            "geometry": merged_geom,
            "source_pred_ids": merged_pred_id,
            "merge_size": len(group),
        })

        merged_indices.update(group)

    unmerged_idx = set(prd_gdf.index) - merged_indices
    unmerged = prd_gdf.loc[list(unmerged_idx)].copy()

    unmerged["source_pred_ids"] = unmerged["prediction_id"].astype(str)
    unmerged["merge_size"] = 1

    final_df = pd.concat(
        [pd.DataFrame(merged_rows), unmerged],
        ignore_index=True
    )

    return gpd.GeoDataFrame(final_df, geometry="geometry", crs=prd_gdf.crs)



# ===============================================
# 3. Run Pipeline
# ===============================================

# FIX A — clean centroid pixel values FIRST
prd_cleaned_gdf["polygon_centroid_pixel_y_x_"] = \
    prd_cleaned_gdf["polygon_centroid_pixel_y_x_"].apply(fix_pixel_centroid)

neighbors_df = find_neighboring_polygons_pred(prd_cleaned_gdf)
print("Neighbors found:", len(neighbors_df))

display(neighbors_df.head())

final_gdf_grouping = merge_neighbors_pred_only(prd_cleaned_gdf, neighbors_df)
display(final_gdf_grouping.head())


  from .autonotebook import tqdm as notebook_tqdm
Neighbor Search Groups: 100%|██████████| 4454/4454 [00:47<00:00, 93.96it/s] 

Neighbors found: 410





Unnamed: 0,image_id,label,idx1,idx2,pred1,pred2,dist_m
0,i_2022_RGB_8cm_W07B_16,PV_heater,269,270,i_2022_RGB_8cm_W07B_16_pred_1f216efa,i_2022_RGB_8cm_W07B_16_pred_5002ff51,0.16
1,i_2022_RGB_8cm_W07C_12,PV_heater,377,378,i_2022_RGB_8cm_W07C_12_pred_a1855ce2,i_2022_RGB_8cm_W07C_12_pred_6171b170,0.16
2,i_2022_RGB_8cm_W07C_13,PV_heater,435,436,i_2022_RGB_8cm_W07C_13_pred_dec122a2,i_2022_RGB_8cm_W07C_13_pred_8c5691a6,0.16
3,i_2022_RGB_8cm_W07C_13,PV_heater,441,442,i_2022_RGB_8cm_W07C_13_pred_c68a1980,i_2022_RGB_8cm_W07C_13_pred_811ce58e,0.16
4,i_2022_RGB_8cm_W07C_7,PV_normal,790,791,i_2022_RGB_8cm_W07C_7_pred_88e75c59,i_2022_RGB_8cm_W07C_7_pred_5500625c,0.16


Unnamed: 0,prediction_id,image_id,label,area_m2,polygon_centroid_pixel_y_x_,polygon_centroid_CRS_X_Y_,polygon_centroid_GPS_lat_lon_,geometry,source_pred_ids,merge_size,vertex_count
0,i_2022_RGB_8cm_W07B_16_pred_1f216efa;;i_2022_R...,i_2022_RGB_8cm_W07B_16,PV_heater,0.1856,"(9759.5, 4128.5)","(-4669.666206896574, -3773780.84)","(-34.09121843887314, 18.9494000159533)","MULTIPOLYGON (((-4669.64 -3773780.84, -4669.56...",i_2022_RGB_8cm_W07B_16_pred_1f216efa;;i_2022_R...,2,
1,i_2022_RGB_8cm_W07C_12_pred_a1855ce2;;i_2022_R...,i_2022_RGB_8cm_W07C_12,PV_heater,0.512,"(1794.5, 10490.5)","(-8160.624499999944, -3777143.629)","(-34.121513069845, 18.911540865394816)","MULTIPOLYGON (((-8160.6 -3777143.32, -8160.52 ...",i_2022_RGB_8cm_W07C_12_pred_a1855ce2;;i_2022_R...,2,
2,i_2022_RGB_8cm_W07C_13_pred_dec122a2;;i_2022_R...,i_2022_RGB_8cm_W07C_13,PV_heater,0.2272,"(8824.0, 6380.0)","(-7489.8296713615655, -3777706.286009389)","(-34.12659051311745, 18.918807258834658)","MULTIPOLYGON (((-7489.8 -3777706.44, -7489.88 ...",i_2022_RGB_8cm_W07C_13_pred_dec122a2;;i_2022_R...,2,
3,i_2022_RGB_8cm_W07C_13_pred_c68a1980;;i_2022_R...,i_2022_RGB_8cm_W07C_13,PV_heater,0.2176,"(8122.0, 7011.5)","(-7439.0372549018975, -3777649.938823529)","(-34.126082899382205, 18.919358350724824)","MULTIPOLYGON (((-7438.84 -3777649.88, -7438.84...",i_2022_RGB_8cm_W07C_13_pred_c68a1980;;i_2022_R...,2,
4,i_2022_RGB_8cm_W07C_7_pred_88e75c59;;i_2022_RG...,i_2022_RGB_8cm_W07C_7,PV_normal,1.0784,"(8130.0, 1187.0)","(-8905.369851632011, -3776650.4649258135)","(-34.11706105426442, 18.903473072818173)","MULTIPOLYGON (((-8905.56 -3776649.8, -8905.48 ...",i_2022_RGB_8cm_W07C_7_pred_88e75c59;;i_2022_RG...,2,


**Drop small objects**

- Calculate area of each polygon using, then drop smaller than 1.7m^2

In [None]:
# Threshold for small polygons
AREA_THRESHOLD = 1.7  # m²
# AREA_THRESHOLD = 0.816  # m²  # (alternative threshold)

# Count polygons smaller than threshold
small_count = (final_gdf_grouping["area_m2"] < AREA_THRESHOLD).sum()

# Total before filtering
total_count = len(final_gdf_grouping)

# Count polygons that remain
remaining_count = total_count - small_count

print("=== Area-based Filtering Summary ===")
print(f"Total polygons before filtering : {total_count}")
print(f"Polygons < {AREA_THRESHOLD} m²     : {small_count}")
print(f"Polygons ≥ {AREA_THRESHOLD} m²     : {remaining_count}")

# Filtered polygons (>= threshold)
remained_polygons = final_gdf_grouping[final_gdf_grouping["area_m2"] >= AREA_THRESHOLD].copy()

display(remained_polygons.head(10))

=== Area-based Filtering Summary ===
Total polygons before filtering : 207960
Polygons < 1.7 m²     : 82484
Polygons ≥ 1.7 m²     : 125476


Unnamed: 0,prediction_id,image_id,label,area_m2,polygon_centroid_pixel_y_x_,polygon_centroid_CRS_X_Y_,polygon_centroid_GPS_lat_lon_,geometry,source_pred_ids,merge_size,vertex_count
6,i_2022_RGB_8cm_W12A_21_pred_0c35aa9c;;i_2022_R...,i_2022_RGB_8cm_W12A_21,PV_normal,139.536,"(7483.5, 4833.5)","(-19612.268518747875, -3724603.0627389764)","(-33.647685654821196, 18.788580075140565)","MULTIPOLYGON (((-19612.44 -3724594.12, -19612....",i_2022_RGB_8cm_W12A_21_pred_0c35aa9c;;i_2022_R...,2,
7,i_2022_RGB_8cm_W16C_20_pred_a792fa67;;i_2022_R...,i_2022_RGB_8cm_W16C_20,PV_pool,14.5568,"(9482.0, 4929.5)","(-15605.72829486334, -3768757.351936686)","(-34.04582487604924, 18.83098830531666)","MULTIPOLYGON (((-15605.24 -3768759.64, -15605....",i_2022_RGB_8cm_W16C_20_pred_a792fa67;;i_2022_R...,2,
8,i_2022_RGB_8cm_W16C_24_pred_1e8c4072;;i_2022_R...,i_2022_RGB_8cm_W16C_24,PV_heater,2.72,"(9447.5, 11644.5)","(-16068.011545098068, -3769755.918588236)","(-34.05482016667615, 18.82596335554246)","MULTIPOLYGON (((-16067.56 -3769755.08, -16067....",i_2022_RGB_8cm_W16C_24_pred_1e8c4072;;i_2022_R...,2,
9,i_2022_RGB_8cm_W16C_24_pred_14290527;;i_2022_R...,i_2022_RGB_8cm_W16C_24,PV_pool,22.6432,"(5502.5, 4461.0)","(-16644.287348784612, -3769439.402204636)","(-34.05195771607081, 18.81972763516447)","MULTIPOLYGON (((-16641.88 -3769441, -16641.8 -...",i_2022_RGB_8cm_W16C_24_pred_14290527;;i_2022_R...,2,
11,i_2022_RGB_8cm_W16D_22_pred_19e575a7;;i_2022_R...,i_2022_RGB_8cm_W16D_22,PV_heater,5.6448,"(3577.0, 7765.5)","(-13378.73664399092, -3769286.2264550263)","(-34.05062358302191, 18.855098677739434)","MULTIPOLYGON (((-13380.68 -3769285.56, -13380....",i_2022_RGB_8cm_W16D_22_pred_19e575a7;;i_2022_R...,2,
12,i_2022_RGB_8cm_W16D_22_pred_ef3e4fd8;;i_2022_R...,i_2022_RGB_8cm_W16D_22,PV_normal,14.976,"(11447.5, 1062.5)","(-13915.998723646733, -3769916.069732194)","(-34.05629475156223, 18.849269711912132)","MULTIPOLYGON (((-13916.04 -3769913.16, -13915....",i_2022_RGB_8cm_W16D_22_pred_ef3e4fd8;;i_2022_R...,2,
13,i_2022_RGB_8cm_W17A_18_pred_6376dc62;;i_2022_R...,i_2022_RGB_8cm_W17A_18,PV_normal,32.5696,"(12161.0, 9477.0)","(-17239.258211829456, -3773972.8072548574)","(-34.09281746189735, 18.81319391643389)","MULTIPOLYGON (((-17246.92 -3773974.36, -17246....",i_2022_RGB_8cm_W17A_18_pred_6376dc62;;i_2022_R...,2,
15,i_2022_RGB_8cm_W17A_9_pred_61aca7b4;;i_2022_RG...,i_2022_RGB_8cm_W17A_9,PV_normal,188.9312,"(6004.0, 7913.0)","(-16371.882781795739, -3771478.829958622)","(-34.07034781471983, 18.822639709211934)","MULTIPOLYGON (((-16362.76 -3771481.16, -16362....",i_2022_RGB_8cm_W17A_9_pred_61aca7b4;;i_2022_RG...,2,
18,i_2022_RGB_8cm_W17B_13_pred_ed3112ad;;i_2022_R...,i_2022_RGB_8cm_W17B_13,PV_normal,10.784,"(3531.0, 7927.5)","(-12363.412249258143, -3772283.1479248214)","(-34.077653880806466, 18.86605283885253)","MULTIPOLYGON (((-12367.56 -3772281.64, -12367....",i_2022_RGB_8cm_W17B_13_pred_ed3112ad;;i_2022_R...,2,
19,i_2022_RGB_8cm_W17B_17_pred_9af845e9;;i_2022_R...,i_2022_RGB_8cm_W17B_17,PV_normal,23.1232,"(9496.0, 3157.0)","(-13747.832165328891, -3773761.5090100523)","(-34.090964275921934, 18.85103051940416)","MULTIPOLYGON (((-13747 -3773758.44, -13747 -37...",i_2022_RGB_8cm_W17B_17_pred_9af845e9;;i_2022_R...,2,


**Save output**
- `.gpkg` for visualization
- `.csv` for subsequent analysis


In [None]:
import os

# ----------------------------------------------------
# 1. Ensure output directory exists
# ----------------------------------------------------
output_dir = "/shared/data/climateplus2025/Postprocessing_EntireDataset_CapeTown_Image_2018_2023_Mask2Former_1024_Nov29/2021/output_post_processing_polygonization_grouping_drop_small_objects"
os.makedirs(output_dir, exist_ok=True)

csv_path = os.path.join(output_dir, "prediction_merged_2021_final.csv")
gpkg_path = os.path.join(output_dir, "prediction_merged_2021_final_visualization.gpkg")
print("Saving files to:", output_dir)

# ----------------------------------------------------
# 2. Save CSV (non-geometric attributes only)
# ----------------------------------------------------
remained_polygons.drop(columns="geometry").to_csv(csv_path, index=False)
print(f"CSV saved → {csv_path}")

# ----------------------------------------------------
# 3. Save GPKG (full GeoDataFrame with geometry)
# ----------------------------------------------------
remained_polygons.to_file(gpkg_path, driver="GPKG")
print(f"GPKG saved → {gpkg_path}")

# Preview saved data
display(remained_polygons.head(5))

Saving files to: /shared/data/climateplus2025/Postprocessing_EntireDataset_CapeTown_Image_2018_2023_Mask2Former_1024_Nov29/2022/output_post_processing_polygonization_grouping_drop_small_objects
CSV saved → /shared/data/climateplus2025/Postprocessing_EntireDataset_CapeTown_Image_2018_2023_Mask2Former_1024_Nov29/2022/output_post_processing_polygonization_grouping_drop_small_objects/prediction_merged_2022_final.csv
GPKG saved → /shared/data/climateplus2025/Postprocessing_EntireDataset_CapeTown_Image_2018_2023_Mask2Former_1024_Nov29/2022/output_post_processing_polygonization_grouping_drop_small_objects/prediction_merged_2022_final_visualization.gpkg


Unnamed: 0,prediction_id,image_id,label,area_m2,polygon_centroid_pixel_y_x_,polygon_centroid_CRS_X_Y_,polygon_centroid_GPS_lat_lon_,geometry,source_pred_ids,merge_size,vertex_count
6,i_2022_RGB_8cm_W12A_21_pred_0c35aa9c;;i_2022_R...,i_2022_RGB_8cm_W12A_21,PV_normal,139.536,"(7483.5, 4833.5)","(-19612.268518747875, -3724603.0627389764)","(-33.647685654821196, 18.788580075140565)","MULTIPOLYGON (((-19612.44 -3724594.12, -19612....",i_2022_RGB_8cm_W12A_21_pred_0c35aa9c;;i_2022_R...,2,
7,i_2022_RGB_8cm_W16C_20_pred_a792fa67;;i_2022_R...,i_2022_RGB_8cm_W16C_20,PV_pool,14.5568,"(9482.0, 4929.5)","(-15605.72829486334, -3768757.351936686)","(-34.04582487604924, 18.83098830531666)","MULTIPOLYGON (((-15605.24 -3768759.64, -15605....",i_2022_RGB_8cm_W16C_20_pred_a792fa67;;i_2022_R...,2,
8,i_2022_RGB_8cm_W16C_24_pred_1e8c4072;;i_2022_R...,i_2022_RGB_8cm_W16C_24,PV_heater,2.72,"(9447.5, 11644.5)","(-16068.011545098068, -3769755.918588236)","(-34.05482016667615, 18.82596335554246)","MULTIPOLYGON (((-16067.56 -3769755.08, -16067....",i_2022_RGB_8cm_W16C_24_pred_1e8c4072;;i_2022_R...,2,
9,i_2022_RGB_8cm_W16C_24_pred_14290527;;i_2022_R...,i_2022_RGB_8cm_W16C_24,PV_pool,22.6432,"(5502.5, 4461.0)","(-16644.287348784612, -3769439.402204636)","(-34.05195771607081, 18.81972763516447)","MULTIPOLYGON (((-16641.88 -3769441, -16641.8 -...",i_2022_RGB_8cm_W16C_24_pred_14290527;;i_2022_R...,2,
11,i_2022_RGB_8cm_W16D_22_pred_19e575a7;;i_2022_R...,i_2022_RGB_8cm_W16D_22,PV_heater,5.6448,"(3577.0, 7765.5)","(-13378.73664399092, -3769286.2264550263)","(-34.05062358302191, 18.855098677739434)","MULTIPOLYGON (((-13380.68 -3769285.56, -13380....",i_2022_RGB_8cm_W16D_22_pred_19e575a7;;i_2022_R...,2,
