In [0]:
# 05_feature_water_bodies_v2.py
from pyspark.sql import functions as F
from graphframes import GraphFrame
from utils.site_lock import acquire_site_lock, release_site_lock

# ==================================================
# Unity Catalog context
# ==================================================
spark.sql("USE CATALOG main")
spark.sql("USE SCHEMA demo")

# ==================================================
# CONFIG
# ==================================================
SITE_ID = spark.conf.get("pipeline.siteId", "wellington_cbd")

CELLS_TABLE = "surface_cells_v2"
OUTPUT_TABLE = "features_water_bodies_v2"
OUTPUT_PATH  = "abfss://processed@trimblegeospatialdemo.dfs.core.windows.net/features_water_bodies_v2"

WATER_CELL_THRESHOLD = 0.6   # cell-level water ratio
JOB_RUN_ID = spark.conf.get("spark.databricks.job.runId", "manual")

# ==================================================
# Acquire site lock
# ==================================================
acquire_site_lock(
    spark=spark,
    site_id=SITE_ID,
    locked_by=JOB_RUN_ID,
    ttl_minutes=120
)

try:
    # ==================================================
    # 1) Read water cells only
    # ==================================================
    df_cells = (
        spark.table(CELLS_TABLE)
             .filter(F.col("siteId") == SITE_ID)
             .filter(F.col("waterPointRatio") >= F.lit(WATER_CELL_THRESHOLD))
             .select(
                 "siteId",
                 "tileId",
                 "tileX", "tileY",          # assumed present from tiling stage
                 "cellX", "cellY",
                 "cellSizeM",
                 "minZ", "meanZ", "maxZ"
             )
    )

    if df_cells.rdd.isEmpty():
        raise RuntimeError("No water cells found for site")

    # ==================================================
    # 2) Compute global cell coordinates
    # ==================================================
    cells_per_tile = (
        df_cells.select("cellSizeM").first()[0]
    )

    df_cells_global = (
        df_cells
        .withColumn(
            "globalCellX",
            F.col("tileX") * F.lit(100000) + F.col("cellX")
        )
        .withColumn(
            "globalCellY",
            F.col("tileY") * F.lit(100000) + F.col("cellY")
        )
        .withColumn(
            "vertexId",
            F.concat_ws("_", F.col("globalCellX"), F.col("globalCellY"))
        )
    )

    # ==================================================
    # 3) Build adjacency edges (4-neighbour)
    # ==================================================
    a = df_cells_global.alias("a")
    b = df_cells_global.alias("b")

    edges = (
        a.join(
            b,
            (
                (a.globalCellX == b.globalCellX) &
                (F.abs(a.globalCellY - b.globalCellY) == 1)
            ) |
            (
                (a.globalCellY == b.globalCellY) &
                (F.abs(a.globalCellX - b.globalCellX) == 1)
            ),
            "inner"
        )
        .select(
            F.col("a.vertexId").alias("src"),
            F.col("b.vertexId").alias("dst")
        )
        .distinct()
    )

    vertices = df_cells_global.select("vertexId").distinct()

    # ==================================================
    # 4) Connected components (water bodies)
    # ==================================================
    g = GraphFrame(vertices, edges)
    components = g.connectedComponents()

    df_labeled = (
        df_cells_global
        .join(components, df_cells_global.vertexId == components.id)
        .withColumnRenamed("component", "waterBodyId")
    )

    # ==================================================
    # 5) Aggregate to water body features
    # ==================================================
    df_features = (
        df_labeled
        .groupBy("siteId", "waterBodyId")
        .agg(
            F.count("*").alias("cellCount"),
            (F.count("*") * F.first("cellSizeM") * F.first("cellSizeM")).alias("areaM2"),
            F.min("minZ").alias("minZ"),
            F.max("maxZ").alias("maxZ"),
            F.avg("meanZ").alias("meanZ"),
            F.min("globalCellX").alias("bboxMinX"),
            F.min("globalCellY").alias("bboxMinY"),
            F.max("globalCellX").alias("bboxMaxX"),
            F.max("globalCellY").alias("bboxMaxY")
        )
        .withColumn("computedAt", F.current_timestamp())
    )

    # ==================================================
    # 6) Write latest snapshot by site
    # ==================================================
    (
        df_features.write
            .format("delta")
            .mode("overwrite")
            .option("replaceWhere", f"siteId = '{SITE_ID}'")
            .option("path", OUTPUT_PATH)
            .partitionBy("siteId")
            .saveAsTable(OUTPUT_TABLE)
    )

    print("âœ… features_water_bodies_v2 written successfully")

finally:
    release_site_lock(
        spark=spark,
        site_id=SITE_ID,
        locked_by=JOB_RUN_ID
    )
