In [0]:
%python
# 05_feature_building_candidates_v2.py
from pyspark.sql import functions as F
from utils.site_lock import acquire_site_lock, release_site_lock

# ==================================================
# Unity Catalog context
# ==================================================
spark.sql("USE CATALOG main")
spark.sql("USE SCHEMA demo")

# ==================================================
# CONFIG
# ==================================================
SITE_ID = spark.conf.get("pipeline.siteId", "wellington_cbd")

PATCHES_TABLE = "surface_patches_v2"
OUTPUT_TABLE  = "features_building_candidates_v2"
OUTPUT_PATH   = "abfss://processed@trimblegeospatialdemo.dfs.core.windows.net/features_building_candidates_v2"

# Thresholds (tunable)
MIN_PATCH_AREA_M2       = 20.0
MIN_HEIGHT_ABOVE_GROUND = 2.5
MAX_HEIGHT_RANGE        = 3.0
MIN_POINTS_USED         = 200

JOB_RUN_ID = spark.conf.get("spark.databricks.job.runId", "manual")

# ==================================================
# Acquire site lock
# ==================================================
acquire_site_lock(
    spark=spark,
    site_id=SITE_ID,
    locked_by=JOB_RUN_ID,
    ttl_minutes=90
)

try:
    # ==================================================
    # 1) Load surface patches (ground only)
    # ==================================================
    df_patches = (
        spark.table(PATCHES_TABLE)
             .filter(F.col("siteId") == SITE_ID)
             .filter(F.col("surfaceType") == F.lit("ground"))
    )

    if df_patches.rdd.isEmpty():
        raise RuntimeError("No surface patches found")

    # ==================================================
    # 2) Estimate local ground reference per tile
    #    (low percentile ground height)
    # ==================================================
    df_ground_ref = (
        df_patches
        .groupBy("siteId", "tileId")
        .agg(
            F.expr("percentile_approx(minZ, 0.1)").alias("groundZ_ref")
        )
    )

    df = (
        df_patches
        .join(df_ground_ref, ["siteId", "tileId"], "left")
        .withColumn("heightAboveGround", F.col("meanZ") - F.col("groundZ_ref"))
        .withColumn("heightRange", F.col("maxZ") - F.col("minZ"))
    )

    # ==================================================
    # 3) Candidate filtering (rule-based)
    # ==================================================
    df_candidates = (
        df
        .filter(F.col("pointsUsed") >= F.lit(MIN_POINTS_USED))
        .filter(F.col("heightAboveGround") >= F.lit(MIN_HEIGHT_ABOVE_GROUND))
        .filter(F.col("heightRange") <= F.lit(MAX_HEIGHT_RANGE))
    )

    # ==================================================
    # 4) Compute final features
    # ==================================================
    df_features = (
        df_candidates
        .select(
            "siteId",
            "tileId",
            F.col("patchId").alias("buildingCandidateId"),
            "pointsUsed",
            "cellCount",
            "minZ",
            "meanZ",
            "maxZ",
            "heightAboveGround",
            "heightRange"
        )
        .withColumn("computedAt", F.current_timestamp())
    )

    # ==================================================
    # 5) Write latest snapshot by site
    # ==================================================
    (
        df_features.write
            .format("delta")
            .mode("overwrite")
            .option("replaceWhere", f"siteId = '{SITE_ID}'")
            .option("path", OUTPUT_PATH)
            .partitionBy("siteId", "tileId")
            .saveAsTable(OUTPUT_TABLE)
    )

    print("âœ… features_building_candidates_v2 written successfully")

finally:
    release_site_lock(
        spark=spark,
        site_id=SITE_ID,
        locked_by=JOB_RUN_ID
    )
