In [0]:
# 01_ingest_laz_raw - Job Entrypoint
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, LongType
from datetime import datetime
import laspy
import pandas as pd

# ============================================================
# Step 0: Read Job Parameters
# ============================================================
# Use dbutils.widgets.get() to get job params
dbutils.widgets.text("landing_path", "", "Landing Path (LAZ file)")
dbutils.widgets.text("siteId", "", "Site ID")
dbutils.widgets.text("ingestRunId", "", "Ingest Run ID")

landing_path = dbutils.widgets.get("landing_path")
site_id = dbutils.widgets.get("siteId")
ingest_run_id = dbutils.widgets.get("ingestRunId")

# Validate required parameters
if not landing_path or not site_id or not ingest_run_id:
    raise ValueError("Missing required job parameters: landing_path, siteId, ingestRunId")

print(f"üì• Landing Path: {landing_path}")
print(f"üèóÔ∏è  Site ID: {site_id}")
print(f"üîÑ Ingest Run ID: {ingest_run_id}")

# ============================================================
# Step 1: Convert LAZ to Parquet
# ============================================================
DEFAULT_COLUMNS = [
    "x",
    "y",
    "z",
    "intensity",
    "classification",
    "return_number",
    "number_of_returns",
    "gps_time",
]

print("\nüîÑ Converting LAZ to Parquet...")

# Read LAZ file using laspy
laz_file = laspy.read(landing_path)

# Extract point data into a dictionary
point_data = {}
for col in DEFAULT_COLUMNS:
    if hasattr(laz_file, col):
        point_data[col] = getattr(laz_file, col)
    else:
        raise ValueError(f"LAZ file missing required column: {col}")

# Convert to pandas DataFrame
pdf = pd.DataFrame(point_data)

# Convert to Spark DataFrame
df = spark.createDataFrame(pdf)

# ============================================================
# Step 2: Add Metadata Columns
# ============================================================
ingest_timestamp = datetime.now()

df = df.withColumn("siteId", F.lit(site_id)) \
       .withColumn("ingestRunId", F.lit(ingest_run_id)) \
       .withColumn("ingestTime", F.lit(ingest_timestamp).cast("timestamp"))

print(f"‚úÖ Converted {df.count():,} points from LAZ to DataFrame")

# ============================================================
# Step 3: Data Quality Checks
# ============================================================
required_cols = ["x","y","z","intensity","classification","return_number","number_of_returns","gps_time","siteId","ingestRunId","ingestTime"]
missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

# Basic sanity filter (keep it light; still considered 'raw safe')
df = df.filter(F.col("x").isNotNull() & F.col("y").isNotNull() & F.col("z").isNotNull())

# ============================================================
# Step 4: Write to External Delta Table
# ============================================================
raw_path = "abfss://raw@trimblegeospatialdemo.dfs.core.windows.net/points"
table_name = "main.demo.points_raw"

print(f"\nüíæ Writing to Delta table: {table_name}")

(
    df.write
      .format("delta")
      .mode("append")
      .partitionBy("siteId","ingestRunId")
      .option("path", raw_path)
      .saveAsTable(table_name)
)

print("‚úÖ Write complete. Verifying...")

# ============================================================
# Step 5: Verify Write
# ============================================================
written = spark.read.table(table_name)

# Verify this run landed
df.select("siteId","ingestRunId").distinct().show(truncate=False)

this_run_count = written.filter(
    (F.col("siteId") == site_id) &
    (F.col("ingestRunId") == ingest_run_id)
).count()

print(f"‚úÖ This run ingested: {this_run_count:,} rows")
print(f"‚úÖ Raw Delta table now contains {written.count():,} rows total.")