In [0]:
# Databricks notebook: silver_optimize
# Path: /Workspace/Users/you/silver_optimize
#
# Widgets:
#  - ingestion_batch_id (optional)  -> if provided, optimize will try to limit to that partition when possible
#
# Purpose:
#  - Run Delta OPTIMIZE to compact small files into larger files (target ~256MB)
#  - Apply Z-ORDER on sensible non-partition columns to improve read locality for common query patterns
#  - Be idempotent and safe to run repeatedly; support per-batch (ingestion) scoping

from datetime import datetime
import json
from pyspark.sql import functions as F

spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

# Tables expected to optimize in the Silver layer
tables_to_optimize = [
    "census.silver.dim_person",
    "census.silver.dim_person_history",
    "census.silver.dim_household",
    "census.silver.lineage"
]

# Heuristic mapping: for each table choose a non-partition column that is useful for filtering/joining.
# These should NOT be partition columns. If the chosen column is not present, the code will fall back.
recommended_zorder = {
    "census.silver.dim_person": ["canonical_person_id", "person_surrogate_id", "last_updated"],
    "census.silver.dim_person_history": ["canonical_person_id", "person_surrogate_id", "census_year"],
    "census.silver.dim_household": ["household_id", "median_household_income", "geoid"],
    "census.silver.lineage": ["canonical_person_id", "ingestion_batch_id"]
}

# optional widget
try:
    ingestion_batch_id = dbutils.widgets.get("ingestion_batch_id")
    if ingestion_batch_id and ingestion_batch_id.strip() == "":
        ingestion_batch_id = None
except Exception:
    ingestion_batch_id = None

start_ts = datetime.utcnow()
results = []

def table_exists(tbl):
    try:
        return spark.catalog.tableExists(tbl)
    except Exception:
        return False

def choose_zorder_column(tbl):
    # Return the first recommended column that actually exists in the table schema and that is not a partition column.
    cols = [c.name for c in spark.catalog.listColumns(tbl)]
    # Try recommended columns in order
    for cand in recommended_zorder.get(tbl, []):
        if cand in cols:
            # Should avoid z-ordering on partition columns; try to detect partition flag if available
            try:
                # spark.catalog.listColumns returns objects with isPartition attr in many Spark versions
                part_cols = [c.name for c in spark.catalog.listColumns(tbl) if getattr(c, "isPartition", False)]
            except Exception:
                # If metadata missing, assume the common partition column names (safe fallback)
                part_cols = ["geoid", "ingestion_batch_id", "_ingestion_batch_id", "partition"]
            if cand in part_cols:
                # candidate is a partition column -> skip it
                continue
            return cand
    # if none found, return None
    return None

for tbl in tables_to_optimize:
    if not table_exists(tbl):
        results.append({"table": tbl, "status": "SKIP", "reason": "not_exists"})
        continue

    try:
        # Decide whether I can restrict optimize to an ingestion partition
        where_clause = ""
        # lineage and household tables often have ingestion_batch_id column; check presence and use it if provided
        if ingestion_batch_id:
            cols_lower = [c.name.lower() for c in spark.catalog.listColumns(tbl)]
            if "ingestion_batch_id" in cols_lower or "_ingestion_batch_id" in cols_lower:
                # prefer exact column name on table; choose actual casing
                colnames = [c.name for c in spark.catalog.listColumns(tbl)]
                chosen_col = next((c for c in colnames if c.lower() in ("ingestion_batch_id","_ingestion_batch_id")), None)
                if chosen_col:
                    where_clause = f" WHERE {chosen_col} = '{ingestion_batch_id}'"

        # choose z-order candidate for this table
        zcol = choose_zorder_column(tbl)

        # Build SQL
        if zcol:
            # safe: do OPTIMIZE ... ZORDER BY (zcol)
            sql_opt = f"OPTIMIZE {tbl}{where_clause} ZORDER BY ({zcol})"
        else:
            # no suitable z-order candidate found; do a plain OPTIMIZE (optionally constrained by where_clause)
            if where_clause:
                # OPTIMIZE with WHERE but no ZORDER
                sql_opt = f"OPTIMIZE {tbl}{where_clause}"
            else:
                sql_opt = f"OPTIMIZE {tbl}"

        # log and execute
        print(f"Running OPTIMIZE for table {tbl}; SQL -> {sql_opt}")
        spark.sql(sql_opt)
        results.append({"table": tbl, "status": "OK", "zorder": zcol, "where": (ingestion_batch_id if where_clause else None)})
    except Exception as e:
        # If OPTIMIZE fails (e.g., runtime lacks OPTIMIZE privilege), capture the error
        results.append({"table": tbl, "status": "ERROR", "error": str(e)})

end_ts = datetime.utcnow()
out = {"status":"DONE","start": start_ts.isoformat(), "end": end_ts.isoformat(), "results": results}
print(json.dumps(out, indent=2))
dbutils.notebook.exit(json.dumps(out))
