In [0]:
# Red background + logo + title (wider and taller)
RED   = "#FF0000"
WHITE = "#FFFFFF"
title = "DQX Demo"

# Load the logo (no SVG fallback)
import base64, mimetypes, os

logo_path = "utils/cla_logo_white.png"
logo_html = ""
try:
    p = logo_path
    if p.startswith("dbfs:/"):
        p = "/dbfs" + p[5:]
    with open(p, "rb") as f:
        b64 = base64.b64encode(f.read()).decode("ascii")
    mime = mimetypes.guess_type(p)[0] or "image/png"
    logo_uri = f"data:{mime};base64,{b64}"
    logo_html = f"<img src='{logo_uri}' alt='Logo' style='height:48px; width:auto; display:block;'/>"
except Exception:
    pass

displayHTML(f"""
<div style="
  background:{RED} !important;
  padding: 28px 48px;  /* Increased padding for more height and width */
  border-radius: 16px; /* Slightly larger rounded corners */
  color: {WHITE};
  text-align:center;
  margin: 12px 0 20px 0;
  max-width: 100%; /* Full width */
  display: flex;
  justify-content: center;">
  <span style="display:inline-flex; align-items:center; gap:20px;">
    {logo_html}
    <span style="font-weight:800; font-size:36px; letter-spacing:.4px; font-family: system-ui, -apple-system, Segoe UI, Roboto, Arial, sans-serif;">
      {title}
    </span>
  </span>
</div>
""")


In [0]:
%pip install databricks-labs-dqx==0.8.0
%pip install dbldatagen

In [0]:
dbutils.library.restartPython()

In [0]:
# =====================================================================================
# CELL 1 — demo_spec.py  (Catalog/schema, table names, schemas, MEGASPEC)
# =====================================================================================
from utils.color import Color as C
from pyspark.sql.types import (
    StructType, StructField,
    StringType, BooleanType, DateType, TimestampType, DecimalType
)

# ──────────────────────────────────────────────────────────────────────────────
# Globals / conventions (Unity Catalog: CATALOG.SCHEMA.TABLE)
# ──────────────────────────────────────────────────────────────────────────────
DQX_CATALOG        = "dq_dev"
DQX_SCHEMA         = "demo"
DQX_DB             = f"{DQX_CATALOG}.{DQX_SCHEMA}"           # catalog.schema
QUARANTINE_TABLE   = f"{DQX_DB}.demo_quarantine"             # sink for ERROR rows
# DQX adds _warning and _error by default; we'll stick with those.
DQ_RESULT_WARN_COL = "_warning"
DQ_RESULT_ERR_COL  = "_error"

ROW_TARGETS = {
    f"{DQX_DB}.demo_employee":   2_000,
    f"{DQX_DB}.demo_customer":   1_000,
    f"{DQX_DB}.demo_project":      600,
    f"{DQX_DB}.demo_timesheet": 350_000,
    f"{DQX_DB}.demo_expense":   120_000,
}

# Canonical table names (full paths) + groupings
TABLES = {
    "catalog":    DQX_CATALOG,
    "schema":     DQX_DB,   # fully-qualified (catalog.schema)
    "employee":   f"{DQX_DB}.demo_employee",
    "customer":   f"{DQX_DB}.demo_customer",
    "project":    f"{DQX_DB}.demo_project",
    "timesheet":  f"{DQX_DB}.demo_timesheet",
    "expense":    f"{DQX_DB}.demo_expense",
    "quarantine": QUARANTINE_TABLE,
    # Groupings
    "sources": [
        f"{DQX_DB}.demo_employee",
        f"{DQX_DB}.demo_customer",
        f"{DQX_DB}.demo_project",
        f"{DQX_DB}.demo_timesheet",
        f"{DQX_DB}.demo_expense",
    ],
    "facts": [
        f"{DQX_DB}.demo_timesheet",
        f"{DQX_DB}.demo_expense",
    ],
    "dims": [
        f"{DQX_DB}.demo_employee",
        f"{DQX_DB}.demo_customer",
        f"{DQX_DB}.demo_project",
    ],
    "all": [
        f"{DQX_DB}.demo_employee",
        f"{DQX_DB}.demo_customer",
        f"{DQX_DB}.demo_project",
        f"{DQX_DB}.demo_timesheet",
        f"{DQX_DB}.demo_expense",
        QUARANTINE_TABLE,
    ],
}

# ──────────────────────────────────────────────────────────────────────────────
# Spark Structured Schemas
# ──────────────────────────────────────────────────────────────────────────────

demo_employee_schema = StructType([
    StructField("employee_id",        StringType(),  False),
    StructField("full_name",          StringType(),  False),
    StructField("department",         StringType(),  False),  # Consulting, Audit, Tax, IT, Ops
    StructField("role",               StringType(),  False),  # Engineer, Analyst, Consultant, Manager, Support
    StructField("cost_center",        StringType(),  True),   # CC-####
    StructField("employment_status",  StringType(),  False),  # Active, Leave, Terminated
    StructField("hire_date",          DateType(),    False),
    StructField("termination_date",   DateType(),    True),
    StructField("work_email",         StringType(),  True),
    StructField("country_code",       StringType(),  True),   # US, CA, MX, GB, IN
])

demo_customer_schema = StructType([
    StructField("customer_id",            StringType(),  False),
    StructField("customer_name",          StringType(),  False),
    StructField("industry",               StringType(),  False),
    StructField("country_code",           StringType(),  True),   # ISO-3166 alpha-2
    StructField("status",                 StringType(),  False),  # Active, Prospect, Inactive
    StructField("onboarding_date",        DateType(),    False),
    StructField("primary_contact_email",  StringType(),  True),
    StructField("registration_number",    StringType(),  True),   # national reg / tax-like id
])

demo_project_schema = StructType([
    StructField("project_id",          StringType(),       False),
    StructField("customer_id",         StringType(),       False),  # FK -> customer
    StructField("project_name",        StringType(),       False),
    StructField("status",              StringType(),       False),  # Planned, Active, OnHold, Closed
    StructField("start_date",          DateType(),         False),
    StructField("end_date",            DateType(),         True),
    StructField("manager_employee_id", StringType(),       True),   # FK -> employee
    StructField("budget_amount",       DecimalType(18, 2), True),
    StructField("billing_model",       StringType(),       False),  # T&M, Fixed, Retainer
])

demo_timesheet_schema = StructType([
    StructField("timesheet_id",  StringType(),       False),
    StructField("employee_id",   StringType(),       False),  # FK -> employee
    StructField("project_id",    StringType(),       False),  # FK -> project
    StructField("work_date",     DateType(),         False),
    StructField("hours_worked",  DecimalType(5, 2),  False),  # 0–24
    StructField("work_type",     StringType(),       False),  # Billable, NonBillable, Admin
    StructField("source_system", StringType(),       False),  # Workday, Jira, CSV
    StructField("created_ts",    TimestampType(),    False),
])

demo_expense_schema = StructType([
    StructField("expense_id",       StringType(),       False),
    StructField("employee_id",      StringType(),       False),  # FK -> employee
    StructField("project_id",       StringType(),       True),   # FK -> project (nullable)
    StructField("expense_date",     DateType(),         False),
    StructField("category",         StringType(),       False),  # Meals, Travel, Supplies, Software, Other
    StructField("amount",           DecimalType(18, 2), False),
    StructField("currency_code",    StringType(),       False),  # ISO-4217
    StructField("merchant",         StringType(),       True),
    StructField("receipt_attached", BooleanType(),      False),
    StructField("submission_ts",    TimestampType(),    False),
])

SCHEMAS_BY_TABLE = {
    TABLES["employee"]:  demo_employee_schema,
    TABLES["customer"]:  demo_customer_schema,
    TABLES["project"]:   demo_project_schema,
    TABLES["timesheet"]: demo_timesheet_schema,
    TABLES["expense"]:   demo_expense_schema,
}

# ──────────────────────────────────────────────────────────────────────────────
# MEGASPEC (knobs & flags used by our checks)
# ──────────────────────────────────────────────────────────────────────────────

MEGASPEC = {
    "schema": DQX_DB,
    "quarantine_table": QUARANTINE_TABLE,
    "flags": {
        "allow_weekend_billable": "warn",            # 'warn' | 'error' | 'off'
    },
    "knobs": {
        "receipt_threshold": 75.00,                  # >= threshold requires receipt
        "meal_limit": 150.00,                        # warn if Meals > limit
        "travel_limit": 500.00,                      # warn if Travel > limit
        "max_hours_per_day_error": 24.0,             # error if > this
        "hi_hours_warn": 12.0,                       # warn if > this
        "valid_country_codes": ["US","CA","MX","GB","IN"],
        "valid_currency_codes": ["USD","CAD","MXN","GBP","INR"],
        "valid_emp_status": ["Active","Leave","Terminated"],
        "valid_proj_status": ["Planned","Active","OnHold","Closed"],
        "valid_billing_models": ["T&M","Fixed","Retainer"],
        "timeliness_hours_expense_max": 240,         # submission_ts within 10 days
        "timeliness_hours_timesheet_max": 72,        # created_ts within 3 days
        "unusual_multiplier": 1.5                    # 50% higher than avg → warn
    }
}

print(f"\n[SPEC LOADED] schema={DQX_DB}  sources={len(TABLES['sources'])}  quarantine={TABLES['quarantine']}")
print("Row targets: " + ", ".join([f"{k.split('.')[-1]}={v:,}" for k,v in ROW_TARGETS.items()]))

### Create Tables

In [0]:
# Drop all DQX tables with full catalog.schema.table names
tables_to_drop = [
    f"{DQX_CATALOG}.{DQX_SCHEMA}.demo_employee",
    f"{DQX_CATALOG}.{DQX_SCHEMA}.demo_customer",
    f"{DQX_CATALOG}.{DQX_SCHEMA}.demo_project",
    f"{DQX_CATALOG}.{DQX_SCHEMA}.demo_timesheet",
    f"{DQX_CATALOG}.{DQX_SCHEMA}.demo_expense",
    f"{DQX_CATALOG}.{DQX_SCHEMA}.demo_quarantine"
]

for t in tables_to_drop:
    try:
        spark.sql(f"DROP TABLE IF EXISTS {t}")
        print(f"✅ Dropped (if existed): {t}")
    except Exception as e:
        print(f"⚠️ Skipped: {t} -> {e}")

In [0]:
# =====================================================================================
# CELL 2 — Setup (UC catalog/schema, quarantine table, banner)
# =====================================================================================

from pyspark.sql import functions as F, Window
from uuid import uuid4
from datetime import date, datetime, timedelta
from decimal import Decimal

def banner(msg, color="neon_blue"):
    col = getattr(C, color, "")
    print("\n" + "═"*92)
    print(f"{C.b}{col} {msg}{C.r}")
    print("═"*92)

banner("SETUP: Use catalog/schema and create quarantine table", "aqua_blue")

spark.sql(f"USE CATALOG {DQX_CATALOG}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {DQX_DB}")
spark.sql(f"USE {DQX_DB}")

run_id = str(uuid4())
print(f"{C.ivory}Using: catalog={DQX_CATALOG}  schema={DQX_SCHEMA}  (db={DQX_DB}){C.r}")
print(f"{C.ivory}Run ID: {C.bubblegum_pink}{run_id}{C.r}")

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {QUARANTINE_TABLE} (
  _source_table     STRING,
  _rule_id          STRING,
  _level            STRING,
  _reason           STRING,
  _run_id           STRING,
  _event_ts         TIMESTAMP,
  _row_payload_json STRING
) USING delta
""")
print(f"{C.ivory}Quarantine table ready: {C.golden_yellow}{QUARANTINE_TABLE}{C.r}")

In [0]:
# =====================================================================================
# CELLS 3–7 — DataGen for all tables (deterministic demo rows included; avoid random "bad" noise)
# =====================================================================================
from dbldatagen import DataGenerator
from utils.display import show_df  # << use your helper for nice display
from datetime import date, datetime, timedelta
from decimal import Decimal
from uuid import uuid4

# -------------------------------------------------------------------------------------
# CELL 3 — Employees
# -------------------------------------------------------------------------------------
banner(f"BUILD (Datagen): {TABLES['employee']}", "sky_blue")
n_emp = ROW_TARGETS[TABLES["employee"]]

emp_gen = (DataGenerator(spark, name="demo_employee_gen", rows=n_emp, partitions=8)
           .withIdOutput()
           .withColumn("employee_id", "string", expr="concat('E', lpad(cast(id + 1001 as string), 4, '0'))")
           .withColumn("full_name", "string", expr="concat('Emp ', cast(id as string))")
           .withColumn("department", values=["Consulting","Audit","Tax","IT","Ops"], weights=[0.35,0.20,0.20,0.15,0.10])
           .withColumn("role", values=["Engineer","Analyst","Consultant","Manager","Support"])
           .withColumn("cost_center", "string", expr="concat('CC-', lpad(cast(cast(rand()*10000 as int) as string), 4, '0'))", percentNulls=0.02)
           .withColumn("employment_status", values=MEGASPEC["knobs"]["valid_emp_status"], weights=[0.90,0.03,0.07])
           .withColumn("hire_date", "date", minValue="2018-01-01", maxValue="2025-08-01")
           .withColumn("termination_date", "date", minValue="2018-02-01", maxValue="2030-12-31", percentNulls=1.0)
           .withColumn("work_email", "string", expr="concat('emp', cast(id as string), '@company.com')")
           .withColumn("country_code", values=MEGASPEC["knobs"]["valid_country_codes"])
)
emp_df = emp_gen.build().drop("id")

# Deterministic demo employees (specific rule hits)
demo_rows_emp = [
    ("E9999", "Terminated_NoEnd", "Consulting", "Analyst", "CC-0001",
     "Terminated", date(2023,1,15), None, "emp9999@company.com", "US"),
    ("E9998", "Terminated_BadEnd", "Audit", "Engineer", "CC-0002",
     "Terminated", date(2023,6,1), date(2023,5,15), "emp9998@company.com", "US"),
    ("E9997", "ExtEmail_InvalidCC", "IT", "Manager", "CC-0003",
     "Active", date(2021,3,1), None, "someone@gmail.com", "XX"),
]
emp_demo_df = spark.createDataFrame(demo_rows_emp, schema=SCHEMAS_BY_TABLE[TABLES["employee"]])
emp_df = emp_df.unionByName(emp_demo_df)

emp_df.write.format("delta").mode("overwrite").saveAsTable(TABLES["employee"])
print(f"{C.ivory}Wrote: {C.golden_yellow}{TABLES['employee']}{C.r}  rows={spark.table(TABLES['employee']).count():,}")
show_df(spark.table(TABLES["employee"]), n=20, truncate=False)

# -------------------------------------------------------------------------------------
# CELL 4 — Customers
# -------------------------------------------------------------------------------------
banner(f"BUILD (Datagen): {TABLES['customer']}", "sky_blue")
n_cust = ROW_TARGETS[TABLES["customer"]]

cust_gen = (DataGenerator(spark, name="demo_customer_gen", rows=n_cust, partitions=8)
            .withIdOutput()
            .withColumn("customer_id", "string", expr="concat('C', lpad(cast(id + 5001 as string), 4, '0'))")
            .withColumn("customer_name", "string", expr="concat('Customer ', cast(id as string))")
            .withColumn("industry", values=["Technology","Healthcare","Finance","Retail","Manufacturing"], weights=[0.30,0.20,0.20,0.20,0.10])
            .withColumn("country_code", values=MEGASPEC["knobs"]["valid_country_codes"])
            .withColumn("status", values=["Active","Prospect","Inactive"], weights=[0.75,0.15,0.10])
            .withColumn("onboarding_date", "date", minValue="2019-01-01", maxValue="2025-08-01")
            .withColumn("primary_contact_email", "string", expr="concat('contact', cast(id as string), '@example.com')")
            .withColumn("registration_number", "string", expr="concat('RN-', lpad(cast(cast(rand()*100000000 as int) as string), 8, '0'))")
)
cust_df = cust_gen.build().drop("id")

# Deterministic dup registration among Active
demo_rows_cust = [
    ("C9999", "DupReg A", "Technology", "US", "Active",  date(2024,1,1), "a@example.com", "RN-00001234"),
    ("C9998", "DupReg B", "Technology", "US", "Active",  date(2024,1,2), "b@example.com", "RN-00001234"),
]
cust_demo_df = spark.createDataFrame(demo_rows_cust, schema=SCHEMAS_BY_TABLE[TABLES["customer"]])
cust_df = cust_df.unionByName(cust_demo_df)

cust_df.write.format("delta").mode("overwrite").saveAsTable(TABLES["customer"])
print(f"{C.ivory}Wrote: {C.golden_yellow}{TABLES['customer']}{C.r}  rows={spark.table(TABLES['customer']).count():,}")
show_df(spark.table(TABLES["customer"]), n=20, truncate=False)

# -------------------------------------------------------------------------------------
# CELL 5 — Projects
# -------------------------------------------------------------------------------------
banner(f"BUILD (Datagen): {TABLES['project']}", "sky_blue")
n_proj = ROW_TARGETS[TABLES["project"]]

emp_ids  = [r[0] for r in spark.table(TABLES["employee"]).select("employee_id").collect()]
cust_ids = [r[0] for r in spark.table(TABLES["customer"]).select("customer_id").collect()]
mgr_id   = emp_ids[0] if emp_ids else None

proj_gen = (DataGenerator(spark, name="demo_project_gen", rows=n_proj, partitions=8)
            .withIdOutput()
            .withColumn("project_id", "string", expr="concat('P', lpad(cast(id + 10001 as string), 5, '0'))")
            .withColumn("customer_id", values=cust_ids)
            .withColumn("project_name", "string", expr="concat('Project ', cast(id as string))")
            .withColumn("status", values=MEGASPEC["knobs"]["valid_proj_status"], weights=[0.15,0.55,0.10,0.20])
            .withColumn("start_date", "date", minValue="2020-01-01", maxValue="2025-03-01")
            .withColumn("end_date", "date", minValue="2020-02-01", maxValue="2028-12-31", percentNulls=0.35)
            .withColumn("manager_employee_id", values=emp_ids, percentNulls=0.03)
            .withColumn("budget_amount", "decimal(18,2)", minValue=10000, maxValue=2000000)
            .withColumn("billing_model", values=MEGASPEC["knobs"]["valid_billing_models"], weights=[0.55,0.35,0.10])
)
proj_df = proj_gen.build().drop("id")

# Deterministic demo projects (specific rule hits)
demo_rows_proj = [
    ("P99998", cust_ids[0] if cust_ids else "C9999", "Demo_EndBeforeStart", "Active",
     date(2024,5,1), date(2024,4,15), mgr_id, Decimal("250000.00"), "T&M"),
    ("P99999", "C0000", "Demo_MissingCustomer", "Active",
     date(2024,6,1), None, mgr_id, Decimal("150000.00"), "Fixed"),
    ("P99997", cust_ids[0] if cust_ids else "C9999", "Demo_ClosedNoEnd", "Closed",
     date(2024,2,1), None, mgr_id, Decimal("100000.00"), "Retainer"),
]
proj_demo_df = spark.createDataFrame(demo_rows_proj, schema=SCHEMAS_BY_TABLE[TABLES["project"]])
proj_df = proj_df.unionByName(proj_demo_df)

proj_df.write.format("delta").mode("overwrite").saveAsTable(TABLES["project"])
print(f"{C.ivory}Wrote: {C.golden_yellow}{TABLES['project']}{C.r}  rows={spark.table(TABLES['project']).count():,}")
show_df(spark.table(TABLES["project"]), n=20, truncate=False)

# -------------------------------------------------------------------------------------
# CELL 6 — Timesheets  (enforce DECIMAL(5,2) + overwriteSchema)
# -------------------------------------------------------------------------------------
banner(f"BUILD (Datagen): {TABLES['timesheet']}", "sky_blue")
n_ts = ROW_TARGETS[TABLES["timesheet"]]

emp_ids = [r[0] for r in spark.table(TABLES["employee"]).select("employee_id").collect()]
proj_keys = (spark.table(TABLES["project"]).select("project_id","start_date","end_date").collect())
project_id_list = [r["project_id"] for r in proj_keys]
proj_map = {r["project_id"]: (r["start_date"], r["end_date"]) for r in proj_keys}

ts_gen = (DataGenerator(spark, name="demo_timesheet_gen", rows=n_ts, partitions=48)
          .withColumn("timesheet_id", "string", expr="uuid()")
          .withColumn("employee_id", values=emp_ids)
          .withColumn("project_id", values=project_id_list)
          .withColumn("work_date", "date", minValue="2024-01-01", maxValue="2025-08-10")
          .withColumn("hours_worked", "decimal(5,2)", minValue=0.00, maxValue=12.00)
          .withColumn("work_type", values=["Billable","NonBillable","Admin"], weights=[0.70,0.25,0.05])
          .withColumn("source_system", values=["Workday","Jira","CSV"], weights=[0.70,0.20,0.10])
          .withColumn("created_ts", "timestamp")
)
ts_df = ts_gen.build()

# created_ts near work_date (0–72 hours)
ts_df = ts_df.withColumn(
    "created_ts",
    F.expr("timestamp(work_date) + make_interval(0,0,0,cast(rand()*3 as int), cast(rand()*24 as int), cast(rand()*60 as int), 0)")
)

# Deterministic demo timesheets
chosen_emp = emp_ids[0] if emp_ids else None
chosen_proj = project_id_list[0] if project_id_list else None
if chosen_emp and chosen_proj:
    p_start, _ = proj_map.get(chosen_proj, (date(2024,1,1), None))
    demo_rows_ts = [
        (str(uuid4()), chosen_emp, chosen_proj, p_start - timedelta(days=3),
         Decimal("8.00"), "Billable", "Workday", datetime.utcnow()),
        (str(uuid4()), chosen_emp, chosen_proj, date.today() - timedelta(days=10),
         Decimal("8.00"), "Billable", "Workday", datetime.utcnow()),
        (str(uuid4()), chosen_emp, chosen_proj, date.today() - timedelta(days=10),
         Decimal("8.00"), "Billable", "Workday", datetime.utcnow()),
    ]
    ts_demo_df = spark.createDataFrame(demo_rows_ts, schema=SCHEMAS_BY_TABLE[TABLES["timesheet"]])
    ts_df = ts_df.unionByName(ts_demo_df)

# Enforce canonical schema & write with overwriteSchema
ts_schema = SCHEMAS_BY_TABLE[TABLES["timesheet"]]
for field in ts_schema:
    ts_df = ts_df.withColumn(field.name, F.col(field.name).cast(field.dataType))

ts_df.select([f.name for f in ts_schema]) \
     .write.format("delta") \
     .mode("overwrite") \
     .option("overwriteSchema", "true") \
     .saveAsTable(TABLES["timesheet"])

print(f"{C.ivory}Wrote: {C.golden_yellow}{TABLES['timesheet']}{C.r}  rows={spark.table(TABLES['timesheet']).count():,}")
show_df(spark.table(TABLES["timesheet"]), n=20, truncate=False)

# -------------------------------------------------------------------------------------
# CELL 7 — Expenses
# -------------------------------------------------------------------------------------
banner(f"BUILD (Datagen): {TABLES['expense']}", "sky_blue")
n_exp = ROW_TARGETS[TABLES["expense"]]
emp_ids = [r[0] for r in spark.table(TABLES["employee"]).select("employee_id").collect()]
proj_ids = [r[0] for r in spark.table(TABLES["project"]).select("project_id").collect()]

exp_gen = (DataGenerator(spark, name="demo_expense_gen", rows=n_exp, partitions=24)
           .withColumn("expense_id", "string", expr="uuid()")
           .withColumn("employee_id", values=emp_ids)
           .withColumn("project_id", values=proj_ids, percentNulls=0.25)
           .withColumn("expense_date", "date", minValue="2024-01-01", maxValue="2025-08-10")
           .withColumn("category", values=["Meals","Travel","Supplies","Software","Other"], weights=[0.35,0.25,0.20,0.10,0.10])
           .withColumn("amount", "decimal(18,2)", minValue=5.00, maxValue=5000.00)
           .withColumn("currency_code", values=MEGASPEC["knobs"]["valid_currency_codes"])
           .withColumn("merchant", values=["Uber","Lyft","Delta","AA","Staples","BestBuy","Amazon","LocalCafe","HotelCo","SoftwareCo"])
           .withColumn("receipt_attached", "boolean", expr="rand() > 0.08")
           .withColumn("submission_ts", "timestamp")
)
exp_df = exp_gen.build()

# submission_ts near expense_date (0–240 hours)
exp_df = exp_df.withColumn(
    "submission_ts",
    F.expr("timestamp(expense_date) + make_interval(0,0,0,cast(rand()*10 as int), cast(rand()*24 as int), cast(rand()*60 as int), 0)")
)

# Deterministic demo expenses
if emp_ids:
    demo_emp = emp_ids[0]
    emp_cc = (spark.table(TABLES["employee"]).where(F.col("employee_id")==demo_emp)
              .select("country_code").first())
    home_cc = emp_cc["country_code"] if emp_cc and emp_cc["country_code"] else "US"
    home_ccy = {"US":"USD","CA":"CAD","MX":"MXN","GB":"GBP","IN":"INR"}.get(home_cc, "USD")
    away_ccy = "GBP" if home_ccy != "GBP" else "USD"
else:
    demo_emp = "E9999"; home_ccy = "USD"; away_ccy = "GBP"

demo_proj = proj_ids[0] if proj_ids else None
today = date.today()
demo_rows_exp = [
    (str(uuid4()), demo_emp, demo_proj, today - timedelta(days=20), "Meals", Decimal("15.00"), home_ccy, "LocalCafe", False, datetime.utcnow()),
    (str(uuid4()), demo_emp, demo_proj, today - timedelta(days=18), "Meals", Decimal("22.00"), home_ccy, "LocalCafe", True,  datetime.utcnow()),
    (str(uuid4()), demo_emp, demo_proj, today - timedelta(days=16), "Meals", Decimal("18.00"), home_ccy, "LocalCafe", True,  datetime.utcnow()),
    (str(uuid4()), demo_emp, demo_proj, today - timedelta(days=14), "Meals", Decimal("20.50"), home_ccy, "LocalCafe", True,  datetime.utcnow()),
    (str(uuid4()), demo_emp, demo_proj, today - timedelta(days=1),  "Meals", Decimal("600.00"), away_ccy, "LocalCafe", True,  datetime.utcnow()),
    (str(uuid4()), demo_emp, demo_proj, today - timedelta(days=2),  "Meals", Decimal("24.99"), home_ccy, "LocalCafe", False, datetime.utcnow()),
    (str(uuid4()), demo_emp, demo_proj, today - timedelta(days=2),  "Meals", Decimal("24.99"), home_ccy, "LocalCafe", False, datetime.utcnow()),
    (str(uuid4()), demo_emp, demo_proj, today - timedelta(days=2),  "Meals", Decimal("26.99"), home_ccy, "LocalCafe", False, datetime.utcnow()),
    (str(uuid4()), demo_emp, demo_proj, today - timedelta(days=5),  "Supplies", Decimal("129.00"), home_ccy, "Amazon", True,  datetime.utcnow()),
    (str(uuid4()), demo_emp, demo_proj, today - timedelta(days=5),  "Supplies", Decimal("129.00"), home_ccy, "Amazon", True,  datetime.utcnow()),
]
exp_demo_df = spark.createDataFrame(demo_rows_exp, schema=SCHEMAS_BY_TABLE[TABLES["expense"]])
exp_df = exp_df.unionByName(exp_demo_df)

exp_df.write.format("delta").mode("overwrite").saveAsTable(TABLES["expense"])
print(f"{C.ivory}Wrote: {C.golden_yellow}{TABLES['expense']}{C.r}  rows={spark.table(TABLES['expense']).count():,}")
show_df(spark.table(TABLES["expense"]), n=20, truncate=False)

---