In [0]:
# Setup widgets for environment configuration
dbutils.widgets.text("RunType", "once", "Set once to run as a batch")
dbutils.widgets.text("ProcessingTime", "5 seconds", "Set the microbatch interval")

In [0]:
# Retrieve widget values for environment configuration
once = dbutils.widgets.get("RunType") == "once"
processing_time = dbutils.widgets.get("ProcessingTime")

In [0]:
# Print pipeline start mode based on the 'once' flag
if once:
    print("Starting TFL pipeline in batch mode...")
else:
    print(f"Starting TFL pipeline in streaming mode with microbatch = {processing_time}...")

In [0]:
# Spark optimisations
spark.conf.set("spark.sql.shuffle.partitions", sc.defaultParallelism)
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", True)
spark.conf.set("spark.databricks.delta.autoCompact.enabled", True)
spark.conf.set("spark.sql.streaming.stateStore.providerClass", "com.databricks.sql.streaming.state.RocksDBStateStoreProvider")

In [0]:
%run ./01-config

In [0]:
%run ./03-bronze

In [0]:
%run ./04-silver

In [0]:
%run ./05-gold

In [0]:
# Execute Bronze, Silver, and Gold data pipeline layers in sequence

# Ingest and process data into the bronze layer
consume_bronze(once=True, processing_time="5 seconds")
validate_bronze()

# Transform and upsert data into the silver layer
upsert_silver()
# validate_silver()

# Generate KPIs and aggregate metrics in the gold layer
create_gold_kpis()
# validate_gold()