In [0]:
import dlt
from pyspark.sql.functions import current_timestamp, col

# volume paths for raw data ingestion
customers_volume_path = "/Volumes/adb_catalog/landing_zone/raw_data/customers"
orders_volume_path = "/Volumes/adb_catalog/landing_zone/raw_data/orders"

# schema storage paths
customers_schema_path = "/Volumes/adb_catalog/landing_zone/raw_data/checkpoints/customers_schema"
orders_schema_path = "/Volumes/adb_catalog/landing_zone/raw_data/checkpoints/orders_schema"


# -- CUSTOMERS BRONZE --
@dlt.table(name="customers_bronze",
           comment="Raw ingestion of customer data from landing zone via Auto Loader."
           )
def customers_raw_bronze():
    return (spark.readStream.format("cloudFiles")
          .option("cloudFiles.format", "csv")
          .option("header", "true")
          .option("cloudFiles.schemaLocation", customers_schema_path)
          .option("cloudFiles.schemaEvolutionMode", "rescue")
          .load(customers_volume_path)
          .withColumn("customerId", col("customerId").cast("int"))
          .withColumn("ingestion_timestamp", current_timestamp()))


# -- ORDERS BRONZE --
@dlt.table(name="orders_bronze",
           comment="Raw ingestion of order transactions from landing zone via Auto Loader."
           )
def orders_raw_bronze():
    return (spark.readStream.format("cloudFiles")
          .option("cloudFiles.format", "csv")
          .option("header", "true")
          .option("cloudFiles.schemaLocation", orders_schema_path)
          .option("cloudFiles.schemaEvolutionMode", "rescue")
          .load(orders_volume_path)
          .withColumn("customerId", col("customerId").cast("int"))
          .withColumn("ingestion_timestamp", current_timestamp()))
