# DLT pipeline

This Delta Live Tables (DLT) definition is executed using a pipeline defined in resources/customer_segmentation_dlt.yml.

In [0]:
import dlt
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

spark = SparkSession.builder.getOrCreate()

In [0]:
# TODO: define schema
raw_data_schema = StructType([
    StructField("customer_id", IntegerType(), True),
    # Add more fields as necessary
])

In [0]:

# Bronze Table: Raw data ingestion from DBFS using Auto Loader

@dlt.table
def bronze_customer_data():
    return (
        spark.readStream.format("cloudFiles")  # Use Auto Loader
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.header", "true")  # Ensure header is recognized
        .schema(raw_data_schema)
        .load()  # Path and other options are configured in the YAML
        .withColumn("ingest_timestamp", current_timestamp())  # Add ingest timestamp

    )


# Silver Table: Cleaned and transformed data
@dlt.table
def silver_customer_data():
    bronze_df = dlt.read("bronze_customer_data")
    return bronze_df.filter(col("customer_id").isNotNull())  # Example: filter out rows with null customer IDs

# Gold Table: Business-ready or aggregated data
@dlt.table
def gold_customer_aggregations():
    silver_df = dlt.read("silver_customer_data")
    return silver_df.groupBy("customer_segment").agg({"spend": "sum"})  # Example: group by customer segment and aggregate spend
