# DLT pipeline for training data

This Delta Live Tables (DLT) definition is executed using a pipeline defined in resources/customer_segmentation_dlt.yml. It contains the DLT Steps for creating valid training data

In [0]:
import dlt
from pyspark.sql.functions import col, when
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

spark = SparkSession.builder.getOrCreate()

In [0]:
# TODO: define schema
raw_data_schema = StructType([
    StructField("customer_id", IntegerType(), True),
    # Add more fields as necessary
])

In [0]:

# Bronze Table: Raw data ingestion from DBFS using Auto Loader

@dlt.table
def bronze_training_customer_data():
    return (
        spark.readStream.format("cloudFiles")  # Use Auto Loader
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.header", "true")  # Ensure header is recognized
        .schema(raw_data_schema)
        .load()  # Path and other options are configured in the YAML
        .withColumn("ingest_timestamp", current_timestamp())  # Add ingest timestamp
    )




In [0]:
# Silver Table: Cleaned and transformed data
@dlt.table(
    comment = "silver table with valid customer data for training"
)
@dlt.expect_all_or_drop({
    "valid_id": "id IS NOT NULL",  # Ensure ID is not null
    "valid_segmentation": "segmentation IS NOT NULL"  # Ensure Segmentation (target) is not null
})
@dlt.expect_all({
    "non_negative_age": "age >= 0",
    "valid_age": "age <= 120","
    "valid_family_size": "family_size >= 0 AND family_size <= 15",
    "valid_work_experience": "work_experience >= 0 AND work_experience <= 50"
})
def silver_training_customer_data():
    bronze_df = dlt.read("bronze_training_customer_data")
    # Standardize column names (lowercase, underscores instead of spaces)
    df = bronze_df.toDF(*[col.lower().replace(' ', '_') for col in bronze_df.columns])

    # Convert Yes/No columns to binary (e.g., Ever_Married, Graduated)
    df = (df.withColumn("ever_married", when(col("ever_married") == "Yes", 1).otherwise(0))
          .withColumn("graduated", when(col("graduated") == "Yes", 1).otherwise(0))
        )

    # Convert Age and Family Size to integers, Work Experience to float
    df = (df.withColumn("age", col("age").cast("int"))
          .withColumn("family_size", col("family_size").cast("int"))
          .withColumn("work_experience", col("work_experience").cast("float"))
         )
    return df



In [0]:
# Gold Table: Business-ready or aggregated data
@dlt.table
def gold_customer_aggregations():
    silver_df = dlt.read("silver_training_customer_data")
    return silver_df