# Process orders data
1. Ingest data into the lakehouse
2. Perform data quality checks and transform data as required
3. Explode json array items

Fact table - no SCD needed

In [0]:
import dlt
import pyspark.sql.functions as F
import pyspark.sql.types as T

## 1. Ingest data

In [0]:
# create a table
@dlt.table(
    name='bronze_orders',
    table_properties={'quality': 'bronze'},
    comment='Raw orders data ingested from the source system'
)
def create_bronze_addresses():
    # reading streaming source will create a streaming table
    return (
        spark.readStream 
            .format('cloudFiles')
            .option('cloudFiles.format', 'json')
            .option('cloudFiles.inferColumnTypes', 'true')
            .load('/Volumes/circuitbox/landing/operational_data/orders/')
            .select(
                '*',
                F.col('_metadata.file_path').alias('input_file_path'),
                F.current_timestamp().alias('ingestion_timestamp')
            )
    )

## 2. Perform data quality checks and transform data as required - silver_orders_clean

In [0]:
@dlt.table(
    name='silver_orders_clean',
    comment='Cleaned orders data',
    table_properties={'quality': 'silver'}
)
# expectations goes here
@dlt.expect_or_fail('valid_customer_id', 'customer_id IS NOT NULL')
@dlt.expect_or_fail('valid_order_id', 'order_id IS NOT NULL')
@dlt.expect('valid_order_status', 'order_status IN ("Pending", "Shipped", "Cancelled", "Completed")')
@dlt.expect('valid_payment_method', 'payment_method IN ("Credit Card", "Bank Transfer", "PayPal")')
def create_silver_addresses_clean():
    return (
        spark.readStream.table('LIVE.bronze_orders')
            .select(
                'customer_id',
                'items',
                'order_id',
                'order_status',
                F.col('order_timestamp').cast(T.TimestampType()),
                'payment_method'
            )
    )

## 3. Explode items array - silver_orders

In [0]:
@dlt.table(
    name='silver_orders',
    comment='Cleaned orders data with items array exploded',
    table_properties={'quality': 'silver'}
)
def create_silver_orders():
    df_orders_normalized = (
        spark.readStream.table('LIVE.silver_orders_clean')
            .select(
                'customer_id',
                F.explode(F.array_distinct('items')).alias('item'),
                'order_id',
                'order_status',
                'order_timestamp',
                'payment_method'
            )
    )
    return (
        df_orders_normalized.select(
            'customer_id',
            'order_id',
            'order_status',
            'order_timestamp',
            'payment_method',
            F.col('item.item_id').alias('item_id'),
            F.col('item.category').alias('item_category'),
            F.col('item.name').alias('item_name'),
            F.col('item.price').alias('item_price'),
            F.col('item.quantity').alias('item_quantity')
        )
    )
