# Process addresses data
1. Ingest data into the lakehouse
2. Perform data quality checks and transform data as required
3. Apply changes to the data

In [0]:
import dlt
import pyspark.sql.functions as F
import pyspark.sql.types as T

## 1. Ingest the data into the data lakehouse - bronze_addresses

In [0]:
# create a table
@dlt.table(
    name='bronze_addresses',
    table_properties={'quality': 'bronze'},
    comment='Raw addresses data ingested from the source system'
)
def create_bronze_addresses():
    # reading streaming source will create a streaming table
    return (
        spark.readStream 
            .format('cloudFiles')
            .option('cloudFiles.format', 'csv')
            .option('cloudFiles.inferColumnTypes', 'true')
            .load('/Volumes/circuitbox/landing/operational_data/addresses/')
            .select(
                '*',
                F.col('_metadata.file_path').alias('input_file_path'),
                F.current_timestamp().alias('ingest_timestamp')
            )
    )

## 2. Perform data quality checks and transform the data as required - silver_addresses_clean

In [0]:
@dlt.table(
    name='silver_addresses_clean',
    comment='Cleaned addresses data',
    table_properties={'quality': 'silver'}
)
# expectations goes here
@dlt.expect_or_fail('valid_customer_id', 'customer_id IS NOT NULL')
@dlt.expect_or_drop('valid_address', 'address_line_1 IS NOT NULL')
@dlt.expect('valid_postcode', 'LENGTH(postcode)=5')
def create_silver_addresses_clean():
    return (
        spark.readStream.table('LIVE.bronze_addresses')
            .select(
                'customer_id',
                'address_line_1',
                'city',
                'state',
                'postcode',
                F.col('created_date').cast(T.DateType())
            )
    )

## 3. Apply changes to the data (SCD Type 2)
![image_1769866972001.png](./image_1769866972001.png "image_1769866972001.png")

In [0]:
dlt.create_streaming_table(
    name='silver_addresses',
    comment='SCD Type 2 address data',
    table_properties={'quality': 'silver'}
)

In [0]:
dlt.apply_changes(
    target='silver_addresses',
    source='silver_addresses_clean',
    keys=['customer_id'],
    sequence_by='created_date',
    stored_as_scd_type=2
)