In [0]:
from pyspark.sql.functions import col, sum, current_timestamp

In [0]:
bronze_df = (
    spark.read
    .format("delta")
    .table("bronze.customers")
)

In [0]:
display(bronze_df.limit(50))

In [0]:
null_counts = bronze_df.select([
    sum(col(c).isNull().cast("int")).alias(c)
    for c in bronze_df.columns
])

null_counts.show()

In [0]:
silver_df = (
    spark.readStream.table("bronze.customers")
    .filter("customer_unique_id IS NOT NULL")
    .dropDuplicates(["customer_unique_id"])
    .dropDuplicates(["customer_id"])
    .withColumn("ingest_time", current_timestamp())
    .drop("_rescued_data")
)


geo_df = (
    spark.read.table("silver.geolocations_cleaned")
)


customer_geo_df = silver_df.join(geo_df, silver_df.customer_zip_code_prefix == geo_df.geolocation_zip_code_prefix, "left").select(*silver_df.columns, geo_df.geolocation_sk)


customer_geo_df.createOrReplaceTempView("silver_updates")


(customer_geo_df.writeStream
    .format("delta")
    .outputMode("append")  
    .option("checkpointLocation", "abfss://metadata@salesmarketingstorage1.dfs.core.windows.net/checkpoints/silver/customers")
    .trigger(availableNow=True)
    .table("silver.customers_cleaned"))



In [0]:
%sql select * from silver.customers_cleaned limit 10;