In [0]:
from pyspark.sql.functions import (
    col, lower, trim, to_timestamp, when, count, lit
)

bronze_path = "/Volumes/workspace/default/raw_data/bronze_marketing_events"
silver_path = "/Volumes/workspace/default/raw_data/silver_marketing_events"

# 1) Read Bronze Delta
df_bronze = spark.read.format("delta").load(bronze_path)

# 2) Type casting + standardization
df_silver = (
    df_bronze
    .withColumn("event_time", to_timestamp(col("event_time"), "yyyy-MM-dd HH:mm:ss"))
    .withColumn("channel", lower(trim(col("channel"))))
    .withColumn("spend", col("spend").cast("double"))
    .withColumn("conversion", col("conversion").cast("int"))
    .withColumn("revenue", col("revenue").cast("double"))
)

# 3) Basic data quality rules (filter invalid)
df_silver = df_silver.filter(
    (col("event_time").isNotNull()) &
    (col("customer_id").isNotNull()) &
    (col("spend") >= 0) &
    (col("revenue") >= 0) &
    (col("conversion").isin(0, 1))
)

# 4) Deduplication (remove exact duplicates based on business keys)
dedup_keys = ["event_time", "customer_id", "channel", "spend", "conversion", "revenue"]
df_silver = df_silver.dropDuplicates(dedup_keys)

# 5) Write Silver Delta
(
    df_silver.write
    .format("delta")
    .mode("overwrite")
    .save(silver_path)
)

print("✅ Silver saved at:", silver_path)

# 6) Quality checks / counts
df_silver_read = spark.read.format("delta").load(silver_path)

print("Silver row count:", df_silver_read.count())

display(df_silver_read)


✅ Silver saved at: /Volumes/workspace/default/raw_data/silver_marketing_events
Silver row count: 8


event_time,customer_id,channel,spend,conversion,revenue,ingest_time,source_file
2025-01-05T10:12:00.000Z,1001,search,25.5,1,120.0,2026-01-15T21:54:19.160Z,dbfs:/Volumes/workspace/default/raw_data/raw_marketing_events.csv
2025-01-06T12:10:00.000Z,1001,email,2.0,1,30.0,2026-01-15T21:54:19.160Z,dbfs:/Volumes/workspace/default/raw_data/raw_marketing_events.csv
2025-01-10T14:00:00.000Z,1007,email,3.0,1,40.0,2026-01-15T21:54:19.160Z,dbfs:/Volumes/workspace/default/raw_data/raw_marketing_events.csv
2025-01-09T20:05:00.000Z,1006,tv,90.0,0,0.0,2026-01-15T21:54:19.160Z,dbfs:/Volumes/workspace/default/raw_data/raw_marketing_events.csv
2025-01-07T15:45:00.000Z,1004,search,18.0,0,0.0,2026-01-15T21:54:19.160Z,dbfs:/Volumes/workspace/default/raw_data/raw_marketing_events.csv
2025-01-08T08:20:00.000Z,1005,social,12.0,1,60.0,2026-01-15T21:54:19.160Z,dbfs:/Volumes/workspace/default/raw_data/raw_marketing_events.csv
2025-01-05T11:00:00.000Z,1002,social,10.0,0,0.0,2026-01-15T21:54:19.160Z,dbfs:/Volumes/workspace/default/raw_data/raw_marketing_events.csv
2025-01-06T09:30:00.000Z,1003,tv,80.0,1,200.0,2026-01-15T21:54:19.160Z,dbfs:/Volumes/workspace/default/raw_data/raw_marketing_events.csv
