In [0]:
%sql

SHOW VOLUMES IN workspace.ecommerce;



database,volume_name
ecommerce,ecommerce_data


In [0]:
# BRONZE: Raw ingestion
from pyspark.sql import functions as F
raw = spark.read.csv([
    "/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv",
    "/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv"
], header=True, inferSchema=True)
raw.withColumn("ingestion_ts", F.current_timestamp()) \
   .write.format("delta").mode("overwrite").save("/Volumes/workspace/ecommerce/ecommerce_data/bronze/events")

In [0]:
# SILVER: Cleaned data
bronze = spark.read.format("delta").load("/Volumes/workspace/ecommerce/ecommerce_data/bronze/events")
silver = bronze.filter(F.col("price") > 0) \
    .filter(F.col("price") < 10000) \
    .dropDuplicates(["user_session", "event_time"]) \
    .withColumn("event_date", F.to_date("event_time")) \
    .withColumn("price_tier",
        F.when(F.col("price") < 10, "budget")
         .when(F.col("price") < 50, "mid")
         .otherwise("premium"))
silver.write.format("delta").mode("overwrite").save("/Volumes/workspace/ecommerce/ecommerce_data/silver/events")

In [0]:
silver.show()

+-------------------+----------+----------+-------------------+--------------------+------------+-------+---------+--------------------+--------------------+----------+----------+
|         event_time|event_type|product_id|        category_id|       category_code|       brand|  price|  user_id|        user_session|        ingestion_ts|event_date|price_tier|
+-------------------+----------+----------+-------------------+--------------------+------------+-------+---------+--------------------+--------------------+----------+----------+
|2019-11-17 08:43:01|      view|   1005105|2053013555631882655|electronics.smart...|       apple|1363.95|543296136|b6c1c551-d7cb-406...|2026-01-14 21:18:...|2019-11-17|   premium|
|2019-11-17 08:43:08|      view|   1480279|2053013561092866779|   computers.desktop|          hp| 967.82|546350875|cd00b163-df39-4d2...|2026-01-14 21:18:...|2019-11-17|   premium|
|2019-11-17 08:43:30|      view|  28722200|2053013565228450757|       apparel.shoes|     respect|   

In [0]:
# GOLD: Aggregates
silver = spark.read.format("delta").load("/Volumes/workspace/ecommerce/ecommerce_data/silver/events")
product_perf = silver.groupBy("product_id", "brand") \
    .agg(
        F.countDistinct(F.when(F.col("event_type")=="view", "user_id")).alias("views"),
        F.countDistinct(F.when(F.col("event_type")=="purchase", "user_id")).alias("purchases"),
        F.sum(F.when(F.col("event_type")=="purchase", F.col("price").cast("double"))).alias("revenue")
    ).withColumn(
        "conversion_rate",
        F.when(F.col("views") != 0, F.col("purchases")/F.col("views")*100)
    )
product_perf.write.format("delta").mode("overwrite").save("/Volumes/workspace/ecommerce/ecommerce_data/gold/products")