In [0]:
from pyspark.sql import functions as F

base_path = "/Volumes/workspace/ecommerce/ecommerce_data"

bronze_path = f"{base_path}/bronze_events"
silver_path = f"{base_path}/silver_events"
gold_path   = f"{base_path}/gold_products"

## Bronze: raw ingestion

In [0]:
oct_events = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv", header=True, inferSchema=True)
bronze = oct_events.withColumn("ingestion_ts", F.current_timestamp())

bronze.write.format("delta").mode("overwrite").save(bronze_path)

# Quick check
bronze_check = spark.read.format("delta").load(bronze_path)
bronze_check.printSchema()
bronze_check.show(5, truncate=False)

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)
 |-- ingestion_ts: timestamp (nullable = true)

+-------------------+----------+----------+-------------------+-----------------------+-------+------+---------+------------------------------------+--------------------------+
|event_time         |event_type|product_id|category_id        |category_code          |brand  |price |user_id  |user_session                        |ingestion_ts              |
+-------------------+----------+----------+-------------------+-----------------------+-------+------+---------+------------------------------------+--------------------------+
|2019-10-21 07:00:26|view      |1201290   

## Silver: cleaning & validation

In [0]:
bronze_df = spark.read.format("delta").load(bronze_path)

silver = (
    bronze_df
    .filter(F.col("price") > 0)
    .filter(F.col("price") < 10000)
    .dropDuplicates(["user_session", "event_time"])
    .withColumn("event_date", F.to_date("event_time"))
    .withColumn(
        "price_tier",
        F.when(F.col("price") < 10, "budget")
         .when(F.col("price") < 50, "mid")
         .otherwise("premium")
    )
)

silver.write.format("delta").mode("overwrite").save(silver_path)

silver_check = spark.read.format("delta").load(silver_path)
silver_check.printSchema()
silver_check.show(5, truncate=False)

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)
 |-- ingestion_ts: timestamp (nullable = true)
 |-- event_date: date (nullable = true)
 |-- price_tier: string (nullable = true)

+-------------------+----------+----------+-------------------+----------------------+--------+------+---------+------------------------------------+--------------------------+----------+----------+
|event_time         |event_type|product_id|category_id        |category_code         |brand   |price |user_id  |user_session                        |ingestion_ts              |event_date|price_tier|
+-------------------+----------+----------+-------------------+----------------------+-------

## Gold: business aggregates (product performance)

In [0]:
from pyspark.sql import functions as F
silver_df = spark.read.format("delta").load(silver_path)

product_perf = (
    silver_df
    .groupBy("product_id", "brand")
    .agg(
        F.countDistinct(
            F.when(F.col("event_type") == "view", F.col("user_id"))
        ).alias("views"),
        F.countDistinct(
            F.when(F.col("event_type") == "purchase", F.col("user_id"))
        ).alias("purchases"),
        F.sum(
            F.when(F.col("event_type") == "purchase", F.col("price"))
        ).alias("revenue")
    )
    .withColumn(
        "conversion_rate",
        F.try_divide(F.col("purchases"), F.col("views")) * F.lit(100.0)
    )
)

product_perf.write.format("delta").mode("overwrite").save(gold_path)
gold_check = spark.read.format("delta").load(gold_path)
gold_check.show(10, truncate=False)

+----------+----------+-----+---------+------------------+------------------+
|product_id|brand     |views|purchases|revenue           |conversion_rate   |
+----------+----------+-----+---------+------------------+------------------+
|1004573   |samsung   |3216 |50       |39517.170000000006|1.554726368159204 |
|12704683  |nokian    |733  |29       |3121.4700000000003|3.956343792633015 |
|3300488   |redmond   |1718 |38       |6847.049999999998 |2.211874272409779 |
|42000008  |pt-group  |676  |2        |437.6             |0.2958579881656805|
|8500290   |NULL      |357  |12       |4071.73           |3.361344537815126 |
|1307143   |hp        |175  |6        |4014.0            |3.428571428571429 |
|2800645   |arg       |272  |8        |2316.4200000000005|2.941176470588235 |
|6800852   |apacer    |258  |4        |92.56             |1.550387596899225 |
|12717781  |bfgoodrich|37   |0        |NULL              |0.0               |
|20900533  |xiaomi    |60   |1        |81.86             |1.6666