In [0]:
from pyspark.sql import functions as F
base_path = "/Volumes/workspace/ecommerce/ecommerce_data"
oct_path = f"{base_path}/2019-Oct.csv" 

oct_events = spark.read.csv(
    oct_path,
    header=True,
    inferSchema=True
)
oct_events.printSchema()
oct_events.show(5, truncate=False)

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)

+-------------------+----------+----------+-------------------+-----------------------------------+--------+-------+---------+------------------------------------+
|event_time         |event_type|product_id|category_id        |category_code                      |brand   |price  |user_id  |user_session                        |
+-------------------+----------+----------+-------------------+-----------------------------------+--------+-------+---------+------------------------------------+
|2019-10-01 00:00:00|view      |44600062  |2103807459595387724|NULL                               |shiseido|35.79  |541312140|72

In [0]:
# Widgets
dbutils.widgets.text("base_path", "/Volumes/workspace/ecommerce/ecommerce_data")
dbutils.widgets.dropdown("layer", "bronze", ["bronze", "silver", "gold"])

# Read widget values
base_path = dbutils.widgets.get("base_path")
layer = dbutils.widgets.get("layer")

bronze_path = f"{base_path}/bronze_events"
silver_path = f"{base_path}/silver_events"
gold_path   = f"{base_path}/gold_products"

In [0]:
from pyspark.sql import functions as F

def run_bronze():
    raw = oct_events  

    bronze = raw.withColumn("ingestion_ts", F.current_timestamp())
    bronze.write.format("delta").mode("overwrite").save(bronze_path)

def run_silver():
    bronze_df = spark.read.format("delta").load(bronze_path)

    silver = (
        bronze_df
        .filter(F.col("price") > 0)
        .filter(F.col("price") < 10000)
        .dropDuplicates(["user_session", "event_time"])
        .withColumn("event_date", F.to_date("event_time"))
        .withColumn(
            "price_tier",
            F.when(F.col("price") < 10, "budget")
             .when(F.col("price") < 50, "mid")
             .otherwise("premium")
        )
    )
    silver.write.format("delta").mode("overwrite").save(silver_path)

def run_gold():
    silver_df = spark.read.format("delta").load(silver_path)
    product_perf = (
        silver_df
        .groupBy("product_id", "brand")
        .agg(
            F.countDistinct(
                F.when(F.col("event_type") == "view", F.col("user_id"))
            ).alias("views"),
            F.countDistinct(
                F.when(F.col("event_type") == "purchase", F.col("user_id"))
            ).alias("purchases"),
            F.sum(
                F.when(F.col("event_type") == "purchase", F.col("price"))
            ).alias("revenue")
        )
        .withColumn(
            "conversion_rate",
            F.try_divide(F.col("purchases"), F.col("views")) * F.lit(100.0)
        )
    )
    product_perf.write.format("delta").mode("overwrite").save(gold_path)


In [0]:
def run_layer(layer_name: str):
    if layer_name == "bronze":
        run_bronze()
    elif layer_name == "silver":
        run_silver()
    elif layer_name == "gold":
        run_gold()
    else:
        raise ValueError(f"Unknown layer: {layer_name}")

# Entry point when you click "Run all"
run_layer(layer)