In [0]:
from pyspark.sql import functions as F
from delta.tables import *
     

**SETUP: Define Widgets (Parameters)**

In [0]:
# Widgets allow the Job Scheduler to pass arguments to this notebook.
# This makes the notebook reusable for all 3 layers.

# 1. Dropdown widget to select the layer (Default: bronze)
dbutils.widgets.dropdown("layer", "bronze", ["bronze", "silver", "gold"])

# 2. Text widget for the source path (Default: Day 3/4 processed data)
dbutils.widgets.text("source_path", "/Volumes/workspace/ecommerce/ecommerce_data/processed_data/oct_2019")

# 3. Retrieve values from the widgets
current_layer = dbutils.widgets.get("layer")
source_path_param = dbutils.widgets.get("source_path")

print(f"⚙️ JOB CONFIGURATION:")
print(f"   • Running Layer: {current_layer.upper()}")
print(f"   • Source Path:   {source_path_param}")

# Define Base Paths for the Medallion Architecture
base_medallion = "/Volumes/workspace/ecommerce/ecommerce_data/medallion"
path_bronze = f"{base_medallion}/bronze"
path_silver = f"{base_medallion}/silver"
path_gold   = f"{base_medallion}/gold"

⚙️ JOB CONFIGURATION:
   • Running Layer: GOLD
   • Source Path:   /Volumes/workspace/ecommerce/ecommerce_data/processed_data/oct_2019


**Define Logic for each layer**

In [0]:
def run_bronze():
    """
    Ingests raw data 'as-is' and adds lineage metadata.
    """
    print(f"\n🚀 Starting BRONZE Layer Ingestion...")
    
    # Read from the parameter path
    raw_df = spark.read.parquet(source_path_param)
    
    # Add metadata (Using Unity Catalog compatible _metadata.file_path)
    bronze_df = raw_df.withColumn("ingestion_ts", F.current_timestamp()) \
                      .withColumn("source_file", F.col("_metadata.file_path"))
    
    # Write to Bronze
    bronze_df.write.format("delta").mode("overwrite").save(path_bronze)
    print(f"Bronze Layer Complete. Data saved to: {path_bronze}")


def run_silver():
    """
    Cleans data: Filters bad records, dedupes, and enriches schema.
    """
    print("\n Starting SILVER Layer Cleaning...")
    
    # Read from Bronze
    bronze_read = spark.read.format("delta").load(path_bronze)
    
    # Apply Transformations
    silver_df = bronze_read \
        .filter(F.col("price") > 0) \
        .dropDuplicates(["user_session", "event_time", "product_id"]) \
        .withColumn("price_tier", 
            F.when(F.col("price") < 50, "budget").otherwise("premium")
        )
    
    # Write to Silver
    silver_df.write.format("delta").mode("overwrite").save(path_silver)
    print(f"Silver Layer Complete. Data saved to: {path_silver}")


def run_gold():
    """
    Aggregates data: Calculates business KPIs (Revenue & Conversion Rate).
    """
    print("\n Starting GOLD Layer Aggregation...")
    
    # Read from Silver
    silver_read = spark.read.format("delta").load(path_silver)
    
    # Create Aggregates
    product_perf = silver_read.groupBy("product_id").agg(
        F.countDistinct("user_session").alias("total_views"),
        F.sum("price").alias("total_revenue")
    ).withColumn("conversion_rate", 
        # Handle Divide-by-Zero safely
        F.when(F.col("total_views") == 0, 0.0) \
         .otherwise(F.round(F.col("total_revenue") / F.col("total_views"), 2))
    )
    
    # Write to Gold
    product_perf.write.format("delta").mode("overwrite").save(path_gold)
    print(f"Gold Layer Complete. Data saved to: {path_gold}")

**Execution Controller**

In [0]:
# This block acts as the "Switch". It checks the 'layer' parameter
# and runs ONLY the function required for that step.

if current_layer == "bronze":
    run_bronze()
elif current_layer == "silver":
    run_silver()
elif current_layer == "gold":
    run_gold()
else:
    # If a typo is passed in the widget, fail the job.
    raise ValueError(f"Unknown layer: {current_layer}. Please choose bronze, silver, or gold.")

print(f"\n SUCCESS: {current_layer.upper()} Job Finished.")


 Starting GOLD Layer Aggregation...
Gold Layer Complete. Data saved to: /Volumes/workspace/ecommerce/ecommerce_data/medallion/gold

 SUCCESS: GOLD Job Finished.
