In [0]:
# # Bronze Layer: Raw Ingestion
# bronze_df = spark.read.format("csv").option("header", "true").load("/Volumes/workspace/ecommerce/raw/ecommerce_2019_nov.csv")
# display(bronze_df)
# # Silver Layer: Cleaning & Validation
# from pyspark.sql.functions import col
# silver_df = bronze_df.dropDuplicates().filter(col("important_column").isNotNull())
# display(silver_df)
# # Gold Layer: Business Aggregates
# gold_df = silver_df.groupBy("business_key").agg({"metric_column": "sum"})
# display(gold_df)

In [0]:
# # CSV file folder (new directory)
# bronze_path = "/Volumes/workspace/ecommerce/bronze/events"
# silver_path = "/Volumes/workspace/ecommerce/silver/events"
# gold_path   = "/Volumes/workspace/ecommerce/gold/events"
# Delta table folder (new directory)
bronze_delta_path = "dbfs:/Volumes/workspace/ecommerce/bronze/events_delta"
silver_delta_path = "dbfs:/Volumes/workspace/ecommerce/silver/events_delta"
gold_delta_path   = "dbfs:/Volumes/workspace/ecommerce/gold/events_delta"

In [0]:
display(spark.sql("SHOW VOLUMES IN workspace.ecommerce"))

database,volume_name
ecommerce,bronze
ecommerce,delta
ecommerce,ecommerce_data
ecommerce,gold
ecommerce,silver


In [0]:
# Initialize catalog, schema, volumes
catalog = "workspace"
schema = "ecommerce"
volumes = ["bronze", "silver", "gold"]

for volume in volumes:
    try:
        spark.sql(f"CREATE VOLUME {catalog}.{schema}.{volume}")  # Create volume
        print(f"✅ Volume '{catalog}.{schema}.{volume}' created successfully.")  # Print success message
    except Exception as e:
        error_msg = str(e)  # Handle exceptions
        # Check if the error is due to the volume already existing
        if "VOLUME_ALREADY_EXISTS" in error_msg:
            print(f"⚠️ Volume '{catalog}.{schema}.{volume}' already exists. Using existing one.")
        else:
            print(f"❌ Failed to create volume '{catalog}.{schema}.{volume}':")
            print(error_msg)


⚠️ Volume 'workspace.ecommerce.bronze' already exists. Using existing one.
⚠️ Volume 'workspace.ecommerce.silver' already exists. Using existing one.
⚠️ Volume 'workspace.ecommerce.gold' already exists. Using existing one.


In [0]:
raw_path = "dbfs:/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv"  # Source: Raw volume (ecommerce_data)
bronze_path = "dbfs:/Volumes/workspace/ecommerce/bronze/events/2019-Nov.csv"  # Destination: Bronze layer
dbutils.fs.mkdirs("dbfs:/Volumes/workspace/ecommerce/bronze/events")  # Ensure bronze/events directory exists
dbutils.fs.cp(raw_path, bronze_path)  # Copy raw → bronze
print(dbutils.fs.ls("dbfs:/Volumes/workspace/ecommerce/bronze/events"))  # Verify if bronze/events directory contains file

[FileInfo(path='dbfs:/Volumes/workspace/ecommerce/bronze/events/2019-Nov.csv', name='2019-Nov.csv', size=9006762395, modificationTime=1769108629000)]


In [0]:
# Raw data ingestion from source path "ecommerce_data" to Bronze layer as Delta table 
bronze_delta_df = spark.read.format("delta").option("header", "true").load("/Volumes/workspace/ecommerce/bronze/events/2019-Nov.csv")
bronze_delta_df.write.format("delta").mode("overwrite").save(bronze_delta_path)  # Write to Bronze Delta table

In [0]:
from pyspark.sql.functions import col, to_timestamp, regexp_replace  # Import necessary libararies 
silver_delta_df = spark.read.format("delta").load(bronze_delta_path)  # Read from Bronze layer to Silver layer as Delta table
# Transform data - perform type conversion, remove null values, drop duplicates
silver_delta_df = (silver_delta_df.withColumn("event_time", to_timestamp(col("event_time"))).withColumn("price", col("price").cast("double"))
    .filter(col("price").isNotNull())            # remove bad rows
    .dropDuplicates()                            # remove duplicates
)
silver_delta_df.write.format("delta").mode("overwrite").save(silver_delta_path)  # Write to Silver Delta table

In [0]:
from pyspark.sql.functions import sum  # Import necessary libararies 
gold_delta_df = spark.read.format("delta").load(silver_delta_path)  # Read from Silver layer to Gold layer as Delta table
gold_delta_df = gold_delta_df.groupBy("brand").agg(sum("price").alias("total_revenue")).orderBy(col("total_revenue").desc())  # Transform data - group by brand, sum price
gold_delta_df.write.format("delta").mode("overwrite").save(gold_delta_path)  # Write to Gold Delta table