In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, desc, row_number, sum as _sum, avg, count, year, month, current_date, date_sub
from delta.tables import DeltaTable
from pyspark.sql import Window

# Initialize Spark
spark = SparkSession.builder.appName("Retail360Capstone").getOrCreate()

# Base path
base_path = "/tmp/retail360/"
dbutils.fs.rm(base_path, True)
dbutils.fs.mkdirs(base_path)

# Create sample files

# customers.csv
customers_csv = """customer_id,name,region,email
1,Arjun Rao,North,arjun@example.com
2,Sneha Patel,South,sneha@example.com
3,Rahul Sharma,East,rahul@example.com
4,Neha Iyer,West,neha@example.com
"""
dbutils.fs.put(base_path + "customers.csv", customers_csv, overwrite=True)

# orders_day1.csv
orders_day1 = """order_id,customer_id,product,quantity,price,status,order_date
1001,1,Laptop,2,55000,Completed,2024-01-15
1002,2,Mobile,3,25000,Completed,2024-01-16
1003,3,Book,10,700,Pending,2024-01-16
1004,1,Headphones,5,3000,Completed,2024-01-17
"""
dbutils.fs.put(base_path + "orders_day1.csv", orders_day1, overwrite=True)

# orders_day2.csv
orders_day2 = """order_id,customer_id,product,quantity,price,status,order_date
1005,4,Mobile,1,25000,Completed,2024-01-18
1006,2,Book,2,700,Completed,2024-01-19
1003,3,Book,10,700,Completed,2024-01-16
"""
dbutils.fs.put(base_path + "orders_day2.csv", orders_day2, overwrite=True)

# products.json
products_json = """{"product_id":"P001","product_name":"Laptop","category":"Electronics","product":"Laptop"}
{"product_id":"P002","product_name":"Mobile","category":"Electronics","product":"Mobile"}
{"product_id":"P003","product_name":"Book","category":"Stationery","product":"Book"}
{"product_id":"P004","product_name":"Headphones","category":"Accessories","product":"Headphones"}
"""
dbutils.fs.put(base_path + "products.json", products_json, overwrite=True)

# Bronze Layer

customers_raw = spark.read.csv(base_path + "customers.csv", header=True, inferSchema=True)
orders_raw = spark.read.csv(base_path + "orders_day1.csv", header=True, inferSchema=True)
products_raw = spark.read.json(base_path + "products.json")

customers_raw.write.format("delta").mode("overwrite").save("/tmp/bronze_customers")
orders_raw.write.format("delta").mode("overwrite").save("/tmp/bronze_orders")
products_raw.write.format("delta").mode("overwrite").save("/tmp/bronze_products")


# Silver Layer — Cleanse & Transform

customers_df = customers_raw.withColumnRenamed("region","customer_region")
orders_df = orders_raw.filter(col("status") == "Completed") \
    .withColumn("quantity", col("quantity").cast("int")) \
    .withColumn("price", col("price").cast("double")) \
    .withColumn("order_date", col("order_date").cast("date")) \
    .withColumn("total_amount", expr("quantity * price"))

silver_orders = orders_df.alias("o") \
    .join(customers_df.alias("c"), "customer_id") \
    .join(products_raw.select("product_name","category").alias("p"), col("o.product")==col("p.product_name"), "left") \
    .select(
        col("order_id"),
        col("customer_id"),
        col("c.name").alias("customer_name"),
        col("customer_region"),
        col("o.product"),
        col("p.category"),
        col("quantity"),
        col("price"),
        col("total_amount"),
        col("status"),
        col("order_date"),
        col("c.email")
    )

silver_orders.write.format("delta").mode("overwrite").save("/tmp/silver_orders")


# Gold Layer — Aggregations

revenue_by_region = silver_orders.groupBy("customer_region").agg(_sum("total_amount").alias("total_revenue"))
revenue_by_region.write.format("delta").mode("overwrite").save("/tmp/gold_revenue_by_region")

product_sales = silver_orders.groupBy("product","category") \
    .agg(_sum("total_amount").alias("revenue"), _sum("quantity").alias("units_sold"))
w = Window.partitionBy("category").orderBy(desc("revenue"))
product_sales_ranked = product_sales.withColumn("rank_by_revenue", row_number().over(w))
product_sales_ranked.write.format("delta").mode("overwrite").save("/tmp/gold_product_sales")


# Incremental Load — MERGE day2

new_orders_df = spark.read.csv(base_path + "orders_day2.csv", header=True, inferSchema=True) \
    .withColumn("quantity", col("quantity").cast("int")) \
    .withColumn("price", col("price").cast("double")) \
    .withColumn("order_date", col("order_date").cast("date")) \
    .withColumn("total_amount", expr("quantity * price"))

silver_dt = DeltaTable.forPath(spark, "/tmp/silver_orders")
staging = new_orders_df.alias("s").join(customers_df.alias("c"), "customer_id") \
    .join(products_raw.select("product_name","category").alias("p"), col("s.product")==col("p.product_name"), "left") \
    .select(
        col("order_id"),
        col("customer_id"),
        col("c.name").alias("customer_name"),
        col("customer_region"),
        col("s.product"),
        col("p.category"),
        col("quantity"),
        col("price"),
        col("total_amount"),
        col("status"),
        col("order_date"),
        col("c.email")
    )

(silver_dt.alias("t")
 .merge(staging.alias("s"), "t.order_id = s.order_id")
 .whenMatchedUpdateAll()
 .whenNotMatchedInsertAll()
 .execute())


# Delta Lake Features: Time Travel, Optimize, VACUUM

# Time Travel Example (version 0)
silver_v0 = spark.read.format("delta").option("versionAsOf",0).load("/tmp/silver_orders")
print("Silver Orders — Version 0")
silver_v0.show()

# VACUUM old versions
silver_dt.vacuum(168)

# Optimize + Z-Ordering (metrics suppressed)
silver_dt.optimize().executeZOrderBy("customer_id")

# Read table after optimization to see data
silver_orders_post_opt = spark.read.format("delta").load("/tmp/silver_orders")
print("Silver Orders — After Optimization & Z-Ordering")
silver_orders_post_opt.show()


Wrote 176 bytes.
Wrote 232 bytes.
Wrote 184 bytes.
Wrote 363 bytes.
Silver Orders — Version 0
+--------+-----------+-------------+---------------+----------+-----------+--------+-------+------------+---------+----------+-----------------+
|order_id|customer_id|customer_name|customer_region|   product|   category|quantity|  price|total_amount|   status|order_date|            email|
+--------+-----------+-------------+---------------+----------+-----------+--------+-------+------------+---------+----------+-----------------+
|    1001|          1|    Arjun Rao|          North|    Laptop|Electronics|       2|55000.0|    110000.0|Completed|2024-01-15|arjun@example.com|
|    1002|          2|  Sneha Patel|          South|    Mobile|Electronics|       3|25000.0|     75000.0|Completed|2024-01-16|sneha@example.com|
|    1004|          1|    Arjun Rao|          North|Headphones|Accessories|       5| 3000.0|     15000.0|Completed|2024-01-17|arjun@example.com|
+--------+-----------+-------------+