In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("EcommerceAnalytics") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

PySpark + Delta

1. Ingest all 3 CSVs as Delta Tables.

In [0]:
orders_df = spark.read.option("header", True).csv("file:/Workspace/Shared/orders.csv")
customers_df = spark.read.option("header", True).csv("file:/Workspace/Shared/customers.csv")
products_df = spark.read.option("header", True).csv("file:/Workspace/Shared/products.csv")

orders_df.write.format("delta").mode("overwrite").save("/delta/orders")
customers_df.write.format("delta").mode("overwrite").save("/delta/customers")
products_df.write.format("delta").mode("overwrite").save("/delta/products")

2. Write SQL to get the total revenue per Product.

In [0]:
%sql CREATE TABLE IF NOT EXISTS orders USING DELTA LOCATION '/delta/orders';

SELECT 
  ProductID, 
  SUM(Quantity * Price) AS TotalRevenue
FROM orders
WHERE Status != 'Returned'
GROUP BY ProductID;


ProductID,TotalRevenue
P1001,75000.0
P1002,50000.0
P1004,30000.0
P1003,30000.0


3. Join Orders + Customers to find revenue by Region.

In [0]:
%sql CREATE TABLE IF NOT EXISTS customers USING DELTA LOCATION '/delta/customers';

SELECT 
  c.Region,
  SUM(o.Quantity * o.Price) AS Revenue
FROM orders o
JOIN customers c ON o.CustomerID = c.CustomerID
WHERE o.Status != 'Returned'
GROUP BY c.Region;


Region,Revenue
East,30000.0
West,30000.0
North,125000.0


4. Update the Status of Pending orders to 'Cancelled'.

In [0]:
from delta.tables import DeltaTable

orders_delta = DeltaTable.forPath(spark, "/delta/orders")

orders_delta.update(
    condition="Status = 'Pending'",
    set={"Status": "'Cancelled'"}
)

5. Merge a new return record into Orders.

In [0]:
new_return = spark.createDataFrame([
    ("3006", "C003", "P1001", 1, 75000, "2024-06-10", "Returned")
], ["OrderID", "CustomerID", "ProductID", "Quantity", "Price", "OrderDate", "Status"])

orders_delta.alias("target").merge(
    new_return.alias("source"),
    "target.OrderID = source.OrderID"
).whenNotMatchedInsertAll().execute()

DLT Pipeline

6. Create raw → cleaned → aggregated tables

In [0]:
import dlt
from pyspark.sql.functions import col

@dlt.table(name="raw_orders")
def raw_orders():
    return spark.read.format("delta").load("/delta/orders")

@dlt.table(name="cleaned_orders")
def cleaned_orders():
    return dlt.read("raw_orders").dropna()

@dlt.table(name="aggregated_revenue")
def aggregated_revenue():
    df = dlt.read("cleaned_orders")
    return df.groupBy("ProductID").agg({"Price": "sum"}).withColumnRenamed("sum(Price)", "TotalRevenue")


Name,Type
ProductID,string
TotalRevenue,double


Time Travel

7. View data before the Status update.

In [0]:
df_old = spark.read.format("delta").option("versionAsOf", 0).load("/delta/orders")
df_old.show()

+-------+----------+---------+--------+-----+----------+---------+
|OrderID|CustomerID|ProductID|Quantity|Price| OrderDate|   Status|
+-------+----------+---------+--------+-----+----------+---------+
|   3001|      C001|    P1001|       1|75000|2024-05-01|Delivered|
|   3002|      C002|    P1002|       2|50000|2024-05-02| Returned|
|   3003|      C003|    P1003|       1|30000|2024-05-03|Delivered|
|   3004|      C001|    P1002|       1|50000|2024-05-04|Delivered|
|   3005|      C004|    P1004|       3|10000|2024-05-05|  Pending|
+-------+----------+---------+--------+-----+----------+---------+



8. Restore to an older version of the orders table.

In [0]:
df_old.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save("/delta/orders")

Vacuum + Retention

9. Run VACUUM after changing default retention.

In [0]:
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")

orders_delta.vacuum(retentionHours=0)

DataFrame[]

Expectations

10. Quantity > 0 , Price > 0 , OrderDate is not null

In [0]:
@dlt.expect("valid_quantity", "Quantity > 0")
@dlt.expect("valid_price", "Price > 0")
@dlt.expect("valid_orderdate", "OrderDate IS NOT NULL")
@dlt.table(name="validated_orders")
def validated_orders():
    return dlt.read("raw_orders")

Name,Type
OrderID,string
CustomerID,string
ProductID,string
Quantity,string
Price,string
OrderDate,string
Status,string


Bonus

11. Use when-otherwise to create a new column: OrderType = "Return" if Status ==
'Returned'

In [0]:
from pyspark.sql.functions import when

orders_df = orders_df.withColumn(
    "OrderType",
    when(col("Status") == "Returned", "Return").otherwise("Normal")
)

orders_df.show()

+-------+----------+---------+--------+-----+----------+---------+---------+
|OrderID|CustomerID|ProductID|Quantity|Price| OrderDate|   Status|OrderType|
+-------+----------+---------+--------+-----+----------+---------+---------+
|   3001|      C001|    P1001|       1|75000|2024-05-01|Delivered|   Normal|
|   3002|      C002|    P1002|       2|50000|2024-05-02| Returned|   Return|
|   3003|      C003|    P1003|       1|30000|2024-05-03|Delivered|   Normal|
|   3004|      C001|    P1002|       1|50000|2024-05-04|Delivered|   Normal|
|   3005|      C004|    P1004|       3|10000|2024-05-05|  Pending|   Normal|
+-------+----------+---------+--------+-----+----------+---------+---------+

