In [0]:
dbutils.fs.mount(
  source="wasbs://image@task1june17.blob.core.windows.net",
  mount_point="/mnt/image",
  extra_configs={"fs.azure.account.key.task1june17.blob.core.windows.net": "KaG5QlJltZ/fxhgUzBJ2aMLMw+StEzrE6OCMszpgBlBRxobrQw24WdA+18VBE5TmTb2oGNPLQFNU+AStsCLMPw=="}
)


True

In [0]:
from pyspark.sql.functions import col

# Read CSVs from mounted Azure Blob Storage
orders_df = spark.read.option("header", True).csv("/mnt/image/orders.csv")
customers_df = spark.read.option("header", True).csv("/mnt/image/customers.csv")
products_df = spark.read.option("header", True).csv("/mnt/image/products.csv")

# Clean column types for `orders`
orders_df = orders_df \
    .withColumn("Quantity", col("Quantity").cast("int")) \
    .withColumn("Price", col("Price").cast("double")) \
    .withColumn("OrderDate", col("OrderDate").cast("date"))

# Save as Delta Tables
orders_df.write.format("delta").mode("overwrite").save("/mnt/data/delta/orders")
customers_df.write.format("delta").mode("overwrite").save("/mnt/data/delta/customers")
products_df.write.format("delta").mode("overwrite").save("/mnt/data/delta/products")


In [0]:
spark.sql("CREATE OR REPLACE TEMP VIEW orders AS SELECT * FROM delta.`/mnt/data/delta/orders`")

spark.sql("""
SELECT ProductID, SUM(Quantity * Price) AS TotalRevenue
FROM orders
WHERE Status = 'Delivered'
GROUP BY ProductID
""").show()


+---------+------------+
|ProductID|TotalRevenue|
+---------+------------+
|    P1001|     75000.0|
|    P1002|     50000.0|
|    P1003|     30000.0|
+---------+------------+



In [0]:
spark.sql("CREATE OR REPLACE TEMP VIEW customers AS SELECT * FROM delta.`/mnt/data/delta/customers`")

spark.sql("""
SELECT c.Region, SUM(o.Quantity * o.Price) AS RegionRevenue
FROM orders o
JOIN customers c ON o.CustomerID = c.CustomerID
WHERE o.Status = 'Delivered'
GROUP BY c.Region
""").show()


+------+-------------+
|Region|RegionRevenue|
+------+-------------+
|  West|      30000.0|
| North|     125000.0|
+------+-------------+



In [0]:
from delta.tables import DeltaTable

orders_delta = DeltaTable.forPath(spark, "/mnt/data/delta/orders")
orders_delta.update(
    condition="Status = 'Pending'",
    set={"Status": "'Cancelled'"}
)


In [0]:
new_return = spark.createDataFrame([
    ("3006", "C003", "P1003", 1, 30000.0, "2024-05-06", "Returned")
], ["OrderID", "CustomerID", "ProductID", "Quantity", "Price", "OrderDate", "Status"]) \
    .withColumn("Quantity", col("Quantity").cast("int")) \
    .withColumn("Price", col("Price").cast("double")) \
    .withColumn("OrderDate", col("OrderDate").cast("date"))

orders_delta.alias("target").merge(
    new_return.alias("source"),
    "target.OrderID = source.OrderID"
).whenNotMatchedInsertAll().execute()


In [0]:
import dlt
from pyspark.sql.functions import col, sum as spark_sum
@dlt.table
def raw_orders():
    return spark.read.option("header", True).csv("/mnt/image/orders.csv")

@dlt.table
def cleaned_orders():
    return dlt.read("raw_orders").dropna()

@dlt.table
def aggregated_revenue_by_category():
    df = dlt.read("cleaned_orders")
    df = df.withColumn("Quantity", col("Quantity").cast("int")) \
           .withColumn("Price", col("Price").cast("double")) \
           .withColumn("Revenue", col("Quantity") * col("Price"))
    
    products_df = spark.read.option("header", True).csv("/mnt/image/products.csv")
    joined_df = df.join(products_df, "ProductID")
    
    return joined_df.groupBy("Category").agg(spark_sum("Revenue").alias("TotalRevenue"))



Name,Type
Category,string
TotalRevenue,double


In [0]:
# View before update (e.g., version 0)
df_v0 = spark.read.format("delta").option("versionAsOf", 0).load("/mnt/data/delta/orders")
df_v0.display()


OrderID,CustomerID,ProductID,Quantity,Price,OrderDate,Status
3001,C001,P1001,1,75000.0,2024-05-01,Delivered
3002,C002,P1002,2,50000.0,2024-05-02,Returned
3003,C003,P1003,1,30000.0,2024-05-03,Delivered
3004,C001,P1002,1,50000.0,2024-05-04,Delivered
3005,C004,P1004,3,10000.0,2024-05-05,Pending


In [0]:
# Overwrite current with old version (e.g., version 0)
df_old = spark.read.format("delta").option("versionAsOf", 0).load("/mnt/data/delta/orders")
df_old.write.format("delta").mode("overwrite").save("/mnt/data/delta/orders")


In [0]:
# Turn off 7-day default retention check
spark.sql("SET spark.databricks.delta.retentionDurationCheck.enabled = false")

# Vacuum the table to delete old files
spark.sql("VACUUM delta.`/mnt/data/delta/orders` RETAIN 0 HOURS")


DataFrame[path: string]

In [0]:
from pyspark.sql.functions import col

orders = spark.read.format("delta").load("/mnt/data/delta/orders")

orders.filter(
    (col("Quantity") > 0) &
    (col("Price") > 0) &
    (col("OrderDate").isNotNull())
).display()


OrderID,CustomerID,ProductID,Quantity,Price,OrderDate,Status
3001,C001,P1001,1,75000.0,2024-05-01,Delivered
3002,C002,P1002,2,50000.0,2024-05-02,Returned
3003,C003,P1003,1,30000.0,2024-05-03,Delivered
3004,C001,P1002,1,50000.0,2024-05-04,Delivered
3005,C004,P1004,3,10000.0,2024-05-05,Pending


In [0]:
from pyspark.sql.functions import when

orders = orders.withColumn(
    "OrderType",
    when(col("Status") == "Returned", "Return").otherwise("Normal")
)

orders.select("OrderID", "Status", "OrderType").display()


OrderID,Status,OrderType
3001,Delivered,Normal
3002,Returned,Return
3003,Delivered,Normal
3004,Delivered,Normal
3005,Pending,Normal
