In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('DeltaExample').getOrCreate()

data = [(1, "Rahul", 50000),
        (2, "Priya", 60000),
        (3, "Aman", 55000)]
cols = ["emp_id", "name", "salary"]

df = spark.createDataFrame(data, cols)
df.write.format("delta").mode("overwrite").save("/FileStore/delta/employee")

In [0]:
df1 = spark.read.format("delta").load("/FileStore/delta/employee")
display(df1)

emp_id,name,salary
1,Rahul,50000
2,Priya,60000
3,Aman,55000


In [0]:
from delta.tables import DeltaTable
 
delta_emp = DeltaTable.forPath(spark, "/FileStore/delta/employee")
 
updates = [(1, "Rahul", 52000),
           (4, "Sneha", 58000)]
update_df = spark.createDataFrame(updates, cols)
 
delta_emp.alias("t").merge(
    update_df.alias("u"),
    "t.emp_id = u.emp_id"
).whenMatchedUpdate(set={"salary": "u.salary"}) \
.whenNotMatchedInsert(values={"emp_id": "u.emp_id", "name": "u.name", "salary": "u.salary"}) \
.execute()
 
 
delta_emp.history().show()                # Shows all versions
spark.read.format("delta").option("versionAsOf", 0).load("/FileStore/delta/employee").show()

+-------+-------------------+---------------+--------------------+---------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|version|          timestamp|         userId|            userName|operation| operationParameters| job|          notebook|           clusterId|readVersion|   isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+-------------------+---------------+--------------------+---------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|      1|2025-10-08 07:19:12|141077767851255|azuser4798_mml.lo...|    MERGE|{predicate -> ["(...|NULL|{1650090761100692}|1008-065153-zier6...|          0|WriteSerializable|        false|{numTargetRowsCop...|        NULL|Databricks-Runtim...|
|      0|2025-10-08 06:52:05|141

In [0]:
spark.sql("OPTIMIZE delta.`/FileStore/delta/employee`")
spark.sql("VACUUM delta.`/FileStore/delta/employee` RETAIN 168 HOURS")

DataFrame[path: string]

In [0]:
from pyspark.sql import Row
from pyspark.sql.functions import col, sum

# 🟤 Bronze - Raw Data
data = [
    Row(order_id=1, product="Laptop", region="North", quantity=2, price=55000, status="Completed"),
    Row(order_id=2, product="Mobile", region="South", quantity=3, price=25000, status="Completed"),
    Row(order_id=3, product="Book", region="North", quantity=10, price=700, status="Pending"),
    Row(order_id=4, product="Headphones", region="East", quantity=5, price=3000, status="Completed")
]
bronze_df = spark.createDataFrame(data)
bronze_df.show()

# 🟢 Silver - Filter + Derived Column
silver_df = bronze_df.filter(col("status") == "Completed") \
                     .withColumn("total_amount", col("quantity") * col("price"))
silver_df.show()

# 🟡 Gold - Aggregate
gold_df = silver_df.groupBy("region") \
                   .agg(sum("total_amount").alias("total_sales"))
gold_df.show()


+--------+----------+------+--------+-----+---------+
|order_id|   product|region|quantity|price|   status|
+--------+----------+------+--------+-----+---------+
|       1|    Laptop| North|       2|55000|Completed|
|       2|    Mobile| South|       3|25000|Completed|
|       3|      Book| North|      10|  700|  Pending|
|       4|Headphones|  East|       5| 3000|Completed|
+--------+----------+------+--------+-----+---------+

+--------+----------+------+--------+-----+---------+------------+
|order_id|   product|region|quantity|price|   status|total_amount|
+--------+----------+------+--------+-----+---------+------------+
|       1|    Laptop| North|       2|55000|Completed|      110000|
|       2|    Mobile| South|       3|25000|Completed|       75000|
|       4|Headphones|  East|       5| 3000|Completed|       15000|
+--------+----------+------+--------+-----+---------+------------+

+------+-----------+
|region|total_sales|
+------+-----------+
| North|     110000|
| South|     