### 🛠Ô∏è Day 5 Tasks:

1. Implement incremental MERGE
2. Query historical versions
3. Optimize tables
4. Clean old files

## Task 1: Implement Incremental MERGE

In [0]:
#Read November CSV
Nov_events = spark.read.csv(
    "/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv",
    header=True,
    inferSchema=True
)


In [0]:
Nov_events.createOrReplaceTempView("incremental_events")


In [0]:
%sql
MERGE INTO workspace.ecommerce.oct_events_delta_sql AS target
USING incremental_events AS source
ON target.event_time = source.event_time
   AND target.user_id = source.user_id
   AND target.product_id = source.product_id

WHEN MATCHED THEN
  UPDATE SET *

WHEN NOT MATCHED THEN
  INSERT *


num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
67501979,0,0,67501979


Validate Incremental Load

In [0]:
%sql
SELECT COUNT(*) AS total_rows
FROM workspace.ecommerce.oct_events_delta_sql;


total_rows
109950743


## Task 2: Query Historical Versions (Delta Time Travel)

View Delta Table History

In [0]:
%sql
DESCRIBE HISTORY workspace.ecommerce.oct_events_delta_sql;


version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
2,2026-01-15T10:34:06.000Z,78119440703336,karthika2738@gmail.com,MERGE,"Map(predicate -> [""(((event_time#13201 = event_time#13181) AND (user_id#13208 = user_id#13188)) AND (product_id#13203 = product_id#13183))""], clusterBy -> [], matchedPredicates -> [{""actionType"":""update""}], statsOnLoad -> true, notMatchedBySourcePredicates -> [], notMatchedPredicates -> [{""actionType"":""insert""}])",,List(2538393327508024),0115-102806-588uh7nr-v2n,1.0,WriteSerializable,False,"Map(numTargetRowsCopied -> 0, numTargetRowsDeleted -> 0, numTargetFilesAdded -> 68, numTargetBytesAdded -> 1171886398, numTargetBytesRemoved -> 0, numTargetDeletionVectorsAdded -> 0, numTargetRowsMatchedUpdated -> 0, executionTimeMs -> 84187, materializeSourceTimeMs -> 45304, numTargetRowsInserted -> 67501979, numTargetRowsMatchedDeleted -> 0, numTargetDeletionVectorsUpdated -> 0, scanTimeMs -> 18575, numTargetRowsUpdated -> 0, numOutputRows -> 67501979, numTargetDeletionVectorsRemoved -> 0, numTargetRowsNotMatchedBySourceUpdated -> 0, numTargetChangeFilesAdded -> 0, numSourceRows -> 67501979, numTargetFilesRemoved -> 0, numTargetRowsNotMatchedBySourceDeleted -> 0, rewriteTimeMs -> 20165)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
1,2026-01-15T03:45:13.000Z,78119440703336,karthika2738@gmail.com,MERGE,"Map(predicate -> [""(((event_time#13257 = event_time#13236) AND (user_id#13264 = user_id#13243)) AND (product_id#13259 = product_id#13238))""], clusterBy -> [], matchedPredicates -> [], statsOnLoad -> false, notMatchedBySourcePredicates -> [], notMatchedPredicates -> [{""actionType"":""insert""}])",,List(3604489690824905),0115-033354-z5wk3ccl-v2n,0.0,WriteSerializable,False,"Map(numTargetRowsCopied -> 0, numTargetRowsDeleted -> 0, numTargetFilesAdded -> 0, numTargetBytesAdded -> 0, numTargetBytesRemoved -> 0, numTargetDeletionVectorsAdded -> 0, numTargetRowsMatchedUpdated -> 0, executionTimeMs -> 29904, materializeSourceTimeMs -> 12, numTargetRowsInserted -> 0, numTargetRowsMatchedDeleted -> 0, numTargetDeletionVectorsUpdated -> 0, scanTimeMs -> 0, numTargetRowsUpdated -> 0, numOutputRows -> 0, numTargetDeletionVectorsRemoved -> 0, numTargetRowsNotMatchedBySourceUpdated -> 0, numTargetChangeFilesAdded -> 0, numSourceRows -> 42448764, numTargetFilesRemoved -> 0, numTargetRowsNotMatchedBySourceDeleted -> 0, rewriteTimeMs -> 29796)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
0,2026-01-14T16:25:39.000Z,78119440703336,karthika2738@gmail.com,CREATE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.parquet.compression.codec"":""zstd"",""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(3604489690824905),0114-162320-q4l93jah-v2n,,WriteSerializable,True,"Map(numFiles -> 12, numOutputRows -> 42448764, numOutputBytes -> 731915974)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13


Query a Specific Version (VERSION AS OF)

In [0]:
%sql
SELECT COUNT(*)
FROM workspace.ecommerce.oct_events_delta_sql
VERSION AS OF 0;


COUNT(*)
42448764


Compare with Current Version

In [0]:
%sql
SELECT COUNT(*)
FROM workspace.ecommerce.oct_events_delta_sql;


COUNT(*)
109950743


Query by Timestamp (TIMESTAMP AS OF)

In [0]:
%sql
SELECT *
FROM workspace.ecommerce.oct_events_delta_sql
TIMESTAMP AS OF '2026-01-14 21:00:00'
LIMIT 10;


event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-10-26T07:55:54.000Z,view,45300083,2106075662325383725,kids.swing,joie,197.43,538181220,70d25815-9efd-4535-a1e3-5d209006ca27
2019-10-26T07:55:54.000Z,view,3701134,2053013565983425517,appliances.environment.vacuum,bosch,90.07,525753452,d8ee9530-fc77-421c-be80-54bb03e0d1a2
2019-10-26T07:55:54.000Z,view,1307238,2053013558920217191,computers.notebook,lenovo,303.46,549408570,bc4a7104-1277-4c8b-8191-9e2189bfeeee
2019-10-26T07:55:54.000Z,view,3601537,2053013563810775923,appliances.kitchen.washer,samsung,386.08,512854725,923be535-e2ae-4b47-af2d-5f5dad09f0ae
2019-10-26T07:55:54.000Z,cart,16700267,2053013559901684381,furniture.kitchen.chair,,13.1,515802764,989e824d-3f3f-45a6-9025-4f0e32870e67
2019-10-26T07:55:55.000Z,view,17600213,2053013558895051365,,missha,17.37,523607154,82514b31-043a-4386-874a-235b7f40b50b
2019-10-26T07:55:55.000Z,view,15300008,2053013552662315243,,samsung,43.21,516470345,97532507-cf75-429e-8bbd-84c2fc32dab7
2019-10-26T07:55:55.000Z,view,1004858,2053013555631882655,electronics.smartphone,samsung,131.53,522850155,40738b0d-76ac-4e20-8020-045709479161
2019-10-26T07:55:55.000Z,view,1004434,2053013555631882655,electronics.smartphone,samsung,257.15,516035121,238a9215-7c7e-4101-a3f5-abafd893e585
2019-10-26T07:55:55.000Z,view,22700065,2053013556168753601,,aeroforce,145.43,514919831,71a13977-8ac6-43d5-b261-61293ac7192d


Validate Time Travel Works


In [0]:
%sql
SELECT COUNT(*)
FROM workspace.ecommerce.oct_events_delta_sql VERSION AS OF 1;


COUNT(*)
42448764


In [0]:
# After the incremental load,
%sql
SELECT COUNT(*)
FROM workspace.ecommerce.oct_events_delta_sql VERSION AS OF 2;


COUNT(*)
109950743


## Task 3: Optimize Delta Tables

## Understand WHY OPTIMIZE is needed

Delta tables often end up with many small files due to:

Frequent appends

MERGE operations

Incremental pipelines

👉 Small files = slow queries

👉 OPTIMIZE = compacts files into larger ones

Run OPTIMIZE (Basic)

In [0]:
%sql
OPTIMIZE workspace.ecommerce.oct_events_delta_sql;


path,metrics
,"List(22, 68, List(51069941, 68137411, 5.439848204545455E7, 22, 1196766605), List(1764250, 19282631, 1.72336235E7, 68, 1171886398), 0, null, null, 0, 1, 80, 12, true, 0, 0, 1768474000980, 1768474013805, 8, 22, null, List(0, 0), null, 9, 9, 60667, 0, null)"


Optimize with Z-ORDER (Recommended ‚≠ê)

Use Z-ORDER on columns that are:

Frequently filtered

Used in joins

In [0]:
%sql
OPTIMIZE workspace.ecommerce.oct_events_delta_sql
ZORDER BY (user_id, event_time);


path,metrics
,"List(24, 34, List(64703736, 92493347, 7.644601516666667E7, 24, 1834704364), List(51069941, 69911558, 5.6725958205882356E7, 34, 1928682579), 0, List(minCubeSize(107374182400), List(0, 0), List(34, 1928682579), 0, List(34, 1928682579), 1, null), null, 0, 1, 34, 0, false, 0, 0, 1768474077182, 1768474113965, 8, 1, null, List(0, 0), null, 9, 9, 140286, 0, null)"


Best Practices (Remember This )

Run OPTIMIZE:

- After large MERGE operations

- On frequently queried tables

- Don‚Äôt run it after every small write

- Use Z-ORDER only on selective columns

## Task 4: Clean Old Files (VACUUM)

Understand What VACUUM Does

After:

MERGE,
OPTIMIZE,
DELETE

Delta keeps old files for:

Time travel,
Rollback,
Auditing

👉 VACUUM removes those unused files

Run VACUUM

In [0]:
%sql
VACUUM workspace.ecommerce.oct_events_delta_sql;


path


What this does

Keeps last 7 days of history (default)

Removes older unused files

Safe for most use cases

‚úÖ Recommended for learning & production

Verify VACUUM Ran

In [0]:
%sql
DESCRIBE HISTORY workspace.ecommerce.oct_events_delta_sql;


version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
6,2026-01-15T10:53:49.000Z,78119440703336,karthika2738@gmail.com,VACUUM END,Map(status -> COMPLETED),,List(2538393327508024),0115-102806-588uh7nr-v2n,5.0,SnapshotIsolation,True,"Map(numDeletedFiles -> 0, numVacuumedDirectories -> 1)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
5,2026-01-15T10:53:48.000Z,78119440703336,karthika2738@gmail.com,VACUUM START,"Map(retentionCheckEnabled -> true, defaultRetentionMillis -> 604800000)",,List(2538393327508024),0115-102806-588uh7nr-v2n,4.0,SnapshotIsolation,True,"Map(numFilesToDelete -> 0, sizeOfDataToDelete -> 0)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
4,2026-01-15T10:48:34.000Z,78119440703336,karthika2738@gmail.com,OPTIMIZE,"Map(predicate -> [], auto -> false, clusterBy -> [], zOrderBy -> [""user_id"",""event_time""], batchId -> 0)",,List(2538393327508024),0115-102806-588uh7nr-v2n,3.0,SnapshotIsolation,False,"Map(numRemovedFiles -> 34, numRemovedBytes -> 1928682579, p25FileSize -> 71158718, numDeletionVectorsRemoved -> 0, minFileSize -> 64703736, numAddedFiles -> 24, maxFileSize -> 92493347, p75FileSize -> 83156540, p50FileSize -> 76117023, numAddedBytes -> 1834704364)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
3,2026-01-15T10:46:54.000Z,78119440703336,karthika2738@gmail.com,OPTIMIZE,"Map(predicate -> [], auto -> false, clusterBy -> [], zOrderBy -> [], batchId -> 0)",,List(2538393327508024),0115-102806-588uh7nr-v2n,2.0,SnapshotIsolation,False,"Map(numRemovedFiles -> 68, numRemovedBytes -> 1171886398, p25FileSize -> 52907322, numDeletionVectorsRemoved -> 0, minFileSize -> 51069941, numAddedFiles -> 22, maxFileSize -> 68137411, p75FileSize -> 55112855, p50FileSize -> 53386543, numAddedBytes -> 1196766605)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
2,2026-01-15T10:34:06.000Z,78119440703336,karthika2738@gmail.com,MERGE,"Map(predicate -> [""(((event_time#13201 = event_time#13181) AND (user_id#13208 = user_id#13188)) AND (product_id#13203 = product_id#13183))""], clusterBy -> [], matchedPredicates -> [{""actionType"":""update""}], statsOnLoad -> true, notMatchedBySourcePredicates -> [], notMatchedPredicates -> [{""actionType"":""insert""}])",,List(2538393327508024),0115-102806-588uh7nr-v2n,1.0,WriteSerializable,False,"Map(numTargetRowsCopied -> 0, numTargetRowsDeleted -> 0, numTargetFilesAdded -> 68, numTargetBytesAdded -> 1171886398, numTargetBytesRemoved -> 0, numTargetDeletionVectorsAdded -> 0, numTargetRowsMatchedUpdated -> 0, executionTimeMs -> 84187, materializeSourceTimeMs -> 45304, numTargetRowsInserted -> 67501979, numTargetRowsMatchedDeleted -> 0, numTargetDeletionVectorsUpdated -> 0, scanTimeMs -> 18575, numTargetRowsUpdated -> 0, numOutputRows -> 67501979, numTargetDeletionVectorsRemoved -> 0, numTargetRowsNotMatchedBySourceUpdated -> 0, numTargetChangeFilesAdded -> 0, numSourceRows -> 67501979, numTargetFilesRemoved -> 0, numTargetRowsNotMatchedBySourceDeleted -> 0, rewriteTimeMs -> 20165)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
1,2026-01-15T03:45:13.000Z,78119440703336,karthika2738@gmail.com,MERGE,"Map(predicate -> [""(((event_time#13257 = event_time#13236) AND (user_id#13264 = user_id#13243)) AND (product_id#13259 = product_id#13238))""], clusterBy -> [], matchedPredicates -> [], statsOnLoad -> false, notMatchedBySourcePredicates -> [], notMatchedPredicates -> [{""actionType"":""insert""}])",,List(3604489690824905),0115-033354-z5wk3ccl-v2n,0.0,WriteSerializable,False,"Map(numTargetRowsCopied -> 0, numTargetRowsDeleted -> 0, numTargetFilesAdded -> 0, numTargetBytesAdded -> 0, numTargetBytesRemoved -> 0, numTargetDeletionVectorsAdded -> 0, numTargetRowsMatchedUpdated -> 0, executionTimeMs -> 29904, materializeSourceTimeMs -> 12, numTargetRowsInserted -> 0, numTargetRowsMatchedDeleted -> 0, numTargetDeletionVectorsUpdated -> 0, scanTimeMs -> 0, numTargetRowsUpdated -> 0, numOutputRows -> 0, numTargetDeletionVectorsRemoved -> 0, numTargetRowsNotMatchedBySourceUpdated -> 0, numTargetChangeFilesAdded -> 0, numSourceRows -> 42448764, numTargetFilesRemoved -> 0, numTargetRowsNotMatchedBySourceDeleted -> 0, rewriteTimeMs -> 29796)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
0,2026-01-14T16:25:39.000Z,78119440703336,karthika2738@gmail.com,CREATE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.parquet.compression.codec"":""zstd"",""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(3604489690824905),0114-162320-q4l93jah-v2n,,WriteSerializable,True,"Map(numFiles -> 12, numOutputRows -> 42448764, numOutputBytes -> 731915974)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13


Understand Retention Policy 

Default retention:

7 days (168 hours)


Meaning:

Time travel works for last 7 days

Older versions are removed

Best Practices

‚úî Use default VACUUM in production

‚úî Run VACUUM after OPTIMIZE

‚úî Never run RETAIN 0 HOURS accidentally

‚úî Schedule VACUUM off-peak