In [0]:
# Load your data
oct_events = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv", header=True, inferSchema=True)

In [0]:
oct_events.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



In [0]:
oct_events.show(5, truncate=False)

+-------------------+----------+----------+-------------------+-----------------------------------+--------+-------+---------+------------------------------------+
|event_time         |event_type|product_id|category_id        |category_code                      |brand   |price  |user_id  |user_session                        |
+-------------------+----------+----------+-------------------+-----------------------------------+--------+-------+---------+------------------------------------+
|2019-10-01 00:00:00|view      |44600062  |2103807459595387724|NULL                               |shiseido|35.79  |541312140|72d76fde-8bb3-4e00-8c23-a032dfed738c|
|2019-10-01 00:00:00|view      |3900821   |2053013552326770905|appliances.environment.water_heater|aqua    |33.2   |554748717|9333dfbd-b87a-4708-9857-6336556b0fcc|
|2019-10-01 00:00:01|view      |17200506  |2053013559792632471|furniture.living_room.sofa         |NULL    |543.1  |519107250|566511c2-e2e3-422b-b695-cf8e6e792ca8|
|2019-10-01 00:0

In [0]:
oct_events.count()

42448764

In [0]:
oct_events.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("workspace.ecommerce.events_delta")


In [0]:
%sql
SELECT event_type, COUNT(*) AS cnt
FROM workspace.ecommerce.events_delta
GROUP BY event_type
ORDER BY cnt DESC;



event_type,cnt
view,40779399
cart,926516
purchase,742849


In [0]:
from pyspark.sql import Row

wrong_schema = spark.createDataFrame(
    [("a", "b", "c")],
    ["x", "y", "z"]
)

try:
    wrong_schema.write.format("delta").mode("append").save("/delta/events")
except Exception as e:
    print("Schema enforcement error:")
    print(e)


Schema enforcement error:
Public DBFS root is disabled. Access is denied on path: /delta/events/_delta_log

JVM stacktrace:
java.lang.UnsupportedOperationException
	at com.databricks.backend.daemon.data.client.DisabledDatabricksFileSystem.rejectOperation(DisabledDatabricksFileSystem.scala:31)
	at com.databricks.backend.daemon.data.client.DisabledDatabricksFileSystem.getFileStatus(DisabledDatabricksFileSystem.scala:108)
	at com.databricks.backend.daemon.data.client.DatabricksFileSystemV2.$anonfun$getFileStatus$2(DatabricksFileSystemV2.scala:1227)
	at com.databricks.s3a.S3AExceptionUtils$.convertAWSExceptionToJavaIOException(DatabricksStreamUtils.scala:64)
	at com.databricks.backend.daemon.data.client.DatabricksFileSystemV2.$anonfun$getFileStatus$1(DatabricksFileSystemV2.scala:1224)
	at com.databricks.logging.UsageLogging.$anonfun$recordOperation$1(UsageLogging.scala:510)
	at com.databricks.logging.UsageLogging.executeThunkAndCaptureResultTags$1(UsageLogging.scala:616)
	at com.databricks

In [0]:
# Choose a path inside the volume
base_path = "/Volumes/workspace/ecommerce/ecommerce_data"

# 1) Write a clean copy first
oct_events.write.format("delta").mode("overwrite").save(f"{base_path}/events_raw")

# 2) Append the same data again (simulate duplicate load)
oct_events.write.format("delta").mode("append").save(f"{base_path}/events_raw")

# 3) Read back and check counts
raw = spark.read.format("delta").load(f"{base_path}/events_raw")
print("Raw row count (with duplicates):", raw.count())



Raw row count (with duplicates): 84897528


In [0]:

%fs ls /Volumes/workspace/ecommerce/ecommerce_data/events_raw/_delta_log


path,name,size,modificationTime
dbfs:/Volumes/workspace/ecommerce/ecommerce_data/events_raw/_delta_log/00000000000000000000.crc,00000000000000000000.crc,52239,1768273513000
dbfs:/Volumes/workspace/ecommerce/ecommerce_data/events_raw/_delta_log/00000000000000000000.json,00000000000000000000.json,51533,1768273512000
dbfs:/Volumes/workspace/ecommerce/ecommerce_data/events_raw/_delta_log/00000000000000000001.crc,00000000000000000001.crc,2832,1768273541000
dbfs:/Volumes/workspace/ecommerce/ecommerce_data/events_raw/_delta_log/00000000000000000001.json,00000000000000000001.json,50335,1768273540000
dbfs:/Volumes/workspace/ecommerce/ecommerce_data/events_raw/_delta_log/_staged_commits/,_staged_commits/,0,1768273567966


In [0]:
cleaned = raw.dropDuplicates(["event_time", "user_id", "product_id", "event_type"])
print("Deduped row count:", cleaned.count())

cleaned.write.format("delta").mode("overwrite").save(f"{base_path}/events_cleaned")

clean = spark.read.format("delta").load(f"{base_path}/events_cleaned")
clean.show(5, truncate=False)


Deduped row count: 42413557
+-------------------+----------+----------+-------------------+----------------------+--------+------+---------+------------------------------------+
|event_time         |event_type|product_id|category_id        |category_code         |brand   |price |user_id  |user_session                        |
+-------------------+----------+----------+-------------------+----------------------+--------+------+---------+------------------------------------+
|2019-10-13 06:29:47|view      |22700193  |2053013556168753601|NULL                  |stels   |77.22 |537333502|1a1e1d4a-4a39-4c11-ac98-92fce3eb3de0|
|2019-10-13 06:37:14|view      |15100148  |2053013557024391671|NULL                  |lider   |378.36|521327059|0a8ee6fd-8e12-4181-adb6-069288a717e2|
|2019-10-13 06:37:25|view      |1003310   |2053013555631882655|electronics.smartphone|apple   |696.13|544301403|3c9ef31b-2f74-4d7d-9e6e-5c40e3cd5157|
|2019-10-13 06:37:47|view      |1004838   |2053013555631882655|electroni