In [0]:
ecommerce_data_2019_nov_df = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv", header=True)
display(ecommerce_data_2019_nov_df.limit(3))

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-11-01 00:00:00 UTC,view,1003461,2053013555631882655,electronics.smartphone,xiaomi,489.07,520088904,4d3b30da-a5e4-49df-b1a8-ba5943f1dd33
2019-11-01 00:00:00 UTC,view,5000088,2053013566100866035,appliances.sewing_machine,janome,293.65,530496790,8e5f4f83-366c-4f70-860e-ca7417414283
2019-11-01 00:00:01 UTC,view,17302664,2053013553853497655,,creed,28.31,561587266,755422e7-9040-477b-9bd2-6a6e8fd97387


In [0]:
%sql
SHOW VOLUMES IN workspace.ecommerce;

database,volume_name
ecommerce,delta
ecommerce,ecommerce_data


In [0]:
%sql
CREATE VOLUME IF NOT EXISTS workspace.ecommerce.delta;

In [0]:
# Saves DataFrame as Delta table in separate location; does not overwrite original 2019-Nov.csv file.
ecommerce_data_2019_nov_df.write.format("delta").mode("overwrite").save("/Volumes/workspace/ecommerce/delta/ecommerce_data_2019_nov")

In [0]:
ecommerce_data_2019_nov_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("workspace.ecommerce.ecommerce_data_2019_nov")

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
# Define a schema with an extra column not present in the Delta table
test_schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("quantity", IntegerType(), True),  # This column does not exist in the Delta table
])
# Create a DataFrame with the test schema
test_data = [("1001", "A123", 2)]
test_df = spark.createDataFrame(test_data, schema=test_schema)
# Attempt to write to the Delta table to test schema enforcement
try:
    test_df.write.format("delta").mode("append").save("/Volumes/workspace/ecommerce/delta/ecommerce_data_2019_nov")
except Exception as e:
    print("Schema enforcement test: Exception occurred.")
    print(e)

Schema enforcement test: Exception occurred.
[_LEGACY_ERROR_TEMP_DELTA_0007] A schema mismatch detected when writing to the Delta table (Table ID: 8302f488-54d6-4d73-8bd8-b89890019fc6).
To enable schema migration using DataFrameWriter or DataStreamWriter, please set:
'.option("mergeSchema", "true")'.
For other operations, set the session configuration
spark.databricks.delta.schema.autoMerge.enabled to "true". See the documentation
specific to the operation for details.

Table schema:
root
-- event_time: string (nullable = true)
-- event_type: string (nullable = true)
-- product_id: string (nullable = true)
-- category_id: string (nullable = true)
-- category_code: string (nullable = true)
-- brand: string (nullable = true)
-- price: string (nullable = true)
-- user_id: string (nullable = true)
-- user_session: string (nullable = true)


Data schema:
root
-- order_id: string (nullable = true)
-- product_id: string (nullable = true)
-- quantity: integer (nullable = true)

         
Table

In [0]:
from pyspark.sql import functions as F
# Remove duplicates based on all columns before writing to Delta
ecommerce_data_2019_nov_nodupe_df = ecommerce_data_2019_nov_df.dropDuplicates()
ecommerce_data_2019_nov_nodupe_df.write.format("delta").mode("overwrite").save("/Volumes/workspace/ecommerce/delta/ecommerce_data_2019_nov")
print(f"{ecommerce_data_2019_nov_df.count()}, {len(ecommerce_data_2019_nov_df.columns)}")
print(f"{ecommerce_data_2019_nov_nodupe_df.count()}, {len(ecommerce_data_2019_nov_nodupe_df.columns)}")

67501979, 9
67401460, 9
