In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS hexa_files.default;


In [0]:
%sql
SHOW SCHEMAS IN hexa_files;


databaseName
default
information_schema


In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType

schema = StructType([
  StructField("id", IntegerType(), True),
  StructField("firstName", StringType(), True),
  StructField("middleName", StringType(), True),
  StructField("lastName", StringType(), True),
  StructField("gender", StringType(), True),
  StructField("birthDate", TimestampType(), True),
  StructField("ssn", StringType(), True),
  StructField("salary", IntegerType(), True)
])

df = spark.read.format("csv").option("header", True).schema(schema).load("/Volumes/hexa_files/default/hexadata/export.csv")

# Create the table if it does not exist. Otherwise, replace the existing table.
df.writeTo("hexa_files.default.people_10m").createOrReplace()

# If you know the table does not already exist, you can call this instead:
# df.write.saveAsTable("main.default.people_10m")

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from datetime import date

schema = StructType([
  StructField("id", IntegerType(), True),
  StructField("firstName", StringType(), True),
  StructField("middleName", StringType(), True),
  StructField("lastName", StringType(), True),
  StructField("gender", StringType(), True),
  StructField("birthDate", DateType(), True),
  StructField("ssn", StringType(), True),
  StructField("salary", IntegerType(), True)
])

data = [
  (9999998, 'Billy', 'Tommie', 'Luppitt', 'M', date.fromisoformat('1992-09-17'), '953-38-9452', 55250),
  (9999999, 'Elias', 'Cyril', 'Leadbetter', 'M', date.fromisoformat('1984-05-22'), '906-51-2137', 48500),
  (10000000, 'Joshua', 'Chas', 'Broggio', 'M', date.fromisoformat('1968-07-22'), '988-61-6247', 90000),
  (20000001, 'John', '', 'Doe', 'M', date.fromisoformat('1978-01-14'), '345-67-8901', 55500),
  (20000002, 'Mary', '', 'Smith', 'F', date.fromisoformat('1982-10-29'), '456-78-9012', 98250),
  (20000003, 'Jane', '', 'Doe', 'F', date.fromisoformat('1981-06-25'), '567-89-0123', 89900)
]

people_10m_updates = spark.createDataFrame(data, schema)
people_10m_updates.createTempView("people_10m_updates")

# ...

from delta.tables import DeltaTable

deltaTable = DeltaTable.forName(spark, 'hexa_files.default.people_10m')

(deltaTable.alias("people_10m")
  .merge(
    people_10m_updates.alias("people_10m_updates"),
    "people_10m.id = people_10m_updates.id")
  .whenMatchedUpdateAll()
  .whenNotMatchedInsertAll()
  .execute()
)

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [0]:
df = spark.read.table("hexa_files.default.people_10m")
df_filtered = df.filter(df["id"] >= 9999998)
display(df_filtered)

id,firstName,middleName,lastName,gender,birthDate,ssn,salary
9999999,Elias,Cyril,Leadbetter,M,1984-05-22T00:00:00.000Z,906-51-2137,48500
9999998,Billy,Tommie,Luppitt,M,1992-09-17T00:00:00.000Z,953-38-9452,55250
10000000,Joshua,Chas,Broggio,M,1968-07-22T00:00:00.000Z,988-61-6247,90000
20000002,Mary,,Smith,F,1982-10-29T00:00:00.000Z,456-78-9012,98250
20000001,John,,Doe,M,1978-01-14T00:00:00.000Z,345-67-8901,55500
20000003,Jane,,Doe,F,1981-06-25T00:00:00.000Z,567-89-0123,89900


In [0]:
people_df = spark.read.table("hexa_files.default.people_10m")
display(people_df)

id,firstName,middleName,lastName,gender,birthDate,ssn,salary
9999999,Elias,Cyril,Leadbetter,M,1984-05-22T00:00:00.000Z,906-51-2137,48500
9999998,Billy,Tommie,Luppitt,M,1992-09-17T00:00:00.000Z,953-38-9452,55250
10000000,Joshua,Chas,Broggio,M,1968-07-22T00:00:00.000Z,988-61-6247,90000
20000002,Mary,,Smith,F,1982-10-29T00:00:00.000Z,456-78-9012,98250
20000001,John,,Doe,M,1978-01-14T00:00:00.000Z,345-67-8901,55500
20000003,Jane,,Doe,F,1981-06-25T00:00:00.000Z,567-89-0123,89900


In [0]:
df.write.mode("append").saveAsTable("hexa_files.default.people_10m")

In [0]:
from delta.tables import *
from pyspark.sql.functions import *

deltaTable = DeltaTable.forName(spark, "hexa_files.default.people_10m")

# Declare the predicate by using a SQL-formatted string.
deltaTable.update(
  condition = "gender = 'F'",
  set = { "gender": "'Female'" }
)

# Declare the predicate by using Spark SQL functions.
deltaTable.update(
  condition = col('gender') == 'M',
  set = { 'gender': lit('Male') }
)

DataFrame[num_affected_rows: bigint]

In [0]:
from delta.tables import *
from pyspark.sql.functions import *

deltaTable = DeltaTable.forName(spark, "hexa_files.default.people_10m")

# Declare the predicate by using a SQL-formatted string.
deltaTable.delete("birthDate < '1955-01-01'")

# Declare the predicate by using Spark SQL functions.
deltaTable.delete(col('birthDate') < '1960-01-01')

DataFrame[num_affected_rows: bigint]

In [0]:
from delta.tables import *

deltaTable = DeltaTable.forName(spark, "hexa_files.default.people_10m")
display(deltaTable.history())

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
9,2025-08-12T16:40:12.000Z,144409217163733,azuser4019_mml.local@techademy.com,DELETE,"Map(predicate -> [""(birthDate#12733 < 1960-01-01 00:00:00)""])",,,0812-152710-goyeqgho-v2n,8.0,WriteSerializable,False,"Map(numRemovedFiles -> 0, numRemovedBytes -> 0, numCopiedRows -> 0, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 158, numDeletionVectorsUpdated -> 0, numDeletedRows -> 0, scanTimeMs -> 158, numAddedFiles -> 0, numAddedBytes -> 0, rewriteTimeMs -> 0)",,Databricks-Runtime/17.0.x-photon-scala2.13
8,2025-08-12T16:40:10.000Z,144409217163733,azuser4019_mml.local@techademy.com,DELETE,"Map(predicate -> [""(birthDate#12637 < 1955-01-01 00:00:00)""])",,,0812-152710-goyeqgho-v2n,7.0,WriteSerializable,False,"Map(numRemovedFiles -> 0, numRemovedBytes -> 0, numCopiedRows -> 0, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 167, numDeletionVectorsUpdated -> 0, numDeletedRows -> 0, scanTimeMs -> 165, numAddedFiles -> 0, numAddedBytes -> 0, rewriteTimeMs -> 0)",,Databricks-Runtime/17.0.x-photon-scala2.13
7,2025-08-12T16:39:41.000Z,144409217163733,azuser4019_mml.local@techademy.com,OPTIMIZE,"Map(predicate -> [], auto -> true, clusterBy -> [], zOrderBy -> [], batchId -> 0)",,,0812-152710-goyeqgho-v2n,6.0,SnapshotIsolation,False,"Map(numRemovedFiles -> 2, numRemovedBytes -> 5001, p25FileSize -> 2561, numDeletionVectorsRemoved -> 1, minFileSize -> 2561, numAddedFiles -> 1, maxFileSize -> 2561, p75FileSize -> 2561, p50FileSize -> 2561, numAddedBytes -> 2561)",,Databricks-Runtime/17.0.x-photon-scala2.13
6,2025-08-12T16:39:39.000Z,144409217163733,azuser4019_mml.local@techademy.com,UPDATE,"Map(predicate -> [""(gender#11925 = M)""])",,,0812-152710-goyeqgho-v2n,4.0,WriteSerializable,False,"Map(numRemovedFiles -> 5, numRemovedBytes -> 10474, numCopiedRows -> 0, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 1, numAddedChangeFiles -> 0, executionTimeMs -> 1565, conflictDetectionTimeMs -> 322, numDeletionVectorsUpdated -> 0, scanTimeMs -> 637, numAddedFiles -> 1, numUpdatedRows -> 8, numAddedBytes -> 2449, rewriteTimeMs -> 928)",,Databricks-Runtime/17.0.x-photon-scala2.13
5,2025-08-12T16:39:37.000Z,144409217163733,azuser4019_mml.local@techademy.com,OPTIMIZE,"Map(predicate -> [], auto -> true, clusterBy -> [], zOrderBy -> [], batchId -> 0)",,,0812-152710-goyeqgho-v2n,4.0,SnapshotIsolation,False,"Map(numRemovedFiles -> 6, numRemovedBytes -> 12771, p25FileSize -> 2552, numDeletionVectorsRemoved -> 1, minFileSize -> 2552, numAddedFiles -> 1, maxFileSize -> 2552, p75FileSize -> 2552, p50FileSize -> 2552, numAddedBytes -> 2552)",,Databricks-Runtime/17.0.x-photon-scala2.13
4,2025-08-12T16:39:36.000Z,144409217163733,azuser4019_mml.local@techademy.com,UPDATE,"Map(predicate -> [""(gender#11439 = F)""])",,,0812-152710-goyeqgho-v2n,3.0,WriteSerializable,False,"Map(numRemovedFiles -> 2, numRemovedBytes -> 3975, numCopiedRows -> 0, numDeletionVectorsAdded -> 1, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 2477, numDeletionVectorsUpdated -> 0, scanTimeMs -> 1096, numAddedFiles -> 1, numUpdatedRows -> 4, numAddedBytes -> 2297, rewriteTimeMs -> 1375)",,Databricks-Runtime/17.0.x-photon-scala2.13
3,2025-08-12T16:39:06.000Z,144409217163733,azuser4019_mml.local@techademy.com,WRITE,"Map(mode -> Append, statsOnLoad -> true, partitionBy -> [])",,,0812-152710-goyeqgho-v2n,2.0,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 6, numOutputBytes -> 2368)",,Databricks-Runtime/17.0.x-photon-scala2.13
2,2025-08-12T16:35:49.000Z,144409217163733,azuser4019_mml.local@techademy.com,MERGE,"Map(predicate -> [""(id#10601 = id#10633)""], clusterBy -> [], matchedPredicates -> [{""actionType"":""update""}], statsOnLoad -> true, notMatchedBySourcePredicates -> [], notMatchedPredicates -> [{""actionType"":""insert""}])",,,0812-152710-goyeqgho-v2n,1.0,WriteSerializable,False,"Map(numTargetRowsCopied -> 0, numTargetRowsDeleted -> 0, numTargetFilesAdded -> 6, numTargetBytesAdded -> 12081, numTargetBytesRemoved -> 0, numTargetDeletionVectorsAdded -> 0, numTargetRowsMatchedUpdated -> 0, executionTimeMs -> 2415, materializeSourceTimeMs -> 242, numTargetRowsInserted -> 6, numTargetRowsMatchedDeleted -> 0, numTargetDeletionVectorsUpdated -> 0, scanTimeMs -> 1016, numTargetRowsUpdated -> 0, numOutputRows -> 6, numTargetDeletionVectorsRemoved -> 0, numTargetRowsNotMatchedBySourceUpdated -> 0, numTargetChangeFilesAdded -> 0, numSourceRows -> 6, numTargetFilesRemoved -> 0, numTargetRowsNotMatchedBySourceDeleted -> 0, rewriteTimeMs -> 1106)",,Databricks-Runtime/17.0.x-photon-scala2.13
1,2025-08-12T16:33:08.000Z,144409217163733,azuser4019_mml.local@techademy.com,CREATE OR REPLACE TABLE,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> false)",,,0812-152710-goyeqgho-v2n,0.0,WriteSerializable,False,Map(),,Databricks-Runtime/17.0.x-photon-scala2.13
0,2025-08-12T16:30:37.000Z,144409217163733,azuser4019_mml.local@techademy.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,,0812-152710-goyeqgho-v2n,,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 0, numRemovedBytes -> 0, numOutputRows -> 1000, numOutputBytes -> 46030)",,Databricks-Runtime/17.0.x-photon-scala2.13
