In [1]:
import sys
print (sys.version)

3.10.12 (main, May 27 2025, 17:12:29) [GCC 11.4.0]


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import lit

spark = (
    SparkSession.builder.appName("SparkDeltaTest")
    .config("spark.driver.bindAddress", "127.0.0.1")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .master("local[*]")
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/03 12:35:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Write a delta table

In [6]:
data = spark.range(0, 5)

(data
  .write
  .format("delta")
  .save("./tmp/delta-table")
)

### Read a delta table

In [7]:
df = (spark
        .read
        .format("delta")
        .load("./tmp/delta-table")
        .orderBy("id")
      )

df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



### Overwrite a delta table

In [8]:
data = spark.range(5, 10)

(data
  .write
  .format("delta")
  .mode("overwrite")
  .save("/tmp/delta-table")
)

In [9]:
df = (spark
        .read
        .format("delta")
        .load("/tmp/delta-table")
        .orderBy("id")
      )

df.show()

+---+
| id|
+---+
|  5|
|  6|
|  7|
|  8|
|  9|
+---+



### Delta lake and ACID

In [10]:
from delta.tables import *
from pyspark.sql.functions import *

delta_table = DeltaTable.forPath(spark, "/tmp/delta-table")

# Update every even value by adding 100 to it
(delta_table
  .update(
    condition = expr("id % 2 == 0"),
    set = { "id": expr("id + 100") }
  )
)

(delta_table
  .toDF()
  .orderBy("id")
  .show()
)

25/07/03 12:38:15 WARN UpdateCommand: Could not validate number of records due to missing statistics.


+---+
| id|
+---+
|  5|
|  7|
|  9|
|106|
|108|
+---+



### Delete

In [11]:
# Delete every even value
(delta_table
  .delete(
    condition = expr("id % 2 == 0")
  )
)

(delta_table
  .toDF()
  .orderBy("id")
  .show()
)

25/07/03 12:40:03 WARN DeleteCommand: Could not validate number of records due to missing statistics.


+---+
| id|
+---+
|  5|
|  7|
|  9|
+---+

