In [1]:
import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/usr/local/Homebrew/Caskroom/miniforge/base/envs/delta_lake/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/hamzaharunamohammed/.ivy2/cache
The jars for the packages stored in: /Users/hamzaharunamohammed/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c9756fce-43e4-4da8-b771-11ee261f651f;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.1.0 in central
	found io.delta#delta-storage;3.1.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 285ms :: artifacts dl 14ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.1.0 from central in [default]
	io.delta#delta-storage;3.1.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|   

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 63018)
Traceback (most recent call last):
  File "/usr/local/Homebrew/Caskroom/miniforge/base/envs/delta_lake/lib/python3.9/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/local/Homebrew/Caskroom/miniforge/base/envs/delta_lake/lib/python3.9/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/usr/local/Homebrew/Caskroom/miniforge/base/envs/delta_lake/lib/python3.9/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/local/Homebrew/Caskroom/miniforge/base/envs/delta_lake/lib/python3.9/socketserver.py", line 747, in __init__
    self.handle()
  File "/usr/local/Homebrew/Caskroom/miniforge/base/envs/delta_lake/lib/python3.9/site-packages/pyspark/accumulators.py", line 295, in handle
    poll(accum_

In [3]:
# Create a table
data = spark.range(0, 5)
data.write.format("delta").save("/tmp/delta-table")

24/06/02 16:52:53 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [4]:
#Read data
df = spark.read.format("delta").load("/tmp/delta-table")
df.show()

                                                                                

+---+
| id|
+---+
|  4|
|  1|
|  0|
|  2|
|  3|
+---+



In [5]:
# Update table data
data = spark.range(5, 10)
data.write.format("delta").mode("overwrite").save("/tmp/delta-table")

                                                                                

In [6]:
# Read data
df = spark.read.format("delta").load("/tmp/delta-table")
df.show()

+---+
| id|
+---+
|  6|
|  7|
|  5|
|  8|
|  9|
+---+



In [7]:
# conditional update without overwrite
from delta.tables import *
from pyspark.sql.functions import *

deltaTable = DeltaTable.forPath(spark, "/tmp/delta-table")

# Update every even value by adding 100 to it
deltaTable.update(
  condition = expr("id % 2 == 0"),
  set = { "id": expr("id + 100") })

# Delete every even value
deltaTable.delete(condition = expr("id % 2 == 0"))

# Upsert (merge) new data
newData = spark.range(0, 20)

deltaTable.alias("oldData") \
  .merge(
    newData.alias("newData"),
    "oldData.id = newData.id") \
  .whenMatchedUpdate(set = { "id": col("newData.id") }) \
  .whenNotMatchedInsert(values = { "id": col("newData.id") }) \
  .execute()

deltaTable.toDF().show()

                                                                                

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+



In [8]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("DeltaLakeExample") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Create a DataFrame
data = [("Alice", 34), ("Bob", 45), ("Catherine", 29)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)
df

24/06/02 17:02:24 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


DataFrame[Name: string, Age: bigint]

In [13]:
# Write DataFrame to Delta table
df.write.format("delta").save("/tmp/delta-table-bob-alice")

# Read from Delta table
delta_df = spark.read.format("delta").load("/tmp/delta-table-bob-alice")
delta_df.show()

                                                                                

+---------+---+
|     Name|Age|
+---------+---+
|Catherine| 29|
|    Alice| 34|
|      Bob| 45|
+---------+---+



In [16]:
# Update Delta table
spark.sql("UPDATE delta.`/tmp/delta-table-bob-alice` SET Age = 35 WHERE Name = 'Alice'")

# Time Travel - Read an older version of the data
old_df = spark.read.format("delta").option("versionAsOf", 0).load("/tmp/delta-table-bob-alice")
old_df.show()


                                                                                

+---------+---+
|     Name|Age|
+---------+---+
|Catherine| 29|
|    Alice| 34|
|      Bob| 45|
+---------+---+



24/06/03 10:38:32 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 512183 ms exceeds timeout 120000 ms
24/06/03 10:38:32 WARN SparkContext: Killing executors is not supported by current scheduler.
24/06/03 10:38:36 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$