In [1]:
from pyspark.sql.types import *
from delta.tables import *

In [2]:
data = [
 (1, "United Kingdom", "London"),
 (2, "Canada", "Toronto")
 ]
schema = ["id", "country", "capital"]
df = spark.createDataFrame(data, schema=schema)
(
df
.write
.format("delta")
.save("./delta-lake/countries")
)

24/11/17 15:23:09 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [3]:
data = [(3, 'United States', 'Washington, D.C.') ]
# Define the schema for the Delta table
schema = ["id", "country", "capital"]
df = spark.createDataFrame(data, schema=schema)
(df
.write
.format("delta")
.mode("append")
.save("./delta-lake/countries")
)

                                                                                

In [4]:
delta_table = (spark
        .read
        .format("delta")
        .load("./delta-lake/countries")
      )
delta_table.show()

+---+--------------+----------------+
| id|       country|         capital|
+---+--------------+----------------+
|  3| United States|Washington, D.C.|
|  1|United Kingdom|          London|
|  2|        Canada|         Toronto|
+---+--------------+----------------+



In [5]:
delta_table.filter(delta_table.capital == 'London').show()

+---+--------------+-------+
| id|       country|capital|
+---+--------------+-------+
|  1|United Kingdom| London|
+---+--------------+-------+



In [6]:
delta_table.select("id", "capital").show()

+---+----------------+
| id|         capital|
+---+----------------+
|  3|Washington, D.C.|
|  1|          London|
|  2|         Toronto|
+---+----------------+



In [7]:
delta_table = DeltaTable.forPath(spark, "./delta-lake/countries")

# Update the Delta table
delta_table.update(
    condition="id = 1",
    set={"country": "'U.K.'"}
)

# Show the updated table
delta_table.toDF().show()

                                                                                

+---+-------------+----------------+
| id|      country|         capital|
+---+-------------+----------------+
|  3|United States|Washington, D.C.|
|  2|       Canada|         Toronto|
|  1|         U.K.|          London|
+---+-------------+----------------+



In [8]:
fullHistoryDF = delta_table.history()

In [9]:
fullHistoryDF["version","timestamp","operation","readVersion","isBlindAppend"].show()

+-------+--------------------+---------+-----------+-------------+
|version|           timestamp|operation|readVersion|isBlindAppend|
+-------+--------------------+---------+-----------+-------------+
|      2|2024-11-17 15:23:...|   UPDATE|          1|        false|
|      1|2024-11-17 15:23:...|    WRITE|          0|         true|
|      0|2024-11-17 15:23:...|    WRITE|       NULL|         true|
+-------+--------------------+---------+-----------+-------------+



In [10]:
df = (spark.read.format("delta")
 .option("versionAsOf", 2)
 .load("./delta-lake/countries"))
df.show()

+---+-------------+----------------+
| id|      country|         capital|
+---+-------------+----------------+
|  3|United States|Washington, D.C.|
|  2|       Canada|         Toronto|
|  1|         U.K.|          London|
+---+-------------+----------------+



In [14]:
df = (spark.read.format("delta")
.option("timestampAsOf", "2024-11-17 15:23:20")
 .load("./delta-lake/countries"))
df.show()

+---+--------------+----------------+
| id|       country|         capital|
+---+--------------+----------------+
|  3| United States|Washington, D.C.|
|  1|United Kingdom|          London|
|  2|        Canada|         Toronto|
+---+--------------+----------------+



In [15]:
from pyspark.sql.functions import col

delta_table = DeltaTable.forPath(spark, "./delta-lake/countries")
delta_table.delete(col("id") == 2) 
delta_table_df = delta_table.toDF()
delta_table_df.show()

+---+-------------+----------------+
| id|      country|         capital|
+---+-------------+----------------+
|  3|United States|Washington, D.C.|
|  1|         U.K.|          London|
+---+-------------+----------------+



In [16]:
fullHistoryDF = delta_table.history()
fullHistoryDF["version","timestamp","operation","readVersion","isBlindAppend"].show()

+-------+--------------------+---------+-----------+-------------+
|version|           timestamp|operation|readVersion|isBlindAppend|
+-------+--------------------+---------+-----------+-------------+
|      3|2024-11-17 15:24:...|   DELETE|          2|        false|
|      2|2024-11-17 15:23:...|   UPDATE|          1|        false|
|      1|2024-11-17 15:23:...|    WRITE|          0|         true|
|      0|2024-11-17 15:23:...|    WRITE|       NULL|         true|
+-------+--------------------+---------+-----------+-------------+



In [17]:
delta_table.restoreToVersion(3)
delta_table_df = delta_table.toDF()
delta_table_df.show()

+---+-------------+----------------+
| id|      country|         capital|
+---+-------------+----------------+
|  3|United States|Washington, D.C.|
|  1|         U.K.|          London|
+---+-------------+----------------+



In [18]:
df = spark.createDataFrame(
    [
        (1, 'India', 'New Delhi'),
        (4, 'Australia', 'Canberra')
    ],
    schema=["id", "country", "capital"]
)

# Write the DataFrame to a Delta table
df.write \
    .format("delta") \
    .mode("overwrite") \
    .save("./delta-lake/countries")

# Read the Delta table into a DeltaTable object
delta_table = DeltaTable.forPath(spark, "./delta-lake/countries")

# Convert the DeltaTable to a DataFrame and display
delta_table_df = delta_table.toDF()
delta_table_df.show()

+---+---------+---------+
| id|  country|  capital|
+---+---------+---------+
|  4|Australia| Canberra|
|  1|    India|New Delhi|
+---+---------+---------+



In [19]:
delta_table.restoreToVersion(3)
data = [
 (4, 'India', 'New D'),
 ]
schema = ["id", "country", "capital"]
df = spark.createDataFrame(data, schema=schema)
(
df
.write
.format("delta")
.mode("append")
.save("./delta-lake/countries")
)
delta_table_df = delta_table.toDF()
delta_table_df.show()

24/11/17 15:24:46 WARN DAGScheduler: Broadcasting large task binary with size 1072.6 KiB


+---+-------------+----------------+
| id|      country|         capital|
+---+-------------+----------------+
|  3|United States|Washington, D.C.|
|  4|        India|           New D|
|  1|         U.K.|          London|
+---+-------------+----------------+



In [20]:
idf = (
 spark
 .createDataFrame([
 (4, 'India', 'New Delhi'),
 (5, 'Australia', 'Canberra')],
 schema=["id", "country", "capital"]
 )
 )
delta_table.alias("target").merge(
 source = idf.alias("source"),
 condition = "source.id = target.id"
 ).whenMatchedUpdate(set =
 {
 "country": "source.country",
 "capital": "source.capital"
 }
 ).whenNotMatchedInsert(values =
 {
 "id": "source.id",
 "country": "source.country",
 "capital": "source.capital"
 }
 ).execute()

delta_table_df = delta_table.toDF()
delta_table_df.show()

+---+-------------+----------------+
| id|      country|         capital|
+---+-------------+----------------+
|  3|United States|Washington, D.C.|
|  4|        India|       New Delhi|
|  5|    Australia|        Canberra|
|  1|         U.K.|          London|
+---+-------------+----------------+



In [21]:
delta_table.detail()["format","createdAt","lastModified","numFiles","sizeInBytes","minReaderVersion","minWriterVersion"].show()

+------+--------------------+--------------------+--------+-----------+----------------+----------------+
|format|           createdAt|        lastModified|numFiles|sizeInBytes|minReaderVersion|minWriterVersion|
+------+--------------------+--------------------+--------+-----------+----------------+----------------+
| delta|2024-11-17 15:23:...|2024-11-17 15:24:...|       3|       3095|               1|               2|
+------+--------------------+--------------------+--------+-----------+----------------+----------------+



In [22]:
fullHistoryDF = delta_table.history()
fullHistoryDF["version","timestamp","operation","readVersion","isBlindAppend"].show()

+-------+--------------------+---------+-----------+-------------+
|version|           timestamp|operation|readVersion|isBlindAppend|
+-------+--------------------+---------+-----------+-------------+
|      7|2024-11-17 15:24:...|    MERGE|          6|        false|
|      6|2024-11-17 15:24:...|    WRITE|          5|         true|
|      5|2024-11-17 15:24:...|  RESTORE|          4|        false|
|      4|2024-11-17 15:24:...|    WRITE|          3|        false|
|      3|2024-11-17 15:24:...|   DELETE|          2|        false|
|      2|2024-11-17 15:23:...|   UPDATE|          1|        false|
|      1|2024-11-17 15:23:...|    WRITE|          0|         true|
|      0|2024-11-17 15:23:...|    WRITE|       NULL|         true|
+-------+--------------------+---------+-----------+-------------+



In [1]:
spark.stop()