##Create Table
####Lets demonstrate time travel in delta

In [2]:
from pyspark.sql.functions import expr
from pyspark.sql.types import *

raw_data = spark.range(100000) \
  .selectExpr("if(id % 2 = 0, 'Open', 'Close') as action") \
  .withColumn("date", expr("cast(concat('2019-04-', cast(rand(5) * 30 as int) + 1) as date)")) \
  .withColumn("device_id", expr("cast(rand(5) * 100 as int)"))

In [3]:
PARQUET_PATH="/tmp/delta_tutorial/parquet_table"
DELTA_SILVER_PATH="/tmp/delta_tutorial/delta_table"
DELTA_GOLD_PATH="/tmp/delta_tutorial/delta_agg_table"

# Reset Env
dbutils.fs.rm(PARQUET_PATH, True)
dbutils.fs.rm(DELTA_SILVER_PATH, True)
dbutils.fs.rm(DELTA_GOLD_PATH, True)

# Make some configurations small-scale friendly
sql("set spark.sql.shuffle.partitions = 1")
sql("set spark.databricks.delta.snapshotPartitions = 1")

In [4]:
stream_data = spark.readStream.format("rate").option("rowsPerSecond", 100).load() \
  .selectExpr("'Open' as action") \
  .withColumn("date", expr("cast(concat('2019-04-', cast(rand(5) * 30 as int) + 1) as date)")) \
  .withColumn("device_id", expr("cast(rand(5) * 500 as int)"))

In [5]:
raw_data.write.format("delta").partitionBy("date").save(DELTA_SILVER_PATH)

In [6]:
display(spark.read.format("delta").load(DELTA_SILVER_PATH).groupBy("action").count())

action,count
Open,50000
Close,50000


In [7]:
stream_data.writeStream.format("delta").partitionBy("date").outputMode("append") \
  .trigger(processingTime='5 seconds').option('checkpointLocation', DELTA_SILVER_PATH + "/_chk").start(DELTA_SILVER_PATH)

In [8]:
display(spark.read.format("delta").load(DELTA_SILVER_PATH).groupBy("action").count())

action,count
Open,51200
Close,50000


#####Using our Delta Lake as a Source

In [10]:
delta_data_stream = spark.readStream \
  .option("maxFilesPerTrigger", "10") \
  .format("delta") \
  .load(DELTA_SILVER_PATH)
  

In [11]:
delta_data_stream.groupBy("action", "date", "device_id") \
  .count() \
  .writeStream \
  .format("delta") \
  .option("checkpointLocation", DELTA_GOLD_PATH + "/_checkpoint") \
  .partitionBy("date") \
  .outputMode("complete") \
  .start(DELTA_GOLD_PATH)

In [12]:
display(spark.read.format("delta").load(DELTA_GOLD_PATH).orderBy("date", "device_id", "action"))

action,date,device_id,count
Close,2019-04-01,0,501
Open,2019-04-01,0,506
Close,2019-04-01,1,500
Open,2019-04-01,1,502
Close,2019-04-01,2,489
Open,2019-04-01,2,515
Close,2019-04-01,3,176
Open,2019-04-01,3,176
Close,2019-04-02,3,92
Open,2019-04-02,3,107


####Schema Evolution

In [14]:
# Now we have more users, so let's add the user_id column to our table
new_data_with_new_col = spark.range(1000) \
  .selectExpr("'Open' as action","cast(concat('2019-04-', cast(rand(5) * 3 as int) + 1) as date) as date") \
  .withColumn("device_id", expr("cast(rand(5) * 100 as int)")) \
  .withColumn("user_name", expr("cast(rand(10) * 100 as int)"))
  
new_data_with_new_col.write.format("delta").partitionBy("date").mode("append").save(DELTA_SILVER_PATH)

In [15]:
# Add the mergeSchema option
new_data_with_new_col.write.option("mergeSchema","true").format("delta").partitionBy("date").mode("append").save(DELTA_SILVER_PATH)

####Time Travel

The transaction log is stored along with the data under the `_delta_log` directory.

In [18]:
print("\n".join([f.name for f in dbutils.fs.ls(DELTA_SILVER_PATH + "/_delta_log") if f.name.endswith('json')]))

In [19]:
# latest version - 2, because in latest version - 1 we added the user_id column
version_before_schema_change = 10

In [20]:
display(spark.read.format("delta").load(DELTA_SILVER_PATH))

action,date,device_id,user_id,user_name
Open,2019-04-01,19,76.0,
Open,2019-04-01,15,83.0,
Open,2019-04-01,0,98.0,
Open,2019-04-01,6,81.0,
Open,2019-04-01,4,36.0,
Open,2019-04-01,12,13.0,
Open,2019-04-01,21,92.0,
Open,2019-04-01,13,19.0,
Open,2019-04-01,20,67.0,
Open,2019-04-01,14,38.0,


In [21]:
spark.read.format("delta").load(DELTA_SILVER_PATH).count()

In [22]:
display(spark.read.format("delta").option("versionAsOf", version_before_schema_change).load(DELTA_SILVER_PATH))

action,date,device_id
Open,2019-04-01,1
Close,2019-04-01,2
Close,2019-04-01,2
Close,2019-04-01,0
Close,2019-04-01,1
Open,2019-04-01,0
Open,2019-04-01,1
Close,2019-04-01,0
Close,2019-04-01,2
Close,2019-04-01,3


In [23]:
spark.read.format("delta").option("versionAsOf", version_before_schema_change).load(DELTA_SILVER_PATH).count()