# Batch Operations - Append

Databricks Delta allows you to read, write and query data in data lakes in an efficient manner.

In [2]:
%run ./Reference/Setup

In [3]:
miniDataInputPath = "/mnt/training/online_retail/outdoor-products/outdoor-products-mini.csv"
genericDataPath = userhome + "/generic/customer-data/"
deltaDataPath = userhome + "/delta/customer-data/"
deltaIotPath = userhome + "/delta/iot-pipeline/"

print(deltaDataPath)

Here, we add new data to the consumer product data.

Before we load data into non-Databricks Delta and Databricks Delta tables, do a simple pre-processing step:

* The column `StockCode` should be of type `String`.

In [5]:
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, StringType
from pyspark.sql.functions import col

inputSchema = StructType([
  StructField("InvoiceNo", IntegerType(), True),
  StructField("StockCode", StringType(), True),
  StructField("Description", StringType(), True),
  StructField("Quantity", IntegerType(), True),
  StructField("InvoiceDate", StringType(), True),
  StructField("UnitPrice", DoubleType(), True),
  StructField("CustomerID", IntegerType(), True),
  StructField("Country", StringType(), True)
])

newDataDF = (spark       
  .read
  .option("header", "true")
  .schema(inputSchema)
  .csv(miniDataInputPath)                                    
)

In [6]:
spark.sql("""
  DROP TABLE IF EXISTS customer_data_delta
""")
spark.sql("""
  CREATE TABLE customer_data_delta 
  USING DELTA 
  LOCATION '{}' 
""".format(deltaDataPath))

In [7]:
%sql
DESCRIBE DETAIL customer_data_delta

format,id,name,description,location,createdAt,lastModified,partitionColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion
delta,94802b34-d687-4379-a628-5df6bc4ee13f,nagaraj_sengodan_hotmail_com_db.customer_data_delta,,dbfs:/user/nagaraj.sengodan@hotmail.com/delta/customer-data,2019-10-16T04:53:07.630+0000,2019-10-16T12:09:17.000+0000,List(Country),39,621086,Map(),1,2


## APPEND Using Databricks Delta Pipeline

Next, repeat the process by writing to Databricks Delta format.

In [9]:
# Just in case it exists already.
dbutils.fs.rm(deltaDataPath, True)

In [10]:
(newDataDF
  .write
  .format("delta")
  .partitionBy("Country")
  .mode("append")
  .save(deltaDataPath)
)

In [11]:
%sql
SELECT count(*) FROM customer_data_delta

count(1)
36


In [12]:
%sql
DESCRIBE DETAIL customer_data_delta

format,id,name,description,location,createdAt,lastModified,partitionColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion
delta,1b7d5d8e-c87b-47d5-9a19-fcd0c440e21d,nagaraj_sengodan_hotmail_com_db.customer_data_delta,,dbfs:/user/nagaraj.sengodan@hotmail.com/delta/customer-data,2019-10-16T12:18:23.338+0000,2019-10-16T12:18:24.000+0000,List(Country),2,5078,Map(),1,2


###Streaming Data

In [14]:

from pyspark.sql.functions import expr, col, from_unixtime, to_date
streamingEventPath = "/mnt/training/structured-streaming/events/"
rawDataDF = (spark
  .read 
  .option("inferSchema", "true") 
  .json(streamingEventPath) 
  .withColumn("date", to_date(from_unixtime(col('time').cast('Long'),'yyyy-MM-dd')))
  .withColumn("deviceId", expr("cast(rand(5) * 100 as int)"))
  .repartition(200)
)

* partition by `date`
* save to `deltaIotPath`

In [16]:

(rawDataDF
  .write
  .mode("overwrite")
  .format("delta")
  .partitionBy("date")
  .save(deltaIotPath)
)

spark.sql("""
    DROP TABLE IF EXISTS demo_iot_data_delta
  """)
spark.sql("""
    CREATE TABLE demo_iot_data_delta
    USING DELTA 
    LOCATION '{}' 
  """.format(deltaIotPath))

In [17]:
%sql

select * from demo_iot_data_delta

action,time,date,deviceId
Open,1469646786,2016-07-27,20
Open,1469644815,2016-07-27,64
Open,1469645211,2016-07-27,30
Close,1469583345,2016-07-27,97
Open,1469634825,2016-07-27,48
Open,1469633646,2016-07-27,14
Open,1469585269,2016-07-27,5
Close,1469636224,2016-07-27,44
Open,1469584369,2016-07-27,84
Close,1469613025,2016-07-27,73


In [18]:
%sql

select count(*) from demo_iot_data_delta

count(1)
100000


Create a new DataFrame with columns `action`, `time`, `date` and `deviceId`. The columns contain the following data:

* `action` contains the value `Open`
* `time` contains the Unix time cast into a long integer `cast(1529091520 as bigint)`
* `date` contains `cast('2018-06-01' as date)`
* `deviceId` contains a random number from 0 to 499 given by `expr("cast(rand(5) * 500 as int)")`

In [20]:
from pyspark.sql.functions import expr, from_unixtime
from pyspark.sql.types import LongType

newDataDF = (spark.range(10000) 
  .repartition(200)
  .selectExpr("'Open' as action", "cast(1529091520 as bigint) as time",  "cast('2018-06-01' as date) as date") 
  .withColumn("deviceId", expr("cast(rand(5) * 500 as int)"))
)

In [21]:
(newDataDF
  .write
  .format("delta")
  .partitionBy("date")
  .mode("append")
  .save(deltaIotPath)
)

In [22]:
from pyspark.sql.types import Row
numFiles = spark.sql("SELECT count(*) as total FROM demo_iot_data_delta").collect()[0][0]
print(numFiles)