In [1]:
spark

Waiting for a Spark session to start...

org.apache.spark.sql.SparkSession@5503153

In [2]:
spark.conf.get("spark.driver.appUIAddress")

Waiting for a Spark session to start...

http://gw02.itversity.com:4044

In [3]:
def getType(o: Any) = o.getClass.getTypeName

getType: (o: Any)String


In [4]:
val os_name = System.getProperty("os.name")
val envHome: Option[String] = System.getenv("HOME").split("/").lift(2)
val path = if(envHome.isDefined)  "/user/kkdosapati/dataSets/spark-guide/retail-data/by-day/*.csv" else 
"/databricks-datasets/definitive-guide/data/retail-data/by-day/*.csv"

os_name = Linux
envHome = Some(kkdosapati)
path = /user/kkdosapati/dataSets/spark-guide/retail-data/by-day/*.csv


/user/kkdosapati/dataSets/spark-guide/retail-data/by-day/*.csv

In [5]:
val staticDataFrame = spark.read.format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load(path)

staticDataFrame = [InvoiceNo: string, StockCode: string ... 6 more fields]


[InvoiceNo: string, StockCode: string ... 6 more fields]

In [6]:
staticDataFrame.createOrReplaceTempView("retail_data")
val staticSchema = staticDataFrame.schema

staticSchema = StructType(StructField(InvoiceNo,StringType,true), StructField(StockCode,StringType,true), StructField(Description,StringType,true), StructField(Quantity,IntegerType,true), StructField(InvoiceDate,TimestampType,true), StructField(UnitPrice,DoubleType,true), StructField(CustomerID,DoubleType,true), StructField(Country,StringType,true))


StructType(StructField(InvoiceNo,StringType,true), StructField(StockCode,StringType,true), StructField(Description,StringType,true), StructField(Quantity,IntegerType,true), StructField(InvoiceDate,TimestampType,true), StructField(UnitPrice,DoubleType,true), StructField(CustomerID,DoubleType,true), StructField(Country,StringType,true))

In [7]:
staticDataFrame.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [8]:
staticDataFrame.show(truncate=false, numRows=5)

+---------+---------+-------------------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                    |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+---------+---------+-------------------------------+--------+-------------------+---------+----------+--------------+
|580538   |23084    |RABBIT NIGHT LIGHT             |48      |2011-12-05 08:38:00|1.79     |14075.0   |United Kingdom|
|580538   |23077    |DOUGHNUT LIP GLOSS             |20      |2011-12-05 08:38:00|1.25     |14075.0   |United Kingdom|
|580538   |22906    |12 MESSAGE CARDS WITH ENVELOPES|24      |2011-12-05 08:38:00|1.65     |14075.0   |United Kingdom|
|580538   |21914    |BLUE HARMONICA IN BOX          |24      |2011-12-05 08:38:00|1.25     |14075.0   |United Kingdom|
|580538   |22467    |GUMBALL COAT RACK              |6       |2011-12-05 08:38:00|2.55     |14075.0   |United Kingdom|
+---------+---------+---------------------------

In [9]:
sc.version

2.3.0.2.6.5.0-292

In [10]:
import org.apache.spark.sql.functions.{window, column, desc, col}

val t1 = staticDataFrame
  .selectExpr("CustomerId", "(UnitPrice * Quantity) as total_cost", "InvoiceDate")
  .groupBy(col("CustomerId"), window(col("InvoiceDate"), "1 day"))
  .sum("total_cost")

t1 = [CustomerId: double, window: struct<start: timestamp, end: timestamp> ... 1 more field]


[CustomerId: double, window: struct<start: timestamp, end: timestamp> ... 1 more field]

In [11]:
 t1.show(numRows=5, truncate=false)

+----------+------------------------------------------+------------------+
|CustomerId|window                                    |sum(total_cost)   |
+----------+------------------------------------------+------------------+
|14799.0   |[2011-10-18 20:00:00, 2011-10-19 20:00:00]|157.7             |
|16210.0   |[2011-09-01 20:00:00, 2011-09-02 20:00:00]|344.88            |
|17790.0   |[2010-12-12 19:00:00, 2010-12-13 19:00:00]|154.8             |
|14541.0   |[2011-10-11 20:00:00, 2011-10-12 20:00:00]|146.0             |
|12682.0   |[2011-07-03 20:00:00, 2011-07-04 20:00:00]|474.36000000000007|
+----------+------------------------------------------+------------------+
only showing top 5 rows



In [12]:
staticDataFrame
  .selectExpr("CustomerId", "UnitPrice", "Quantity", "(UnitPrice * Quantity) as total_cost", "InvoiceDate")
  .where("CustomerId == 16057.0").where(col("InvoiceDate") >= "2011-12-04 19:00:00")
.where(col("InvoiceDate") <= "2011-12-05 19:00:00").show()

+----------+---------+--------+----------+-------------------+
|CustomerId|UnitPrice|Quantity|total_cost|        InvoiceDate|
+----------+---------+--------+----------+-------------------+
|   16057.0|    12.75|      -1|    -12.75|2011-12-05 16:36:00|
|   16057.0|     9.95|      -2|     -19.9|2011-12-05 16:36:00|
|   16057.0|     4.95|      -1|     -4.95|2011-12-05 16:36:00|
+----------+---------+--------+----------+-------------------+



In [13]:
t1.take(1)

Array([14799.0,[2011-10-18 20:00:00.0,2011-10-19 20:00:00.0],157.7])

In [14]:
t1.explain()

== Physical Plan ==
*(2) HashAggregate(keys=[CustomerId#16, window#128], functions=[sum(total_cost#60)])
+- Exchange hashpartitioning(CustomerId#16, window#128, 200)
   +- *(1) HashAggregate(keys=[CustomerId#16, window#128], functions=[partial_sum(total_cost#60)])
      +- *(1) Project [named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(InvoiceDate#14, TimestampType, LongType) - 0) as double) / 8.64E10)) as double) = (cast((precisetimestampconversion(InvoiceDate#14, TimestampType, LongType) - 0) as double) / 8.64E10)) THEN (CEIL((cast((precisetimestampconversion(InvoiceDate#14, TimestampType, LongType) - 0) as double) / 8.64E10)) + 1) ELSE CEIL((cast((precisetimestampconversion(InvoiceDate#14, TimestampType, LongType) - 0) as double) / 8.64E10)) END + 0) - 1) * 86400000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(InvoiceDate#14, TimestampType, LongTy

In [15]:
t1.printSchema()

root
 |-- CustomerId: double (nullable = true)
 |-- window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- sum(total_cost): double (nullable = true)



In [16]:
spark.conf.set("spark.sql.shuffle.partitions", "5")

In [17]:
val streamingDataFrame = spark.readStream
    .schema(staticSchema)
    .option("maxFilesPerTrigger", 15)
    .format("csv")
    .option("header", "true")
    .load(path)

streamingDataFrame = [InvoiceNo: string, StockCode: string ... 6 more fields]


[InvoiceNo: string, StockCode: string ... 6 more fields]

In [18]:
streamingDataFrame.explain() 

== Physical Plan ==
StreamingRelation FileSource[/user/kkdosapati/dataSets/spark-guide/retail-data/by-day/*.csv], [InvoiceNo#129, StockCode#130, Description#131, Quantity#132, InvoiceDate#133, UnitPrice#134, CustomerID#135, Country#136]


In [19]:
val purchaseByCustomerPerDay = streamingDataFrame
  .selectExpr(
    "CustomerId",
    "(UnitPrice * Quantity) as total_cost",
    "InvoiceDate")
  .groupBy(
    col("CustomerId"), window($"InvoiceDate", "1 day"))
  .sum("total_cost")

purchaseByCustomerPerDay = [CustomerId: double, window: struct<start: timestamp, end: timestamp> ... 1 more field]


[CustomerId: double, window: struct<start: timestamp, end: timestamp> ... 1 more field]

In [20]:
val staticDFPurchaseByCustomerPerDay = staticDataFrame
  .selectExpr(
    "CustomerId",
    "(UnitPrice * Quantity) as total_cost",
    "InvoiceDate")
  .groupBy(
    col("CustomerId"), window($"InvoiceDate", "1 day"))
  .sum("total_cost")

staticDFPurchaseByCustomerPerDay = [CustomerId: double, window: struct<start: timestamp, end: timestamp> ... 1 more field]


[CustomerId: double, window: struct<start: timestamp, end: timestamp> ... 1 more field]

In [21]:
purchaseByCustomerPerDay.writeStream
    .format("memory")
    .queryName("customer_purchases")
    .outputMode("complete")
    .start()

org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@182e46a5

In [22]:
spark.sql("""
  SELECT *
  FROM customer_purchases
  ORDER BY `sum(total_cost)` DESC
  """)
  .show(20)

+----------+------+---------------+
|CustomerId|window|sum(total_cost)|
+----------+------+---------------+
+----------+------+---------------+



In [23]:
streamingDataFrame.isStreaming

true

In [24]:
val streamCount = spark.sql("""
  SELECT *
  FROM customer_purchases
  """).orderBy(desc("window"), desc("sum(total_cost)"))

streamCount: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [CustomerId: double, window: struct<start: timestamp, end: timestamp> ... 1 more field]


In [25]:
streamingDataFrame.isStreaming

lastException: Throwable = null


true

In [26]:
staticDFPurchaseByCustomerPerDay.count

19576

In [27]:
def loop() : Long = {
    Thread.sleep(5000)
    val cnt = streamCount.count
    println(cnt)
    if(cnt != 19576) loop() else cnt
}

loop: ()Long


In [28]:
staticDFPurchaseByCustomerPerDay.printSchema

root
 |-- CustomerId: double (nullable = true)
 |-- window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- sum(total_cost): double (nullable = true)



In [29]:
staticDFPurchaseByCustomerPerDay.show

+----------+--------------------+------------------+
|CustomerId|              window|   sum(total_cost)|
+----------+--------------------+------------------+
|   14075.0|[2011-12-04 19:00...|316.78000000000003|
|   18180.0|[2011-12-04 19:00...|            310.73|
|   15358.0|[2011-12-04 19:00...| 830.0600000000003|
|   15392.0|[2011-12-04 19:00...|304.40999999999997|
|   15290.0|[2011-12-04 19:00...|263.02000000000004|
|   16811.0|[2011-12-04 19:00...|             232.3|
|   12748.0|[2011-12-04 19:00...| 363.7899999999999|
|   16500.0|[2011-12-04 19:00...| 52.74000000000001|
|   16873.0|[2011-12-04 19:00...|1854.8300000000002|
|   14060.0|[2011-12-04 19:00...|297.47999999999996|
|   14649.0|[2011-12-04 19:00...| 513.9899999999998|
|   16904.0|[2011-12-04 19:00...| 349.0200000000001|
|   17857.0|[2011-12-04 19:00...|            2979.6|
|   14083.0|[2011-12-04 19:00...| 446.5700000000001|
|   14777.0|[2011-12-04 19:00...|             -2.95|
|   16684.0|[2011-12-04 19:00...| 5401.9799999

In [30]:
loop()

3301
4822
6594
8520
10106
19576


19576

In [31]:
//purchaseByCustomerPerDay.writeStream
//     .format("console")
//     .queryName("customer_purchases_2")
//     .outputMode("complete")
//     .start()

//check on Console

Name: Syntax Error.
Message: 
StackTrace: 

In [32]:
streamingDataFrame.isStreaming

true

In [33]:
streamCount.orderBy(desc("window"), desc("sum(total_cost)")).show(false)

+----------+------------------------------------------+------------------+
|CustomerId|window                                    |sum(total_cost)   |
+----------+------------------------------------------+------------------+
|null      |[2011-12-08 19:00:00, 2011-12-09 19:00:00]|16571.320000000036|
|12433.0   |[2011-12-08 19:00:00, 2011-12-09 19:00:00]|2638.690000000001 |
|14051.0   |[2011-12-08 19:00:00, 2011-12-09 19:00:00]|1203.9            |
|17581.0   |[2011-12-08 19:00:00, 2011-12-09 19:00:00]|984.6800000000001 |
|12713.0   |[2011-12-08 19:00:00, 2011-12-09 19:00:00]|848.5499999999998 |
|17389.0   |[2011-12-08 19:00:00, 2011-12-09 19:00:00]|741.9             |
|17490.0   |[2011-12-08 19:00:00, 2011-12-09 19:00:00]|730.7             |
|16558.0   |[2011-12-08 19:00:00, 2011-12-09 19:00:00]|598.9700000000001 |
|12985.0   |[2011-12-08 19:00:00, 2011-12-09 19:00:00]|485.0             |
|18102.0   |[2011-12-08 19:00:00, 2011-12-09 19:00:00]|469.43999999999994|
|15796.0   |[2011-12-08 1