In [1]:
var ordersDf = spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("subscribe", "orders")
    .option("group.id", "orders-5-min-jsc")
    .load()

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.80.128:4040
SparkContext available as 'sc' (version = 3.1.3, master = local[*], app id = local-1648503120436)
SparkSession available as 'spark'


ordersDf: org.apache.spark.sql.DataFrame = [key: binary, value: binary ... 5 more fields]


In [2]:
ordersDf.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [3]:
val ticksDf = ordersDf.selectExpr("CAST(value AS STRING)", "timestamp")
ticksDf.printSchema()

root
 |-- value: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)



ticksDf: org.apache.spark.sql.DataFrame = [value: string, timestamp: timestamp]


In [4]:
import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, TimestampType,  StringType, StructField, StructType}
import org.apache.spark.sql.functions._

val schema = StructType(
    List(
      StructField("orderId", IntegerType, true),
      StructField("itemId", StringType, true),
      StructField("quantity", IntegerType, true),
      StructField("unitprice", IntegerType, true),
      StructField("state", StringType, true),
      StructField("timestamp", LongType, true)
    )
  )

import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, TimestampType, StringType, StructField, StructType}
import org.apache.spark.sql.functions._
schema: org.apache.spark.sql.types.StructType = StructType(StructField(orderId,IntegerType,true), StructField(itemId,StringType,true), StructField(quantity,IntegerType,true), StructField(unitprice,IntegerType,true), StructField(state,StringType,true), StructField(timestamp,LongType,true))


In [5]:
val jsonDf = ticksDf.withColumn("value", from_json($"value", schema))
jsonDf.printSchema()

root
 |-- value: struct (nullable = true)
 |    |-- orderId: integer (nullable = true)
 |    |-- itemId: string (nullable = true)
 |    |-- quantity: integer (nullable = true)
 |    |-- unitprice: integer (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- timestamp: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)



jsonDf: org.apache.spark.sql.DataFrame = [value: struct<orderId: int, itemId: string ... 4 more fields>, timestamp: timestamp]


In [6]:
var stockTickDf = jsonDf.select(col("value.*"))
stockTickDf.printSchema()

root
 |-- orderId: integer (nullable = true)
 |-- itemId: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- unitprice: integer (nullable = true)
 |-- state: string (nullable = true)
 |-- timestamp: long (nullable = true)



stockTickDf: org.apache.spark.sql.DataFrame = [orderId: int, itemId: string ... 4 more fields]


In [7]:
stockTickDf = stockTickDf.withColumn("total", col("unitprice") * col("quantity"))
stockTickDf.printSchema()

root
 |-- orderId: integer (nullable = true)
 |-- itemId: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- unitprice: integer (nullable = true)
 |-- state: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- total: integer (nullable = true)



stockTickDf: org.apache.spark.sql.DataFrame = [orderId: int, itemId: string ... 5 more fields]


In [8]:
stockTickDf = stockTickDf
                .withColumn("timestampTemp", (col("timestamp") / 1000).cast("timestamp"))
                .withColumn("trade_time", date_trunc("minute", col("timestampTemp")))
                .drop("timestamp")
                .drop("timestampTemp")
                .withColumnRenamed("trade_time", "timestamp")

stockTickDf: org.apache.spark.sql.DataFrame = [orderId: int, itemId: string ... 5 more fields]


In [9]:
stockTickDf.printSchema()
stockTickDf.writeStream
    .format("console")
    .outputMode("append")
    .start()

root
 |-- orderId: integer (nullable = true)
 |-- itemId: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- unitprice: integer (nullable = true)
 |-- state: string (nullable = true)
 |-- total: integer (nullable = true)
 |-- timestamp: timestamp (nullable = true)



res5: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@30f7716b


In [10]:
//import org.apache.spark.sql.expressions.Window
//import org.apache.spark.sql.Column

//val stockTickDf5Min = stockTickDf.groupBy("symbol", window("timestamp", "300 seconds"))
                            
//stockTickDf5Min.printSchema()

In [11]:
// val echoOnconsole = stockTickDf
//                 .writeStream
//                 .outputMode("update")
//                 .format("console")
//                 .option("truncate", false)
//                 .start()

In [12]:
// val stockTickDf5MinKafka = stockTickDf5Min.selectExpr("to_json(struct(*)) AS value")

In [13]:
// stockTickDf5MinKafka
//             .writeStream\
//              .format("kafka")
//             .outputMode("update")
//              .option("kafka.bootstrap.servers", "localhost:9092")
//             .option("topic", "statewise_earning")
//             .option("checkpointLocation", "file:///tmp/spark3")
//             .start()

In [14]:
// Question 2 Part b. publish to Amazon RDS table using JDBC in append mode
// import org.apache.spark.sql._

// def processBatchData(ordersBatchDf: DataFrame, batch_id: Long) = {
//     print ("process batch called", batch_id, "writing ", ordersBatchDf.count())

//      val ordersBatchFinalDf = (ordersBatchDf
//         .write
//         .select(col("state"), col("total"))
//         .mode("append")
//         .format("jdbc")
//         .option("url", "jdbc:mysql-database-1.cgioa4qvqncf.us-east-2.rds.amazonaws.com")
//         .option("driver", "com.mysql.jdbc.Driver")
//         .option("user", "admin")
//         .option("password", "FtgY3XcBn0i")
//         .option("dbtable", "orders_5min")
//         .save()
//     )
// }
// stockTickDf.writeStream.outputMode("append").foreachBatch(processBatchData).start()

-------------------------------------------
Batch: 0
-------------------------------------------
+-------+------+--------+---------+-----+-----+---------+
|orderId|itemId|quantity|unitprice|state|total|timestamp|
+-------+------+--------+---------+-----+-----+---------+
+-------+------+--------+---------+-----+-----+---------+

-------------------------------------------
Batch: 1
-------------------------------------------
+-------+------+--------+---------+-----+-----+---------+
|orderId|itemId|quantity|unitprice|state|total|timestamp|
+-------+------+--------+---------+-----+-----+---------+
|   null|  null|    null|     null| null| null|     null|
+-------+------+--------+---------+-----+-----+---------+

-------------------------------------------
Batch: 2
-------------------------------------------
+-------+------+--------+---------+-----+-----+---------+
|orderId|itemId|quantity|unitprice|state|total|timestamp|
+-------+------+--------+---------+-----+-----+---------+
|   null|  