In [1]:
val stocksDf = spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "hadoop-vm:9092")
    .option("subscribe", "stock-ticks")
    .option("group.id", "stock-ticks-group-JSCScala1234")
    .load()

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.80.128:4040
SparkContext available as 'sc' (version = 3.1.3, master = local[*], app id = local-1647895902400)
SparkSession available as 'spark'


stocksDf: org.apache.spark.sql.DataFrame = [key: binary, value: binary ... 5 more fields]


In [2]:
stocksDf.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [3]:
val ticksDf = stocksDf.selectExpr("CAST(value AS STRING)", "timestamp")
ticksDf.printSchema()

root
 |-- value: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)



ticksDf: org.apache.spark.sql.DataFrame = [value: string, timestamp: timestamp]


In [4]:
import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, TimestampType,  StringType, StructField, StructType}
import  org.apache.spark.sql.functions._

val schema = StructType(
    List(
      StructField("symbol", StringType, true),
      StructField("price", DoubleType, true), // TimestampType with , yyyyMMdd
      StructField("volume", LongType, true),
      StructField("timestamp", LongType, true),
    )
  )

import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, TimestampType, StringType, StructField, StructType}
import org.apache.spark.sql.functions._
schema: org.apache.spark.sql.types.StructType = StructType(StructField(symbol,StringType,true), StructField(price,DoubleType,true), StructField(volume,LongType,true), StructField(timestamp,LongType,true))


In [5]:
val jsonDf = ticksDf.withColumn("value", from_json($"value", schema))
jsonDf.printSchema()

root
 |-- value: struct (nullable = true)
 |    |-- symbol: string (nullable = true)
 |    |-- price: double (nullable = true)
 |    |-- volume: long (nullable = true)
 |    |-- timestamp: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)



jsonDf: org.apache.spark.sql.DataFrame = [value: struct<symbol: string, price: double ... 2 more fields>, timestamp: timestamp]


In [6]:
var stockTickDf = jsonDf.select (col("value.*"))
stockTickDf.printSchema()

root
 |-- symbol: string (nullable = true)
 |-- price: double (nullable = true)
 |-- volume: long (nullable = true)
 |-- timestamp: long (nullable = true)



stockTickDf: org.apache.spark.sql.DataFrame = [symbol: string, price: double ... 2 more fields]


In [7]:
stockTickDf = stockTickDf.withColumn("traded_value", col("price") * col("volume"))
stockTickDf.printSchema()

root
 |-- symbol: string (nullable = true)
 |-- price: double (nullable = true)
 |-- volume: long (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- traded_value: double (nullable = true)



stockTickDf: org.apache.spark.sql.DataFrame = [symbol: string, price: double ... 3 more fields]


In [8]:
stockTickDf = stockTickDf
                .withColumn("timestampTemp", (col("timestamp") / 1000).cast("timestamp"))
                .withColumn("trade_time", date_trunc("minute", col("timestampTemp")))
                .drop("timestamp")
                .drop("timestampTemp")
                .withColumnRenamed("trade_time", "timestamp")

stockTickDf: org.apache.spark.sql.DataFrame = [symbol: string, price: double ... 3 more fields]


In [9]:
// import org.apache.spark.sql.streaming.Trigger


// stockTickDf
//      .withColumn("year", date_format(col("timestamp"), "yyyy"))
//      .withColumn("month", date_format(col("timestamp"), "MM"))
//      .withColumn("day", date_format(col("timestamp"), "dd"))  
//      .withColumn("hour", date_format(col("timestamp"), "HH"))   
//      .withColumn("_symbol", col("symbol"))   
//      .writeStream
//      .trigger(Trigger.ProcessingTime("65 seconds"))
//      .queryName("Write Ticks to CSV trigger by 1 min hour")
//      .format("csv")
//      .option("path", "hdfs://localhost:9000/dump-scala-csv-trigger-hourly-1min")
//      .option("header", true)
//      .option("checkpointLocation", "hdfs://localhost:9000/checkpoint/tickscsvtohdfs6")
//      .partitionBy("year", "month", "day", "hour", "_symbol")
//      .option("truncate", false)
//      .start()

In [11]:
import org.apache.spark.sql._

def processBatchData(candleBatchDf: DataFrame, batch_id: Long) = {
    print ("process batch called", batch_id, "writing ", candleBatchDf.count())

     val candleBatchDoneDf = (candleBatchDf
        .coalesce(1)
        .write
        .mode("append")
        .format("csv")
        .partitionBy("year", "month", "day", "hour", "_symbol")
        .option("header", true)
        .save( "hdfs://localhost:9000/layers/raw/scala")
    )
}
    
stockTickDf.writeStream.foreachBatch(processBatchData _).outputMode("update").start()

import org.apache.spark.sql._
processBatchData: (candleBatchDf: org.apache.spark.sql.DataFrame, batch_id: Long)Unit
res7: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@52e80259


(process batch called,0,writing ,0)