# Spark Structured Streaming
**Andrey Titov**
Senior Spark Engineer @ NVIDIA

## Agenda
+ Rate stream creation
+ Writing streaming data to console
+ Writing data to kafka
+ Persisting stream state using checkpoints

In [None]:
# Data locations
json_file = 'cities.json'
streaming_dir = 'tmp/streaming_dir'
kafka_servers = 'localhost:9092'
kafka_topic = 'test_topic'
checkpoint_dir = "tmp/chk/chk_1"

output_parquet_agg = "tmp/agg0.parquet"

### Creating rate >> console stream
+ rate source generates random data
+ console sink is used to print data to console

In [None]:
rate_sdf = spark.readStream.format("rate").load()
rate_sdf.printSchema()
rate_sq = rate_sdf.writeStream.format("console").start()

In [None]:
rate_sq.stop()

### Creating text >> console stream
+ text source listens for new files in folder
+ new files are detected by name

In [None]:
file_sdf = spark.readStream.format("text").option("path", streaming_dir).load()
file_sdf.printSchema()
file_sq = file_sdf.writeStream.format("console").start()

In [None]:
file_sq.stop()

### Writing file to kafka

In [None]:
cities = spark.read.format("text").load(json_file)

kafka_params = {
    "kafka.bootstrap.servers": kafka_servers
}

cities.write.format("kafka").options(**kafka_params).option("topic", kafka_topic).save()

### Creating kafka >> console stream
+ Kafka API is same for batch and streaming modes
+ Dataframe contains the following columns:
 - `key`
 - `value`
 - `topic`
 - `partition`
 - `offset`
 - `timestamp`
 - `timestampType`

In [None]:
kafka_sdf = spark.readStream.format("kafka") \
                .options(**kafka_params) \
                .option("subscribe", kafka_topic) \
                .option("startingOffsets", "earliest") \
                .load()

kafka_sdf.printSchema()

kafka_sq = kafka_sdf.writeStream.format("console").option("truncate", "true").start()

In [None]:
kafka_sq.stop()

### Enabled checkpointing

In [None]:
file_sdf = spark.readStream.format("text").option("path", streaming_dir).load()

kafka_params = {
    "kafka.bootstrap.servers": kafka_servers,
    "topic": kafka_topic
}

kafka_sq = file_sdf.writeStream.format("kafka") \
                    .options(**kafka_params) \
                    .option("checkpointLocation", checkpoint_dir) \
                    .start()

In [None]:
kafka_sq.stop()

### Get stream statistics

In [None]:
# Check is stream is active
kafka_sq.isActive

In [None]:
# Get last batch information
kafka_sq.lastProgress

In [None]:
# Get stream status
kafka_sq.status

In [None]:
# Read data from kafka and write it to console

kafka_params = {
    "kafka.bootstrap.servers": kafka_servers,
    "subscribe": kafka_topic,
    "startingOffsets": "earliest"
}

kafka_sdf = spark.readStream.format("kafka").options(**kafka_params).load()
console_sq = kafka_sdf.writeStream.format("console").start()

In [None]:
console_sq.status

In [None]:
for s in spark.streams.active:
    s.stop()