# Generating the input data for the realtime dashboard

In this exercise, we'll use Spark structured streaming to generate the input data for the realtime dashboard.

In [None]:
%%bash
# Install the required Python 3 dependencies
python3 -m pip install kafka-python  # type: ignore

In [None]:

from time import sleep
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0 pyspark-shell'

from IPython.display import display, clear_output

import pyspark 
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *


def test_query(sdf, mode="append", rows=None, wait=2, sort=None):
    # If a query with the same name exists, stop it.
    query_name = "test_query"
    query = None
    for q in spark.streams.active:
        if (q.name == query_name):
            query = q
    if query is not None:
        query.stop()

    try:
        tq = (
            # Create an output stream
            sdf.writeStream               
            # Only write new rows to the output
            .outputMode(mode)           
            # Write output stream to an in-memory Spark table (a DataFrame)
            .format("memory")               
            # The name of the output table will be the same as the name of the query
            .queryName(query_name)
            # Submit the query to Spark and execute it
            .start()
        )

        tq.processAllAvailable()

        sleep(wait)
        while(tq.status.get("isTriggerActive") == True):
            print(f"DataAvailable: {tq.status['isDataAvailable']},\tTriggerActive: {tq.status['isTriggerActive']}\t{tq.status['message']}")
            sleep(wait)

        # When the status says "Waiting for data to arrive", that means the query
        # has finished its current iteration and is waiting for new messages from
        # Kafka.
        print(f"DataAvailable: {tq.status['isDataAvailable']},\tTriggerActive: {tq.status['isTriggerActive']}\t{tq.status['message']}")

        memory_sink = spark.table(query_name)

        if sort:
            memory_sink = memory_sink.sort(*sort)

        # Show result table in Jupyter Notebook. Since Jupyter Notebooks have native support for showing pandas tables,
        # we convert the Spark DataFrame.
        if rows:
            display(memory_sink)
            display(memory_sink.take(10))
        else:
            display(memory_sink)
            display(memory_sink.toPandas())

    finally:
        # Always try to stop the query but it doesn't matter if it fails.
        try:
            tq.stop()
        except:
            pass


Create a Spark context and specify that the python spark-kafka libraries need to be added.

In [None]:
# Create a local Spark cluster with two executors (if it doesn't already exist)
spark = SparkSession.builder.master('local[2]').getOrCreate()

Create a streaming DataFrame that represents the events received from the Kafka topic `clicks-cleaned`.

In [None]:
input = (
    spark.readStream.format("kafka")
    # The Kafka server is available on localhost port 9092
    .option("kafka.bootstrap.servers","localhost:9092")
    # Read the "clicks-cleaned" topic
    .option("subscribe", "clicks-cleaned")
    # Start at the beginning of this topic. This will read all historical data from Kafka.
    # Use "latest" if you only want to process _new_ events.
    .option("startingOffsets", "earliest")
    # Return a Streaming DataFrame representing this stream
    .load()
)

test_query(input, mode="append")

Cast the json to columns in the DataFrame. Make sure to use TimestampType for the `ts_ingest` since we already converted it in the `clean` notebook.

In [None]:
schema = StructType([
    StructField("visitor_platform", StringType()),
    StructField("ts_ingest", TimestampType()),
    StructField("article_title", StringType()),
    StructField("visitor_country", StringType()),
    StructField("visitor_os", StringType()),
    StructField("article", StringType()),
    StructField("visitor_browser", StringType()),
    StructField("visitor_page_timer", IntegerType()),
    StructField("visitor_page_height", IntegerType()),
])

dfs = input.selectExpr("CAST(value AS STRING)") \
      .select(from_json(col("value"), schema) \
      .alias("clicks"))

df_data = dfs.select("clicks.*")

test_query(df_data, mode="append")

Generate the values you want to show in your dashboard. You are free to choose which values and aggregations to show. As an example, you can group by article title and use a 10 seconds window in order to show how many views each article received.

In [None]:
df_data_grouped = (
    df_data
        .withWatermark("ts_ingest", "1 second")
        .groupBy(
            col('article_title'),
            window(col('ts_ingest'), "10 seconds"))
        .count()     
)

test_query(df_data_grouped, mode="append")

Finally, write the results to Kafka topics

In [None]:
try:
    # Remove old checkpoint dir
    os.rmdir("checkpoints-dashboard")
except FileNotFoundError as e:
    pass
except OSError as e:
    # Ignore "file not found" errors
    if (e.errno != 39):
        raise e

# Prepare df for Kafka and write to kafka
tq = (
    df_data_grouped
    .selectExpr("to_json(struct(*)) as value")
    # Write stream to kafka
    .writeStream.format("kafka")
    # The kafka server is available on localhost port 9092
    .option("kafka.bootstrap.servers", "localhost:9092")
    # Write to the topic "clicks-cleaned"
    .option("topic", "clicks-cleaned")
    # Use the folder "checkpoints-cleanup" to write the state of this query
    .option("checkpointLocation", "checkpoints-dashboard")
    # You can use the queryname to later refer to the query
    .queryName("test_query")
    # Start the query
    .start()
)

# Sleep two seconds
sleep(2)

# Show the status of the query
display(tq.status)

## Spark helpers

The following code stops all running queries.

In [None]:
for q in spark.streams.active:
    print("Stopping query '{}' with name '{}'".format(q.id, q.name))
    q.stop()
