In [11]:
from IPython.display import display, clear_output
from datetime import datetime
import time
import pytz
from pathlib import Path

import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DataType
import pyspark.sql.functions as sf
from pyspark.sql.functions import lit, date_format, col, udf

In [2]:
# Create SparkSession
spark = SparkSession.builder \
        .appName('kafka') \
        .getOrCreate()

In [3]:
## Create the schema of the value field for Car Park Facility
schema_car_park_facility_stream = StructType([
    StructField("facility_id", StringType(),  True),
    StructField("tsn", StringType(),  True),
    StructField("park_id", StringType(),  True),
    StructField("facility_name", StringType(),  True),
    StructField("time", StringType(),  True),
    StructField("spots", StringType(),  True),
    StructField("message_date", StringType(),  True),
    StructField("tfnsw_facility_id", StringType(), True),
    StructField("facility_occupancy_loop", StringType(), True),
    StructField("facility_occupancy_total", StringType(), True),
    StructField("facility_occupancy_monthlies", StringType(),  True),
    StructField("facility_occupancy_open_gate", StringType(), True),
    StructField("facility_occupancy_transients", StringType(), True),
])

In [4]:
### Subscribe the topic "nsw_car_park_facility" from the Kafka broker and 
### Read the earlierst data into the Spark dataframe called car_park_facility_stream_df
car_park_facility_stream_df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "broker:29092") \
  .option("startingOffsets", "earliest") \
  .option("subscribe", "nsw_car_park_facility") \
  .load()

### Convert the columns key and value from stream_df to string and 
### Save the results into the dataframe again
car_park_facility_stream_df = car_park_facility_stream_df \
    .withColumn("key", car_park_facility_stream_df["key"].cast(StringType())) \
    .withColumn("value", car_park_facility_stream_df["value"].cast(StringType()))

### Convert the column value of string_stream_df to JSON and
### Save the results to the dataframe again 
car_park_facility_stream_df = car_park_facility_stream_df \
    .withColumn("value", F.from_json("value", schema_car_park_facility_stream))

### Flatten the columns from value and rename the columns key, topic, timestamp to respectively event_key, event_topic, event_timestamp
car_park_facility_stream_df = car_park_facility_stream_df \
    .select( \
        F.col("key").alias("event_key"), \
        F.col("topic").alias("event_topic"), \
        F.col("timestamp").alias("event_timestamp"), \
        "value.facility_id", \
        "value.tsn", \
        "value.park_id", \
        "value.facility_name", \
        "value.time", \
        "value.spots", \
        "value.message_date", \
        "value.tfnsw_facility_id", \
        "value.facility_occupancy_loop", \
        "value.facility_occupancy_total", \
        "value.facility_occupancy_monthlies", \
        "value.facility_occupancy_open_gate", \
        "value.facility_occupancy_transients"
    )

### Drop duplicate records
car_park_facility_stream_df = car_park_facility_stream_df.dropDuplicates(["facility_id","message_date"])

### Print the schema of car_park_facility_stream_df
car_park_facility_stream_df.printSchema()

root
 |-- event_key: string (nullable = true)
 |-- event_topic: string (nullable = true)
 |-- event_timestamp: timestamp (nullable = true)
 |-- facility_id: string (nullable = true)
 |-- tsn: string (nullable = true)
 |-- park_id: string (nullable = true)
 |-- facility_name: string (nullable = true)
 |-- time: string (nullable = true)
 |-- spots: string (nullable = true)
 |-- message_date: string (nullable = true)
 |-- tfnsw_facility_id: string (nullable = true)
 |-- facility_occupancy_loop: string (nullable = true)
 |-- facility_occupancy_total: string (nullable = true)
 |-- facility_occupancy_monthlies: string (nullable = true)
 |-- facility_occupancy_open_gate: string (nullable = true)
 |-- facility_occupancy_transients: string (nullable = true)



In [5]:
## Create the schema of the value field for Car Park Zone
schema_car_park_zone_struct = StructType([
    StructField("zone_id", StringType(),  True),
    StructField("facility_id", StringType(),  True),
    StructField("message_date", StringType(),  True),
    StructField("zone_name", StringType(),  True),
    StructField("spots", StringType(),  True),
    StructField("parent_zone_id", StringType(),  True),
    StructField("zone_occupancy_loop", StringType(), True),
    StructField("zone_occupancy_total", StringType(), True),
    StructField("zone_occupancy_monthlies", StringType(), True),
    StructField("zone_occupancy_open_gate", StringType(),  True),
    StructField("zone_occupancy_transients", StringType(), True),
])

In [6]:
### Subscribe the topic "nsw_car_park_zone" from the Kafka broker and 
### Read the earlierst data into the Spark dataframe called car_park_zone_stream_df
car_park_zone_stream_df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "broker:29092") \
  .option("startingOffsets", "earliest") \
  .option("subscribe", "nsw_car_park_zone") \
  .load()

### Convert the columns key and value from car_park_zone_stream_df to string and 
### Save the results into the dataframe again
car_park_zone_stream_df = car_park_zone_stream_df \
    .withColumn("key", car_park_zone_stream_df["key"].cast(StringType())) \
    .withColumn("value", car_park_zone_stream_df["value"].cast(StringType()))

### Convert the column value of string_stream_df to JSON and
### Save the results to the dataframe again 
car_park_zone_stream_df = car_park_zone_stream_df \
    .withColumn("value", F.from_json("value", schema_car_park_zone_struct))

### Flatten the columns from value and rename the columns key,
### topic, timestamp to respectively event_key, event_topic, event_timestamp
car_park_zone_stream_df = car_park_zone_stream_df \
    .select( \
        F.col("key").alias("event_key"), \
        F.col("topic").alias("event_topic"), \
        F.col("timestamp").alias("event_timestamp"), \
        "value.zone_id", \
        "value.facility_id", \
        "value.message_date", \
        "value.zone_name", \
        "value.spots", \
        "value.parent_zone_id", \
        "value.zone_occupancy_loop", \
        "value.zone_occupancy_total", \
        "value.zone_occupancy_monthlies", \
        "value.zone_occupancy_open_gate", \
        "value.zone_occupancy_transients"
    )

### Drop duplicate records
car_park_zone_stream_df = car_park_zone_stream_df.dropDuplicates(["facility_id","zone_id","message_date"])

### Print the schema of car_park_zone_stream_df
car_park_zone_stream_df.printSchema()

root
 |-- event_key: string (nullable = true)
 |-- event_topic: string (nullable = true)
 |-- event_timestamp: timestamp (nullable = true)
 |-- zone_id: string (nullable = true)
 |-- facility_id: string (nullable = true)
 |-- message_date: string (nullable = true)
 |-- zone_name: string (nullable = true)
 |-- spots: string (nullable = true)
 |-- parent_zone_id: string (nullable = true)
 |-- zone_occupancy_loop: string (nullable = true)
 |-- zone_occupancy_total: string (nullable = true)
 |-- zone_occupancy_monthlies: string (nullable = true)
 |-- zone_occupancy_open_gate: string (nullable = true)
 |-- zone_occupancy_transients: string (nullable = true)



In [7]:
# create views for facilities and zones
car_park_facility_stream = car_park_facility_stream_df \
    .writeStream \
    .format("memory") \
    .queryName("nsw_car_park_facility_view") \
    .start()

car_park_zone_stream = car_park_zone_stream_df \
    .writeStream \
    .format("memory") \
    .queryName("nsw_car_park_zone_view") \
    .start()

In [28]:
try:
    while(True):
        clear_output(wait=True)
        
        # get current time
        tz = pytz.timezone('Australia/Sydney')
        now = datetime.now(tz)
        current_time = now.strftime("%H:%M:%S")
        
        print(f"Current Time : {current_time}")
        
        
        # Union data from latest facility and zone records and get the records which older than 30 minutes
        new_car_park_zone_df = spark.sql(f"""    
            select 
                facility_id, zone_id, message_datetime 
            from
            (
                (
                    SELECT 
                        facility_id, "" as zone_id, message_date, to_timestamp(message_date, "yyyy-MM-dd'T'HH:mm:ss") as message_datetime
                    FROM (
                        SELECT 
                            *,
                            row_number() over (partition by facility_id order by message_date desc) as rn
                        FROM nsw_car_park_facility_view
                    ) t1
                    WHERE rn = 1
                )
                UNION
                (
                    SELECT 
                        facility_id, zone_id, message_date, to_timestamp(message_date, "yyyy-MM-dd'T'HH:mm:ss") as message_datetime
                    FROM (
                        SELECT 
                            *,
                            row_number() over (partition by facility_id, zone_id order by message_date desc) as rn
                        FROM nsw_car_park_zone_view
                    ) t2
                    WHERE rn = 1
                )
            ) t3
            where message_datetime < (from_utc_timestamp(current_timestamp(),'Australia/Sydney') - INTERVAL 30 minutes)
        """)
        
        if (new_car_park_zone_df.count() > 0):
            new_car_park_zone_pd_df = new_car_park_zone_df.toPandas()
            display(new_car_park_zone_pd_df)
        else:
            print("All records are latest.")
        
        time.sleep(60)
        
except KeyboardInterrupt:
    pass


Current Time : 03:25:46
All records are latest.


In [None]:
# stop stream
car_park_facility_stream.stop()
car_park_zone_stream.stop()

In [None]:
# stop spark session
spark.stop()