UTS, MDSI, Autumn 2021 , Big Data Engineering(94693)

Assignment 3 - Streaming with Kafka and Spark Streaming, 13 June 2021

Group Project by
* Irfan
* Ming 
* Jedo
* Hnin Pwint Tin / 13738339  

AT3_Part4_Machine_Learning_Model.ipynb

This script intends to 
* build Machine Learning- Linear Regression Model on Historical Car Park Occupancy Data 
* make prediction of future car park occupancy on Real-time Occupancy Stream Data
* Stream the prediction result by Kafka Producer to Kafka Broker

Prior to running this script, AT3_Producer.ipnyb has to be run.

In [1]:
from IPython.display import display, clear_output
from datetime import datetime
import time
from pathlib import Path

import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DataType, FloatType,TimestampType
import pyspark.sql.functions as sf
from pyspark.sql.functions import lit, date_format, col, udf, struct

from pyspark.sql.window import Window

## Create Spark Session

In [2]:
# Create SparkSession
spark = SparkSession.builder \
        .appName('kafka') \
        .config("spark.driver.memory", "15g") \
        .getOrCreate()

## Subscribe the Zone Occupancy Live Stream

In [3]:
## Create the schema of the value field for Car Park Zone
schema_car_park_zone_struct = StructType([
    StructField("zone_id", StringType(),  True),
    StructField("facility_id", StringType(),  True),
    StructField("message_date", StringType(),  True),
    StructField("zone_name", StringType(),  True),
    StructField("spots", StringType(),  True),
    StructField("parent_zone_id", StringType(),  True),
    StructField("zone_occupancy_loop", StringType(), True),
    StructField("zone_occupancy_total", StringType(), True),
    StructField("zone_occupancy_monthlies", StringType(), True),
    StructField("zone_occupancy_open_gate", StringType(),  True),
    StructField("zone_occupancy_transients", StringType(), True),
])

In [4]:
#### Subscribe the topic "nsw_car_park_zone" from the Kafka broker and 
### Read the earlierst data into the Spark dataframe called car_park_zone_stream_df
car_park_zone_stream_df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "broker:29092") \
  .option("startingOffsets", "earliest") \
  .option("subscribe", "nsw_car_park_zone") \
  .load()

### Convert the columns key and value from car_park_zone_stream_df to string and 
### Save the results into the dataframe again
car_park_zone_stream_df = car_park_zone_stream_df \
    .withColumn("key", car_park_zone_stream_df["key"].cast(StringType())) \
    .withColumn("value", car_park_zone_stream_df["value"].cast(StringType()))

### Convert the column value of string_stream_df to JSON and
### Save the results to the dataframe again 
car_park_zone_stream_df = car_park_zone_stream_df \
    .withColumn("value", F.from_json("value", schema_car_park_zone_struct))

### Flatten the columns from value and rename the columns key,
### topic, timestamp to respectively event_key, event_topic, event_timestamp
car_park_zone_stream_df = car_park_zone_stream_df \
    .select( \
        F.col("key").alias("event_key"), \
        F.col("topic").alias("event_topic"), \
        F.col("timestamp").alias("event_timestamp"), \
        "value.zone_id", \
        "value.facility_id", \
        "value.message_date", \
        "value.zone_name", \
        "value.spots", \
        "value.parent_zone_id", \
        "value.zone_occupancy_loop", \
        "value.zone_occupancy_total", \
        "value.zone_occupancy_monthlies", \
        "value.zone_occupancy_open_gate", \
        "value.zone_occupancy_transients"
    )

### Print the schema of car_park_zone_stream_df
car_park_zone_stream_df.printSchema()

root
 |-- event_key: string (nullable = true)
 |-- event_topic: string (nullable = true)
 |-- event_timestamp: timestamp (nullable = true)
 |-- zone_id: string (nullable = true)
 |-- facility_id: string (nullable = true)
 |-- message_date: string (nullable = true)
 |-- zone_name: string (nullable = true)
 |-- spots: string (nullable = true)
 |-- parent_zone_id: string (nullable = true)
 |-- zone_occupancy_loop: string (nullable = true)
 |-- zone_occupancy_total: string (nullable = true)
 |-- zone_occupancy_monthlies: string (nullable = true)
 |-- zone_occupancy_open_gate: string (nullable = true)
 |-- zone_occupancy_transients: string (nullable = true)



In [5]:
car_park_zone_stream = car_park_zone_stream_df \
    .writeStream \
    .format("memory") \
    .queryName("nsw_car_park_zone_view") \
    .start()

In [6]:
nsw_car_park_zone_df = spark.sql('SELECT * FROM nsw_car_park_zone_view')

In [7]:
nsw_car_park_zone_df.printSchema()

root
 |-- event_key: string (nullable = true)
 |-- event_topic: string (nullable = true)
 |-- event_timestamp: timestamp (nullable = true)
 |-- zone_id: string (nullable = true)
 |-- facility_id: string (nullable = true)
 |-- message_date: string (nullable = true)
 |-- zone_name: string (nullable = true)
 |-- spots: string (nullable = true)
 |-- parent_zone_id: string (nullable = true)
 |-- zone_occupancy_loop: string (nullable = true)
 |-- zone_occupancy_total: string (nullable = true)
 |-- zone_occupancy_monthlies: string (nullable = true)
 |-- zone_occupancy_open_gate: string (nullable = true)
 |-- zone_occupancy_transients: string (nullable = true)



### Initail Data Quality Check Performed on Stream Data

In [8]:
### Identify negative value of zone occupancy
nsw_car_park_zone_df.filter(F.col('zone_occupancy_total') < 0)\
.select('zone_name', 'zone_id', 'zone_occupancy_total', 'spots').limit(5).toPandas()

Unnamed: 0,zone_name,zone_id,zone_occupancy_total,spots
0,Kiama Car Park,1,-1,42
1,Kiama Car Park,1,-1,42
2,Kiama Car Park,1,-1,42
3,Kiama Car Park,1,-1,42
4,Kiama Car Park,1,-1,42


In [84]:
### Zone Occupancy Greater Than Capacity
## The result may be incorrect - the values are String
nsw_car_park_zone_df.filter(F.col('zone_occupancy_total') > (F.col('spots')) * 1.1)\
.select('zone_name', 'zone_id', 'zone_occupancy_total', 'spots').toPandas()

Unnamed: 0,zone_name,zone_id,zone_occupancy_total,spots


In [83]:
nsw_car_park_zone_df.filter(F.col('zone_occupancy_total').isNull())\
.select('zone_name', 'zone_id', 'zone_occupancy_total', 'spots').limit(5).toPandas()

Unnamed: 0,zone_name,zone_id,zone_occupancy_total,spots
0,SYD318 Ashfield Park and Ride,1,,180
1,SYD319 Kogarah Park and Ride,2,,259
2,Multi Level,4,,874
3,on grade,5,,739
4,SYD326 Manly Vale Park and Ride,1,,142


In [12]:
nsw_car_park_zone_df.filter(F.col('zone_occupancy_total').isNotNull() & (F.col('zone_occupancy_total') >= 0) & \
(F.col('zone_occupancy_total') < (F.col('spots')) * 100) ).limit(2).toPandas()

Unnamed: 0,event_key,event_topic,event_timestamp,zone_id,facility_id,message_date,zone_name,spots,parent_zone_id,zone_occupancy_loop,zone_occupancy_total,zone_occupancy_monthlies,zone_occupancy_open_gate,zone_occupancy_transients
0,CPS-CUD1,nsw_car_park_zone,2021-06-01 10:56:47.866,CPS-CUD1,1,2021-06-01T20:56:46,Tallawong Station At-Grade A Car Park,152,0,,22,,,
1,CPS-CUD2,nsw_car_park_zone,2021-06-01 10:56:47.867,CPS-CUD2,1,2021-06-01T20:56:46,Tallawong Station At-Grade B Car Park,455,0,,9,,,


## Load the Saved Historical Streamed Records in Parquet Format

In [7]:
# Constant Parquet File Path
CAR_PARK_ZONE_FILEPATH = "data/car_park_zone.parquet"

In [8]:
saved_car_park_zone_df = spark.read.parquet(CAR_PARK_ZONE_FILEPATH)

In [None]:
saved_car_park_zone_df.show(2)

### Data Quality Investigation

In [10]:
saved_car_park_zone_df.filter(F.col('zone_occupancy_total') < 0)\
.select('zone_name', 'zone_id', 'zone_occupancy_total', 'spots')\
.select('zone_name').distinct().toPandas()

Unnamed: 0,zone_name
0,Kiama Car Park
1,Gordon Henry St North


In [19]:
saved_car_park_zone_df.filter(F.col('zone_occupancy_total').isNull())\
.select('zone_name', 'zone_id', 'zone_occupancy_total', 'spots')\
.select('zone_name').distinct().toPandas()

Unnamed: 0,zone_name
0,SYD326 Manly Vale Park and Ride
1,Multi Level
2,SYD319 Kogarah Park and Ride
3,on grade
4,SYD318 Ashfield Park and Ride


In [21]:
saved_car_park_zone_df.printSchema()

root
 |-- event_key: string (nullable = true)
 |-- event_topic: string (nullable = true)
 |-- event_timestamp: timestamp (nullable = true)
 |-- zone_id: string (nullable = true)
 |-- facility_id: string (nullable = true)
 |-- message_date: string (nullable = true)
 |-- zone_name: string (nullable = true)
 |-- spots: string (nullable = true)
 |-- parent_zone_id: string (nullable = true)
 |-- zone_occupancy_loop: string (nullable = true)
 |-- zone_occupancy_total: string (nullable = true)
 |-- zone_occupancy_monthlies: string (nullable = true)
 |-- zone_occupancy_open_gate: string (nullable = true)
 |-- zone_occupancy_transients: string (nullable = true)
 |-- rn: integer (nullable = true)
 |-- message_datetime: timestamp (nullable = true)
 |-- year: integer (nullable = true)
 |-- quarter: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- dayofweek: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- minute: integer (n

### Transform Data Types

In [11]:
## Change Categorical Features to String Type
car_park_zone_df = saved_car_park_zone_df.withColumn('facility_id', F.col('facility_id').astype(StringType())).\
        withColumn('dayofweek', F.col('dayofweek').astype(StringType())).\
        withColumn('hour', F.col('hour').astype(StringType())).\
        withColumn('minute', F.col('minute').astype(StringType())).\
        withColumn('zone_occupancy_total', F.col('zone_occupancy_total').astype(IntegerType())).\
        withColumn('spots', F.col('spots').astype(IntegerType()))

In [12]:
## Subset Columns for interested features for modelling
cols = ['zone_id', 'spots', 'zone_occupancy_total', 'dayofweek', 'hour', 'minute', 'facility_id', 'message_date']
car_park_zone_df = car_park_zone_df.select(cols)

In [13]:
# car_park_zone_df.printSchema()
car_park_zone_df.show(2)

+-------+-----+--------------------+---------+----+------+-----------+-------------------+
|zone_id|spots|zone_occupancy_total|dayofweek|hour|minute|facility_id|       message_date|
+-------+-----+--------------------+---------+----+------+-----------+-------------------+
|      1|   42|                  -1|        7|   4|    33|          7|2021-06-05T04:33:28|
|      1|  213|                   5|        7|   4|    33|          6|2021-06-05T04:33:36|
+-------+-----+--------------------+---------+----+------+-----------+-------------------+
only showing top 2 rows



### Perform Data Cleansing - filtering out negative / NULL / occupancy > capacity

In [14]:
### Perform mathematical conditional filtering after type change
zone_new_stream_cleaned = car_park_zone_df.filter(F.col('zone_occupancy_total').isNotNull() & \
                                                         (F.col('zone_occupancy_total') >= 0) & \
                                                         (F.col('zone_occupancy_total') < (col('spots')) * 1.1) )

In [15]:
## Confirm the result of above cleansing task - condition 2
zone_new_stream_cleaned.filter(F.col('zone_occupancy_total') < 0)\
.select('zone_id', 'zone_occupancy_total', 'spots')\
.select('zone_id').distinct().toPandas()

Unnamed: 0,zone_id


In [23]:
## Confirm the result of above cleansing task - condition  1
zone_new_stream_cleaned.filter(F.col('zone_occupancy_total').isNull())\
.select('zone_id', 'zone_occupancy_total', 'spots')\
.select('zone_id').distinct().toPandas()

Unnamed: 0,zone_id


In [24]:
## Confirm the result of above cleansing task - condition 3
zone_new_stream_cleaned.filter((F.col('zone_occupancy_total') > (col('spots')) * 1.1))\
.select('zone_id', 'zone_occupancy_total', 'spots')\
.select('zone_id').distinct().toPandas()

Unnamed: 0,zone_id


In [12]:
zone_new_stream_cleaned.printSchema()

root
 |-- zone_id: string (nullable = true)
 |-- spots: integer (nullable = true)
 |-- zone_occupancy_total: integer (nullable = true)
 |-- dayofweek: string (nullable = true)
 |-- hour: string (nullable = true)
 |-- minute: string (nullable = true)
 |-- facility_id: string (nullable = true)
 |-- message_date: string (nullable = true)



### Add Occupancy Rate Feature by calculating

In [16]:
car_park_zone_cleaned = zone_new_stream_cleaned.withColumn('zone_occupancy_rate', F.round(((F.col('zone_occupancy_total')/ F.col('spots')) * 100),2))

In [17]:
# car_park_zone_cleaned.printSchema()
car_park_zone_cleaned.show(2)

+-------+-----+--------------------+---------+----+------+-----------+-------------------+-------------------+
|zone_id|spots|zone_occupancy_total|dayofweek|hour|minute|facility_id|       message_date|zone_occupancy_rate|
+-------+-----+--------------------+---------+----+------+-----------+-------------------+-------------------+
|      1|  213|                   5|        7|   4|    33|          6|2021-06-05T04:33:36|               2.35|
|      1|  213|                   5|        7|   4|    33|          6|2021-06-05T04:33:51|               2.35|
+-------+-----+--------------------+---------+----+------+-----------+-------------------+-------------------+
only showing top 2 rows



### Add Feature of Future Occupancy Rate by  Window LEAD function

In [18]:
# Define Window
wlead = Window.partitionBy([car_park_zone_cleaned.zone_id, car_park_zone_cleaned.facility_id]).orderBy(car_park_zone_cleaned.message_date)

In [19]:
car_park_zone_cleaned = car_park_zone_cleaned.withColumn('future_zone_occupancy_rate',F.lead(F.col('zone_occupancy_rate'),1).over(wlead))
car_park_zone_cleaned.show(5)

+--------+-----+--------------------+---------+----+------+-----------+-------------------+-------------------+--------------------------+
| zone_id|spots|zone_occupancy_total|dayofweek|hour|minute|facility_id|       message_date|zone_occupancy_rate|future_zone_occupancy_rate|
+--------+-----+--------------------+---------+----+------+-----------+-------------------+-------------------+--------------------------+
|CPS-KVE2| 1006|                  18|        1|   0|     0|          2|2021-05-09T00:00:55|               1.79|                      1.79|
|CPS-KVE2| 1006|                  18|        1|   0|     1|          2|2021-05-09T00:01:55|               1.79|                      1.79|
|CPS-KVE2| 1006|                  18|        1|   0|     2|          2|2021-05-09T00:02:55|               1.79|                      1.59|
|CPS-KVE2| 1006|                  16|        1|   0|     3|          2|2021-05-09T00:03:55|               1.59|                      1.29|
|CPS-KVE2| 1006|           

In [20]:
### Final Feature List for modelling , without "message_date"
columns = ['zone_id', 'dayofweek', 'hour', 'minute', 'facility_id', 'zone_occupancy_rate', 'future_zone_occupancy_rate']
car_park_zone_train_df = car_park_zone_cleaned.select(columns)

car_park_zone_train_df.printSchema()
car_park_zone_train_df.show(2)

root
 |-- zone_id: string (nullable = true)
 |-- dayofweek: string (nullable = true)
 |-- hour: string (nullable = true)
 |-- minute: string (nullable = true)
 |-- facility_id: string (nullable = true)
 |-- zone_occupancy_rate: float (nullable = true)
 |-- future_zone_occupancy_rate: float (nullable = true)

+--------+---------+----+------+-----------+-------------------+--------------------------+
| zone_id|dayofweek|hour|minute|facility_id|zone_occupancy_rate|future_zone_occupancy_rate|
+--------+---------+----+------+-----------+-------------------+--------------------------+
|CPS-KVE2|        1|   0|     0|          2|               1.79|                      1.79|
|CPS-KVE2|        1|   0|     1|          2|               1.79|                      1.79|
+--------+---------+----+------+-----------+-------------------+--------------------------+
only showing top 2 rows



### Build Pipeline for Data Processing - Categorical Encoding

In [21]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder

cat_cols = ['zone_id', 'dayofweek','hour', 'minute', 'facility_id']
stages = []

for cat_col in cat_cols:
    col_indexer = StringIndexer(inputCol=cat_col, outputCol=f"{cat_col}_ind")
    col_encoder = OneHotEncoder(inputCols=[f"{cat_col}_ind"], outputCols=[f"{cat_col}_ohe"])
    stages += [col_indexer, col_encoder]
cat_cols_ohe = [f"{cat_col}_ohe" for cat_col in cat_cols]

In [22]:
## Vectorize the Numerical Features 

num_cols = ['zone_occupancy_rate', 'future_zone_occupancy_rate']

assembler = VectorAssembler(inputCols=cat_cols_ohe + num_cols, outputCol="features")
assembler.setHandleInvalid("keep")

VectorAssembler_7b5759e45f31

In [23]:
stages += [assembler]

pipeline = Pipeline(stages=stages)
pipeline_model = pipeline.fit(car_park_zone_train_df)

In [24]:
pipelined_train_df = pipeline_model.transform(car_park_zone_train_df)

In [25]:
pipelined_train_df.show(2)

+--------+---------+----+------+-----------+-------------------+--------------------------+-----------+-------------+-------------+-------------+--------+---------------+----------+---------------+---------------+---------------+--------------------+
| zone_id|dayofweek|hour|minute|facility_id|zone_occupancy_rate|future_zone_occupancy_rate|zone_id_ind|  zone_id_ohe|dayofweek_ind|dayofweek_ohe|hour_ind|       hour_ohe|minute_ind|     minute_ohe|facility_id_ind|facility_id_ohe|            features|
+--------+---------+----+------+-----------+-------------------+--------------------------+-----------+-------------+-------------+-------------+--------+---------------+----------+---------------+---------------+---------------+--------------------+
|CPS-KVE2|        1|   0|     0|          2|               1.79|                      1.79|        7.0|(8,[7],[1.0])|          4.0|(6,[4],[1.0])|    19.0|(23,[19],[1.0])|      59.0|     (59,[],[])|            3.0|  (6,[3],[1.0])|(104,[7,12,33,99,.

In [38]:
pipelined_train_df.printSchema()

root
 |-- zone_id: string (nullable = true)
 |-- dayofweek: string (nullable = true)
 |-- hour: string (nullable = true)
 |-- minute: string (nullable = true)
 |-- facility_id: string (nullable = true)
 |-- zone_occupancy_rate: float (nullable = true)
 |-- future_zone_occupancy_rate: float (nullable = true)
 |-- zone_id_ind: double (nullable = false)
 |-- zone_id_ohe: vector (nullable = true)
 |-- dayofweek_ind: double (nullable = false)
 |-- dayofweek_ohe: vector (nullable = true)
 |-- hour_ind: double (nullable = false)
 |-- hour_ohe: vector (nullable = true)
 |-- minute_ind: double (nullable = false)
 |-- minute_ohe: vector (nullable = true)
 |-- facility_id_ind: double (nullable = false)
 |-- facility_id_ohe: vector (nullable = true)
 |-- features: vector (nullable = true)



In [26]:
pipelined_train_df = pipelined_train_df.select(['features'] + columns)

In [27]:
pipelined_train_df.show(2)

+--------------------+--------+---------+----+------+-----------+-------------------+--------------------------+
|            features| zone_id|dayofweek|hour|minute|facility_id|zone_occupancy_rate|future_zone_occupancy_rate|
+--------------------+--------+---------+----+------+-----------+-------------------+--------------------------+
|(104,[7,12,33,99,...|CPS-KVE2|        1|   0|     0|          2|               1.79|                      1.79|
|(104,[7,12,33,57,...|CPS-KVE2|        1|   0|     1|          2|               1.79|                      1.79|
+--------------------+--------+---------+----+------+-----------+-------------------+--------------------------+
only showing top 2 rows



In [41]:
pipelined_train_df.printSchema()

root
 |-- features: vector (nullable = true)
 |-- zone_id: string (nullable = true)
 |-- dayofweek: string (nullable = true)
 |-- hour: string (nullable = true)
 |-- minute: string (nullable = true)
 |-- facility_id: string (nullable = true)
 |-- zone_occupancy_rate: float (nullable = true)
 |-- future_zone_occupancy_rate: float (nullable = true)



#### Check Again for NULL

In [50]:
#Exmine if any rows with NULL value
print(pipelined_train_df.where(col("future_zone_occupancy_rate").isNull()).count())
print(pipelined_train_df.where(col("zone_occupancy_rate").isNull()).count())

10
0


In [28]:
## If no LEAD records are found, "future_zone_occupancy_rate" column will have NULL values
## THose need to be dropped before proceeding into the model training

pipelined_final_train_df = pipelined_train_df.na.drop(subset=["future_zone_occupancy_rate"])

In [27]:
## Ensure the final data without NULL 

print(pipelined_final_train_df.where(col("future_zone_occupancy_rate").isNull()).count())

0


### Model Training - Linear Regression

In [29]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol = 'features', labelCol='future_zone_occupancy_rate', maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [30]:
lr_model = lr.fit(pipelined_final_train_df)

In [31]:
lr_predictions = lr_model.transform(pipelined_final_train_df)
lr_predictions.select("prediction","future_zone_occupancy_rate","features").show(20)

+------------------+--------------------------+--------------------+
|        prediction|future_zone_occupancy_rate|            features|
+------------------+--------------------------+--------------------+
|2.0139840006759093|                      1.79|(104,[7,12,33,99,...|
|2.0139840006759093|                      1.79|(104,[7,12,33,57,...|
|1.9143664067103072|                      1.59|(104,[7,12,33,76,...|
|1.6660303633593734|                      1.29|(104,[7,12,33,95,...|
|1.4678571626267234|                      1.19|(104,[7,12,33,58,...|
|1.4184024104353663|                      1.19|(104,[7,12,33,52,...|
|1.4184024104353663|                      1.19|(104,[7,12,33,86,...|
| 1.368593583764198|                      1.09|(104,[7,12,33,47,...|
| 1.319138772618126|                      1.09|(104,[7,12,33,87,...|
| 1.319138772618126|                      1.09|(104,[7,12,33,64,...|
| 1.319138772618126|                      1.09|(104,[7,12,33,45,...|
| 1.319138772618126|              

### Model Prediction Accurancy on Training Data

In [32]:
from pyspark.ml.evaluation import RegressionEvaluator

lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="future_zone_occupancy_rate", metricName="rmse")
rmse = lr_evaluator.evaluate(lr_predictions)
print("Root Mean Squared Error (RMSE)  = %g" % rmse)

Root Mean Squared Error (RMSE)  = 0.470734


## Model Prediction of Occupancy Rate on Real-Time Stream Data
## Stream the Prediction Results - Strategy I

In [34]:
import json
from confluent_kafka import Producer

p = Producer({'bootstrap.servers': 'broker:29092'})

def convert_to_json(row):
    return json.dumps(row).encode('utf-8')

def callback(error, message):
    if error:
        print(f"Error: {message.value()}: {error.str()}")
    else:
        print(f"Sucess: {message.value()}")

In [None]:
topic = "Occupancy_Rate_Prediction"
try:
    clear_output(wait=True)
    zone_new_stream_data = spark.sql("""SELECT 
                        zone_id, facility_id,spots, zone_occupancy_total,
                        dayofweek(message_date) as dayofweek,
                        hour(message_date) as hour,
                        minute(message_date) as minute, message_date
                       FROM nsw_car_park_zone_view""")

    ### Transform Data types
    zone_new_stream_df = zone_new_stream_data.withColumn('facility_id', F.col('facility_id').astype(StringType())).\
    withColumn('dayofweek', F.col('dayofweek').astype(StringType())).\
    withColumn('hour', F.col('hour').astype(StringType())).\
    withColumn('minute', F.col('minute').astype(StringType())).\
    withColumn('zone_occupancy_total', F.col('zone_occupancy_total').astype(IntegerType())).\
    withColumn('spots', F.col('spots').astype(IntegerType()))

    ## Omit invalid values - NULL, Negative
    zone_new_stream_cleaned = zone_new_stream_df.filter(col('zone_occupancy_total').isNotNull() & \
                                                     (col('zone_occupancy_total') >= 0) & \
                                                     (col('zone_occupancy_total') < (col('spots')) * 1.1) )
    ## Add feature -  Current Occupancy Rate
    zone_new_stream_df = zone_new_stream_cleaned.withColumn('zone_occupancy_rate', F.round(((F.col('zone_occupancy_total')/ F.col('spots')) * 100),2))
    ## Drop if any NULL
    zone_new_stream_df_cleaned_again = zone_new_stream_df.na.drop(subset=["zone_occupancy_rate"])

    ### Define Window Lead function
    wlead = Window.partitionBy([zone_new_stream_df_cleaned_again.zone_id, zone_new_stream_df_cleaned_again.facility_id]).orderBy(zone_new_stream_df_cleaned_again.message_date)
    ## Shift the engineered featured "zone_occupancy_rate" value of next row to the current row 
    zone_new_stream_cleaned_final = zone_new_stream_df_cleaned_again.withColumn('future_zone_occupancy_rate',F.lead(F.col('zone_occupancy_rate'),1).over(wlead))

    ### Subset for features which will be used in Modelling
    zone_new_stream_cleaned_final = zone_new_stream_cleaned_final.select(columns)

    ## Fit the cleaned new stream data into the previously built Pipeline during Model Training
    pipeline_model_new_data = pipeline.fit(zone_new_stream_cleaned_final)

    ## Trigger the data in the pipeline to active the transformation process
    pipelined_cleaned_new_df = pipeline_model_new_data.transform(zone_new_stream_cleaned_final)

    ### Drop the unkown lead occupancy means dropping rows with no Target values
    pipelined_cleaned_final_df = pipelined_cleaned_new_df.na.drop(subset=["future_zone_occupancy_rate"])

    ## Feed into the pipeline
    pipelined_final_new_stream = pipelined_cleaned_final_df.select(['features'] + columns)

    ## Make Prediction
    lr_predictions_new_stream = lr_model.transform(pipelined_final_new_stream)


    df = lr_predictions_new_stream.select('zone_id', "facility_id", "hour", "minute", "prediction" )
    df = df.withColumn("prediction_time",F.current_timestamp().astype(StringType()))
    
    df2 = df.toPandas()
    
    ### Streaming the prediction Starts 
    for i, row in df2.iterrows():
            send_value = convert_to_json(row.to_dict())
            clear_output(wait=True)
            print(f"========{topic}=======")
            print(send_value)
            p.produce(topic, key=row[0], value=send_value, callback=callback)
            p.poll(0) 
    time.sleep(10)
except KeyboardInterrupt:
    pass

b'{"zone_id": "CPS-CUD1", "facility_id": "1", "hour": "9", "minute": "32", "prediction": 13.955396517972801, "prediction_time": "2021-06-13 08:31:16.72974"}'
Sucess: b'{"zone_id": "CPS-CUD1", "facility_id": "1", "hour": "9", "minute": "31", "prediction": 13.628994917690985, "prediction_time": "2021-06-13 08:31:16.72974"}'


In [None]:
car_park_zone_stream.stop()

In [94]:
spark.stop()