In [32]:
import pyspark
from pyspark.sql import SparkSession, functions as F, types as T
from pyspark.sql.functions import from_json, col, udf
import json


pyspark_version = pyspark.__version__
kafka_jar_package = f"org.apache.spark:spark-sql-kafka-0-10_2.12:{pyspark_version}"

spark = (
    SparkSession.builder.master("local[*]")
    .appName("GreenTripsConsumer")
    .config("spark.jars.packages", kafka_jar_package)
    .getOrCreate()
)

In [33]:
green_stream = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("subscribe", "green-trips")
    .option("startingOffsets", "earliest")
    .load()
)

In order to test that we can consume from the stream, let's see what will be the first record there.

In Spark streaming, the stream is represented as a sequence of small batches, each batch being a small RDD (or a small dataframe).

So we can execute a function over each mini-batch. Let's run take(1) there to see what do we have in the stream:


In [34]:
# def peek(mini_batch, batch_id):
#     first_row = mini_batch.take(1)

#     if first_row:
#         print(first_row[0])


# query = green_stream.writeStream.foreachBatch(peek).start()

In [35]:
# green_stream.select(col('value')).writeStream.foreachBatch(peek).start()

The data is JSON, but currently it's in binary format. We need to parse it and turn it into a streaming dataframe with proper columns.

Similarly to PySpark, we define the schema


In [36]:
from pyspark.sql import types as T

schema = T.StructType(
    [
        T.StructField("lpep_pickup_datetime", T.StringType(), True),
        T.StructField("lpep_dropoff_datetime", T.StringType(), True),
        T.StructField("PULocationID", T.IntegerType(), True),
        T.StructField("DOLocationID", T.IntegerType(), True),
        T.StructField("passenger_count", T.DoubleType(), True),
        T.StructField("trip_distance", T.DoubleType(), True),
        T.StructField("tip_amount", T.DoubleType(), True),
    ]
)

In [37]:
from pyspark.sql import functions as F

green_stream = green_stream.select(
    F.from_json(F.col("value").cast("STRING"), schema).alias("data")
).select("data.*")

# green_stream.writeStream.foreachBatch(peek).start()

### Streaming Analytics|


In [38]:
popular_destination = green_stream.withColumn("timestamp", F.current_timestamp()) \
    .groupBy(F.window(col("timestamp"), "5 minutes"),col("DOLocationID")) \
    .agg(F.count(col('DOLocationID')).alias('count')) \
    .orderBy(col('count').desc()) 

In [39]:
query = popular_destination \
    .writeStream \
    .outputMode("complete") \
    .format("console") \
    .option("truncate", "false") \
    .start()

query.awaitTermination()

24/03/21 16:58:04 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-0782dbe4-09de-4a30-a39e-4734a74de73d. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/03/21 16:58:04 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/03/21 16:58:04 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.

-------------------------------------------
Batch: 0
-------------------------------------------
+------+------------+-----+
|window|DOLocationID|count|
+------+------------+-----+
+------+------------+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+------------+-----+
|window                                    |DOLocationID|count|
+------------------------------------------+------------+-----+
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|74          |4630 |
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|42          |4127 |
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|41          |3760 |
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|75          |3341 |
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|129         |3008 |
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|166         |2975 |
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|7           |2941 |
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|236         |2042 |
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|238         |2010 |
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|223         |1963 |
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|82          |1724 |
|{2024-

                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+------------------------------------------+------------+-----+
|window                                    |DOLocationID|count|
+------------------------------------------+------------+-----+
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|74          |13442|
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|42          |11687|
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|41          |10573|
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|75          |9595 |
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|129         |9030 |
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|7           |8751 |
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|166         |8525 |
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|236         |6045 |
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|223         |5873 |
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|238         |5618 |
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|82          |5307 |
|{2024-

                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+------------------------------------------+------------+-----+
|window                                    |DOLocationID|count|
+------------------------------------------+------------+-----+
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|74          |13442|
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|42          |11687|
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|41          |10573|
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|75          |9595 |
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|129         |9030 |
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|7           |8751 |
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|166         |8525 |
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|236         |6045 |
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|223         |5873 |
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|238         |5618 |
|{2024-03-21 16:55:00, 2024-03-21 17:00:00}|82          |5307 |
|{2024-

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/codespace/.python/current/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/codespace/.python/current/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/home/codespace/.python/current/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [40]:
query.stop()