In [43]:
import pyspark
from pyspark.sql import SparkSession

pyspark_version = pyspark.__version__
kafka_jar_package = f"org.apache.spark:spark-sql-kafka-0-10_2.12:{pyspark_version}"

spark = (
    SparkSession.builder.master("local[*]")
    .appName("GreenTripsConsumer")
    .config("spark.jars.packages", kafka_jar_package)
    .getOrCreate()
)

In [44]:
green_stream = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("subscribe", "green-trips")
    .option("startingOffsets", "earliest")
    .load()
)

In order to test that we can consume from the stream, let's see what will be the first record there.

In Spark streaming, the stream is represented as a sequence of small batches, each batch being a small RDD (or a small dataframe).

So we can execute a function over each mini-batch. Let's run take(1) there to see what do we have in the stream:


In [45]:
def peek(mini_batch, batch_id):
    first_row = mini_batch.take(1)

    if first_row:
        print(first_row[0])


query = green_stream.writeStream.foreachBatch(peek).start()

24/03/19 07:52:57 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-3325b383-f10c-4833-b32a-f5076ce0fa18. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/03/19 07:52:57 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


The data is JSON, but currently it's in binary format. We need to parse it and turn it into a streaming dataframe with proper columns.

Similarly to PySpark, we define the schema


In [46]:
from pyspark.sql import types

schema = (
    types.StructType()
    .add("Index", types.IntegerType())
    .add("lpep_pickup_datetime", types.StringType())
    .add("lpep_dropoff_datetime", types.StringType())
    .add("PULocationID", types.IntegerType())
    .add("DOLocationID", types.IntegerType())
    .add("passenger_count", types.DoubleType())
    .add("trip_distance", types.DoubleType())
    .add("tip_amount", types.DoubleType())
)

In [47]:
from pyspark.sql import functions as F

green_stream = green_stream.select(F.col("value")).printSchema()
# query = green_stream.writeStream.foreachBatch(peek).start()

root
 |-- value: binary (nullable = true)



24/03/19 07:52:58 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


Row(key=None, value=bytearray(b'"{\\"Index\\": 0, \\"lpep_pickup_datetime\\": \\"2019-10-01 00:26:02\\", \\"lpep_dropoff_datetime\\": \\"2019-10-01 00:39:58\\", \\"PULocationID\\": 112, \\"DOLocationID\\": 196, \\"passenger_count\\": 1.0, \\"trip_distance\\": 5.88, \\"tip_amount\\": 0.0}"'), topic='green-trips', partition=0, offset=0, timestamp=datetime.datetime(2024, 3, 19, 3, 11, 15, 100000), timestampType=0)


In [None]:
from pyspark.sql import functions as F

green_stream = green_stream.select(
    F.from_json(F.col("value").cast("string"), schema).alias("data")
).select("data.*")
query = green_stream.writeStream.foreachBatch(peek).start()

In [None]:
query.stop()