In [1]:
from pyspark.sql import SparkSession

import pyspark.sql.functions as F

spark = (
    SparkSession.builder.appName("Streaming")  # type: ignore
    .master("spark://spark:7077")
    # .config(conf)
    .getOrCreate()
)
# Set configuration for Notebook display
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.sparkContext.setLogLevel("WARN")

:: loading settings :: url = jar:file:/opt/bitnami/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /tmp/.ivy/cache
The jars for the packages stored in: /tmp/.ivy/jars
org.apache.spark#spark-avro_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-50b464c6-19b6-4750-9968-184838b0fe66;1.0
	confs: [default]
	found org.apache.spark#spark-avro_2.12;3.5.0 in central
	found org.tukaani#xz;1.9 in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.0 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 

In [2]:
# Read data from the topic as a DataFrame.
raw_topic_df = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", "kafka:19092")
    .option("subscribe", "OpenSeaRawEvents")
    .load()
)
raw_topic_df.printSchema()
parsed_df = raw_topic_df.selectExpr(
    "CAST(value as string) as json_value",
    "timestamp as processed_at",
)
nft_data = parsed_df.selectExpr(
    "get_json_object(json_value, '$.payload.payload.collection.slug') as collection_slug",
    "to_timestamp(get_json_object(json_value, '$.payload.sent_at')) as sent_at",
)
agg_df = (
    nft_data.withWatermark(
        "sent_at", "1 minute"
    )  # Define watermark to handle late data
    .groupBy(
        F.window(
            "sent_at", "1 minute", "1 minute"
        ),  # Every 1 hour window, with 1 minute slide (Slide is the interval at which the window is updated)
        "collection_slug",
    )
    .count()
)
agg_df = agg_df.select(
    F.col("window.start").alias("window_start"),
    F.col("window.end").alias("window_end"),
    "collection_slug",
    "count",
)
agg_df.printSchema()
# query = agg_df.writeStream.format("console").outputMode("complete")
# query.start().awaitTermination()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)

root
 |-- window_start: timestamp (nullable = true)
 |-- window_end: timestamp (nullable = true)
 |-- collection_slug: string (nullable = true)
 |-- count: long (nullable = false)



In [4]:
import os
# Start the Structured Streaming query
# Write to s3a://processed-data every 5 seconds, in parquet format
# Set the S3 bucket host to the localstack host 'minio', and use the access key and secret key
# Parquet format does not support the 'complete' output mode, so we use 'append' instead
# 'append' mode will write the new data to the existing data in the parquet file,
query = (
    agg_df.select(F.to_json(F.struct("*")).alias("value"))
    .writeStream.format("kafka")
    .option("kafka.bootstrap.servers", "kafka:19092")
    .option(
        "checkpointLocation",
        "s3a://processed-data/checkpoints/topics/OpenSeaEnrichedEvents/",
    )
    .option("topic", "OpenSeaEnrichedEvents")
)
query.start().awaitTermination()

24/06/11 18:09:21 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
24/06/11 18:09:22 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/06/11 18:09:23 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
[Stage 108:>                                                        (0 + 1) / 1]

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/bitnami/python/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/bitnami/python/lib/python3.11/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/bitnami/python/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt
24/06/11 18:19:47 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 54, writer: org.apache.spark.sql.kafka010.KafkaStreamingWrite@51150c46] is aborting.
24/06/11 18:19:47 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 54, writer: org.apache.spark.sql.kafka010.KafkaStreami

KeyboardInterrupt: 

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/bitnami/python/lib/python3.11/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/bitnami/python/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/bitnami/python/lib/python3.11/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


In [None]:
# Read the files in batch:
data = spark.read.parquet("s3a://processed-data/stream")
data.orderBy("count", ascending=False)

                                                                                

window_start,window_end,collection_slug,count
2024-05-27 20:35:00,2024-05-27 20:36:00,datacube-2,51
2024-05-27 20:35:00,2024-05-27 20:36:00,game-item-11,38
2024-05-27 20:34:00,2024-05-27 20:35:00,game-item-10,24
2024-05-27 20:33:00,2024-05-27 20:34:00,game-item-10,23
2024-05-27 20:33:00,2024-05-27 20:34:00,datacube-1,20
2024-05-27 20:32:00,2024-05-27 20:33:00,game-item-10,19
2024-05-27 20:28:00,2024-05-27 20:29:00,game-item-10,16
2024-05-27 20:30:00,2024-05-27 20:31:00,game-item-10,16
2024-05-27 20:35:00,2024-05-27 20:36:00,game-item-10,15
2024-05-27 20:28:00,2024-05-27 20:29:00,datacube-1,15
