In [1]:
from pyspark.sql import SparkSession

import pyspark.sql.functions as F

spark = (
    SparkSession.builder.appName("Streaming")  # type: ignore
    .master("spark://spark:7077")
    # .config(conf)
    .getOrCreate()
)
# Set configuration for Notebook display
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.sparkContext.setLogLevel("WARN")

:: loading settings :: url = jar:file:/opt/bitnami/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /tmp/.ivy/cache
The jars for the packages stored in: /tmp/.ivy/jars
org.apache.spark#spark-avro_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-753294f7-469b-4e23-9569-8be84bdcc849;1.0
	confs: [default]
	found org.apache.spark#spark-avro_2.12;3.5.0 in central
	found org.tukaani#xz;1.9 in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.0 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 

In [2]:
# Read data from the topic as a DataFrame.
raw_topic_df = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", "kafka:19092")
    .option("subscribe", "OpenSeaRawEvents")
    .load()
)
raw_topic_df.printSchema()
parsed_df = raw_topic_df.selectExpr(
    "CAST(value as string) as json_value",
    "timestamp as processed_at",
)
nft_data = parsed_df.selectExpr(
    "get_json_object(json_value, '$.payload.payload.collection.slug') as collection_slug",
    "to_timestamp(get_json_object(json_value, '$.payload.sent_at')) as sent_at",
)
agg_df = (
    nft_data.withWatermark(
        "sent_at", "1 minute"
    )  # Define watermark to handle late data
    .groupBy(
        F.window(
            "sent_at", "1 minute", "1 minute"
        ),  # Every 1 hour window, with 1 minute slide (Slide is the interval at which the window is updated)
        "collection_slug",
    )
    .count()
)
agg_df = agg_df.select(
    F.col("window.start").alias("window_start"),
    F.col("window.end").alias("window_end"),
    "collection_slug",
    "count",
)
agg_df.printSchema()
query = agg_df.writeStream.format("console").outputMode("complete")
query.start().awaitTermination()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)

root
 |-- window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- collection_slug: string (nullable = true)
 |-- count: long (nullable = false)

root
 |-- window_start: timestamp (nullable = true)
 |-- window_end: timestamp (nullable = true)
 |-- collection_slug: string (nullable = true)
 |-- count: long (nullable = false)



24/06/09 19:53:55 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-ef9e1f1b-cac4-4757-bb4a-cc27634cd916. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/06/09 19:53:55 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/06/09 19:53:57 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------------+----------+---------------+-----+
|window_start|window_end|collection_slug|count|
+------------+----------+---------------+-----+
+------------+----------+---------------+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+-------------------+-------------------+--------------------+-----+
|       window_start|         window_end|     collection_slug|count|
+-------------------+-------------------+--------------------+-----+
|2024-06-09 19:54:00|2024-06-09 19:55:00|          datacube-2|    6|
|2024-06-09 19:54:00|2024-06-09 19:55:00|the-sandbox-s-ass...|    1|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-item-11|    1|
|2024-06-09 19:54:00|2024-06-09 19:55:00|    juicyposition-13|    6|
|2024-06-09 19:53:00|2024-06-09 19:54:00|pink-blockstylers...|    3|
+-------------------+-------------------+--------------------+-----+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+-------------------+-------------------+--------------------+-----+
|       window_start|         window_end|     collection_slug|count|
+-------------------+-------------------+--------------------+-----+
|2024-06-09 19:54:00|2024-06-09 19:55:00|the-sandbox-s-ass...|    2|
|2024-06-09 19:54:00|2024-06-09 19:55:00|          datacube-2|    8|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-item-11|    3|
|2024-06-09 19:54:00|2024-06-09 19:55:00|    juicyposition-13|    8|
|2024-06-09 19:54:00|2024-06-09 19:55:00|          datacube-1|    2|
|2024-06-09 19:53:00|2024-06-09 19:54:00|pink-blockstylers...|    3|
|2024-06-09 19:54:00|2024-06-09 19:55:00|       imports-pin-1|    1|
|2024-06-09 19:54:00|2024-06-09 19:55:00|     blaqua-tech-nft|    1|
|2024-06-09 19:54:00|2024-06-09 19:55:00|           scores-50|    6|
+-------------------+-------------------+--------------------+-----+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+-------------------+-------------------+--------------------+-----+
|       window_start|         window_end|     collection_slug|count|
+-------------------+-------------------+--------------------+-----+
|2024-06-09 19:54:00|2024-06-09 19:55:00|the-sandbox-s-ass...|    2|
|2024-06-09 19:54:00|2024-06-09 19:55:00|          datacube-2|   10|
|2024-06-09 19:54:00|2024-06-09 19:55:00|control-structure...|    1|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-item-11|    3|
|2024-06-09 19:54:00|2024-06-09 19:55:00|    juicyposition-13|    8|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-item-10|    2|
|2024-06-09 19:54:00|2024-06-09 19:55:00|          datacube-1|    3|
|2024-06-09 19:53:00|2024-06-09 19:54:00|pink-blockstylers...|    3|
|2024-06-09 19:54:00|2024-06-09 19:55:00|       imports-pin-1|    1|
|2024-06-09 19:54:00|2024-06-09 19:55:00|     blaqua-tech-nft|    1|
|2024-

                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+-------------------+-------------------+--------------------+-----+
|       window_start|         window_end|     collection_slug|count|
+-------------------+-------------------+--------------------+-----+
|2024-06-09 19:54:00|2024-06-09 19:55:00|the-sandbox-s-ass...|    2|
|2024-06-09 19:54:00|2024-06-09 19:55:00|          datacube-2|   17|
|2024-06-09 19:54:00|2024-06-09 19:55:00|control-structure...|    1|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-item-11|    3|
|2024-06-09 19:54:00|2024-06-09 19:55:00|    juicyposition-13|    8|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-item-10|    2|
|2024-06-09 19:54:00|2024-06-09 19:55:00|          datacube-1|    3|
|2024-06-09 19:53:00|2024-06-09 19:54:00|pink-blockstylers...|    3|
|2024-06-09 19:54:00|2024-06-09 19:55:00|       imports-pin-1|    1|
|2024-06-09 19:54:00|2024-06-09 19:55:00|     blaqua-tech-nft|    2|
|2024-

                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+-------------------+-------------------+--------------------+-----+
|       window_start|         window_end|     collection_slug|count|
+-------------------+-------------------+--------------------+-----+
|2024-06-09 19:54:00|2024-06-09 19:55:00|the-sandbox-s-ass...|    2|
|2024-06-09 19:54:00|2024-06-09 19:55:00|          datacube-2|   17|
|2024-06-09 19:54:00|2024-06-09 19:55:00|control-structure...|    1|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-cards-4|    9|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-item-11|    4|
|2024-06-09 19:54:00|2024-06-09 19:55:00|    juicyposition-13|    8|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-item-10|    2|
|2024-06-09 19:54:00|2024-06-09 19:55:00|          datacube-1|    4|
|2024-06-09 19:53:00|2024-06-09 19:54:00|pink-blockstylers...|    3|
|2024-06-09 19:54:00|2024-06-09 19:55:00|       imports-pin-1|    1|
|2024-

                                                                                

-------------------------------------------
Batch: 6
-------------------------------------------
+-------------------+-------------------+--------------------+-----+
|       window_start|         window_end|     collection_slug|count|
+-------------------+-------------------+--------------------+-----+
|2024-06-09 19:54:00|2024-06-09 19:55:00|the-sandbox-s-ass...|    2|
|2024-06-09 19:54:00|2024-06-09 19:55:00|          datacube-2|   24|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-cards-4|    9|
|2024-06-09 19:54:00|2024-06-09 19:55:00|control-structure...|    1|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-item-11|    6|
|2024-06-09 19:54:00|2024-06-09 19:55:00|    juicyposition-13|    8|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-item-10|    2|
|2024-06-09 19:54:00|2024-06-09 19:55:00|          datacube-1|    4|
|2024-06-09 19:53:00|2024-06-09 19:54:00|pink-blockstylers...|    3|
|2024-06-09 19:54:00|2024-06-09 19:55:00|       imports-pin-1|    1|
|2024-

                                                                                

-------------------------------------------
Batch: 7
-------------------------------------------
+-------------------+-------------------+--------------------+-----+
|       window_start|         window_end|     collection_slug|count|
+-------------------+-------------------+--------------------+-----+
|2024-06-09 19:54:00|2024-06-09 19:55:00|the-sandbox-s-ass...|    2|
|2024-06-09 19:54:00|2024-06-09 19:55:00|          datacube-2|   29|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-cards-4|    9|
|2024-06-09 19:54:00|2024-06-09 19:55:00|control-structure...|    1|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-item-11|    7|
|2024-06-09 19:54:00|2024-06-09 19:55:00|    juicyposition-13|    8|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-item-10|    4|
|2024-06-09 19:54:00|2024-06-09 19:55:00|          datacube-1|    6|
|2024-06-09 19:53:00|2024-06-09 19:54:00|pink-blockstylers...|    3|
|2024-06-09 19:54:00|2024-06-09 19:55:00|       imports-pin-1|    1|
|2024-

                                                                                

-------------------------------------------
Batch: 8
-------------------------------------------
+-------------------+-------------------+--------------------+-----+
|       window_start|         window_end|     collection_slug|count|
+-------------------+-------------------+--------------------+-----+
|2024-06-09 19:54:00|2024-06-09 19:55:00|the-sandbox-s-ass...|    2|
|2024-06-09 19:54:00|2024-06-09 19:55:00|          datacube-2|   39|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-cards-4|    9|
|2024-06-09 19:54:00|2024-06-09 19:55:00|control-structure...|    1|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-item-11|   11|
|2024-06-09 19:54:00|2024-06-09 19:55:00|    juicyposition-13|    8|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-item-10|    6|
|2024-06-09 19:54:00|2024-06-09 19:55:00|          datacube-1|    8|
|2024-06-09 19:53:00|2024-06-09 19:54:00|pink-blockstylers...|    3|
|2024-06-09 19:54:00|2024-06-09 19:55:00|       imports-pin-1|    1|
|2024-

                                                                                

-------------------------------------------
Batch: 9
-------------------------------------------
+-------------------+-------------------+--------------------+-----+
|       window_start|         window_end|     collection_slug|count|
+-------------------+-------------------+--------------------+-----+
|2024-06-09 19:54:00|2024-06-09 19:55:00|the-sandbox-s-ass...|    2|
|2024-06-09 19:54:00|2024-06-09 19:55:00|          datacube-2|   40|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-cards-4|    9|
|2024-06-09 19:54:00|2024-06-09 19:55:00|control-structure...|    1|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-item-11|   12|
|2024-06-09 19:54:00|2024-06-09 19:55:00|    juicyposition-13|    8|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-item-10|    6|
|2024-06-09 19:54:00|2024-06-09 19:55:00|          datacube-1|    8|
|2024-06-09 19:53:00|2024-06-09 19:54:00|pink-blockstylers...|    3|
|2024-06-09 19:54:00|2024-06-09 19:55:00|       imports-pin-1|    1|
|2024-

                                                                                

-------------------------------------------
Batch: 10
-------------------------------------------
+-------------------+-------------------+--------------------+-----+
|       window_start|         window_end|     collection_slug|count|
+-------------------+-------------------+--------------------+-----+
|2024-06-09 19:54:00|2024-06-09 19:55:00|the-sandbox-s-ass...|    2|
|2024-06-09 19:54:00|2024-06-09 19:55:00|          datacube-2|   40|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-cards-4|    9|
|2024-06-09 19:54:00|2024-06-09 19:55:00|control-structure...|    1|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-item-11|   12|
|2024-06-09 19:55:00|2024-06-09 19:56:00|          datacube-1|    1|
|2024-06-09 19:54:00|2024-06-09 19:55:00|    juicyposition-13|    8|
|2024-06-09 19:55:00|2024-06-09 19:56:00|        game-item-11|    2|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-item-10|    6|
|2024-06-09 19:54:00|2024-06-09 19:55:00|          datacube-1|    8|
|2024

                                                                                

-------------------------------------------
Batch: 11
-------------------------------------------
+-------------------+-------------------+--------------------+-----+
|       window_start|         window_end|     collection_slug|count|
+-------------------+-------------------+--------------------+-----+
|2024-06-09 19:54:00|2024-06-09 19:55:00|the-sandbox-s-ass...|    2|
|2024-06-09 19:54:00|2024-06-09 19:55:00|          datacube-2|   40|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-cards-4|    9|
|2024-06-09 19:54:00|2024-06-09 19:55:00|control-structure...|    1|
|2024-06-09 19:55:00|2024-06-09 19:56:00|       storage-pin-2|    1|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-item-11|   12|
|2024-06-09 19:55:00|2024-06-09 19:56:00|           scores-50|    1|
|2024-06-09 19:55:00|2024-06-09 19:56:00|          bbqperks-2|    2|
|2024-06-09 19:55:00|2024-06-09 19:56:00|          datacube-1|    1|
|2024-06-09 19:54:00|2024-06-09 19:55:00|    juicyposition-13|    8|
|2024

                                                                                

-------------------------------------------
Batch: 12
-------------------------------------------
+-------------------+-------------------+--------------------+-----+
|       window_start|         window_end|     collection_slug|count|
+-------------------+-------------------+--------------------+-----+
|2024-06-09 19:54:00|2024-06-09 19:55:00|the-sandbox-s-ass...|    2|
|2024-06-09 19:54:00|2024-06-09 19:55:00|          datacube-2|   40|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-cards-4|    9|
|2024-06-09 19:54:00|2024-06-09 19:55:00|control-structure...|    1|
|2024-06-09 19:55:00|2024-06-09 19:56:00|       storage-pin-2|    1|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-item-11|   12|
|2024-06-09 19:55:00|2024-06-09 19:56:00|        game-item-10|    3|
|2024-06-09 19:55:00|2024-06-09 19:56:00|           scores-50|    1|
|2024-06-09 19:55:00|2024-06-09 19:56:00|          bbqperks-2|    2|
|2024-06-09 19:55:00|2024-06-09 19:56:00|          datacube-1|    2|
|2024

                                                                                

-------------------------------------------
Batch: 13
-------------------------------------------
+-------------------+-------------------+--------------------+-----+
|       window_start|         window_end|     collection_slug|count|
+-------------------+-------------------+--------------------+-----+
|2024-06-09 19:54:00|2024-06-09 19:55:00|the-sandbox-s-ass...|    2|
|2024-06-09 19:54:00|2024-06-09 19:55:00|          datacube-2|   40|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-cards-4|    9|
|2024-06-09 19:54:00|2024-06-09 19:55:00|control-structure...|    1|
|2024-06-09 19:55:00|2024-06-09 19:56:00|       storage-pin-2|    1|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-item-11|   12|
|2024-06-09 19:55:00|2024-06-09 19:56:00|      mappings-pin-1|    1|
|2024-06-09 19:55:00|2024-06-09 19:56:00|        game-item-10|    4|
|2024-06-09 19:55:00|2024-06-09 19:56:00|           scores-50|    1|
|2024-06-09 19:55:00|2024-06-09 19:56:00|          bbqperks-2|    2|
|2024

                                                                                

-------------------------------------------
Batch: 14
-------------------------------------------
+-------------------+-------------------+--------------------+-----+
|       window_start|         window_end|     collection_slug|count|
+-------------------+-------------------+--------------------+-----+
|2024-06-09 19:54:00|2024-06-09 19:55:00|the-sandbox-s-ass...|    2|
|2024-06-09 19:54:00|2024-06-09 19:55:00|          datacube-2|   40|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-cards-4|    9|
|2024-06-09 19:54:00|2024-06-09 19:55:00|control-structure...|    1|
|2024-06-09 19:55:00|2024-06-09 19:56:00|       storage-pin-2|    1|
|2024-06-09 19:54:00|2024-06-09 19:55:00|        game-item-11|   12|
|2024-06-09 19:55:00|2024-06-09 19:56:00|      mappings-pin-1|    1|
|2024-06-09 19:55:00|2024-06-09 19:56:00|        game-item-10|    4|
|2024-06-09 19:55:00|2024-06-09 19:56:00|           scores-50|    1|
|2024-06-09 19:55:00|2024-06-09 19:56:00|          bbqperks-2|    2|
|2024

In [None]:
import os
# Start the Structured Streaming query
# Write to s3a://processed-data every 5 seconds, in parquet format
# Set the S3 bucket host to the localstack host 'minio', and use the access key and secret key
# Parquet format does not support the 'complete' output mode, so we use 'append' instead
# 'append' mode will write the new data to the existing data in the parquet file,
query = agg_df.writeStream \
    .format("parquet") \
    .outputMode("append") \
    .option("path", "s3a://processed-data/stream") \
    .option("checkpointLocation", "/tmp/checkpoint") \
    # .trigger(processingTime="5 seconds")
query.start().awaitTermination()

24/06/09 17:58:59 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
24/06/09 17:59:01 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/06/09 17:59:01 ERROR MicroBatchExecution: Query [id = 1015a98e-b110-431f-b874-65756c3fdaf5, runId = bb09bd5a-3b24-4b2f-991e-016d8540f86d] terminated with error
java.net.ConnectException: Connection refused
	at java.base/sun.nio.ch.Net.connect0(Native Method)
	at java.base/sun.nio.ch.Net.connect(Net.java:579)
	at java.base/sun.nio.ch.Net.connect(Net.java:568)
	at java.base/sun.nio.ch.NioSocketImpl.connect(NioSocketImpl.java:593)
	at java.base/java.net.SocksSocketImpl.connect(SocksSocketImpl.java:327)
	at java.base/java.net.Socket.connect(Socket.java:633)
	at java.base/java.net.Socket.connect(Socket.java:583)
	at java.base/java.net.Socket.<init>(Socket.java:507)
	at java.base/java.net.Socket.<init>(Sock

StreamingQueryException: [STREAM_FAILED] Query [id = 1015a98e-b110-431f-b874-65756c3fdaf5, runId = bb09bd5a-3b24-4b2f-991e-016d8540f86d] terminated with exception: Connection refused

In [None]:
# Read the files in batch:
data = spark.read.parquet("s3a://processed-data/stream")
data.orderBy("count", ascending=False)

                                                                                

window_start,window_end,collection_slug,count
2024-05-27 20:35:00,2024-05-27 20:36:00,datacube-2,51
2024-05-27 20:35:00,2024-05-27 20:36:00,game-item-11,38
2024-05-27 20:34:00,2024-05-27 20:35:00,game-item-10,24
2024-05-27 20:33:00,2024-05-27 20:34:00,game-item-10,23
2024-05-27 20:33:00,2024-05-27 20:34:00,datacube-1,20
2024-05-27 20:32:00,2024-05-27 20:33:00,game-item-10,19
2024-05-27 20:28:00,2024-05-27 20:29:00,game-item-10,16
2024-05-27 20:30:00,2024-05-27 20:31:00,game-item-10,16
2024-05-27 20:35:00,2024-05-27 20:36:00,game-item-10,15
2024-05-27 20:28:00,2024-05-27 20:29:00,datacube-1,15
