In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext

import pyspark.sql.functions as F

# conf = SparkConf()
# conf.set("spark.jars.ivy","/tmp/.ivy")

# from pyspark.sql.types import *

# Connect to remote spark session at port 8080
spark = (
    SparkSession.builder.appName("Streaming")  # type: ignore
    .master("spark://spark:7077")
    # .config(conf)
    .getOrCreate()
)
# Set configuration for Notebook display
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.sparkContext.setLogLevel("WARN")

:: loading settings :: url = jar:file:/opt/bitnami/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /tmp/.ivy/cache
The jars for the packages stored in: /tmp/.ivy/jars
org.apache.spark#spark-avro_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-8983fb61-5e13-4f16-b1ee-f790677f1d7d;1.0
	confs: [default]
	found org.apache.spark#spark-avro_2.12;3.5.0 in central
	found org.tukaani#xz;1.9 in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.0 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 

In [2]:

# Read data from the socket as a DataFrame.
socket_df = spark.readStream \
    .format("socket") \
    .option("host", "stream") \
    .option("port", 9999) \
    .load()

# Parse JSON data using the defined schema
parsed_df = socket_df.selectExpr(
    "CAST(value AS STRING) as event_json",
    "get_json_object(value, '$.payload.payload.collection.slug') as collection_slug",
    "to_timestamp(get_json_object(value, '$.payload.sent_at')) as sent_at",
)
parsed_df.printSchema()
parsed_df.select("*")
# group by collection_slug and window of 1 hour every 30 seconds. Order by count descending
agg_df = (
    parsed_df
        .withWatermark("sent_at", "1 minute") # Define watermark to handle late data
        .groupBy(
            F.window("sent_at", "1 minute", "1 minute"), # Every 1 hour window, with 1 minute slide (Slide is the interval at which the window is updated)
            "collection_slug"
        ).count()
)
agg_df = agg_df.select(
    F.col("window.start").alias("window_start"),
    F.col("window.end").alias("window_end"),
    "collection_slug",
    "count"
)
agg_df.printSchema()


24/05/27 20:26:44 WARN TextSocketSourceProvider: The socket source should not be used for production applications! It does not support recovery.


root
 |-- event_json: string (nullable = true)
 |-- collection_slug: string (nullable = true)
 |-- sent_at: timestamp (nullable = true)

root
 |-- window_start: timestamp (nullable = true)
 |-- window_end: timestamp (nullable = true)
 |-- collection_slug: string (nullable = true)
 |-- count: long (nullable = false)



In [None]:
import os
# Start the Structured Streaming query
# Write to s3a://processed-data every 5 seconds, in parquet format
# Set the S3 bucket host to the localstack host 'minio', and use the access key and secret key
# Parquet format does not support the 'complete' output mode, so we use 'append' instead
# 'append' mode will write the new data to the existing data in the parquet file,
query = agg_df.writeStream \
    .format("parquet") \
    .outputMode("append") \
    .option("path", "s3a://processed-data/stream") \
    .option("checkpointLocation", "/tmp/checkpoint") \
    # .trigger(processingTime="5 seconds")
query.start().awaitTermination()

24/05/27 20:27:21 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/bitnami/python/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/bitnami/python/lib/python3.11/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/bitnami/python/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt
24/05/27 20:38:03 ERROR FileFormatWriter: Aborting job a466ee85-0d1f-49a8-837e-7594e94073e3.
org.apache.spark.SparkException: Job 129 cancelled because SparkContext was shut down
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$cleanUpAfterSchedulerStop$1(DAGScheduler.scala:1253)
	at org.apache.spark.scheduler.DAGScheduler.$anonf

KeyboardInterrupt: 

In [3]:
# Read the files in batch:
data = spark.read.parquet("s3a://processed-data/stream")
data.orderBy("count", ascending=False)

                                                                                

window_start,window_end,collection_slug,count
2024-05-27 20:35:00,2024-05-27 20:36:00,datacube-2,51
2024-05-27 20:35:00,2024-05-27 20:36:00,game-item-11,38
2024-05-27 20:34:00,2024-05-27 20:35:00,game-item-10,24
2024-05-27 20:33:00,2024-05-27 20:34:00,game-item-10,23
2024-05-27 20:33:00,2024-05-27 20:34:00,datacube-1,20
2024-05-27 20:32:00,2024-05-27 20:33:00,game-item-10,19
2024-05-27 20:28:00,2024-05-27 20:29:00,game-item-10,16
2024-05-27 20:30:00,2024-05-27 20:31:00,game-item-10,16
2024-05-27 20:35:00,2024-05-27 20:36:00,game-item-10,15
2024-05-27 20:28:00,2024-05-27 20:29:00,datacube-1,15
