In [1]:
import pyspark

conf = pyspark.SparkConf()
# conf.set("spark.jars.ivy","/tmp/.ivy")

sc = pyspark.SparkContext(conf=conf)

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

import pyspark.sql.functions as F

# from pyspark.sql.types import *

# Connect to remote spark session at port 8080
spark = (
    SparkSession.builder.appName("Streaming")  # type: ignore
    .master("spark://spark:8080")
    .getOrCreate()
)
spark

In [3]:
# Read data from the socket as a DataFrame.
socket_df = spark.readStream \
    .format("socket") \
    .option("host", "stream") \
    .option("port", 9999) \
    .load()

# Parse JSON data using the defined schema
parsed_df = socket_df.selectExpr(
    "CAST(value AS STRING) as event_json",
    "get_json_object(value, '$.payload.payload.collection.slug') as collection_slug",
    "to_timestamp(get_json_object(value, '$.payload.sent_at')) as sent_at",
)
parsed_df.printSchema()
parsed_df.select("*")
# group by collection_slug and window of 1 hour every 30 seconds. Order by count descending
agg_df = parsed_df.groupBy(
    F.window("sent_at", "1 hour", "1 minute"), # Every 10 seconds, aggregate the data from the last 30 seconds
    "collection_slug"
).count()
agg_df.printSchema()


root
 |-- event_json: string (nullable = true)
 |-- collection_slug: string (nullable = true)
 |-- sent_at: timestamp (nullable = true)

root
 |-- window: struct (nullable = true)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- collection_slug: string (nullable = true)
 |-- count: long (nullable = false)



In [4]:
# Write a dummy dataframe to s3 to check if the connection is working:
dummy_df = spark.createDataFrame([("hello", "world")], ["key", "value"])
dummy_df.write.parquet("s3a://processed-data/test-data/test.parquet")

In [None]:
import os
# Start the Structured Streaming query
# Write to s3a://processed-data every 5 seconds, in parquet format
# Set the S3 bucket host to the localstack host 'minio', and use the access key and secret key
# Parquet format does not support the 'complete' output mode, so we use 'append' instead
# 'append' mode will write the new data to the existing data in the parquet file,
query = agg_df.writeStream \
    .format("parquet") \
    .outputMode("append") \
    .option("path", "s3a://processed-data/") \
    .option("checkpointLocation", "/tmp/checkpoint") \
    .trigger(processingTime="5 seconds")
query.start().awaitTermination()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
query

<pyspark.sql.streaming.readwriter.DataStreamWriter at 0xffff8cd6e890>