In [1]:
# Enable horizontal scrolling:
from IPython.display import display, HTML

display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [3]:
# Connect to Spark on port 7077
spark = (
    SparkSession.builder.master("spark://spark:7077")
    .appName("batch-demo")
    .config("spark.cores.max", 1)
    .getOrCreate()
)

In [4]:
input_dir = "s3a://demo-data-agg/raw-inventory"

raw_df = spark.read.option("header", "true").json(f"{input_dir}")
print("Count: ", raw_df.count())
raw_df.printSchema()
raw_df.show()

Count:  16682
root
 |-- inventory: struct (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- productid: long (nullable = true)
 |    |-- quantity: long (nullable = true)
 |-- key: long (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- timestamp_date: string (nullable = true)
 |-- timestamp_date_iso: date (nullable = true)
 |-- timestamp_hour: integer (nullable = true)
 |-- timestamp_minute: integer (nullable = true)

+--------------------+-----+--------------------+--------------+------------------+--------------+----------------+
|           inventory|  key|           timestamp|timestamp_date|timestamp_date_iso|timestamp_hour|timestamp_minute|
+--------------------+-----+--------------------+--------------+------------------+--------------+----------------+
|{14638, 14638, 14...|14638|2023-11-18T16:40:...|    2023-11-18|        2023-11-18|            16|              40|
|{14639, 14639, 14...|14639|2023-11-18T16:40:...|    2023-11-18|        2023-11-18|    

In [5]:
df = raw_df.select(
    "key",
    F.col("inventory.id").alias("inventory_id"),
    F.col("inventory.productid").alias("product_id"),
    F.col("inventory.quantity").alias("quantity"),
    F.to_timestamp("timestamp").alias("timestamp"),
)
df.show(20, False)

+-----+------------+----------+--------+-----------------------+
|key  |inventory_id|product_id|quantity|timestamp              |
+-----+------------+----------+--------+-----------------------+
|14638|14638       |14638     |14638   |2023-11-18 16:40:00.449|
|14639|14639       |14639     |14639   |2023-11-18 16:40:01.104|
|14640|14640       |14640     |14640   |2023-11-18 16:40:01.424|
|14642|14642       |14642     |14642   |2023-11-18 16:40:03.047|
|14643|14643       |14643     |14643   |2023-11-18 16:40:03.429|
|14645|14645       |14645     |14645   |2023-11-18 16:40:05.043|
|14646|14646       |14646     |14646   |2023-11-18 16:40:05.111|
|14648|14648       |14648     |14648   |2023-11-18 16:40:05.883|
|14650|14650       |14650     |14650   |2023-11-18 16:40:06.574|
|14651|14651       |14651     |14651   |2023-11-18 16:40:07.294|
|14653|14653       |14653     |14653   |2023-11-18 16:40:07.791|
|14655|14655       |14655     |14655   |2023-11-18 16:40:08.304|
|14657|14657       |14657

In [6]:
# Aggregate the quantities sold of each product on windows of thirty minutes:
agg_df = df.groupBy(
    F.window("timestamp", "30 minutes", "30 minutes"),
    "product_id",
).agg(F.sum("quantity").alias("quantity"))
agg_df.select(
    F.col("window.start").alias("window_start"),
    F.col("window.end").alias("window_end"),
    "product_id",
    "quantity",
).show(20, False)

+-------------------+-------------------+----------+--------+
|window_start       |window_end         |product_id|quantity|
+-------------------+-------------------+----------+--------+
|2023-11-18 16:30:00|2023-11-18 17:00:00|14734     |14734   |
|2023-11-18 16:00:00|2023-11-18 16:30:00|9950      |9950    |
|2023-11-18 16:30:00|2023-11-18 17:00:00|15198     |15198   |
|2023-11-18 14:30:00|2023-11-18 15:00:00|767       |767     |
|2023-11-18 16:00:00|2023-11-18 16:30:00|10934     |10934   |
|2023-11-18 16:00:00|2023-11-18 16:30:00|10948     |10948   |
|2023-11-18 15:30:00|2023-11-18 16:00:00|8802      |8802    |
|2023-11-18 14:30:00|2023-11-18 15:00:00|2070      |2070    |
|2023-11-18 15:00:00|2023-11-18 15:30:00|4175      |4175    |
|2023-11-18 15:00:00|2023-11-18 15:30:00|4197      |4197    |
|2023-11-18 15:30:00|2023-11-18 16:00:00|7951      |7951    |
|2023-11-18 16:30:00|2023-11-18 17:00:00|13621     |13621   |
|2023-11-18 16:30:00|2023-11-18 17:00:00|15542     |15542   |
|2023-11

In [8]:
# Aggregate the quantity of products sold on windows of thirty minutes, regardless of the product:
agg_all_products_df = (
    df.groupBy(
        F.window("timestamp", "30 minutes", "30 minutes"),
    )
    .agg(F.sum("quantity").alias("quantity"))
    .select(
        F.col("window.start").alias("window_start"),
        F.col("window.end").alias("window_end"),
        "quantity",
    )
    .orderBy("quantity", ascending=False)
)
agg_all_products_df.show(20, False)

+-------------------+-------------------+--------+
|window_start       |window_end         |quantity|
+-------------------+-------------------+--------+
|2023-11-18 16:00:00|2023-11-18 16:30:00|42434651|
|2023-11-18 16:30:00|2023-11-18 17:00:00|34380645|
|2023-11-18 15:30:00|2023-11-18 16:00:00|28385770|
|2023-11-18 15:00:00|2023-11-18 15:30:00|15674109|
|2023-11-18 20:00:00|2023-11-18 20:30:00|14687245|
|2023-11-18 14:30:00|2023-11-18 15:00:00|3573801 |
+-------------------+-------------------+--------+

