# Pipeline 2: Stream Order Monitoring


In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, TimestampType
from time import sleep

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("FastFood_Stream_Monitoring")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

dataSchema = StructType([
    StructField("order_id", StringType(), True),
    StructField("order_time", StringType(), True),
    StructField("city", StringType(), True),
    StructField("cuisine_type", StringType(), True),
    StructField("order_value", DoubleType(), True),
    StructField("delivery_time_minutes", IntegerType(), True),
    StructField("payment_method", StringType(), True),
    StructField("items_count", IntegerType(), True)
])

sdf_raw = spark.readStream \
    .schema(dataSchema) \
    .option("maxFilesPerTrigger", 1) \
    .option("latestFirst", "false") \
    .csv("/home/jovyan/data/fast_food_ordering_dataset_stream")

sdf = sdf_raw.withColumn("order_time", to_timestamp(col("order_time"), "yyyy-MM-dd HH:mm:ss"))

sdf.printSchema()


### Streaming Transformation 1: Real-time Orders by City

Monitor the count of orders per city in real-time.


In [None]:
for query in spark.streams.active:
    if query.name == "city_order_counts":
        query.stop()

city_counts = sdf.groupBy("city").count()

city_query = city_counts.writeStream \
    .queryName("city_order_counts") \
    .format("memory") \
    .outputMode("complete") \
    .trigger(processingTime='2 seconds') \
    .start()


### Streaming Transformation 2: Average Order Value by Cuisine Type

Calculate average order value by cuisine type in 30-day time windows.


In [None]:
with_event_time = sdf.selectExpr("*", "order_time as event_time")

for query in spark.streams.active:
    if query.name == "cuisine_order_value":
        query.stop()

cuisine_order_value = with_event_time \
    .withWatermark("event_time", "60 days") \
    .groupBy(window(col("event_time"), "30 days"), "cuisine_type") \
    .agg(avg("order_value").alias("avg_order_value"), sum("order_value").alias("total_revenue"), count("*").alias("order_count"))

cuisine_query = cuisine_order_value.writeStream \
    .queryName("cuisine_order_value") \
    .format("memory") \
    .outputMode("complete") \
    .trigger(processingTime='2 seconds') \
    .start()


### Streaming Transformation 3: Payment Method Distribution

Calculate payment method distribution in 30-day time windows.


In [None]:
for query in spark.streams.active:
    if query.name == "payment_method_distribution":
        query.stop()

payment_distribution = with_event_time \
    .withWatermark("event_time", "60 days") \
    .groupBy(window(col("event_time"), "30 days"), "payment_method") \
    .agg(count("*").alias("order_count"), sum("order_value").alias("total_value"))

payment_query = payment_distribution.writeStream \
    .queryName("payment_method_distribution") \
    .format("memory") \
    .outputMode("complete") \
    .trigger(processingTime='2 seconds') \
    .start()


### Streaming Transformation 4: Total Revenue by City

Calculate total revenue by city in 30-day time windows.


In [None]:
for query in spark.streams.active:
    if query.name == "city_revenue":
        query.stop()

city_revenue = with_event_time \
    .withWatermark("event_time", "60 days") \
    .groupBy(window(col("event_time"), "30 days"), "city") \
    .agg(sum("order_value").alias("total_revenue"), avg("order_value").alias("avg_order_value"), count("*").alias("order_count"))

revenue_query = city_revenue.writeStream \
    .queryName("city_revenue") \
    .format("memory") \
    .outputMode("complete") \
    .trigger(processingTime='2 seconds') \
    .start()


### Monitor Streaming Queries

Display results from all streaming queries. The queries will update as new data arrives.


In [None]:
try:
    for x in range(20):
        print(f"\n=== Iteration {x+1} ===")
        
        print("\n--- City Order Counts (Real-time) ---")
        spark.sql("SELECT * FROM city_order_counts ORDER BY count DESC").show()
        
        print("\n--- Average Order Value by Cuisine Type (30-day windows) ---")
        spark.sql("""
            SELECT 
                window,
                cuisine_type,
                avg_order_value,
                total_revenue,
                order_count
            FROM cuisine_order_value
            ORDER BY window DESC, total_revenue DESC
            LIMIT 15
        """).show(truncate=False)
        
        print("\n--- Payment Method Distribution (30-day windows) ---")
        spark.sql("""
            SELECT 
                window,
                payment_method,
                order_count,
                total_value,
                ROUND(total_value / SUM(total_value) OVER (PARTITION BY window), 4) * 100 as percentage_of_total
            FROM payment_method_distribution
            ORDER BY window DESC, order_count DESC
            LIMIT 15
        """).show(truncate=False)
        
        print("\n--- Total Revenue by City (30-day windows) ---")
        spark.sql("""
            SELECT 
                window,
                city,
                total_revenue,
                avg_order_value,
                order_count
            FROM city_revenue
            ORDER BY window DESC, total_revenue DESC
            LIMIT 15
        """).show(truncate=False)
        
        sleep(10)
        
except KeyboardInterrupt:
    city_query.stop()
    cuisine_query.stop()
    payment_query.stop()
    revenue_query.stop()
    spark.stop()
except Exception as e:
    city_query.stop()
    cuisine_query.stop()
    payment_query.stop()
    revenue_query.stop()
    spark.stop()
    print(f"Error: {e}")




In [None]:
# Stop the Spark context
spark.stop()