In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, TimestampType
from time import sleep

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("FastFood_Stream_Monitoring")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the Spark session, which is the entry point to the Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Define schema for streaming data
dataSchema = StructType([
    StructField("order_id", StringType(), True),
    StructField("order_time", TimestampType(), True),
    StructField("city", StringType(), True),
    StructField("cuisine_type", StringType(), True),
    StructField("order_value", DoubleType(), True),
    StructField("delivery_time_minutes", IntegerType(), True),
    StructField("payment_method", StringType(), True),
    StructField("items_count", IntegerType(), True)
])


sdf = spark.readStream \
    .schema(dataSchema) \
    .option("header", "true") \
    .option("maxFilesPerTrigger", 1) \
    .csv("/home/jovyan/data/fast_food_ordering_dataset_stream")  # Create a stream directory

print("Streaming DataFrame Schema:")
sdf.printSchema()


### Streaming Transformation 1: Real-time Orders by City

Monitor the count of orders per city in real-time.


In [None]:
# Real-time Orders by City
city_counts = sdf.groupBy("city").count()

city_query = city_counts.writeStream \
    .queryName("city_order_counts") \
    .format("memory") \
    .outputMode("complete") \
    .start()


### Streaming Transformation 2: Windowed Revenue by Cuisine Type

Calculate revenue aggregated by cuisine type in 1-minute time windows.


In [None]:
# Create event time column
with_event_time = sdf.selectExpr(
    "*",
    "order_time as event_time"
)

# Aggregate revenue by cuisine type in 1-minute windows
cuisine_revenue_window = with_event_time \
    .groupBy(
        window(col("event_time"), "1 minute"),
        "cuisine_type"
    ) \
    .agg(
        sum("order_value").alias("window_revenue"),
        count("*").alias("window_orders"),
        avg("order_value").alias("avg_order_value")
    )

cuisine_revenue_query = cuisine_revenue_window.writeStream \
    .queryName("cuisine_revenue_window") \
    .format("memory") \
    .outputMode("complete") \
    .start()


### Streaming Transformation 3: Real-time Payment Method Distribution

Monitor the distribution of payment methods as orders stream in.


In [None]:
# Real-time Payment Method Distribution
payment_counts = sdf.groupBy("payment_method").count()

payment_query = payment_counts.writeStream \
    .queryName("payment_method_counts") \
    .format("memory") \
    .outputMode("complete") \
    .start()


### Streaming Transformation 4: Windowed Average Delivery Time by City

Calculate average delivery time by city in 2-minute time windows.


In [None]:
# Windowed Average Delivery Time by City
delivery_window = with_event_time \
    .groupBy(
        window(col("event_time"), "2 minutes"),
        "city"
    ) \
    .agg(
        avg("delivery_time_minutes").alias("avg_delivery_time"),
        count("*").alias("orders_in_window")
    )

delivery_query = delivery_window.writeStream \
    .queryName("delivery_time_window") \
    .format("memory") \
    .outputMode("complete") \
    .start()


### Monitor Streaming Queries

Display results from all streaming queries. The queries will update as new data arrives.


In [None]:
# Monitor the streaming queries
try:
    for x in range(20):
        print(f"\n=== Iteration {x+1} ===")
        
        print("\n--- City Order Counts ---")
        spark.sql("SELECT * FROM city_order_counts ORDER BY count DESC").show()
        
        print("\n--- Cuisine Revenue (Last Window) ---")
        spark.sql("SELECT * FROM cuisine_revenue_window ORDER BY window DESC LIMIT 10").show(truncate=False)
        
        print("\n--- Payment Method Distribution ---")
        spark.sql("SELECT * FROM payment_method_counts ORDER BY count DESC").show()
        
        print("\n--- Delivery Time by City (Last Window) ---")
        spark.sql("SELECT * FROM delivery_time_window ORDER BY window DESC LIMIT 10").show(truncate=False)
        
        sleep(5)
        
except KeyboardInterrupt:
    city_query.stop()
    cuisine_revenue_query.stop()
    payment_query.stop()
    delivery_query.stop()
    spark.stop()
    print("Stopped all streaming queries and Spark context")
except Exception as e:
    city_query.stop()
    cuisine_revenue_query.stop()
    payment_query.stop()
    delivery_query.stop()
    spark.stop()
    print(f"Error: {e}")
    print("Stopped all streaming queries and Spark context")

spark.stop()
