In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import window, col, avg, concat, lit, from_json
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, TimestampType
from pyspark.sql.window import Window
from time import sleep


sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Lab9_Ex3")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()


# Define the PySpark schema for the streaming data
data_schema = StructType([
    StructField("Address", StringType(), True),
    StructField("City", StringType(), True),
    StructField("Price", IntegerType(), True),
    StructField("Lot_size", StringType(), True),
    StructField("Living_space_size", StringType(), True),
    StructField("Build_year", StringType(), True),
    StructField("Build_type", StringType(), True),
    StructField("House_type", StringType(), True),
    StructField("Roof", StringType(), True),
    StructField("Rooms", StringType(), True),
    StructField("Toilet", StringType(), True),
    StructField("Floors", StringType(), True),
    StructField("Energy_label", StringType(), True),
    StructField("Position", StringType(), True),
    StructField("Garden", StringType(), True),
    StructField("Estimated_neighbourhood_price_per", StringType(), True),
    StructField("Availability", BooleanType(), True),
    StructField("event_time",TimestampType(), True),
])

# Read the whole dataset as a batch
kafkaStream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:9093") \
    .option("failOnDataLoss", "false") \
    .option("subscribe", "mock") \
    .option("startingOffsets", "latest") \
    .load()

df = kafkaStream.selectExpr("CAST(value AS STRING)")

df1 = df.select(from_json(df.value, data_schema.simpleString()))

df1.printSchema()

sdf = df1.select(col("from_json(value).*"))

sdf.printSchema()

# Filter data based on a given price X
price_threshold = 500000  # Set your price threshold

top_10_prices_df = sdf \
    .groupBy(window(col("event_time"), "10 seconds"), "Price","Availability") \
    .agg(F.max("event_time").alias("latest_event_time")) \
    .where((col("Price") <= price_threshold) & (col("Availability") == True) )\
    .orderBy("Price", ascending=False) \
    .limit(10)

# Update the Availability column based on specific conditions
'''updated_data_df = sdf \
    .join(top_10_prices_df.withColumn("Price")) \
    .filter("event_time = top_event_time or top_event_time is null") \
    .withColumn("Availability", F.when(col("Availability") == True, False).otherwise(col("Availability")))'''

'''query = resultdf \
    .writeStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:9093") \
    .option("checkpointLocation", "/home/jovyan/checkpoint/priceavg") \
    .option("topic", "mock") \
    .option("price", "price_avg_query")\
    .outputMode("update") \
    .start()'''

query = top_10_prices_df \
    .writeStream \
    .queryName("top10_price_window") \
    .format("memory") \
    .outputMode("complete") \
    .start()

try:
    for x in range(100):
        spark.sql("SELECT * FROM top10_price_window").show()
        sleep(10)
except KeyboardInterrupt:
    query.stop()
    # Stop the spark context
    spark.stop()
    print("Stoped the streaming query and the spark context")

'''# Await termination
query.awaitTermination()'''


In [None]:
spark.stop()