In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col, avg, concat, lit
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, FloatType, BooleanType
from time import sleep

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Stream_test2_Ex1_group11")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
dataSchema = StructType([
    StructField("Address", StringType(), True),
    StructField("City", StringType(), True),
    StructField("Price", IntegerType(), True),
    StructField("Lot_size", StringType(), True),
    StructField("Living_space_size", StringType(), True),
    StructField("Build_year", StringType(), True),
    StructField("Build_type", StringType(), True),
    StructField("House_type", StringType(), True),
    StructField("Roof", StringType(), True),
    StructField("Rooms", StringType(), True),
    StructField("Toilet", StringType(), True),
    StructField("Floors", StringType(), True),
    StructField("Energy_label", StringType(), True),
    StructField("Position", StringType(), True),
    StructField("Garden", StringType(), True),
    StructField("Estimated_neighbourhood_price_per", StringType(), True),
    StructField("Availability", BooleanType(), True),
])

# Read from a source 
sdf = spark.readStream.schema(dataSchema).option("maxFilesPerTrigger", 1) \
    .json("/home/jovyan/data/mock_data")

avgpricedf = sdf.groupBy( "Address", "City") \
    .agg(avg("Price").alias("value"))

resultdf = avgpricedf.select(concat(col("Address"), lit(" "), col("City")).alias("key"), col("value"))

query = resultdf \
    .writeStream \
    .queryName("avg_price") \
    .format("memory") \
    .outputMode("complete") \
    .start()

try:
    for x in range(100):
        spark.sql("SELECT * FROM avg_price").show()
        sleep(10)
except KeyboardInterrupt:
    query.stop()
    # Stop the spark context
    spark.stop()
    print("Stopped the streaming query and the spark context")
    
# Do a calculation
#priceCounts = sdf.groupBy("price").count()

# Write to a sink - here, the output is the console. 
#priceQuery = priceCounts.writeStream.queryName("price_counts") \
                    #.format("memory").outputMode("complete") \
                    #.start()
# Testing 
#for x in range(10):
    #spark.sql("SELECT * FROM price_counts").show()
    #sleep(5)

In [None]:
spark.stop()