In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col, avg, concat, lit
from pyspark.sql.types import ArrayType, StructType, StructField, LongType, StringType, DoubleType, IntegerType
from time import sleep

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Lab9_Ex1")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

dataSchema = StructType(
    [StructField("username", StringType(), True),
     StructField("teamname", StringType(), True),
     StructField("score", IntegerType(), True),
     StructField("timestamp_in_ms", LongType(), True),
     StructField("readable_time", StringType(), True)
     ])

# Read from a source 
sdf = spark.readStream.schema(dataSchema).option("multiline","true").option("maxFilesPerTrigger", 1) \
    .json("/home/jovyan/data/gamestream")

# create the event time column 
withEventTimedf = sdf.selectExpr(
    "*",
    "cast(timestamp_in_ms/1000.0 as timestamp) as event_time")

withEventTimedf.printSchema()

avgscoredf = withEventTimedf \
    .groupBy(window(col("event_time"), "10 seconds"), "username", "teamname") \
    .agg(avg("score").alias("value"))

resultdf = avgscoredf.select(concat(col("username"), lit(" "), col("teamname")).alias("key"), col("value"))

query = resultdf \
    .writeStream \
    .queryName("avg_score_window") \
    .format("memory") \
    .outputMode("complete") \
    .start()

try:
    for x in range(100):
        spark.sql("SELECT * FROM avg_score_window").show()
        sleep(10)
except KeyboardInterrupt:
    query.stop()
    # Stop the spark context
    spark.stop()
    print("Stoped the streaming query and the spark context")

root
 |-- username: string (nullable = true)
 |-- teamname: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- timestamp_in_ms: long (nullable = true)
 |-- readable_time: string (nullable = true)
 |-- event_time: timestamp (nullable = true)

+---+-----+
|key|value|
+---+-----+
+---+-----+

+---+-----+
|key|value|
+---+-----+
+---+-----+

+---+-----+
|key|value|
+---+-----+
+---+-----+

+--------------------+------------------+
|                 key|             value|
+--------------------+------------------+
|user5_AuburnCocka...|           9.34375|
|user3_ArmyGreenCa...| 9.631578947368421|
|user5_AmberCaneTo...|10.793650793650794|
|Robot-3 ApricotDingo| 9.044642857142858|
|user14_BananaWall...|10.295774647887324|
|user4_AmberCaneTo...| 9.818181818181818|
|user10_BananaWall...| 9.706896551724139|
|user19_AuburnDing...| 9.692307692307692|
|user7_AmaranthMar...|          8.953125|
|user6_AndroidGree...|10.421052631578947|
|user7_ApricotDing...|              9.26|
|user4

### Check  https://sparkbyexamples.com/pyspark/pyspark-read-json-file-into-dataframe/ for reading from json

In [2]:
# Stop the spark context
spark.stop()