In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.0.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0 pyspark-shell'

from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
from pyspark.sql import functions as F
from pyspark.sql.types import *

spark = SparkSession \
    .builder \
    .appName("Weather Analysis") \
    .getOrCreate()

In [42]:
topic = "Perth"
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "127.0.0.1:9092") \
    .option("subscribe", topic) \
    .load()

In [43]:
df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [44]:
df = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

In [45]:
df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)



In [46]:
# schema = StructType([    
#     StructField('WeatherIcon', IntegerType(), True)          
# ])

In [53]:
schema = StructType([    
    StructField("city", StringType(), True), 
    StructField("WeatherText", StringType(), True),
    StructField("Temperature", MapType(StringType(), MapType(StringType(), StringType())), True)        
])

In [54]:
df1=df.select(F.from_json(F.col("value").cast("string"), schema).alias('parsed_value'))

In [55]:
df1.printSchema()

root
 |-- parsed_value: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- WeatherText: string (nullable = true)
 |    |-- Temperature: map (nullable = true)
 |    |    |-- key: string
 |    |    |-- value: map (valueContainsNull = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)



In [71]:
df2 = df1.select(
    F.col("parsed_value.city").alias("City"),
    F.col("parsed_value.WeatherText").alias("WeatherText"),
    explode("parsed_value.Temperature")
)

In [72]:
df2.printSchema()

root
 |-- City: string (nullable = true)
 |-- WeatherText: string (nullable = true)
 |-- key: string (nullable = false)
 |-- value: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [81]:
df3 = df2.filter(df2.key == "Metric")

In [85]:
df4 = df3.select("City",
                 "WeatherText",
                 explode("value")
)

In [89]:
df4.printSchema()

root
 |-- City: string (nullable = true)
 |-- WeatherText: string (nullable = true)
 |-- key: string (nullable = false)
 |-- value: string (nullable = true)



In [90]:
df5 = df4.filter(df4.key == "Value")

In [104]:
df6 = df5.select("City", "WeatherText", F.col("value").alias("TemperatureC"))

In [105]:
df_formatted = df6.withColumn("TemperatureC", df6.TemperatureC.cast(DoubleType()))

In [106]:
df_formatted.printSchema()

root
 |-- City: string (nullable = true)
 |-- WeatherText: string (nullable = true)
 |-- TemperatureC: double (nullable = true)



In [132]:
df_formatted2 = df_formatted.select(F.trim(F.initcap(F.col("City"))).alias("City"), 
                                    F.trim(F.lower(F.col("WeatherText"))).alias("WeatherText"),
                                   F.col("TemperatureC"))

In [183]:
Avg_temp = df_formatted2.limit(24).groupBy("City").agg(
    F.avg("TemperatureC").alias("AvgTempC")
)

In [155]:
Weather_text = df_formatted2.limit(24).groupBy("City", "WeatherText").agg(
    F.count("WeatherText").alias("Count"))

In [158]:
Weather_text = Weather_text.sort(F.col("Count").desc()).limit(3)

In [133]:
# query = df_formatted2 \
#     .writeStream \
#     .outputMode("append") \
#     .format("console") \
#     .trigger(processingTime='24 seconds') \
#     .start()

In [189]:
query1 = Avg_temp \
    .writeStream \
    .outputMode("complete") \
    .format("memory") \
    .queryName("temp_query") \
    .trigger(processingTime='24 seconds') \
    .start()

In [192]:
spark.sql("select * from temp_query").toPandas().to_csv("avg_temp.csv")

In [196]:
query1.stop()

In [201]:
query2 = Weather_text \
    .writeStream \
    .outputMode("complete") \
    .format("memory") \
    .queryName("weather_query") \
    .trigger(processingTime='24 seconds') \
    .start()

In [202]:
spark.sql("select * from weather_query").toPandas().to_csv("weather_text.csv")

In [203]:
query2.stop()