In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.0.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0 pyspark-shell'

from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window
from time import sleep

spark = SparkSession \
    .builder \
    .appName("Weather Analysis") \
    .getOrCreate()

In [59]:
time_window = "12 seconds"
watermark = "6 seconds"
output_location = os.path.join(os.getcwd(), "result")

if not os.path.exists(output_location):
    os.makedirs(output_location)

In [3]:
topic = "Perth"
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "127.0.0.1:9092") \
    .option("subscribe", topic) \
    .load()

In [4]:
df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [5]:
# Converting the key/value from the kafka data stream to string
df = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

In [6]:
df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)



In [7]:
schema = StructType([
    StructField("Time", TimestampType(), True), 
    StructField("city", StringType(), True), 
    StructField("WeatherText", StringType(), True),
    StructField("Temperature", MapType(StringType(), MapType(StringType(), StringType())), True)        
])

In [8]:
df1=df.select(F.from_json(F.col("value").cast("string"), schema).alias('parsed_value'))

In [9]:
df1.printSchema()

root
 |-- parsed_value: struct (nullable = true)
 |    |-- Time: timestamp (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- WeatherText: string (nullable = true)
 |    |-- Temperature: map (nullable = true)
 |    |    |-- key: string
 |    |    |-- value: map (valueContainsNull = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)



In [10]:
df2 = df1.select(
    F.col("parsed_value.Time").alias("Time"),
    F.col("parsed_value.city").alias("City"),
    F.col("parsed_value.WeatherText").alias("WeatherText"),
    explode("parsed_value.Temperature")
)

In [11]:
df2.printSchema()

root
 |-- Time: timestamp (nullable = true)
 |-- City: string (nullable = true)
 |-- WeatherText: string (nullable = true)
 |-- key: string (nullable = false)
 |-- value: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [12]:
# We are only looking at Celsius here
df3 = df2.filter(df2.key == "Metric")

In [13]:
df4 = df3.select("Time", 
                 "City",
                 "WeatherText",
                 explode("value")
)

In [14]:
df4.printSchema()

root
 |-- Time: timestamp (nullable = true)
 |-- City: string (nullable = true)
 |-- WeatherText: string (nullable = true)
 |-- key: string (nullable = false)
 |-- value: string (nullable = true)



In [15]:
df5 = df4.filter(df4.key == "Value")

In [16]:
df6 = df5.select("Time", 
                 "City", 
                 "WeatherText", 
                 F.col("value").alias("TemperatureC"))

In [17]:
df_formatted = df6.withColumn("TemperatureC", df6.TemperatureC.cast(DoubleType()))

In [18]:
df_formatted.printSchema()

root
 |-- Time: timestamp (nullable = true)
 |-- City: string (nullable = true)
 |-- WeatherText: string (nullable = true)
 |-- TemperatureC: double (nullable = true)



In [19]:
df_formatted2 = df_formatted.select("Time",
                                    F.trim(F.initcap(F.col("City"))).alias("City"), 
                                    F.trim(F.lower(F.col("WeatherText"))).alias("WeatherText"),
                                    F.col("TemperatureC"))

In [20]:
df_formatted2.printSchema()

root
 |-- Time: timestamp (nullable = true)
 |-- City: string (nullable = true)
 |-- WeatherText: string (nullable = true)
 |-- TemperatureC: double (nullable = true)



In [21]:
Avg_temp = df_formatted2\
    .withWatermark("Time", watermark)\
    .groupBy("City", 
             F.window(df_formatted2.Time, time_window, "1 second"))\
    .agg(F.avg("TemperatureC").alias("AvgTempC"))\
    .sort(F.col("window").desc())

In [22]:
Avg_temp.printSchema()

root
 |-- City: string (nullable = true)
 |-- window: struct (nullable = true)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- AvgTempC: double (nullable = true)



In [23]:
Avg_temp = Avg_temp.withColumn("AvgTempC", F.round(Avg_temp["AvgTempC"], 2))

In [24]:
# query = Avg_temp \
#     .writeStream \
#     .outputMode("complete") \
#     .format("console") \
#     .trigger(processingTime='12 seconds') \
#     .option("truncate", False) \
#     .start()

In [25]:
# query.stop()

In [26]:
Weather_text = df_formatted2\
    .withWatermark("Time", watermark)\
    .groupBy("City", 
             "WeatherText", 
             F.window(df_formatted2.Time, time_window, "1 second")).agg(
    F.count("WeatherText").alias("Count"))

In [27]:
Weather_text = Weather_text.sort(
    F.col("window").desc(), 
    F.col("Count").desc())

In [28]:
# query = Weather_text \
#     .writeStream \
#     .outputMode("complete") \
#     .format("console") \
#     .trigger(processingTime='12 seconds') \
#     .option("truncate", False) \
#     .start()

In [29]:
# query.stop()

In [30]:
Weather_text.printSchema()

root
 |-- City: string (nullable = true)
 |-- WeatherText: string (nullable = true)
 |-- window: struct (nullable = true)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- Count: long (nullable = false)



## Output

In [54]:
query1 = Avg_temp \
    .writeStream \
    .outputMode("complete") \
    .format("memory") \
    .queryName("temp_query") \
    .trigger(processingTime='12 seconds') \
    .start()

print("Sleeping...")
sleep(60)
print("Waking Up...")

query1.stop()

Sleeping...
Waking Up...


In [55]:
spark.sql("SELECT * FROM temp_query limit 10").toPandas().head(10)

Unnamed: 0,City,window,AvgTempC
0,Perth,"(2022-09-11 14:39:11, 2022-09-11 14:39:23)",11.1
1,Perth,"(2022-09-11 14:39:10, 2022-09-11 14:39:22)",11.1
2,Perth,"(2022-09-11 14:39:09, 2022-09-11 14:39:21)",11.47
3,Perth,"(2022-09-11 14:39:08, 2022-09-11 14:39:20)",11.65
4,Perth,"(2022-09-11 14:39:07, 2022-09-11 14:39:19)",11.54
5,Perth,"(2022-09-11 14:39:06, 2022-09-11 14:39:18)",11.47
6,Perth,"(2022-09-11 14:39:05, 2022-09-11 14:39:17)",11.66
7,Perth,"(2022-09-11 14:39:04, 2022-09-11 14:39:16)",12.08
8,Perth,"(2022-09-11 14:39:03, 2022-09-11 14:39:15)",12.52
9,Perth,"(2022-09-11 14:39:02, 2022-09-11 14:39:14)",12.77


In [61]:
spark.sql("SELECT * FROM temp_query").toPandas().to_csv(os.path.join(output_location, "avg_temp.csv"), index = False)

In [50]:
query2 = Weather_text \
    .writeStream \
    .outputMode("complete") \
    .format("memory") \
    .queryName("weather_query") \
    .trigger(processingTime='12 seconds') \
    .start()

print("Sleeping...")
sleep(60)
print("Waking Up...")
    
query2.stop()

Sleeping...
Waking Up...


In [53]:
spark.sql('''
    SELECT City, 
        WeatherText, 
        window, 
        Count
    FROM (
        SELECT *, 
            row_number() OVER (PARTITION BY City, window order BY Count desc) AS rn
    FROM weather_query) temp
    WHERE rn <= 3
    ORDER BY window DESC, rn
    LIMIT 30
    ''').toPandas()

Unnamed: 0,City,WeatherText,window,Count
0,Perth,light rain shower,"(2022-09-11 14:36:22, 2022-09-11 14:36:34)",1
1,Perth,partly sunny,"(2022-09-11 14:36:21, 2022-09-11 14:36:33)",1
2,Perth,light rain shower,"(2022-09-11 14:36:21, 2022-09-11 14:36:33)",1
3,Perth,cloudy,"(2022-09-11 14:36:20, 2022-09-11 14:36:32)",1
4,Perth,light rain shower,"(2022-09-11 14:36:20, 2022-09-11 14:36:32)",1
5,Perth,partly sunny,"(2022-09-11 14:36:20, 2022-09-11 14:36:32)",1
6,Perth,light rain shower,"(2022-09-11 14:36:19, 2022-09-11 14:36:31)",1
7,Perth,mostly cloudy,"(2022-09-11 14:36:19, 2022-09-11 14:36:31)",1
8,Perth,cloudy,"(2022-09-11 14:36:19, 2022-09-11 14:36:31)",1
9,Perth,partly sunny,"(2022-09-11 14:36:18, 2022-09-11 14:36:30)",2


In [60]:
spark.sql('''
    SELECT City, 
        WeatherText, 
        window, 
        Count
    FROM (
        SELECT *, 
            row_number() OVER (PARTITION BY City, window order BY Count desc) AS rn
    FROM weather_query) temp
    WHERE rn <= 3
    ORDER BY window DESC, rn
    ''').toPandas().to_csv(os.path.join(output_location, "weather_text.csv"), index = False)