Preprocessing the weather data to obtain hourly and daily data with temperature and wind speed

In [50]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import functions as F

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC") # ensure time zone isn't changed to Australian
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

In [51]:
# get the data for new york
weather = spark.read.option("header",True).csv("../../mast30034-project-1-janggani/data/raw/nyc")

# take columns with relevant data(date time, temperature, feels like, humidity, windspeed)
weather_r = weather.select("datetime", "temp", "windspeed")
weather_r

datetime,temp,windspeed
2019-02-11T00:00:00,1.0,5.4
2019-02-11T01:00:00,1.1,8.6
2019-02-11T02:00:00,-0.2,0.0
2019-02-11T03:00:00,-0.1,9.8
2019-02-11T04:00:00,-0.8,12.5
2019-02-11T05:00:00,-0.8,8.3
2019-02-11T06:00:00,-0.9,8.6
2019-02-11T07:00:00,-0.5,6.7
2019-02-11T08:00:00,0.1,5.4
2019-02-11T09:00:00,0.6,9.4


In [52]:
# convert km/h of wind speed to miles per hour and switch out the data, temp is in celcius
weather_new = weather_r.withColumn('wind speed',(col("windspeed")/1.609344)).withColumn('temp_f',(col("temp")*1.8 +32))
weather_new = weather_new.withColumn("wind speed", F.round(weather_new["wind speed"], 2)).withColumn("temp_f", F.round(weather_new["temp_f"], 2))
weather_new = weather_new.select("datetime", "temp_f","wind speed") 
weather_new

datetime,temp_f,wind speed
2019-02-11T00:00:00,33.8,3.36
2019-02-11T01:00:00,33.98,5.34
2019-02-11T02:00:00,31.64,0.0
2019-02-11T03:00:00,31.82,6.09
2019-02-11T04:00:00,30.56,7.77
2019-02-11T05:00:00,30.56,5.16
2019-02-11T06:00:00,30.38,5.34
2019-02-11T07:00:00,31.1,4.16
2019-02-11T08:00:00,32.18,3.36
2019-02-11T09:00:00,33.08,5.84


In [53]:
# get the specific hours and place them in a new column and specific date
weather_news = weather_new.withColumn("hour", hour(col("datetime")))

In [54]:
# get the date 
weather_news = weather_news.withColumn("date", weather_news.datetime.substr(1,10))

In [55]:
# remove the T in all the dates 
weather_curated = weather_news.withColumn('datetime', F.regexp_replace('datetime', 'T', ' '))
weather_curated

datetime,temp_f,wind speed,hour,date
2019-02-11 00:00:00,33.8,3.36,0,2019-02-11
2019-02-11 01:00:00,33.98,5.34,1,2019-02-11
2019-02-11 02:00:00,31.64,0.0,2,2019-02-11
2019-02-11 03:00:00,31.82,6.09,3,2019-02-11
2019-02-11 04:00:00,30.56,7.77,4,2019-02-11
2019-02-11 05:00:00,30.56,5.16,5,2019-02-11
2019-02-11 06:00:00,30.38,5.34,6,2019-02-11
2019-02-11 07:00:00,31.1,4.16,7,2019-02-11
2019-02-11 08:00:00,32.18,3.36,8,2019-02-11
2019-02-11 09:00:00,33.08,5.84,9,2019-02-11


In [56]:
weather_curated = weather_curated.select("date", "hour", "temp_f", "wind speed") 

In [57]:
# get data for date hour column for weather
weather_dh = weather_curated.select(concat_ws(' ', weather_curated.date, weather_curated.hour).alias("date hour"), "temp_f", "wind speed")

# find mean temperature daily
weather_daily = (weather_curated
            .groupBy("date")
            .agg(round(mean("temp_f"), 2).alias("temp"),
                round(mean("wind speed"), 2).alias("wind speed"))
            .orderBy("date")
           )


In [58]:
weather_daily.write.mode("overwrite").parquet("../../mast30034-project-1-janggani/data/curated/weather/weather_daily.parquet")
weather_dh.write.mode("overwrite").parquet("../../mast30034-project-1-janggani/data/curated/weather/weather_dh.parquet")

22/08/24 21:47:34 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 924288 ms exceeds timeout 120000 ms
22/08/24 21:47:34 WARN SparkContext: Killing executors is not supported by current scheduler.
