# Pyspark setup

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = (
    SparkSession.builder.appName("preprocessing of taxi data")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/16 16:22:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/08/16 16:22:02 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
weather_sdf = spark.read.option("header",True).csv("../data/new york 2016-01-01 to 2017.csv")
weather_sdf.printSchema()

                                                                                

root
 |-- name: string (nullable = true)
 |-- datetime: string (nullable = true)
 |-- tempmax: string (nullable = true)
 |-- tempmin: string (nullable = true)
 |-- temp: string (nullable = true)
 |-- feelslikemax: string (nullable = true)
 |-- feelslikemin: string (nullable = true)
 |-- feelslike: string (nullable = true)
 |-- dew: string (nullable = true)
 |-- humidity: string (nullable = true)
 |-- precip: string (nullable = true)
 |-- precipprob: string (nullable = true)
 |-- precipcover: string (nullable = true)
 |-- preciptype: string (nullable = true)
 |-- snow: string (nullable = true)
 |-- snowdepth: string (nullable = true)
 |-- windgust: string (nullable = true)
 |-- windspeed: string (nullable = true)
 |-- winddir: string (nullable = true)
 |-- sealevelpressure: string (nullable = true)
 |-- cloudcover: string (nullable = true)
 |-- visibility: string (nullable = true)
 |-- solarradiation: string (nullable = true)
 |-- solarenergy: string (nullable = true)
 |-- uvindex: stri

# Select columns of interest for feature engineering 

In [4]:
weather_sdf = weather_sdf.select(["datetime", "tempmax", "tempmin", "temp", "precip"])
weather_sdf

datetime,tempmax,tempmin,temp,precip
2016-01-01,5.8,1.7,4.1,0.0
2016-01-02,4.4,0.7,2.2,0.0
2016-01-03,7.2,1.6,3.9,0.0
2016-01-04,2.4,-9.2,-2.3,0.0
2016-01-05,-1.8,-11.3,-6.3,0.0
2016-01-06,4.6,-4.0,0.2,0.0
2016-01-07,8.0,-0.4,3.8,0.0
2016-01-08,7.0,1.6,4.6,0.0
2016-01-09,8.4,4.6,6.6,0.05
2016-01-10,14.9,5.1,10.6,40.9


# Check for any null values to decided whether imputation should be performed.

In [5]:
Dict_Null = {col:weather_sdf.filter(weather_sdf[col].isNull()).count() for col in weather_sdf.columns}
Dict_Null

{'datetime': 0, 'tempmax': 0, 'tempmin': 0, 'temp': 0, 'precip': 0}

# Max temp and min temp is engineered to produce daily average temperature.

In [6]:
weather_sdf = weather_sdf.withColumn("average_temperature", F.round(((F.col("tempmax") + F.col("tempmin")) / 2), 2))
weather_sdf

datetime,tempmax,tempmin,temp,precip,average_temperature
2016-01-01,5.8,1.7,4.1,0.0,3.75
2016-01-02,4.4,0.7,2.2,0.0,2.55
2016-01-03,7.2,1.6,3.9,0.0,4.4
2016-01-04,2.4,-9.2,-2.3,0.0,-3.4
2016-01-05,-1.8,-11.3,-6.3,0.0,-6.55
2016-01-06,4.6,-4.0,0.2,0.0,0.3
2016-01-07,8.0,-0.4,3.8,0.0,3.8
2016-01-08,7.0,1.6,4.6,0.0,4.3
2016-01-09,8.4,4.6,6.6,0.05,6.5
2016-01-10,14.9,5.1,10.6,40.9,10.0


# Retain attributes of interest.

In [7]:
weather_sdf = weather_sdf.select(["datetime", "average_temperature", "precip"])
weather_sdf

datetime,average_temperature,precip
2016-01-01,3.75,0.0
2016-01-02,2.55,0.0
2016-01-03,4.4,0.0
2016-01-04,-3.4,0.0
2016-01-05,-6.55,0.0
2016-01-06,0.3,0.0
2016-01-07,3.8,0.0
2016-01-08,4.3,0.0
2016-01-09,6.5,0.05
2016-01-10,10.0,40.9


In [8]:
weather_sdf.write.mode('overwrite').parquet('../data/curated/preprocess_weather_result1')

                                                                                

22/08/16 19:55:52 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 937873 ms exceeds timeout 120000 ms
22/08/16 19:55:52 WARN SparkContext: Killing executors is not supported by current scheduler.
