In [None]:
# DO TAKE NOTE SOME AGGREGATION IS DONE DURING MODELING 
#  AS SOME DATA IS NEEDED FOR VISUALISATION

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from datetime import datetime

In [None]:


# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "4g")
    .getOrCreate()
)

In [None]:
sdf_green = spark.read.parquet('../data/raw/green_data')

In [None]:
# Initial data amount
sdf_green.count()

In [None]:
sdf_green.printSchema()

In [None]:
sdf_green.limit(5)

In [None]:
sdf_green_pre = sdf_green.withColumn('shift', F.hour(F.col('lpep_pickup_datetime')))
sdf_green_pre.limit(5)

In [None]:
# Split the datetime to days(Mon,Tue,...,Sun) 
# 1 - Sunday, 2-Monday, ..., 7-Saturday
sdf_green_pre = sdf_green.withColumn('shift', 
                                        F.when(
                                            (F.hour(F.col('lpep_pickup_datetime')) > 3)
                                            & (F.hour(F.col('lpep_pickup_datetime')) < 18),
                                            'Morning'
                                        ).otherwise('Night')
                                    )
sdf_green_pre = sdf_green_pre.withColumn('day', F.date_format(F.col('lpep_pickup_datetime'), "E"))
sdf_green_pre2 = sdf_green_pre.withColumn('fare', F.round(F.col('total_amount')-F.col('tip_amount'),2))
sdf_green_pre3 = sdf_green_pre2.where(
                                    (F.col('fare')>2.5)
                                    & (F.col('passenger_count')>0)
                                    & (F.col('passenger_count')<7)
                                    & (F.col('trip_distance')>0)
                                    & (F.col('PULocationID')<=263)
                                    & (F.col('PULocationID')>1) # Don't include EWR as not in Green taxi zone as indicated in user guide (show picture)
                                     )
sdf_green_pre3 = sdf_green_pre3.withColumn('Date',F.to_date('lpep_pickup_datetime'))
sdf_green_pre4 = sdf_green_pre3.filter(F.col('Date') >= '2021-07-01')
sdf_green_pre4 = sdf_green_pre4.filter(F.col('Date') < '2022-05-01')
sdf_green_pre4.limit(5)

In [None]:
# As we disregarded the tip amount as it is not accounted for in cash tips
# we will only account for total amount - tip and we can remove the rest that is attributed to the total amount and payment type would not matter
# VendorID will not be of importance as we are looking at green taxis in general
rel_col = ('lpep_pickup_datetime', 'lpep_dropoff_datetime', 'store_and_fwd_flag', 'RatecodeID', 'PULocationID', 'DOLocationID', 'passenger_count', 'fare', 'trip_type', 'shift', 'Date', 'Day')
sdf_green_pre5 = sdf_green_pre4.select(*rel_col)
sdf_green_pre5.limit(5)


In [None]:
# Now we import the weather dataset to be added
weather = spark.read.option("header",True).csv("../data/raw/Weather.csv")
weather.show(5)

In [None]:
# Only want the 3 stations near New York City
rel_cols = ('NAME', 'DATE', 'TAVG')
weather_NYC = weather.select(*rel_cols).filter((F.col('NAME') == 'JFK INTERNATIONAL AIRPORT, NY US')
                                               |(F.col('NAME') == 'NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US')
                                               |(F.col('NAME') == 'NY CITY CENTRAL PARK, NY US')
                                              )

weather_NYC.filter(F.col('NAME') == 'NY CITY CENTRAL PARK, NY US').limit(5)

In [None]:
# As NYC Central Park does not have data for temperature, we will use Newark and JFK instead
# Using the appropriated percentage according to weatherspark, we will split the weightage of 58% of central park accordingly to Newark and JFK (2:1 ratio)
# So Newark is 67% while JFK contributes 33%
rel_cols = ('NAME', 'DATE', 'TAVG')
weather_NYC = weather.select(*rel_cols).filter((F.col('NAME') == 'JFK INTERNATIONAL AIRPORT, NY US')
                                               |(F.col('NAME') == 'NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US')
                                              )
weather_NYC = weather_NYC.withColumn('WTAVG', F.when((F.col('NAME') == 'NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US'),
                                                        (F.col('TAVG')*2/3).cast('int')
                                                    ).otherwise((F.col('TAVG')/3)).cast('int')
                                    )
weather_NYC = weather_NYC.withColumn('Date', F.to_date('DATE'))
weather_NYC.limit(5)

In [None]:
# Now we aggregate the weighted average together
cols = ('Date', 'WTAVG')
weather_NYC_cur = weather_NYC.select(*cols)
agg_NYC_weather = weather_NYC_cur.groupby('Date') \
                                 .agg(
                                    F.sum('WTAVG').alias('temp')
                                  )
agg_NYC_weather.limit(5)

In [None]:
# Now we want to add the temperature accordingly to the dates in the main dataframe

merged_sdf = sdf_green_pre5.join(agg_NYC_weather, on='Date', how='left')
merged_sdf.orderBy('Date').limit(5)

In [None]:
# Output the merged dataframe to data folder in parquet
merged_sdf.write.mode('overwrite').parquet('../data/curated/result')