In [78]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [79]:
spark = (
    SparkSession.builder.appName("preprocessing of taxi data")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .getOrCreate()
)

In [80]:
aggregate_sdf = spark.read.parquet('../data/curated/analysed')

In [81]:
aggregate_sdf

PULocationID,fare_amount,extra,tip_amount,duration (minutes),date,time,year,month,day,average_temperature,precip,time_float,cos_time,sin_time,date_float,cos_date,sin_date,month_float,cos_month,sin_month,trip_value,holiday,high_value
238,6.0,0.5,2.0,4.367,2016-04-01,00:41:18,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,1.5882757,False,True
166,17.0,0.5,3.65,19.2,2016-04-01,00:24:47,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,0.898875,False,False
164,5.5,0.5,1.35,4.133,2016-04-01,00:45:45,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,1.4511493,False,True
170,5.5,0.5,1.7,3.517,2016-04-01,00:19:40,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,1.7865226,False,True
158,10.0,0.5,2.8,12.733,2016-04-01,00:04:05,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,0.85233647,False,False
170,21.0,0.5,4.45,24.783,2016-04-01,00:30:36,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,0.8544244,False,False
246,5.5,0.5,1.35,5.817,2016-04-01,00:42:39,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,1.031047,False,False
164,7.0,0.5,1.65,6.433,2016-04-01,00:53:36,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,1.1606405,False,True
161,4.5,0.5,1.2,3.7,2016-04-01,00:50:50,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,1.3673513,False,True
48,22.5,0.5,4.75,33.267,2016-04-01,00:13:26,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,0.68067455,False,False


In [82]:
cnt_cond = lambda cond: F.sum(F.when(cond, 1).otherwise(0))
aggregate_sdf = aggregate_sdf.groupBy("PULocationID", "date", "time_float") \
                             .agg(F.first("average_temperature") ,
                                  F.first("time"),
                                  F.first("precip"),
                                  F.first("cos_time"),
                                  F.first("sin_time"),
                                  F.first("date_float"),
                                  F.first("cos_date"),
                                  F.first("sin_date"),
                                  F.first("month_float"),
                                  F.first("cos_month"),
                                  F.first("sin_month"),
                                  F.first("trip_value"),
                                  F.first("holiday"),
                                  cnt_cond(F.col('high_value') == True)
                                 )

In [83]:

aggregate_sdf = aggregate_sdf.withColumnRenamed("first(average_temperature)", "average_temperature") \
                             .withColumnRenamed("first(time)", "time") \
                             .withColumnRenamed("first(precip)", "precip") \
                             .withColumnRenamed("first(cos_time)", "cos_time")	\
                             .withColumnRenamed("first(sin_time)", "sin_time")	\
                             .withColumnRenamed("first(date_float)", "date_float")	\
                             .withColumnRenamed("first(cos_date)", "cos_date")	\
                             .withColumnRenamed("first(sin_date)", "sin_date")	\
                             .withColumnRenamed("first(month_float)", "month_float") \
                             .withColumnRenamed("first(cos_month)", "cos_month") \
                             .withColumnRenamed("first(holiday)", "holiday") \
                             .withColumnRenamed("first(trip_value)", "trip_value") \
                             .withColumnRenamed("sum(CASE WHEN (high_value = true) THEN 1 ELSE 0 END)", "high_value_trips") \
                             .withColumnRenamed("first(sin_month)", "sin_month") \
                             .withColumnRenamed("first(trip_value)", "trip_value") 
                            

In [84]:
aggregate_sdf.write.mode('overwrite').parquet('../data/curated/aggregated')

                                                                                