In [20]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [21]:
spark = (
    SparkSession.builder.appName("preprocessing of taxi data")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .getOrCreate()
)

In [22]:
analysis_sdf = spark.read.parquet('../data/curated/feature_engineered')

In [23]:
analysis_sdf.count()

116591525

In [24]:
analysis_sdf

PULocationID,fare_amount,extra,tip_amount,duration (minutes),date,time,year,month,day,average_temperature,precip,time_float,cos_time,sin_time,date_float,cos_date,sin_date,month_float,cos_month,sin_month,trip_value,holiday
238,6.0,0.5,2.0,4.367,2016-04-01,00:41:18,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,1.5882757,False
166,17.0,0.5,3.65,19.2,2016-04-01,00:24:47,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,0.898875,False
164,5.5,0.5,1.35,4.133,2016-04-01,00:45:45,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,1.4511493,False
170,5.5,0.5,1.7,3.517,2016-04-01,00:19:40,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,1.7865226,False
158,10.0,0.5,2.8,12.733,2016-04-01,00:04:05,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,0.85233647,False
170,21.0,0.5,4.45,24.783,2016-04-01,00:30:36,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,0.8544244,False
246,5.5,0.5,1.35,5.817,2016-04-01,00:42:39,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,1.031047,False
164,7.0,0.5,1.65,6.433,2016-04-01,00:53:36,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,1.1606405,False
161,4.5,0.5,1.2,3.7,2016-04-01,00:50:50,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,1.3673513,False
48,22.5,0.5,4.75,33.267,2016-04-01,00:13:26,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,0.68067455,False


# Outlier analysis on trip value
I wanted to capture high value trips, indicating that more weight will be given to tip amounts as high as 999.99 a trip. 
However, since such occasions are too limited, an outlier analysis of Q1− IQR * 5< Non−Outlier < Q3+ IQR * 5 where IQR= Q3−Q1
is conducted where 5*IQR is feasible as the value between the first quantile and third is around 0.3.

In [25]:
analysis_sdf.approxQuantile("trip_value", [0.25,0.5,0.75,0.99], 0.1)

                                                                                

[0.7377777695655823, 0.900767982006073, 1.101974368095398, 1579.8486328125]

In [26]:
analysis_sdf.describe("trip_value")

summary,trip_value
count,116591525.0
mean,1.0005391948148343
stddev,0.8746850366692198
min,0.0036487211
max,1579.8486


In [27]:
quantiles = [0.7377777695655823, 0.900767982006073, 1.0946341753005981, 1579.8486328125]
Q1 = quantiles[0]
Q3 = quantiles[2]
IQR = Q3 - Q1
lower_bound = Q1 - (IQR * 5)
upper_bound = Q3 + (IQR * 5)
raw_count = analysis_sdf.count()
raw_count

116591525

In [28]:
analysis_sdf = analysis_sdf.filter((F.col("trip_value") >= lower_bound) & (F.col("trip_value") < upper_bound))

In [29]:
print(f"{raw_count - analysis_sdf.count()} has been removed")
analysis_sdf.count()

310678 has been removed


116280847

In [30]:
analysis_sdf.approxQuantile("trip_value", [0.25,0.5,0.75,0.99], 0.1)

                                                                                

[0.7389476299285889, 0.9016888737678528, 1.100083351135254, 2.8788321018218994]

In [33]:
analysis_sdf.describe("trip_value")

                                                                                

summary,trip_value
count,116280847.0
mean,0.9853645865751468
stddev,0.3091481521432794
min,0.0036487211
max,2.878832


In [31]:
# decided to take trips greater than third quantile as high value trips
Q3 = 1.1
analysis_sdf = analysis_sdf.withColumn("high_value",    
                                       F.when(F.col("trip_value") > Q3, True)
                                       .otherwise(False)
                                      )
analysis_sdf

PULocationID,fare_amount,extra,tip_amount,duration (minutes),date,time,year,month,day,average_temperature,precip,time_float,cos_time,sin_time,date_float,cos_date,sin_date,month_float,cos_month,sin_month,trip_value,holiday,high_value
238,6.0,0.5,2.0,4.367,2016-04-01,00:41:18,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,1.5882757,False,True
166,17.0,0.5,3.65,19.2,2016-04-01,00:24:47,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,0.898875,False,False
164,5.5,0.5,1.35,4.133,2016-04-01,00:45:45,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,1.4511493,False,True
170,5.5,0.5,1.7,3.517,2016-04-01,00:19:40,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,1.7865226,False,True
158,10.0,0.5,2.8,12.733,2016-04-01,00:04:05,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,0.85233647,False,False
170,21.0,0.5,4.45,24.783,2016-04-01,00:30:36,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,0.8544244,False,False
246,5.5,0.5,1.35,5.817,2016-04-01,00:42:39,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,1.031047,False,False
164,7.0,0.5,1.65,6.433,2016-04-01,00:53:36,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,1.1606405,False,True
161,4.5,0.5,1.2,3.7,2016-04-01,00:50:50,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,1.3673513,False,True
48,22.5,0.5,4.75,33.267,2016-04-01,00:13:26,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,0.68067455,False,False


In [32]:
analysis_sdf.write.mode('overwrite').parquet('../data/curated/analysed')

                                                                                

22/08/23 03:31:19 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 883789 ms exceeds timeout 120000 ms
22/08/23 03:31:19 WARN SparkContext: Killing executors is not supported by current scheduler.
