# Preprocess the TLC Data

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = (
    SparkSession.builder.appName("preprocess")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.executor.memory", "4g")
    .getOrCreate()
)

In [3]:
sdf = spark.read.parquet('../data/raw/yellow_taxi_data_2019/')

# Preliminary data analysis

In [4]:
# print(f'total of {sdf.count():,} rows')

In [5]:
# sdf.printSchema()

In [6]:
# we are not interested in these attributes

new_sdf = sdf.drop('VendorID', 
                   'store_and_fwd_flag', 
                   'fare_amount', 
                   'extra',
                   'mta_tax', 
                   'tolls_amount', 
                   'improvement_surcharge',
                   'total_amount',
                   'congestion_surcharge',
                   'airport_fee')

In [7]:
# Check the statistics for interested attributes
# new_sdf.describe(['tip_amount', 'trip_distance', 'passenger_count', 'RatecodeID', 'payment_type']).show()

In [8]:
# Check null value
#for column in new_sdf.columns:
    #print(f'{column} has {new_sdf.where(F.col(column).isNull()).count()} null value(s)')

In [9]:
# Modify the sdf
# line 18: Drop the rows with null passenger_count and RatecodeID
# line 19~20: Remove data not in 2019
# line 21: A reasonable tip amount between 0 and 1000
# line 22: A reasonable trip distance is between 0 and 1000
# line 23: RateCodeID should be one of the integer in the range of 1-6
# line 24: Only trips paid by cash and credit card are of our interest
# line 25: Passenger count must not be zero
# line 26~27: Zone 264, 265 are unknown zone
# line 28~30: Extract the pick-up month, day and hour
# line 31: trip_distance in kilometres
# line 32~33: duaration of trips
# line 34: weekday and weekend 
# line 35: is the tips > 0


new_sdf_mdf = new_sdf \
            .dropna(how='any') \
            .filter((F.year('tpep_pickup_datetime') == 2019) & 
                    (F.year('tpep_dropoff_datetime') == 2019) == True) \
            .filter((F.col('tip_amount') >= 0) & (F.col('tip_amount') < 1000)) \
            .filter((F.col('trip_distance') >= 0) & (F.col('trip_distance') < 1000)) \
            .filter(F.col('RatecodeID').isin(list(range(1, 7)))) \
            .filter(F.col('payment_type').isin([1, 2])) \
            .filter(F.col('passenger_count') != 0) \
            .filter((F.col('PULocationID').isin([264, 265]) == False) & 
                    (F.col('DOLocationID').isin([264, 265]) == False)) \
            .withColumn('PUMonth', F.month(F.col('tpep_pickup_datetime'))) \
            .withColumn('PUDayofweek', F.dayofweek(F.col('tpep_pickup_datetime'))) \
            .withColumn('PUDay', F.dayofmonth(F.col('tpep_pickup_datetime'))) \
            .withColumn('PUHour', F.hour(F.col('tpep_pickup_datetime'))) \
            .withColumn('trip_distance(km)', F.col('trip_distance') * 1.60934) \
            .withColumn('time_difference(s)', F.col('tpep_dropoff_datetime').cast('long') - \
                                                F.col('tpep_pickup_datetime').cast('long')) \
            .withColumn('is_weekend', F.dayofweek('tpep_pickup_datetime').isin([1, 7])) \
            .withColumn('tips_given', F.col('tip_amount') > 0)
                     

In [10]:
aggregated_result1 = new_sdf_mdf \
                    .groupBy("PULocationID", 
                             "DOLocationID",
                            ) \
                    .agg(
                        F.max("tip_amount").alias("max_tip_amount_usd"),
                        F.mean("tip_amount").alias("avg_tip_amount_usd"),
                        F.count("PULocationID").alias("total_trip_count")
                    )

aggregated_result1.show()

+------------+------------+------------------+------------------+----------------+
|PULocationID|DOLocationID|max_tip_amount_usd|avg_tip_amount_usd|total_trip_count|
+------------+------------+------------------+------------------+----------------+
|         231|         261|              90.0|1.3922246071012807|           27488|
|         170|         179|              18.0| 3.136674528301886|            2544|
|          90|         142|             66.69|2.4137433862433864|           16632|
|         114|         100|             26.29|1.7605119236883933|           12580|
|         234|         144|             57.86|1.8175478515625012|           40960|
|          87|          33|              13.8| 2.248647824318829|            4918|
|         249|         225|              35.0|  3.45905325443787|            1859|
|         116|         127|              12.0| 1.264627507163324|             698|
|         246|         249|            211.69| 1.888683311934877|           36607|
|   

In [11]:
aggregated_result2 = new_sdf_mdf \
                    .groupBy("PUMonth",
                             "PUHour",
                             "PUDayofweek",
                             "is_weekend"
                            ) \
                    .agg(
                        F.max("tip_amount").alias("max_tip_amount_usd"),
                        F.mean("tip_amount").alias("avg_tip_amount_usd"),
                        F.count("PULocationID").alias("total_trip_count")
                    )

aggregated_result2.show()

+-------+------+-----------+----------+------------------+------------------+----------------+
|PUMonth|PUHour|PUDayofweek|is_weekend|max_tip_amount_usd|avg_tip_amount_usd|total_trip_count|
+-------+------+-----------+----------+------------------+------------------+----------------+
|      3|    13|          2|     false|              50.0|1.8932726883561646|            4672|
|      8|     9|          4|     false|              90.0| 2.388770749856935|           27952|
|      4|    14|          2|     false|              39.3| 2.203985611510791|            4865|
|      4|    13|          2|     false|              25.0|1.7573219306466719|            4268|
|      3|     4|          6|     false|              75.0|2.3637684704093482|           78910|
|      3|    10|          3|     false|              45.0|2.5065788161550273|           28078|
|      3|     7|          4|     false|             106.0|2.3678223647294816|           62375|
|      3|    19|          4|     false|           

In [12]:
aggregated_result3 = new_sdf_mdf \
                    .groupBy("passenger_count") \
                    .agg(
                        F.max("tip_amount").alias("max_tip_amount_usd"),
                        F.mean("tip_amount").alias("avg_tip_amount_usd"),
                        F.count("PULocationID").alias("total_trip_count")
                    ) \
                    .orderBy("passenger_count")

aggregated_result3.show()

+---------------+------------------+------------------+----------------+
|passenger_count|max_tip_amount_usd|avg_tip_amount_usd|total_trip_count|
+---------------+------------------+------------------+----------------+
|            1.0|            787.25|2.1998592189411283|        57747469|
|            2.0|             600.0| 2.204927920239129|        12504689|
|            3.0|             338.0| 2.128770535330322|         3506396|
|            4.0|             333.0|2.0072217370516823|         1664936|
|            5.0|             444.8|2.2290614052397255|         3349763|
|            6.0|             495.0|2.2148070469215657|         2013702|
|            7.0|             23.82| 5.241421800947866|             211|
|            8.0|              25.2| 6.363382352941177|             136|
|            9.0|              30.4| 5.760357142857144|             112|
+---------------+------------------+------------------+----------------+

