# Preprocess the TLC Data

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = (
    SparkSession.builder.appName("preprocess")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .getOrCreate()
)
# .config("spark.executor.memory", "4g")

In [3]:
sdf = spark.read.parquet('../data/raw/yellow_taxi_data_2019/')

# Preliminary data analysis

In [12]:
print(f'total of {sdf.count():,} rows')

total of 84,598,444 rows


In [13]:
sdf.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: integer (nullable = true)



In [14]:
# we are not interested in these attributes

new_sdf = sdf.drop('VendorID', 
                   'store_and_fwd_flag', 
                   'extra',
                   'mta_tax', 
                   'tolls_amount', 
                   'improvement_surcharge',
                   'total_amount',
                   'congestion_surcharge',
                   'airport_fee')

In [15]:
# Check the statistics for interested attributes
new_sdf.describe(['tip_amount', 'trip_distance', 'passenger_count', 'RatecodeID', 'payment_type']).show()

+-------+------------------+------------------+------------------+------------------+-------------------+
|summary|        tip_amount|     trip_distance|   passenger_count|        RatecodeID|       payment_type|
+-------+------------------+------------------+------------------+------------------+-------------------+
|  count|          84598444|          84598444|          84154061|          84154061|           84598444|
|   mean| 2.190078737505638|3.0183506184817515|1.5626654190817957| 1.061297933084893| 1.2825886963121922|
| stddev|15.638996154306168| 8.093902044464816|1.2079081585219809|0.7596134346977569|0.48687669531746225|
|    min|            -221.0|         -37264.53|               0.0|               1.0|                  0|
|    max|         141492.02|          45977.22|               9.0|              99.0|                  5|
+-------+------------------+------------------+------------------+------------------+-------------------+



In [16]:
# Check null value
for column in new_sdf.columns:
    print(f'{column} has {new_sdf.where(F.col(column).isNull()).count()} null value(s)')

tpep_pickup_datetime has 0 null value(s)
tpep_dropoff_datetime has 0 null value(s)
passenger_count has 444383 null value(s)
trip_distance has 0 null value(s)
RatecodeID has 444383 null value(s)
PULocationID has 0 null value(s)
DOLocationID has 0 null value(s)
payment_type has 0 null value(s)
fare_amount has 0 null value(s)
tip_amount has 0 null value(s)


In [9]:
# Modify the sdf
# line 19: Drop the rows with null passenger_count and RatecodeID
# line 20~21: Remove data not in 2019
# line 22: A reasonable tip amount between 0 and 250 which is at most 50% of fare amount
# line 23: A reasonable trip distance is between 0 and 100
# line 24: A reasonable fare amount is between 2.5(initial charge) and 500 (1/5 mile == 1 usd if Rate #04)
# line 25: RateCodeID should be one of the integer in the range of 1-6
# line 26: Only trips paid by credit card are of our interest
# line 27: Passenger count must not be zero
# line 28~29: Zone 264, 265 are unknown zone
# line 30~33: Extract the pick-up month, day and hour
# line 34: trip_distance in kilometres
# line 35~36: duaration of trips
# line 37: weekday and weekend 
# line 38: is the tips > 0


new_sdf_mdf = new_sdf \
            .dropna(how='any') \
            .filter((F.year('tpep_pickup_datetime') == 2019) & 
                    (F.year('tpep_dropoff_datetime') == 2019) == True) \
            .filter((F.col('tip_amount') >= 0) & (F.col('tip_amount') < 250)) \
            .filter((F.col('trip_distance') > 0) & (F.col('trip_distance') < 100)) \
            .filter((F.col('fare_amount') >= 2.5) & (F.col('fare_amount') < 500)) \
            .filter(F.col('RatecodeID').isin(list(range(1, 7)))) \
            .filter(F.col('payment_type') == 1) \
            .filter(F.col('passenger_count') != 0) \
            .filter((F.col('PULocationID').isin([264, 265]) == False) & 
                    (F.col('DOLocationID').isin([264, 265]) == False)) \
            .withColumn('PUMonth', F.month(F.col('tpep_pickup_datetime'))) \
            .withColumn('PUDay', F.dayofmonth(F.col('tpep_pickup_datetime'))) \
            .withColumn('PUDayofweek', (F.dayofweek(F.col('tpep_pickup_datetime'))+5)%7+1) \
            .withColumn('PUHour', F.hour(F.col('tpep_pickup_datetime'))) \
            .withColumn('trip_distance(km)', F.col('trip_distance') * 1.60934) \
            .withColumn('time_difference(s)', F.col('tpep_dropoff_datetime').cast('long') - \
                                                F.col('tpep_pickup_datetime').cast('long')) \
            .withColumn('is_weekend', F.dayofweek('tpep_pickup_datetime').isin([1, 7])) \
            .withColumn('tips_given', F.col('tip_amount') > 0)
                     

In [10]:
aggregated_result1 = new_sdf_mdf \
                    .groupBy("PULocationID", 
                             "PUMonth", 
                             "PUDay"
                            ) \
                    .agg(
                        F.max("tip_amount").alias("max_tip_amount_usd"),
                        F.mean("tip_amount").alias("avg_tip_amount_usd"),
                        F.count("PULocationID").alias("total_trip_count")
                    )

aggregated_result1.show()

+------------+-------+-----+------------------+-------------------+----------------+
|PULocationID|PUMonth|PUDay|max_tip_amount_usd| avg_tip_amount_usd|total_trip_count|
+------------+-------+-----+------------------+-------------------+----------------+
|         191|      3|    2|              1.03|0.06058823529411765|              17|
|         106|      3|    2|              6.76| 2.0241935483870974|              31|
|         224|      3|    3|             17.15|   2.58779126213592|             412|
|          12|      3|    3|             11.06|  3.589666666666667|              30|
|         223|      3|    3|             14.64|  4.161764705882353|              51|
|          54|      3|    4|             13.76|              4.996|               5|
|         126|      3|    5|               0.0|                0.0|               6|
|         200|      3|    5|               0.0|                0.0|               5|
|         125|      3|    6|              23.0| 2.829563838223631

In [11]:
# write out the aggregated results that are grouped by PU and DO location
aggregated_result1.write.mode('overwrite').parquet('../data/curated/aggregated_results1_2019')