This notebook preprocesses the following raw data for the period Dec 2019 to Feb 2020 and Dec 2021 to Feb 2022:
- NYC Yellow Taxi Data 
- NYC For Hire Vehicles (FHV) Data

In [22]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
import seaborn as sns

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .getOrCreate()
)

<h2><u>Preprocess TLC Data</u></h2>

In [32]:
# load yellow taxi data
yellow_2019_12_sdf = spark.read.parquet('../data/raw/NYC TLC Data/yellow_tripdata_2019-12.parquet', header=True)
yellow_2020_01_sdf = spark.read.parquet('../data/raw/NYC TLC Data/yellow_tripdata_2020-01.parquet', header=True)
yellow_2020_02_sdf = spark.read.parquet('../data/raw/NYC TLC Data/yellow_tripdata_2020-02.parquet', header=True)

yellow_2021_12_sdf = spark.read.parquet('../data/raw/NYC TLC Data/yellow_tripdata_2021-12.parquet', header=True)
yellow_2022_01_sdf = spark.read.parquet('../data/raw/NYC TLC Data/yellow_tripdata_2022-01.parquet', header=True)
yellow_2022_02_sdf = spark.read.parquet('../data/raw/NYC TLC Data/yellow_tripdata_2022-02.parquet', header=True)

# load FHV data
fhvhv_2019_12_sdf = spark.read.parquet('../data/raw/NYC TLC Data/fhvhv_tripdata_2019-12.parquet', header=True)
fhvhv_2020_01_sdf = spark.read.parquet('../data/raw/NYC TLC Data/fhvhv_tripdata_2020-01.parquet', header=True)
fhvhv_2020_02_sdf = spark.read.parquet('../data/raw/NYC TLC Data/fhvhv_tripdata_2020-02.parquet', header=True)

fhvhv_2021_12_sdf = spark.read.parquet('../data/raw/NYC TLC Data/fhvhv_tripdata_2021-12.parquet', header=True)
fhvhv_2022_01_sdf = spark.read.parquet('../data/raw/NYC TLC Data/fhvhv_tripdata_2022-01.parquet', header=True)
fhvhv_2022_02_sdf = spark.read.parquet('../data/raw/NYC TLC Data/fhvhv_tripdata_2022-02.parquet', header=True)

# merge datasets categorised by type and period
yellow_19_20_sdf = yellow_2019_12_sdf.unionAll(yellow_2020_01_sdf).unionAll(yellow_2020_02_sdf)
yellow_21_22_sdf = yellow_2021_12_sdf.unionAll(yellow_2022_01_sdf).unionAll(yellow_2022_02_sdf)

fhvhv_19_20_sdf = fhvhv_2019_12_sdf.unionAll(fhvhv_2020_01_sdf).unionAll(fhvhv_2020_02_sdf)
fhvhv_21_22_sdf = fhvhv_2021_12_sdf.unionAll(fhvhv_2022_01_sdf).unionAll(fhvhv_2022_02_sdf)

<h3>Feature engineering</h3>

**Yellow Taxis**

In [33]:
# trip duration in minutes
yellow_19_20_sdf = yellow_19_20_sdf.withColumn(
    'trip_time',
    (F.col('tpep_dropoff_datetime').cast('long') - F.col('tpep_pickup_datetime').cast('long'))/60
)

# pickup day is weekend or not
yellow_19_20_sdf = yellow_19_20_sdf.withColumn(
    'is_weekend',
    F.dayofweek(F.col('tpep_pickup_datetime')).isin([1, 7])
)

# tip percentage over total amount paid
yellow_19_20_sdf = yellow_19_20_sdf.withColumn(
    'tip_percent',
    (F.col('tip_amount') / F.col('total_amount')) * 100
)

**FHV**

In [48]:
# trip duration in mniutes
fhvhv_19_20_sdf = fhvhv_19_20_sdf.withColumn(
    'trip_time',
    F.col('trip_time')/60
)

# fill null values in airport_fee with 0
fhvhv_19_20_sdf = fhvhv_19_20_sdf.na.fill(value=0, subset=['airport_fee'])

# total amount paid by passenger
fhvhv_19_20_sdf = fhvhv_19_20_sdf.withColumn(
    'total_amount',
    F.col('base_passenger_fare') + F.col('tolls') + F.col('sales_tax') + F.col('tips') + F.col('airport_fee')
)

# pickup day is weekend or not
fhvhv_19_20_sdf = fhvhv_19_20_sdf.withColumn(
    'is_weekend',
    F.dayofweek(F.col('pickup_datetime')).isin([1, 7])
)

# tip percentage over total amount paid
fhvhv_19_20_sdf = fhvhv_19_20_sdf.withColumn(
    'tip_percent',
    (F.col('tips') / F.col('total_amount')) * 100
)

<h3>Outlier Detection</h3>

**Yellow Taxis**

In [44]:
print("Original data size: ", yellow_19_20_sdf.count())
print("Trip time cleaned size: ", yellow_19_20_sdf.where((F.col('trip_time') > 0) & (F.col('trip_time') < 300)).count())
print("Location ID cleaned size: ", yellow_19_20_sdf.where(((F.col('PULocationID') >= 1) & (F.col('PULocationID') <= 263))
                                                            | ((F.col('DOLocationID') >= 1) & (F.col('DOLocationID') <= 263))).count())

Original data size:  19600692


                                                                                

Trip time cleaned size:  19543652
Location ID cleaned size:  19506018


In [51]:
yellow_19_20_sdf_cleaned = yellow_19_20_sdf.where((F.col('trip_time') > 0) & (F.col('trip_time') < 300))
yellow_19_20_sdf_cleaned = yellow_19_20_sdf_cleaned.where(((F.col('PULocationID') >= 1) & (F.col('PULocationID') <= 263))
                                                            | ((F.col('DOLocationID') >= 1) & (F.col('DOLocationID') <= 263)))
print("Cleaned data size:", yellow_19_20_sdf_cleaned.count())     

                                                                                

19450894

**FHV**

In [50]:
print("Original data size: ", fhvhv_19_20_sdf.count())
print("Trip time cleaned size: ", fhvhv_19_20_sdf.where((F.col('trip_time') > 0) & (F.col('trip_time') < 300)).count())
print("Location ID cleaned size: ", fhvhv_19_20_sdf.where(((F.col('PULocationID') >= 1) & (F.col('PULocationID') <= 263))
                                                            | ((F.col('DOLocationID') >= 1) & (F.col('DOLocationID') <= 263))).count())

Original data size:  64538369


                                                                                

Trip time cleaned size:  64536919




Location ID cleaned size:  64536983


                                                                                

In [54]:
fhvhv_19_20_sdf_cleaned = fhvhv_19_20_sdf.where((F.col('trip_time') > 0) & (F.col('trip_time') < 300))
fhvhv_19_20_sdf_cleaned = fhvhv_19_20_sdf_cleaned.where(((F.col('PULocationID') >= 1) & (F.col('PULocationID') <= 263))
                                                            | ((F.col('DOLocationID') >= 1) & (F.col('DOLocationID') <= 263)))
print("Cleaned data size:", fhvhv_19_20_sdf_cleaned.count())



Cleaned data size: 64535536


                                                                                

<h3>Feature selection</h3>

In [62]:
yellow_19_20_sdf_final = yellow_19_20_sdf_cleaned.drop(*['VendorID', 'passenger_count', 'trip_distance', 'RatecodeID', 
                                                        'store_and_fwd_flag', 'payment_type', 'fare_amount', 'extra',
                                                        'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
                                                        'congestion_surcharge', 'airport_fee', 'trip_time'])
yellow_19_20_sdf_final

tpep_pickup_datetime,tpep_dropoff_datetime,PULocationID,DOLocationID,total_amount,is_weekend,tip_percent
2019-12-01 11:26:58,2019-12-01 11:41:45,142,116,18.3,True,0.0
2019-12-01 11:12:08,2019-12-01 11:12:14,145,145,3.8,True,0.0
2019-12-01 11:25:53,2019-12-01 11:26:04,145,145,3.8,True,0.0
2019-12-01 11:12:03,2019-12-01 11:33:19,138,25,39.8,True,25.12562814070352
2019-12-01 11:05:27,2019-12-01 11:16:32,161,237,12.8,True,0.0
2019-12-01 11:58:51,2019-12-01 12:08:37,161,230,10.3,True,0.0
2019-12-01 11:14:19,2019-12-01 11:27:06,164,163,13.8,True,0.0
2019-12-01 11:29:35,2019-12-01 11:32:29,79,224,9.35,True,16.577540106951876
2019-12-01 11:42:19,2019-12-01 11:50:34,79,107,11.3,True,0.0
2019-12-01 11:19:48,2019-12-01 11:24:18,148,4,11.15,True,16.591928251121075


In [61]:
fhvhv_19_20_sdf_final = fhvhv_19_20_sdf_cleaned.drop(*['hvfhs_license_num', 'dispatching_base_num', 'originating_base_num',
                                                        'request_datetime', 'on_scene_datetime', 'trip_miles', 'trip_time',
                                                        'base_passenger_fare', 'tolls', 'bcf', 'sales_tax', 'congestion_surcharge',
                                                        'airport_fee', 'tips', 'driver_pay', 'shared_request_flag', 'shared_match_flag',
                                                        'access_a_ride_flag', 'wav_request_flag', 'wav_match_flag'])
fhvhv_19_20_sdf_final

pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,total_amount,is_weekend,tip_percent
2019-12-01 11:16:23,2019-12-01 11:24:47,42,41,9.82,True,0.0
2019-12-01 11:36:01,2019-12-01 11:47:28,236,166,18.96,True,0.0
2019-12-01 11:54:48,2019-12-01 12:11:15,238,78,16.28,True,0.0
2019-12-01 11:48:19,2019-12-01 12:00:16,148,125,9.88,True,0.0
2019-12-01 11:08:55,2019-12-01 11:26:26,138,137,34.660000000000004,True,0.0
2019-12-01 11:32:33,2019-12-01 11:47:06,164,141,17.13,True,0.0
2019-12-01 11:18:15,2019-12-01 11:38:07,25,263,28.87,True,0.0
2019-12-01 11:40:46,2019-12-01 11:52:38,263,226,18.17,True,0.0
2019-12-01 11:12:55,2019-12-01 11:17:23,60,167,10.03,True,0.0
2019-12-01 11:02:23,2019-12-01 11:26:35,126,173,14.090000000000002,True,0.0
