This notebook preprocesses the following raw data for the period Dec 2019 to Feb 2020 and Dec 2021 to Feb 2022:
- NYC Yellow Taxi Data 
- NYC For Hire Vehicles (FHV) Data

In [29]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F


# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .getOrCreate()
)

<h2><u>Preprocess TLC Data</u></h2>

In [77]:
# load yellow taxi data
yellow_2019_12_sdf = spark.read.parquet('../data/raw/NYC TLC Data/yellow_tripdata_2019-12.parquet', header=True)
yellow_2020_01_sdf = spark.read.parquet('../data/raw/NYC TLC Data/yellow_tripdata_2020-01.parquet', header=True)
yellow_2020_02_sdf = spark.read.parquet('../data/raw/NYC TLC Data/yellow_tripdata_2020-02.parquet', header=True)

yellow_2021_12_sdf = spark.read.parquet('../data/raw/NYC TLC Data/yellow_tripdata_2021-12.parquet', header=True)
yellow_2022_01_sdf = spark.read.parquet('../data/raw/NYC TLC Data/yellow_tripdata_2022-01.parquet', header=True)
yellow_2022_02_sdf = spark.read.parquet('../data/raw/NYC TLC Data/yellow_tripdata_2022-02.parquet', header=True)

# load FHV data
fhvhv_2019_12_sdf = spark.read.parquet('../data/raw/NYC TLC Data/fhvhv_tripdata_2019-12.parquet', header=True)
fhvhv_2020_01_sdf = spark.read.parquet('../data/raw/NYC TLC Data/fhvhv_tripdata_2020-01.parquet', header=True)
fhvhv_2020_02_sdf = spark.read.parquet('../data/raw/NYC TLC Data/fhvhv_tripdata_2020-02.parquet', header=True)

fhvhv_2021_12_sdf = spark.read.parquet('../data/raw/NYC TLC Data/fhvhv_tripdata_2021-12.parquet', header=True)
fhvhv_2022_01_sdf = spark.read.parquet('../data/raw/NYC TLC Data/fhvhv_tripdata_2022-01.parquet', header=True)
fhvhv_2022_02_sdf = spark.read.parquet('../data/raw/NYC TLC Data/fhvhv_tripdata_2022-02.parquet', header=True)

# merge datasets categorised by type and period
yellow_19_20_sdf = yellow_2019_12_sdf.unionAll(yellow_2020_01_sdf).unionAll(yellow_2020_02_sdf)
yellow_21_22_sdf = yellow_2021_12_sdf.unionAll(yellow_2022_01_sdf).unionAll(yellow_2022_02_sdf)

fhvhv_19_20_sdf = fhvhv_2019_12_sdf.unionAll(fhvhv_2020_01_sdf).unionAll(fhvhv_2020_02_sdf)
fhvhv_21_22_sdf = fhvhv_2021_12_sdf.unionAll(fhvhv_2022_01_sdf).unionAll(fhvhv_2022_02_sdf)

<h3>Feature engineering</h3>

**Yellow Taxis**

In [75]:
# trip duration in minutes
yellow_19_20_sdf = yellow_19_20_sdf.withColumn(
    'trip_time',
    (F.col('tpep_dropoff_datetime').cast('long') - F.col('tpep_pickup_datetime').cast('long')/60)
)

# pickup day is weekend or not
yellow_19_20_sdf = yellow_19_20_sdf.withColumn(
    'is_weekend',
    F.dayofweek(F.col('tpep_pickup_datetime')).isin([1, 7])
)

# tip percentage over total amount paid
yellow_19_20_sdf = yellow_19_20_sdf.withColumn(
    'tip_percent',
    (F.col('tip_amount') / F.col('total_amount')) * 100
)

# drop unnecessary columns
yellow_19_20_sdf = yellow_19_20_sdf.drop(*['VendorID', 'RatecodeID', 'store_and_fwd_flag', 'extra', 'mta_tax', 'tolls_amount', 'improvement_surcharge', 'congestion_surcharge', 'airport_fee'])

**Rideshares**

In [78]:
# trip duration in mniutes
fhvhv_19_20_sdf = fhvhv_19_20_sdf.withColumn(
    'trip_time',
    F.col('trip_time')/60
)

# fill null values in airport_fee with 0
fhvhv_19_20_sdf = fhvhv_19_20_sdf.na.fill(value=0, subset=['airport_fee'])

# total amount paid by passenger
fhvhv_19_20_sdf = fhvhv_19_20_sdf.withColumn(
    'total_amount',
    F.col('base_passenger_fare') + F.col('tolls') + F.col('sales_tax') + F.col('tips') + F.col('airport_fee')
)

# pickup day is weekend or not
fhvhv_19_20_sdf = fhvhv_19_20_sdf.withColumn(
    'is_weekend',
    F.dayofweek(F.col('pickup_datetime')).isin([1, 7])
)

# tip percentage over total amount paid
fhvhv_19_20_sdf = fhvhv_19_20_sdf.withColumn(
    'tip_percent',
    (F.col('tips') / F.col('total_amount')) * 100
)

# drop unnecessary columns
fhvhv_19_20_sdf = fhvhv_19_20_sdf.drop(*['dispatching_base_num', 'originating_base_num', 'request_datetime', 'on_scene_datetime', 'base_passenger_fare', 'tolls', 'bcf', 'sales_tax', 'congestion_surcharge', 'airport_fee', 'shared_request_flag', 'shared_match_flag', 'access_a_ride_flag', 'wav_request_flag', 'wav_match_flag'])