# High-level Preprocessing Steps:
    1. Change column names
    2. check for null values
    3. create features: day, hour, is_school_holiday, is_not_working_day, precipitation, snow_depth, is_airport

In [43]:
from pyspark.sql.functions import to_timestamp, date_format, hour
from pyspark.sql.functions import isnan, when, count, col
from pyspark.sql import SparkSession, functions as F

In [2]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("Preprocess Data")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/06 17:00:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [39]:
sdf = spark.read.parquet('../data/raw/tlc_data')

In [34]:
sdf.show(1, vertical=True, truncate=100)

-RECORD 0------------------------------------
 VendorID              | 1                   
 tpep_pickup_datetime  | 2022-03-01 11:13:08 
 tpep_dropoff_datetime | 2022-03-01 11:24:35 
 passenger_count       | 1.0                 
 trip_distance         | 2.4                 
 RatecodeID            | 1.0                 
 store_and_fwd_flag    | N                   
 PULocationID          | 90                  
 DOLocationID          | 209                 
 payment_type          | 2                   
 fare_amount           | 10.0                
 extra                 | 3.0                 
 mta_tax               | 0.5                 
 tip_amount            | 0.0                 
 tolls_amount          | 0.0                 
 improvement_surcharge | 0.3                 
 total_amount          | 13.8                
 congestion_surcharge  | 2.5                 
 airport_fee           | 0.0                 
only showing top 1 row



In [35]:
print(f"Number of Rows: {sdf.count()}")

Number of Rows: 19358482


In [36]:
# renaming a few columns
field_name_change = {"VendorID": "vendor_id", "RatecodeID": "ratecode_id", 
                      "PULocationID": "pu_location_id", 
                      "DOLocationID": "do_location_id"}
for old, new in field_name_change.items():
    sdf = sdf.withColumnRenamed(old, new)
    
# converting a couple columns to integers and saving it
for field in ('pu_location_id', 'do_location_id', 'vendor_id', 'payment_type'):
    sdf = sdf.withColumn(
        field,
        F.col(field).cast('INT')
    )
    
sdf = sdf.withColumn(
    'store_and_fwd_flag',
    (F.col("store_and_fwd_flag") == 'Y').cast('BOOLEAN')
)
    
sdf.printSchema()

root
 |-- vendor_id: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- ratecode_id: double (nullable = true)
 |-- store_and_fwd_flag: boolean (nullable = true)
 |-- pu_location_id: integer (nullable = true)
 |-- do_location_id: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



# Null values

In [7]:
SAMPLE_SIZE = 0.05
df = sdf.sample(SAMPLE_SIZE, seed=0).toPandas()

                                                                                

In [8]:
df.to_csv('../data/raw/other_data/sample_data.csv', index=False)

In [9]:
sdf.columns

['vendor_id',
 'tpep_pickup_datetime',
 'tpep_dropoff_datetime',
 'passenger_count',
 'trip_distance',
 'ratecode_id',
 'store_and_fwd_flag',
 'pu_location_id',
 'do_location_id',
 'payment_type',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'improvement_surcharge',
 'total_amount',
 'congestion_surcharge',
 'airport_fee']

In [12]:
Dict_Null = {col:sdf.filter(sdf[col].isNull()).count() for col in sdf.columns}
Dict_Null

                                                                                

{'vendor_id': 0,
 'tpep_pickup_datetime': 0,
 'tpep_dropoff_datetime': 0,
 'passenger_count': 639669,
 'trip_distance': 0,
 'ratecode_id': 639669,
 'store_and_fwd_flag': 639669,
 'pu_location_id': 0,
 'do_location_id': 0,
 'payment_type': 0,
 'fare_amount': 0,
 'extra': 0,
 'mta_tax': 0,
 'tip_amount': 0,
 'tolls_amount': 0,
 'improvement_surcharge': 0,
 'total_amount': 0,
 'congestion_surcharge': 639669,
 'airport_fee': 639669}

In [15]:
sdf.filter(F.col('ratecode_id').isNull()).limit(5)

                                                                                

vendor_id,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,ratecode_id,store_and_fwd_flag,pu_location_id,do_location_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
2,2022-03-01 11:30:00,2022-03-01 11:47:00,,4.83,,,232,37,0,16.61,0.0,0.5,4.33,0.0,0.3,24.24,,
2,2022-03-01 11:14:42,2022-03-01 11:30:09,,5.6,,,231,236,0,22.19,0.0,0.5,5.55,0.0,0.3,31.04,,
1,2022-03-01 11:10:21,2022-03-01 11:32:29,,12.9,,,138,13,0,36.0,1.75,0.5,6.16,0.0,0.3,47.21,,
6,2022-03-01 11:03:08,2022-03-01 11:03:53,,8.03,,,265,69,0,31.2,0.0,0.5,0.0,0.0,0.3,32.0,,
2,2022-03-01 11:12:42,2022-03-01 11:14:38,,0.33,,,164,170,0,8.8,0.0,0.5,2.57,0.0,0.3,14.67,,


## Feature Engineering

In [71]:
# create new column pickup day
sdf = sdf.withColumn("tpep_pickup_datetime",
                   to_timestamp(col("tpep_pickup_datetime"))) \
         .withColumn("pu_day", date_format(col("tpep_pickup_datetime"), "d"))

# create new column pickup hour
sdf = sdf.withColumn('pu_hour', hour(sdf.tpep_pickup_datetime))

# create new helper column to get day of the week
sdf = sdf.withColumn("tpep_pickup_datetime",
                   to_timestamp(col("tpep_pickup_datetime"))) \
         .withColumn("weekday_abb", date_format(col("tpep_pickup_datetime"), 
                                                   "E"))
# create new column to identify weekends
sdf = sdf.withColumn(
    'is_weekend',
    # when we have non-zero distance/passenger/total amount then True
    # else False
    F.when(
        (F.col('weekday_abb').isin(["Sun", "Sat"])),
        1
    ).otherwise(0)
)
sdf = sdf.drop(col("weekday_abb"))

# create new column to identify if trip is during weekday peak hours
sdf = sdf.withColumn(
    'is_weekday_peak_hour',
    # when we have non-zero distance/passenger/total amount then True
    # else False
    F.when(
        (F.col('is_weekend') == 0)
        & (F.col('pu_hour').isin([6, 7, 8, 9, 10,
                                  16, 17, 18, 19, 20])),
        1
    ).otherwise(0)
)

# create new column to calculate trip duration in seconds
sdf = sdf.withColumn(
    'trip_duration',
    (col("tpep_dropoff_datetime").cast("long") - 
     col('tpep_pickup_datetime').cast("long")))

In [73]:
sdf.filter((F.col('is_weekend') == 0) & (F.col('pu_hour')==7)).limit(5)

                                                                                

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,pu_day,pu_hour,is_weekend,trip_duration,is_weekday_peak_hour
2,2022-03-02 07:01:11,2022-03-02 07:06:37,1.0,1.18,1.0,N,239,238,1,6.0,1.0,0.5,2.58,0.0,0.3,12.88,2.5,0.0,2,7,0,326,1
2,2022-03-02 07:01:40,2022-03-02 07:17:10,2.0,7.47,1.0,N,132,135,1,22.5,1.0,0.5,0.0,0.0,0.3,25.55,0.0,1.25,2,7,0,930,1
2,2022-03-02 07:00:25,2022-03-02 07:06:30,1.0,1.03,1.0,N,161,237,1,6.5,1.0,0.5,1.5,0.0,0.3,12.3,2.5,0.0,2,7,0,365,1
2,2022-03-02 07:07:40,2022-03-02 07:30:01,1.0,11.64,1.0,N,70,220,2,33.0,1.0,0.5,0.0,6.55,0.3,42.6,0.0,1.25,2,7,0,1341,1
2,2022-03-02 07:00:06,2022-03-02 07:07:38,1.0,0.9,1.0,N,237,161,1,6.5,1.0,0.5,2.16,0.0,0.3,12.96,2.5,0.0,2,7,0,452,1
