# High-level Preprocessing Steps:
    1. Change column names
    2. check for null values
    3. create features: day, hour, is_school_holiday, is_not_working_day, precipitation, snow_depth, is_airport

In [1]:
from pyspark.sql.functions import to_timestamp, date_format, hour
from pyspark.sql.functions import isnan, when, count, col
from pyspark.sql import SparkSession, functions as F

In [2]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("Preprocess Data")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/14 01:30:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
sdf = spark.read.parquet('../data/raw/tlc_data')

                                                                                

In [4]:
sdf.show(1, vertical=True, truncate=100)

-RECORD 0------------------------------------
 VendorID              | 1                   
 tpep_pickup_datetime  | 2019-03-01 11:24:41 
 tpep_dropoff_datetime | 2019-03-01 11:25:31 
 passenger_count       | 1.0                 
 trip_distance         | 0.0                 
 RatecodeID            | 1.0                 
 store_and_fwd_flag    | N                   
 PULocationID          | 145                 
 DOLocationID          | 145                 
 payment_type          | 2                   
 fare_amount           | 2.5                 
 extra                 | 0.5                 
 mta_tax               | 0.5                 
 tip_amount            | 0.0                 
 tolls_amount          | 0.0                 
 improvement_surcharge | 0.3                 
 total_amount          | 3.8                 
 congestion_surcharge  | 0.0                 
 airport_fee           | null                
only showing top 1 row



                                                                                

In [5]:
print(f"Number of Rows: {sdf.count()}")



Number of Rows: 84598444


                                                                                

In [4]:
# renaming a few columns
field_name_change = {"VendorID": "vendor_id", "RatecodeID": "ratecode_id", 
                      "PULocationID": "pu_location_id", 
                      "DOLocationID": "do_location_id"}
for old, new in field_name_change.items():
    sdf = sdf.withColumnRenamed(old, new)
    
# converting a couple columns to integers and saving it
for field in ('pu_location_id', 'do_location_id', 'vendor_id', 'payment_type'):
    sdf = sdf.withColumn(
        field,
        F.col(field).cast('INT')
    )
    
sdf = sdf.withColumn(
    'store_and_fwd_flag',
    (F.col("store_and_fwd_flag") == 'Y').cast('BOOLEAN')
)
    
sdf.printSchema()

root
 |-- vendor_id: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- ratecode_id: double (nullable = true)
 |-- store_and_fwd_flag: boolean (nullable = true)
 |-- pu_location_id: integer (nullable = true)
 |-- do_location_id: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: integer (nullable = true)



# Null values

In [7]:
sdf.columns

['vendor_id',
 'tpep_pickup_datetime',
 'tpep_dropoff_datetime',
 'passenger_count',
 'trip_distance',
 'ratecode_id',
 'store_and_fwd_flag',
 'pu_location_id',
 'do_location_id',
 'payment_type',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'improvement_surcharge',
 'total_amount',
 'congestion_surcharge',
 'airport_fee']

In [8]:
Dict_Null = {col:sdf.filter(sdf[col].isNull()).count() for col in sdf.columns}
Dict_Null

                                                                                

{'vendor_id': 0,
 'tpep_pickup_datetime': 0,
 'tpep_dropoff_datetime': 0,
 'passenger_count': 444383,
 'trip_distance': 0,
 'ratecode_id': 444383,
 'store_and_fwd_flag': 444383,
 'pu_location_id': 0,
 'do_location_id': 0,
 'payment_type': 0,
 'fare_amount': 0,
 'extra': 0,
 'mta_tax': 0,
 'tip_amount': 0,
 'tolls_amount': 0,
 'improvement_surcharge': 0,
 'total_amount': 0,
 'congestion_surcharge': 5300601,
 'airport_fee': 84598444}

In [11]:
sdf.filter(F.col('ratecode_id').isNull()).limit(5)

                                                                                

vendor_id,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,ratecode_id,store_and_fwd_flag,pu_location_id,do_location_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
2,2019-03-01 11:10:00,2019-03-01 11:49:00,,6.71,,,137,17,0,39.45,2.75,0.5,0.0,0.0,0.3,43.0,,
2,2019-03-01 11:16:40,2019-03-01 11:37:31,,6.93,,,146,42,0,36.45,2.75,0.0,0.0,0.0,0.3,39.5,,
2,2019-03-01 11:15:00,2019-03-01 11:50:00,,13.22,,,230,32,0,56.45,2.75,0.5,0.0,0.0,0.3,60.0,,
2,2019-03-01 11:02:20,2019-03-01 11:33:05,,7.57,,,226,63,0,54.25,2.75,0.0,0.0,0.0,0.3,57.3,,
2,2019-03-01 11:24:16,2019-03-01 11:40:30,,6.24,,,230,41,0,32.95,2.75,0.0,0.0,0.0,0.3,36.0,,


## Feature Engineering

In [5]:
# create new column pickup day
sdf = sdf.withColumn("tpep_pickup_datetime",
                   to_timestamp(col("tpep_pickup_datetime"))) \
         .withColumn("pu_day", date_format(col("tpep_pickup_datetime"), "d"))

# create new column pickup month
sdf = sdf.withColumn("tpep_pickup_datetime",
                   to_timestamp(col("tpep_pickup_datetime"))) \
         .withColumn("pu_month", date_format(col("tpep_pickup_datetime"), "m"))

# create new column pickup hour
sdf = sdf.withColumn('pu_hour', hour(sdf.tpep_pickup_datetime))

# create new helper column to get day of the week
sdf = sdf.withColumn("tpep_pickup_datetime",
                   to_timestamp(col("tpep_pickup_datetime"))) \
         .withColumn("weekday_abb", date_format(col("tpep_pickup_datetime"), 
                                                   "E"))
# create new column to identify weekends
sdf = sdf.withColumn(
    'is_weekend',
    # when we have non-zero distance/passenger/total amount then True
    # else False
    F.when(
        (F.col('weekday_abb').isin(["Sun", "Sat"])),
        1
    ).otherwise(0)
)
sdf = sdf.drop(col("weekday_abb"))

# create new column to identify if trip is during weekday peak hours
sdf = sdf.withColumn(
    'is_weekday_peak_hour',
    # when we have non-zero distance/passenger/total amount then True
    # else False
    F.when(
        (F.col('is_weekend') == 0)
        & (F.col('pu_hour').isin([6, 7, 8, 9, 10,
                                  16, 17, 18, 19, 20])),
        1
    ).otherwise(0)
)

# create new column to calculate trip duration in seconds
sdf = sdf.withColumn(
    'trip_duration',
    (col("tpep_dropoff_datetime").cast("long") - 
     col('tpep_pickup_datetime').cast("long")))

         

In [6]:
import pandas as pd
from datetime import date
from pyspark.sql.functions import to_date

sch_hol = pd.read_csv("../data/raw/other_data/nyc_school_holiday.csv", sep=";")
sch_hol['DATE'] = pd.to_datetime(sch_hol['DATE'], format='%d/%m/%y')
sch_hol_date = sch_hol['DATE'].dt.date.tolist()

# create helper column pickup date format
sdf = sdf.withColumn(
    "pu_date",
    to_date(col("tpep_pickup_datetime"),"yyyy-MM-dd")
)

# create new column to identify if that day is school holiday
sdf = sdf.withColumn(
    'is_school_holiday',
    # when we have non-zero distance/passenger/total amount then True
    # else False
    F.when(
        (F.col('pu_date').isin(sch_hol_date)),
        1
    ).otherwise(0)
)

sdf = sdf.drop(col("pu_date"))

In [None]:
sdf.filter(F.col('pu_date') == date(2019, 1, 20)).limit(5)

                                                                                

vendor_id,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,ratecode_id,store_and_fwd_flag,pu_location_id,do_location_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,pu_day,pu_hour,is_weekend,is_weekday_peak_hour,trip_duration,pu_month,pu_date,is_school_holiday
2,2019-01-20 09:33:34,2019-01-20 11:09:46,1.0,0.0,5.0,False,68,10,1,35.0,0.0,0.0,7.0,0.0,0.0,42.0,,,20,9,1,0,5772,33,2019-01-20,0
2,2019-01-20 00:03:16,2019-01-20 00:18:22,1.0,4.08,1.0,False,79,263,1,14.0,0.0,0.5,2.96,0.0,0.3,17.76,,,20,0,1,0,906,3,2019-01-20,0
2,2019-01-20 00:01:46,2019-01-20 00:05:03,1.0,0.62,1.0,False,237,237,2,4.5,0.0,0.5,0.0,0.0,0.3,5.3,,,20,0,1,0,197,1,2019-01-20,0
2,2019-01-20 00:00:35,2019-01-20 00:32:44,1.0,2.8,1.0,False,186,239,2,20.0,0.0,0.5,0.0,0.0,0.3,20.8,,,20,0,1,0,1929,0,2019-01-20,0
2,2019-01-20 00:01:24,2019-01-20 00:39:17,2.0,4.61,1.0,False,246,263,1,24.5,0.0,0.5,6.32,0.0,0.3,31.62,,,20,0,1,0,2273,1,2019-01-20,0


[Stage 76:>                                                        (0 + 4) / 12]

In [13]:
sdf.filter((F.col('is_weekend') == 0) & (F.col('pu_hour')==7)).limit(5)

                                                                                

vendor_id,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,ratecode_id,store_and_fwd_flag,pu_location_id,do_location_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,pu_day,pu_hour,is_weekend,is_weekday_peak_hour,trip_duration
2,2019-03-01 07:43:33,2019-03-01 08:03:06,1.0,5.72,1.0,False,61,67,2,19.0,0.5,0.5,0.0,0.0,0.3,20.3,0.0,,1,7,0,1,1173
2,2019-03-01 07:49:10,2019-03-01 07:56:25,2.0,1.49,1.0,False,234,114,1,7.5,0.5,0.5,2.26,0.0,0.3,13.56,2.5,,1,7,0,1,435
2,2019-03-01 07:35:00,2019-03-02 06:52:08,1.0,10.8,1.0,False,132,121,2,30.0,0.5,0.5,0.0,0.0,0.3,31.3,0.0,,1,7,0,1,83828
2,2019-03-01 07:31:01,2019-03-01 07:47:21,1.0,7.65,1.0,False,132,76,1,22.5,0.5,0.5,4.76,0.0,0.3,28.56,0.0,,1,7,0,1,980
2,2019-03-01 07:26:59,2019-03-01 07:34:25,1.0,31.03,2.0,False,132,246,1,52.0,0.0,0.5,6.11,5.76,0.3,67.17,2.5,,1,7,0,1,446


### Get 1.5% sample size

In [7]:
SAMPLE_SIZE = 0.015
df = sdf.sample(SAMPLE_SIZE, seed=0).toPandas()
df.to_csv('../data/raw/other_data/sample_data.csv', index=False)

22/08/14 01:30:46 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                