# High-level Preprocessing Steps:
    1. Change column names and types
    2. Cleaning: date outside range, negative duration, duration > 18000s, null values
    3. Add Features from external dataset: hour, day, is_school_holiday, weather features (temperature, precipitation, wind, etc.)
    4. Check and deal with null values

    
    

In [1]:
from pyspark.sql.functions import to_timestamp, date_format, hour, dayofweek
from pyspark.sql.functions import isnan, when, count, col, split, concat, lit
from pyspark.sql.functions import to_date, create_map
from itertools import chain
from pyspark.sql import SparkSession, Window, functions as F
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("Preprocess Data")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

22/08/17 00:10:36 WARN Utils: Your hostname, Patricks-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 100.94.177.225 instead (on interface en0)
22/08/17 00:10:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/17 00:10:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# read datasets
taxi = spark.read.parquet('../data/raw/tlc_data')
weather = spark.read.csv('../data/raw/other_data/jfk_weather.csv', header=True)
school_holiday = spark.read.csv('../data/raw/other_data/nyc_school_holiday.csv',
                                header=True)
zones = pd.read_csv("../data/raw/taxi_zones/taxi+_zone_lookup.csv")

                                                                                

In [4]:
columns_interest = ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 
                    'PULocationID', 'DOLocationID']
taxi = taxi[columns_interest]

weather_features = ['DATE', 'TMP', 'DEW', 'AA1', 'WND', 'VIS', 'MA1']
weather = weather[weather_features]

In [5]:
weather.count(), taxi.count()

                                                                                

(27136, 187469831)

In [6]:
weather.limit(5)

DATE,TMP,DEW,AA1,WND,VIS,MA1
2018-01-02T00:00:00,-781,-2061,,"300,1,N,0077,1",016000199,999999102761
2018-01-02T00:51:00,-835,-2005,1000095.0,"290,5,N,0067,5","016093,5,N,5",102885102795
2018-01-02T01:51:00,-835,-1895,1000095.0,"270,5,N,0062,5","016093,5,N,5",102885102795
2018-01-02T02:51:00,-835,-1675,1000095.0,"270,5,N,0062,5","016093,5,N,5",102885102795
2018-01-02T03:00:00,-831,-1671,,"270,1,N,0062,1",016000199,999999102761


In [7]:
taxi.limit(5)

                                                                                

tpep_pickup_datetime,tpep_dropoff_datetime,PULocationID,DOLocationID
2018-03-01 00:01:34,2018-03-01 00:01:43,145,145
2018-03-01 00:14:34,2018-03-01 00:28:13,151,244
2018-03-01 00:51:25,2018-03-01 00:59:54,238,152
2018-03-01 00:00:01,2018-03-01 00:00:17,145,145
2018-03-01 00:55:10,2018-03-01 00:56:36,145,145


In [8]:
taxi.printSchema()

root
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)



In [9]:
weather.printSchema()

root
 |-- DATE: string (nullable = true)
 |-- TMP: string (nullable = true)
 |-- DEW: string (nullable = true)
 |-- AA1: string (nullable = true)
 |-- WND: string (nullable = true)
 |-- VIS: string (nullable = true)
 |-- MA1: string (nullable = true)



In [10]:
# Taxi Dataset
# renaming a few columns
field_name_change = {"PULocationID": "pu_location_id", 
                     "DOLocationID": "do_location_id"}
for old, new in field_name_change.items():
    taxi = taxi.withColumnRenamed(old, new)
    
# converting columns type
for field in ('pu_location_id', 'do_location_id'):
    taxi = taxi.withColumn(
        field,
        F.col(field).cast('INT')
    )

In [11]:
# Weather Dataset

# change column names
field_name_change = {"DATE": "date_time", "TMP": "temperature",
                     "DEW": "dew_point", "AA1": "precipitation", 
                     "WND": "wind_direction", "MA1": "pressure", 
                     "VIS":"visibility"}
for old, new in field_name_change.items():
    weather = weather.withColumnRenamed(old, new)

# change column data types
weather = weather.withColumn('date_time', F.col("date_time").cast('TIMESTAMP'))

for field in ('temperature', 'dew_point'):
    weather = weather.withColumn(field, 
                                 concat(weather[field].substr(1, 1), 
                                        weather[field].substr(2, 4), 
                                        lit('.'),weather[field].substr(7, 1)) \
                                 .cast('double'))

for field in ('visibility', 'wind_direction'):
    weather = weather.withColumn(field, 
                                 split(weather[field], ',') \
                                 .getItem(0).cast('INT'))

weather = weather.withColumn('precipitation', 
                             split(weather['precipitation'], ',') \
                             .getItem(1).cast('INT'))

weather = weather.withColumn('pressure', 
                             split(weather['pressure'], ',') \
                             .getItem(2).cast('INT'))



In [12]:
weather.limit(5)

date_time,temperature,dew_point,precipitation,wind_direction,visibility,pressure
2018-01-02 00:00:00,-78.1,-206.1,,300,16000,10276
2018-01-02 00:51:00,-83.5,-200.5,0.0,290,16093,10279
2018-01-02 01:51:00,-83.5,-189.5,0.0,270,16093,10279
2018-01-02 02:51:00,-83.5,-167.5,0.0,270,16093,10279
2018-01-02 03:00:00,-83.1,-167.1,,270,16000,10276


---
# Weather Data Cleaning
#### 1. Change 9s to null

In [13]:
# found many entries where there are 9s in multiple columns
# documentation stated these are missing values
weather.filter(F.col('dew_point') == 9999.9).limit(5)

date_time,temperature,dew_point,precipitation,wind_direction,visibility,pressure
2018-01-02 04:59:00,9999.9,9999.9,0,999,999999,
2018-01-03 04:59:00,9999.9,9999.9,0,999,999999,
2018-01-04 04:59:00,9999.9,9999.9,0,999,999999,
2018-01-05 04:59:00,9999.9,9999.9,124,999,999999,
2018-01-06 04:59:00,9999.9,9999.9,0,999,999999,


In [14]:
# set features to null when equals to 9s
miss_values = {"precipitation": 9999, "temperature": 9999.9, 
               "dew_point": 9999.9, "visibility": 999999,
               "wind_direction": 999, "pressure": 99999.0}
for field, val in miss_values.items():
    weather = weather.withColumn(field, 
                             when(col(field) == val, None) \
                             .otherwise(col(field)))


#### 2. Fill null values by using the median of 6 hours timeframe segment

In [15]:
dict_null = {col:weather.filter(weather[col].isNull()).count() 
             for col in weather.columns}
dict_null

{'date_time': 0,
 'temperature': 774,
 'dew_point': 775,
 'precipitation': 6205,
 'wind_direction': 1878,
 'visibility': 754,
 'pressure': 960}

In [16]:
weather.count()

27136

In [17]:
weather = weather.withColumn(
    "date",
    to_date(col("date_time"),"yyyy-MM-dd")
)

weather = weather.withColumn('hour', hour(weather.date_time))

In [18]:
# partition 24 hours into group of 6 (4 hours each)
segment = {}
seg = 0
for h in range(0, 24):
    if h % 6 == 0:
        seg += 1
    segment[h] = seg

# map dictionary to a new column in weather dataframe
mapping_expr = create_map([lit(x) for x in chain(*segment.items())])

weather = weather.withColumn("time_segment", mapping_expr[col("hour")])

In [19]:
weather.columns

['date_time',
 'temperature',
 'dew_point',
 'precipitation',
 'wind_direction',
 'visibility',
 'pressure',
 'date',
 'hour',
 'time_segment']

In [20]:
window = Window.partitionBy(["date", "time_segment"]).orderBy("date")

fields = ['temperature', 'dew_point', 'precipitation', 
          'wind_direction', 'visibility', 'pressure']

for field in fields:
    weather = (
        weather.withColumn("median", F.expr(f'percentile_approx({field}, 0.5)').over(window)) \
            .withColumn(field, F.when(F.col(field).isNull(), 
                                              F.col("median")) \
                        .otherwise(F.col(field))).drop("median")
    )


In [21]:
dict_null = {col:weather.filter(weather[col].isNull()).count() 
             for col in weather.columns}
dict_null

{'date_time': 0,
 'temperature': 0,
 'dew_point': 0,
 'precipitation': 32,
 'wind_direction': 0,
 'visibility': 0,
 'pressure': 0,
 'date': 0,
 'hour': 0,
 'time_segment': 0}

In [22]:
weather.filter(weather['precipitation'].isNull()).show(40)

+-------------------+-----------+---------+-------------+--------------+----------+--------+----------+----+------------+
|          date_time|temperature|dew_point|precipitation|wind_direction|visibility|pressure|      date|hour|time_segment|
+-------------------+-----------+---------+-------------+--------------+----------+--------+----------+----+------------+
|2019-10-22 06:00:00|      122.1|    100.1|         null|            70|     16000|   10185|2019-10-22|   6|           2|
|2019-10-22 06:51:00|      122.1|    100.1|         null|            70|     16093|   10182|2019-10-22|   6|           2|
|2019-10-22 07:51:00|      122.1|     89.1|         null|            50|     16093|   10182|2019-10-22|   7|           2|
|2019-10-22 08:51:00|      128.1|     83.1|         null|            40|     16093|   10182|2019-10-22|   8|           2|
|2019-10-22 09:00:00|      128.1|     83.1|         null|            40|     16000|   10182|2019-10-22|   9|           2|
|2019-10-22 09:47:00|   

It appears that there are 2 wide time gaps where precipitation data are null. By the past weather data from https://www.timeanddate.com/weather/usa/new-york/historic?month=12&year=2019, it appear that there is no rain within these hours. Hence, we will change these null values to 0 mm precipitation.

In [23]:
weather = weather.withColumn("precipitation", F.when(F.col("precipitation").isNull(), 0) \
                        .otherwise(F.col("precipitation"))).drop("time_segment")


In [24]:
dict_null = {col:weather.filter(weather[col].isNull()).count() 
             for col in weather.columns}
dict_null

{'date_time': 0,
 'temperature': 0,
 'dew_point': 0,
 'precipitation': 0,
 'wind_direction': 0,
 'visibility': 0,
 'pressure': 0,
 'date': 0,
 'hour': 0}

---
# Taxi Data Cleaning
#### 1. Remove dates that are outside range

In [25]:
taxi = taxi.withColumn("tpep_pickup_datetime",
                   to_timestamp(col("tpep_pickup_datetime"))) \
         .withColumn("pu_year", date_format(col("tpep_pickup_datetime"), "y"))

In [26]:
taxi.agg({'tpep_pickup_datetime': 'max'}).show()
taxi.agg({'tpep_pickup_datetime': 'min'}).show()


                                                                                

+-------------------------+
|max(tpep_pickup_datetime)|
+-------------------------+
|      2090-12-31 06:41:26|
+-------------------------+





+-------------------------+
|min(tpep_pickup_datetime)|
+-------------------------+
|      2001-01-01 00:01:48|
+-------------------------+



                                                                                

In [27]:
taxi = taxi.filter(F.col('pu_year').isin([2018, 2019])).drop('pu_year')

#### 2. Remove if drop-off timestamp is < pick-up timestamp

In [28]:
# create new column to calculate trip duration in seconds
taxi = taxi.withColumn(
    'trip_duration',
    (col("tpep_dropoff_datetime").cast("long") - 
     col('tpep_pickup_datetime').cast("long")))

In [29]:
taxi.where((F.col('trip_duration') < 0)).limit(5)

                                                                                

tpep_pickup_datetime,tpep_dropoff_datetime,pu_location_id,do_location_id,trip_duration
2018-03-06 15:15:05,2018-03-05 16:05:01,70,48,-83404
2018-03-21 08:20:35,2018-03-21 07:40:23,239,230,-2412
2018-03-24 15:15:17,2018-03-20 14:03:49,74,75,-349888
2018-04-12 15:15:06,2018-04-06 16:14:50,138,239,-514816
2018-04-29 15:57:18,2018-04-29 14:50:55,142,66,-3983


In [30]:
taxi = taxi.where((F.col('trip_duration') > 0))

#### 3. Remove if duration is more than 1 day (do - pu > 18000)

In [31]:
taxi.where((F.col('trip_duration') > 18000)).limit(5)

                                                                                

tpep_pickup_datetime,tpep_dropoff_datetime,pu_location_id,do_location_id,trip_duration
2018-03-01 00:02:42,2018-03-01 23:52:20,48,143,85778
2018-03-01 00:05:51,2018-03-01 23:53:55,234,232,85684
2018-03-01 00:04:21,2018-03-01 23:28:33,43,107,84252
2018-03-01 00:34:36,2018-03-02 00:05:46,163,7,84670
2018-03-01 00:15:23,2018-03-01 23:39:52,114,79,84269


In [32]:
taxi = taxi.where((F.col('trip_duration') < 18000))

#### 4. Only select pick up location from JFK or LaGuardia Airport

In [33]:
zones[zones['service_zone'] == 'Airports']

Unnamed: 0,LocationID,Borough,Zone,service_zone
131,132,Queens,JFK Airport,Airports
137,138,Queens,LaGuardia Airport,Airports


In [34]:
taxi.where((F.col('pu_location_id').isin([132, 138])) | 
           (F.col('do_location_id').isin([132, 138]))).count()

                                                                                

13592895

In [35]:
taxi = taxi.where((F.col('pu_location_id').isin([132, 138])) | 
                  (F.col('do_location_id').isin([132, 138])))

#### 5. Check for null values

In [36]:
dict_null = {col:taxi.filter(taxi[col].isNull()).count() for col in taxi.columns}
dict_null

                                                                                

{'tpep_pickup_datetime': 0,
 'tpep_dropoff_datetime': 0,
 'pu_location_id': 0,
 'do_location_id': 0,
 'trip_duration': 0}

---
# Create new features

#### 1. From existing taxi dataset

In [37]:
# create new column pickup day of week
for abbr in ('pu', 'do'):
    if abbr == 'pu':
        long = "pickup"
    else:
        long = "dropoff"
    taxi = taxi.withColumn(f"tpep_{long}_datetime",
                       to_timestamp(col(f"tpep_{long}_datetime"))) \
             .withColumn(f"{abbr}_dow", dayofweek(col(f"tpep_{long}_datetime")))

    # create new column pickup hour
    taxi = taxi.withColumn(f'{abbr}_hour', hour(taxi[f'tpep_{long}_datetime']))



#### 2a. From external dataset (School Holidays)

In [38]:
sch_hol = pd.read_csv("../data/raw/other_data/nyc_school_holiday.csv", sep=";")
sch_hol['DATE'] = pd.to_datetime(sch_hol['DATE'], format='%d/%m/%y')
sch_hol_date = sch_hol['DATE'].dt.date.tolist()

# create helper column pickup date format
taxi = taxi.withColumn(
    "pu_date",
    to_date(col("tpep_pickup_datetime"),"yyyy-MM-dd")
)

# create new column to identify if that day is school holiday
taxi = taxi.withColumn(
    'is_school_holiday',
    F.when(
        (F.col('pu_date').isin(sch_hol_date)),
        1
    ).otherwise(0)
)


In [39]:
sch_hol.head()

Unnamed: 0,DATE,EVENT
0,2018-01-01,Winter Recess (Schools closed)
1,2018-01-15,Dr. Martin Luther King Jr. Day (schools closed)
2,2018-02-16,Lunar New Year (schools closed)
3,2018-02-19,Midwinter Recess (includes Washington’s Birthd...
4,2018-02-20,Midwinter Recess (includes Washington’s Birthd...


In [40]:
# check if 15-01-2018 is marked as holiday
taxi.filter(F.col('pu_date') == datetime(2018, 1, 15)).limit(5)



tpep_pickup_datetime,tpep_dropoff_datetime,pu_location_id,do_location_id,trip_duration,pu_dow,pu_hour,do_dow,do_hour,pu_date,is_school_holiday


#### 2b. From external dataset (Hourly Weather)

In [41]:
clean_weather = weather \
                    .groupBy(['date', 'hour']) \
                    .agg(
                        F.mean("temperature").alias("temperature"),
                        F.mean("dew_point").alias("dew_point"),
                        F.mean("precipitation").alias("precipitation"),
                        F.mean("wind_direction").alias("wind_direction"),
                        F.mean("visibility").alias("visibility"),
                        F.mean("pressure").alias("pressure"),
                    ) \
                    .orderBy(["date", 'hour'])

clean_weather.show()

+----------+----+-----------+---------+-------------+--------------+----------+--------+
|      date|hour|temperature|dew_point|precipitation|wind_direction|visibility|pressure|
+----------+----+-----------+---------+-------------+--------------+----------+--------+
|2018-01-02|   0|      -80.8|   -203.3|          0.0|         295.0|   16046.5| 10277.5|
|2018-01-02|   1|      -83.5|   -189.5|          0.0|         270.0|   16093.0| 10279.0|
|2018-01-02|   2|      -83.5|   -167.5|          0.0|         270.0|   16093.0| 10279.0|
|2018-01-02|   3|      -83.3|   -161.8|          0.0|         275.0|   16046.5| 10277.5|
|2018-01-02|   4|      -86.5|   -162.0|          0.0|         270.0|   16093.0| 10276.0|
|2018-01-02|   5|      -94.5|   -161.5|          0.0|         270.0|   16093.0| 10276.0|
|2018-01-02|   6|      -94.3|   -164.3|          0.0|         275.0|   16046.5| 10274.5|
|2018-01-02|   7|      -94.5|   -167.5|          0.0|         280.0|   16093.0| 10283.0|
|2018-01-02|   8|    

                                                                                

In [42]:
# merge taxi and weather dataset on date and hour
# remove taxi data if weather is not available (inner join)
sdf = (taxi \
           .join(clean_weather, on=[taxi['pu_date'] == clean_weather['date'], 
                    taxi['pu_hour'] == clean_weather['hour']], how='inner')
           .drop(clean_weather['date'])
           .drop(clean_weather['hour'])
      )

In [43]:
# taxi number of rows before merge vs after weather merge
(taxi.count(), sdf.count())

                                                                                

(13592895, 13573975)

In [44]:
# check if merge matches perfectly
clean_weather.filter((F.col('date') == datetime(2018, 3, 1)) & F.col('hour').isin([10, 11])).limit(2)

date,hour,temperature,dew_point,precipitation,wind_direction,visibility,pressure


In [45]:
taxi.filter(F.col('pu_date') == datetime(2018, 3, 1)).limit(3)



tpep_pickup_datetime,tpep_dropoff_datetime,pu_location_id,do_location_id,trip_duration,pu_dow,pu_hour,do_dow,do_hour,pu_date,is_school_holiday


In [46]:
sdf.filter(F.col('pu_date') == datetime(2018, 3, 1)).limit(3)

tpep_pickup_datetime,tpep_dropoff_datetime,pu_location_id,do_location_id,trip_duration,pu_dow,pu_hour,do_dow,do_hour,pu_date,is_school_holiday,temperature,dew_point,precipitation,wind_direction,visibility,pressure


---
### Random sampling data for visualization

In [47]:
SAMPLE_SIZE = 0.05
df = sdf.sample(SAMPLE_SIZE, seed=0).toPandas()
df.to_parquet('../data/curated/sample_data.parquet')

                                                                                

### Save final sdf to curated data folder

In [48]:
sdf.write.parquet("../data/curated/curated_sdf.parquet")

                                                                                

---
# Feature Engineering
#### 1. Check for feature correlations

#### 2. Feature Selection using AIC