## Feature engineering

In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from haversine import haversine
from pandas.tseries.holiday import USFederalHolidayCalendar
from datetime import datetime
%matplotlib inline
from matplotlib import pyplot as plt

In [2]:
# Reading preprocessed, previously cleaned data
data = pd.read_csv(filepath_or_buffer='preprocessed_data.csv')
# Initial data dimensions
print(data.shape)
data.head()

(9733587, 19)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,1,2016-03-01 00:00:00,2016-03-01 00:07:55,1,2.5,-73.976746,40.765152,1,0,-74.004265,40.746128,1,9.0,0.5,0.5,2.05,0.0,0.3,12.35
1,1,2016-03-01 00:00:00,2016-03-01 00:11:06,1,2.9,-73.983482,40.767925,1,0,-74.005943,40.733166,1,11.0,0.5,0.5,3.05,0.0,0.3,15.35
2,1,2016-03-01 00:00:01,2016-03-01 00:05:00,1,0.7,-73.958221,40.764641,1,0,-73.967896,40.762901,1,5.5,0.5,0.5,2.0,0.0,0.3,8.8
3,0,2016-03-01 00:00:01,2016-03-01 00:24:06,3,7.18,-73.985779,40.741192,1,0,-73.94635,40.797878,1,23.5,0.5,0.5,3.2,0.0,0.3,28.0
4,0,2016-03-01 00:00:01,2016-03-01 00:02:03,2,0.54,-73.988426,40.76416,1,0,-73.992393,40.758224,2,4.0,0.5,0.5,0.0,0.0,0.3,5.3


### Feature selection

Due to lack of relations between trip time and charges, the most reasonable features to consider are as follows:

In [3]:
# Choosed features:
data = data[['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count','store_and_fwd_flag', 
             'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']]
print(data.shape)
data.head()

(9733587, 9)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,store_and_fwd_flag,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
0,1,2016-03-01 00:00:00,2016-03-01 00:07:55,1,0,-73.976746,40.765152,-74.004265,40.746128
1,1,2016-03-01 00:00:00,2016-03-01 00:11:06,1,0,-73.983482,40.767925,-74.005943,40.733166
2,1,2016-03-01 00:00:01,2016-03-01 00:05:00,1,0,-73.958221,40.764641,-73.967896,40.762901
3,0,2016-03-01 00:00:01,2016-03-01 00:24:06,3,0,-73.985779,40.741192,-73.94635,40.797878
4,0,2016-03-01 00:00:01,2016-03-01 00:02:03,2,0,-73.988426,40.76416,-73.992393,40.758224


### Feature extraction

There are quite a few features we could extract from pickup datetime, since Timestamp is not convenient type to analize  or make conlusions. The traffic flow could depend on weekday, hour or even holidays. The same goes to coordinates, we could use to calculate distances using different metrics.

#### Time based information

In [4]:
# Convert string to datetime
data['tpep_pickup_datetime'] = pd.to_datetime(data['tpep_pickup_datetime'], format='%Y-%m-%d %H:%M:%S')
data['tpep_dropoff_datetime'] = pd.to_datetime(data['tpep_dropoff_datetime'], format='%Y-%m-%d %H:%M:%S')
# Disassemble pickup datetime
data['weekday'] = data.tpep_pickup_datetime.dt.weekday
#data.loc[data.tpep_pickup_datetime.dt.weekday < 5, 'is_weekend'] = 0
#data.loc[data.tpep_pickup_datetime.dt.weekday >= 5, 'is_weekday'] = 0
#data.loc[data.tpep_pickup_datetime.dt.weekday >= 5, 'is_weekend'] = 1
data['minute_of_the_day'] = data.tpep_pickup_datetime.dt.hour * 60 + data.tpep_pickup_datetime.dt.minute

# Calculate trip time
data['trip_duration'] = (data.tpep_dropoff_datetime - data.tpep_pickup_datetime).dt.total_seconds()
# Clear trips that take more than 24 hours (86,400s)
data = data.loc[data.trip_duration < 86400]

# Delete datetime column
#data = data.drop('tpep_pickup_datetime', axis=1)
data = data.drop('tpep_dropoff_datetime', axis=1)

data.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,passenger_count,store_and_fwd_flag,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,weekday,minute_of_the_day,trip_duration
0,1,2016-03-01 00:00:00,1,0,-73.976746,40.765152,-74.004265,40.746128,1,0,475.0
1,1,2016-03-01 00:00:00,1,0,-73.983482,40.767925,-74.005943,40.733166,1,0,666.0
2,1,2016-03-01 00:00:01,1,0,-73.958221,40.764641,-73.967896,40.762901,1,0,299.0
3,0,2016-03-01 00:00:01,3,0,-73.985779,40.741192,-73.94635,40.797878,1,0,1445.0
4,0,2016-03-01 00:00:01,2,0,-73.988426,40.76416,-73.992393,40.758224,1,0,122.0


In [5]:
# Check if holidays
cal = USFederalHolidayCalendar()
holidays = cal.holidays(start=datetime(2016, 3, 1), end=datetime(2016, 4, 1))

data['holiday'] = (data['tpep_pickup_datetime'].dt.date.astype('datetime64[ns]').isin(holidays)).astype(int)

In [6]:
np.sum(data['holiday'])

0

No holidays on March!

In [7]:
# No need for holidays column
data = data.drop('holiday', axis=1)

#### Getting directions and traffic information

In [8]:
# Calculate directions
data['lat_diff'] = data.dropoff_latitude - data.pickup_latitude
data['lon_diff'] = data.dropoff_longitude - data.dropoff_longitude
data['angle'] = 180 * np.arctan2(data.lat_diff, data.lon_diff) / np.pi

In [9]:
def hav_dist(coor):
    dist = haversine((coor[0], coor[1]), (coor[2], coor[3]))
    return dist


# Calculate L1, L2 and Haversine distances
data['dist_l1'] = np.abs(data.pickup_latitude - data.dropoff_latitude) + np.abs(data.pickup_longitude - data.dropoff_longitude)
data['dist_l2'] = np.sqrt((data.pickup_latitude - data.dropoff_latitude)**2 + (data.pickup_longitude - data.dropoff_longitude)**2)

coordinates = data[['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']].values
data['hav_dist'] = list(map(lambda c: hav_dist(c), coordinates))

In [10]:
# Getting days and hours from pickup_datetime
data['day'] = data.tpep_pickup_datetime.dt.day
data['hour'] = data.tpep_pickup_datetime.dt.hour

# Determine number of trips on each day and on each hour
traffic =  data[['day', 'hour']]
traffic_day = traffic.groupby('day')['day'].count()
traffic_hour = traffic.groupby('hour')['hour'].count()

In [11]:
# save this feature to training datasets
data['daily_count'] = data['day'].apply(lambda day: traffic_day[day])
data['hourly_count'] = data['hour'].apply(lambda hour: traffic_hour[hour])

Split dataset to training and testing, since we don't know trip duration for the testing data and estimating time on haversine distance using training data

In [12]:
# Getting X and y
y = data['trip_duration']
X = data.drop('trip_duration', axis=1)

# Perform log transformation on target since it has skewed distribution
# according to the data exploration step
y = np.log(y+1)

# Shuffle and split to training and testing sets
X, y = shuffle(X, y, random_state=19)
offset = int(X.shape[0] * 0.9)
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]
del data

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((8760227, 20), (8760227,), (973359, 20), (973359,))

In [13]:
X_train['hav_speed'] = X_train.hav_dist / y_train
hourly_speed = X_train.groupby('hour')['hav_speed'].mean()
hourly_speed_mean = X_train.hav_speed.mean()

X_train_speed = X_train['hour'].apply(lambda hour: hourly_speed[hour])
X_test_speed = X_test['hour'].apply(lambda hour: hourly_speed[hour] if hour in hourly_speed else hourly_speed_mean)
# Estimated time by Haversine distance
X_train['hav_time'] = X_train.hav_dist / X_train_speed
X_test['hav_time'] = X_test.hav_dist / X_test_speed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [14]:
# Drop unnecessary columns
X_train = X_train.drop('hav_speed', axis=1)
X_train = X_train.drop('day', axis=1)
X_train = X_train.drop('hour', axis=1)
X_test = X_test.drop('day', axis=1)
X_test = X_test.drop('hour', axis=1)
X_train = X_train.drop('tpep_pickup_datetime', axis=1)
X_test = X_test.drop('tpep_pickup_datetime', axis=1)

#### Saving data

In [15]:
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

In [16]:
# Creating header
columns = ['trip_duration']
y_train = y_train[columns]
y_test = y_test[columns]

In [17]:
X_train.to_csv('x_train.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
X_test.to_csv('x_test.csv', index=False)
y_test.to_csv('y_test.csv', index=False)