In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
data = pd.read_csv("data/taxi_final.csv")

## Additional Cleaning 

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount,duration
0,0,CMT,2014-01-09 20:45:25,2014-01-09 20:52:31,1,0.7,-73.99477,40.736828,-73.982227,40.73179,CRD,6.5,0.5,0.5,1.4,0.0,8.9,426
1,1,CMT,2014-01-09 20:46:12,2014-01-09 20:55:12,1,1.4,-73.982392,40.773382,-73.960449,40.763995,CRD,8.5,0.5,0.5,1.9,0.0,11.4,540
2,2,CMT,2014-01-09 20:44:47,2014-01-09 20:59:46,2,2.3,-73.98857,40.739406,-73.986626,40.765217,CRD,11.5,0.5,0.5,1.5,0.0,14.0,899
3,3,CMT,2014-01-09 20:44:57,2014-01-09 20:51:40,1,1.7,-73.960213,40.770464,-73.979863,40.77705,CRD,7.5,0.5,0.5,1.7,0.0,10.2,403
4,4,CMT,2014-01-09 20:47:09,2014-01-09 20:53:32,1,0.9,-73.995371,40.717248,-73.984367,40.720524,CRD,6.0,0.5,0.5,1.75,0.0,8.75,383


In [4]:
del data["Unnamed: 0"]

In [6]:
data.shape

(6564770, 17)

## Add time of day column

In [9]:
data['pickup_datetime'] =  pd.to_datetime(data['pickup_datetime'], format='%Y-%m-%d %H:%M:%S.%f')

In [10]:
data["hour"]= data.pickup_datetime.apply(lambda x: x.hour)

In [11]:
data[data.duration >=3600].shape

(701, 18)

The ones that are longer than an hour are so insignificant so its fair to assume that pickup time of day will be the same as dropoff

In [12]:
data.head()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount,duration,hour
0,CMT,2014-01-09 20:45:25,2014-01-09 20:52:31,1,0.7,-73.99477,40.736828,-73.982227,40.73179,CRD,6.5,0.5,0.5,1.4,0.0,8.9,426,20
1,CMT,2014-01-09 20:46:12,2014-01-09 20:55:12,1,1.4,-73.982392,40.773382,-73.960449,40.763995,CRD,8.5,0.5,0.5,1.9,0.0,11.4,540,20
2,CMT,2014-01-09 20:44:47,2014-01-09 20:59:46,2,2.3,-73.98857,40.739406,-73.986626,40.765217,CRD,11.5,0.5,0.5,1.5,0.0,14.0,899,20
3,CMT,2014-01-09 20:44:57,2014-01-09 20:51:40,1,1.7,-73.960213,40.770464,-73.979863,40.77705,CRD,7.5,0.5,0.5,1.7,0.0,10.2,403,20
4,CMT,2014-01-09 20:47:09,2014-01-09 20:53:32,1,0.9,-73.995371,40.717248,-73.984367,40.720524,CRD,6.0,0.5,0.5,1.75,0.0,8.75,383,20


In [13]:
b = [0,4,8,12,16,20,24]
l = ['Late Night', 'Early Morning','Morning','Noon','Eve','Night']

In [23]:
data["time_of_day"] = pd.cut(data["hour"], bins=b, labels=l)

In [24]:
data.time_of_day.fillna("Late Night", inplace=True)

In [25]:
data.isnull().any(axis=0)

vendor_id            False
pickup_datetime      False
dropoff_datetime     False
passenger_count      False
trip_distance        False
pickup_longitude     False
pickup_latitude      False
dropoff_longitude    False
dropoff_latitude     False
payment_type         False
fare_amount          False
surcharge            False
mta_tax              False
tip_amount           False
tolls_amount         False
total_amount         False
duration             False
hour                 False
time_of_day          False
dtype: bool

In [26]:
def time_to_numeric(x):
    if x=='Late Night':
        return 6
    if x=='Early Morning':
        return 1
    if x=='Morning':
        return 2
    if x=='Noon':
        return 3
    if x=='Eve':
        return 4
    if x=='Night':
        return 5

In [27]:
data["time_of_day"] = data["time_of_day"].apply(time_to_numeric)

In [28]:
data.head(20)

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount,duration,hour,time_of_day
0,CMT,2014-01-09 20:45:25,2014-01-09 20:52:31,1,0.7,-73.99477,40.736828,-73.982227,40.73179,CRD,6.5,0.5,0.5,1.4,0.0,8.9,426,20,4
1,CMT,2014-01-09 20:46:12,2014-01-09 20:55:12,1,1.4,-73.982392,40.773382,-73.960449,40.763995,CRD,8.5,0.5,0.5,1.9,0.0,11.4,540,20,4
2,CMT,2014-01-09 20:44:47,2014-01-09 20:59:46,2,2.3,-73.98857,40.739406,-73.986626,40.765217,CRD,11.5,0.5,0.5,1.5,0.0,14.0,899,20,4
3,CMT,2014-01-09 20:44:57,2014-01-09 20:51:40,1,1.7,-73.960213,40.770464,-73.979863,40.77705,CRD,7.5,0.5,0.5,1.7,0.0,10.2,403,20,4
4,CMT,2014-01-09 20:47:09,2014-01-09 20:53:32,1,0.9,-73.995371,40.717248,-73.984367,40.720524,CRD,6.0,0.5,0.5,1.75,0.0,8.75,383,20,4
5,CMT,2014-01-09 20:45:07,2014-01-09 20:51:01,1,0.9,-73.983811,40.749655,-73.989747,40.756575,CRD,6.0,0.5,0.5,1.4,0.0,8.4,354,20,4
6,CMT,2014-01-09 20:43:23,2014-01-09 20:52:07,1,2.1,-73.979906,40.74585,-73.95909,40.773639,CRD,9.0,0.5,0.5,2.0,0.0,12.0,524,20,4
7,CMT,2014-01-09 20:43:04,2014-01-09 20:54:29,1,3.4,-73.981147,40.758918,-73.94251,40.785975,CRD,12.0,0.5,0.5,2.6,0.0,15.6,685,20,4
8,CMT,2014-01-09 20:50:23,2014-01-09 20:58:10,1,2.3,-73.955193,40.765468,-73.979023,40.740578,CRD,9.0,0.5,0.5,1.0,0.0,11.0,467,20,4
9,CMT,2014-01-09 20:48:04,2014-01-09 21:01:37,1,3.3,-73.991782,40.748911,-73.988359,40.714205,CRD,12.5,0.5,0.5,4.05,0.0,17.55,813,20,4


In [29]:
del data["hour"]

## Building the initial models

In [30]:
data.head()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount,duration,time_of_day
0,CMT,2014-01-09 20:45:25,2014-01-09 20:52:31,1,0.7,-73.99477,40.736828,-73.982227,40.73179,CRD,6.5,0.5,0.5,1.4,0.0,8.9,426,4
1,CMT,2014-01-09 20:46:12,2014-01-09 20:55:12,1,1.4,-73.982392,40.773382,-73.960449,40.763995,CRD,8.5,0.5,0.5,1.9,0.0,11.4,540,4
2,CMT,2014-01-09 20:44:47,2014-01-09 20:59:46,2,2.3,-73.98857,40.739406,-73.986626,40.765217,CRD,11.5,0.5,0.5,1.5,0.0,14.0,899,4
3,CMT,2014-01-09 20:44:57,2014-01-09 20:51:40,1,1.7,-73.960213,40.770464,-73.979863,40.77705,CRD,7.5,0.5,0.5,1.7,0.0,10.2,403,4
4,CMT,2014-01-09 20:47:09,2014-01-09 20:53:32,1,0.9,-73.995371,40.717248,-73.984367,40.720524,CRD,6.0,0.5,0.5,1.75,0.0,8.75,383,4


In [85]:
data.columns

Index(['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count',
       'trip_distance', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'payment_type', 'fare_amount',
       'surcharge', 'mta_tax', 'tip_amount', 'tolls_amount', 'total_amount',
       'duration', 'hour', 'time_of_day', 'time'],
      dtype='object')

In [33]:
x_train, x_test, y_train, y_test = train_test_split(data.drop(columns=['tip_amount','pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude','payment_type','vendor_id', 'pickup_datetime', 'dropoff_datetime','total_amount','mta_tax']), data['tip_amount'], test_size=0.30, random_state=5)

In [34]:
import seaborn as sns
from sklearn.linear_model import LinearRegression, LogisticRegression, LassoCV, RidgeCV

In [35]:
linreg = LinearRegression()
linreg.fit(x_train, y_train)

train_score = linreg.score(x_train, y_train)
test_score = linreg.score(x_test, y_test)

In [36]:
print('The train score is {} and the test score is {}'.format(train_score, test_score))
print('\n')
betas = pd.Series(linreg.coef_, index=x_train.columns)
betas = betas.append(pd.Series({"Intercept": linreg.intercept_}))
print(betas)

The train score is 0.3931453252796163 and the test score is 0.39382157556437625


passenger_count   -0.006998
trip_distance     -0.003326
fare_amount        0.146783
surcharge          0.131891
tolls_amount      -0.005115
duration           0.000042
time_of_day        0.002528
Intercept          0.370124
dtype: float64


In [37]:
# Fit the model
lasso = LassoCV()
lasso.fit(x_train, y_train)

# calculate the score
train_score = lasso.score(x_train, y_train)
test_score = lasso.score(x_test, y_test)

print('The train score is {} and the test score is {}'.format(train_score, test_score)) 

# calculate betas
print('\n')
betas = pd.Series(lasso.coef_, index= x_train.columns)
betas = betas.append(pd.Series({"Intercept": lasso.intercept_}))
print(betas)
              



The train score is 0.37246235511267184 and the test score is 0.37271795665210605


passenger_count   -0.000000
trip_distance      0.000000
fare_amount        0.072324
surcharge          0.000000
tolls_amount      -0.000000
duration           0.000736
time_of_day        0.000000
Intercept          0.653600
dtype: float64


//Cart and RFT then model engineering 