In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression, LogisticRegression, LassoCV, RidgeCV

In [60]:
data = pd.read_csv("data/taxi_final_final.csv")

## Additional Cleaning 

In [3]:
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,...,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount,duration,geometry,Man,Location
0,0,0,CMT,2014-01-09 20:45:25,2014-01-09 20:52:31,1,0.7,-73.99477,40.736828,-73.982227,...,6.5,0.5,0.5,1.4,0.0,8.9,426,POINT (-73.98222699999998 40.73179),1,5
1,1,1,CMT,2014-01-09 20:46:12,2014-01-09 20:55:12,1,1.4,-73.982392,40.773382,-73.960449,...,8.5,0.5,0.5,1.9,0.0,11.4,540,POINT (-73.96044899999998 40.763995),1,4
2,2,2,CMT,2014-01-09 20:44:47,2014-01-09 20:59:46,2,2.3,-73.98857,40.739406,-73.986626,...,11.5,0.5,0.5,1.5,0.0,14.0,899,POINT (-73.986626 40.765217),1,5
3,3,3,CMT,2014-01-09 20:44:57,2014-01-09 20:51:40,1,1.7,-73.960213,40.770464,-73.979863,...,7.5,0.5,0.5,1.7,0.0,10.2,403,POINT (-73.97986299999998 40.77705),1,5
4,4,4,CMT,2014-01-09 20:47:09,2014-01-09 20:53:32,1,0.9,-73.995371,40.717248,-73.984367,...,6.0,0.5,0.5,1.75,0.0,8.75,383,POINT (-73.98436700000001 40.720524),1,5


In [61]:
del data["Unnamed: 0"]

In [62]:
del data["Unnamed: 0.1"]

In [63]:
del data["Man"]

In [64]:
del data["geometry"]

In [65]:
data.head()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount,duration,Location
0,CMT,2014-01-09 20:45:25,2014-01-09 20:52:31,1,0.7,-73.99477,40.736828,-73.982227,40.73179,CRD,6.5,0.5,0.5,1.4,0.0,8.9,426,5
1,CMT,2014-01-09 20:46:12,2014-01-09 20:55:12,1,1.4,-73.982392,40.773382,-73.960449,40.763995,CRD,8.5,0.5,0.5,1.9,0.0,11.4,540,4
2,CMT,2014-01-09 20:44:47,2014-01-09 20:59:46,2,2.3,-73.98857,40.739406,-73.986626,40.765217,CRD,11.5,0.5,0.5,1.5,0.0,14.0,899,5
3,CMT,2014-01-09 20:44:57,2014-01-09 20:51:40,1,1.7,-73.960213,40.770464,-73.979863,40.77705,CRD,7.5,0.5,0.5,1.7,0.0,10.2,403,5
4,CMT,2014-01-09 20:47:09,2014-01-09 20:53:32,1,0.9,-73.995371,40.717248,-73.984367,40.720524,CRD,6.0,0.5,0.5,1.75,0.0,8.75,383,5


In [66]:
data.shape

(6564770, 18)

## Add time of day column

In [67]:
data['pickup_datetime'] =  pd.to_datetime(data['pickup_datetime'], format='%Y-%m-%d %H:%M:%S.%f')

In [68]:
data["hour"]= data.pickup_datetime.apply(lambda x: x.hour)

In [69]:
data[data.duration >=3600].shape

(701, 19)

The ones that are longer than an hour are so insignificant so its fair to assume that pickup time of day will be the same as dropoff

In [70]:
data.head()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount,duration,Location,hour
0,CMT,2014-01-09 20:45:25,2014-01-09 20:52:31,1,0.7,-73.99477,40.736828,-73.982227,40.73179,CRD,6.5,0.5,0.5,1.4,0.0,8.9,426,5,20
1,CMT,2014-01-09 20:46:12,2014-01-09 20:55:12,1,1.4,-73.982392,40.773382,-73.960449,40.763995,CRD,8.5,0.5,0.5,1.9,0.0,11.4,540,4,20
2,CMT,2014-01-09 20:44:47,2014-01-09 20:59:46,2,2.3,-73.98857,40.739406,-73.986626,40.765217,CRD,11.5,0.5,0.5,1.5,0.0,14.0,899,5,20
3,CMT,2014-01-09 20:44:57,2014-01-09 20:51:40,1,1.7,-73.960213,40.770464,-73.979863,40.77705,CRD,7.5,0.5,0.5,1.7,0.0,10.2,403,5,20
4,CMT,2014-01-09 20:47:09,2014-01-09 20:53:32,1,0.9,-73.995371,40.717248,-73.984367,40.720524,CRD,6.0,0.5,0.5,1.75,0.0,8.75,383,5,20


In [71]:
b = [0,4,8,12,16,20,24]
l = ['Late Night', 'Early Morning','Morning','Noon','Eve','Night']

In [72]:
data["time_of_day"] = pd.cut(data["hour"], bins=b, labels=l)

In [73]:
data.time_of_day.fillna("Late Night", inplace=True)

In [74]:
data.isnull().any(axis=0)

vendor_id            False
pickup_datetime      False
dropoff_datetime     False
passenger_count      False
trip_distance        False
pickup_longitude     False
pickup_latitude      False
dropoff_longitude    False
dropoff_latitude     False
payment_type         False
fare_amount          False
surcharge            False
mta_tax              False
tip_amount           False
tolls_amount         False
total_amount         False
duration             False
Location             False
hour                 False
time_of_day          False
dtype: bool

In [75]:
def time_to_numeric(x):
    if x=='Late Night':
        return 6
    if x=='Early Morning':
        return 1
    if x=='Morning':
        return 2
    if x=='Noon':
        return 3
    if x=='Eve':
        return 4
    if x=='Night':
        return 5

In [76]:
data["time_of_day"] = data["time_of_day"].apply(time_to_numeric)

In [77]:
data.head(20)

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount,duration,Location,hour,time_of_day
0,CMT,2014-01-09 20:45:25,2014-01-09 20:52:31,1,0.7,-73.99477,40.736828,-73.982227,40.73179,CRD,6.5,0.5,0.5,1.4,0.0,8.9,426,5,20,4
1,CMT,2014-01-09 20:46:12,2014-01-09 20:55:12,1,1.4,-73.982392,40.773382,-73.960449,40.763995,CRD,8.5,0.5,0.5,1.9,0.0,11.4,540,4,20,4
2,CMT,2014-01-09 20:44:47,2014-01-09 20:59:46,2,2.3,-73.98857,40.739406,-73.986626,40.765217,CRD,11.5,0.5,0.5,1.5,0.0,14.0,899,5,20,4
3,CMT,2014-01-09 20:44:57,2014-01-09 20:51:40,1,1.7,-73.960213,40.770464,-73.979863,40.77705,CRD,7.5,0.5,0.5,1.7,0.0,10.2,403,5,20,4
4,CMT,2014-01-09 20:47:09,2014-01-09 20:53:32,1,0.9,-73.995371,40.717248,-73.984367,40.720524,CRD,6.0,0.5,0.5,1.75,0.0,8.75,383,5,20,4
5,CMT,2014-01-09 20:45:07,2014-01-09 20:51:01,1,0.9,-73.983811,40.749655,-73.989747,40.756575,CRD,6.0,0.5,0.5,1.4,0.0,8.4,354,5,20,4
6,CMT,2014-01-09 20:43:23,2014-01-09 20:52:07,1,2.1,-73.979906,40.74585,-73.95909,40.773639,CRD,9.0,0.5,0.5,2.0,0.0,12.0,524,4,20,4
7,CMT,2014-01-09 20:43:04,2014-01-09 20:54:29,1,3.4,-73.981147,40.758918,-73.94251,40.785975,CRD,12.0,0.5,0.5,2.6,0.0,15.6,685,5,20,4
8,CMT,2014-01-09 20:50:23,2014-01-09 20:58:10,1,2.3,-73.955193,40.765468,-73.979023,40.740578,CRD,9.0,0.5,0.5,1.0,0.0,11.0,467,5,20,4
9,CMT,2014-01-09 20:48:04,2014-01-09 21:01:37,1,3.3,-73.991782,40.748911,-73.988359,40.714205,CRD,12.5,0.5,0.5,4.05,0.0,17.55,813,5,20,4


In [78]:
del data["hour"]

## Adding Vendor ID column

In [79]:
data.vendor_id.value_counts()

CMT    3347258
VTS    3217512
Name: vendor_id, dtype: int64

In [80]:
vendor_changes = {"vendor_id": {"CMT": 1, "VTS":2}}

In [81]:
data.replace(vendor_changes, inplace=True)

In [82]:
data.head()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount,duration,Location,time_of_day
0,1,2014-01-09 20:45:25,2014-01-09 20:52:31,1,0.7,-73.99477,40.736828,-73.982227,40.73179,CRD,6.5,0.5,0.5,1.4,0.0,8.9,426,5,4
1,1,2014-01-09 20:46:12,2014-01-09 20:55:12,1,1.4,-73.982392,40.773382,-73.960449,40.763995,CRD,8.5,0.5,0.5,1.9,0.0,11.4,540,4,4
2,1,2014-01-09 20:44:47,2014-01-09 20:59:46,2,2.3,-73.98857,40.739406,-73.986626,40.765217,CRD,11.5,0.5,0.5,1.5,0.0,14.0,899,5,4
3,1,2014-01-09 20:44:57,2014-01-09 20:51:40,1,1.7,-73.960213,40.770464,-73.979863,40.77705,CRD,7.5,0.5,0.5,1.7,0.0,10.2,403,5,4
4,1,2014-01-09 20:47:09,2014-01-09 20:53:32,1,0.9,-73.995371,40.717248,-73.984367,40.720524,CRD,6.0,0.5,0.5,1.75,0.0,8.75,383,5,4


## Add payment type dummy variables 

In [84]:
unique_types = data.payment_type.unique()
unique_types

array(['CRD', 'UNK', 'NOC', 'DIS', 'CSH'], dtype=object)

In [85]:
for unique in unique_types:
    data[unique] = data.payment_type.apply(lambda x: 1 if x == unique else 0)
data = data.drop(columns=['payment_type'])
data.head()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,fare_amount,...,tolls_amount,total_amount,duration,Location,time_of_day,CRD,UNK,NOC,DIS,CSH
0,1,2014-01-09 20:45:25,2014-01-09 20:52:31,1,0.7,-73.99477,40.736828,-73.982227,40.73179,6.5,...,0.0,8.9,426,5,4,1,0,0,0,0
1,1,2014-01-09 20:46:12,2014-01-09 20:55:12,1,1.4,-73.982392,40.773382,-73.960449,40.763995,8.5,...,0.0,11.4,540,4,4,1,0,0,0,0
2,1,2014-01-09 20:44:47,2014-01-09 20:59:46,2,2.3,-73.98857,40.739406,-73.986626,40.765217,11.5,...,0.0,14.0,899,5,4,1,0,0,0,0
3,1,2014-01-09 20:44:57,2014-01-09 20:51:40,1,1.7,-73.960213,40.770464,-73.979863,40.77705,7.5,...,0.0,10.2,403,5,4,1,0,0,0,0
4,1,2014-01-09 20:47:09,2014-01-09 20:53:32,1,0.9,-73.995371,40.717248,-73.984367,40.720524,6.0,...,0.0,8.75,383,5,4,1,0,0,0,0


## Building the initial models

In [70]:
data.columns

Index(['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count',
       'trip_distance', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'payment_type', 'fare_amount',
       'surcharge', 'mta_tax', 'tip_amount', 'tolls_amount', 'total_amount',
       'duration', 'Location', 'time_of_day'],
      dtype='object')

In [86]:
x_train, x_test, y_train, y_test = train_test_split(data.drop(columns=['tip_amount','pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude','pickup_datetime','dropoff_datetime','total_amount','mta_tax']), data['tip_amount'], test_size=0.30, random_state=5)

In [57]:
linreg = LinearRegression()
linreg.fit(x_train, y_train)

train_score = linreg.score(x_train, y_train)
test_score = linreg.score(x_test, y_test)

In [58]:
print('The train score is {} and the test score is {}'.format(train_score, test_score))
print('\n')
betas = pd.Series(linreg.coef_, index=x_train.columns)
betas = betas.append(pd.Series({"Intercept": linreg.intercept_}))
print(betas)

#without vendor, 39.31 and 39.38
#with is 39.42 and 39.49
#Now with encoded payment type score is 40.30 and 40.33

The train score is 0.4028823741921212 and the test score is 0.40329731485470804


vendor_id         -0.058388
passenger_count    0.000416
trip_distance      0.007268
payment_type      -0.451624
fare_amount        0.139867
surcharge          0.131413
tolls_amount       0.251448
duration           0.000080
Location           0.000117
time_of_day        0.002589
Intercept          0.925256
dtype: float64


In [59]:
# Fit the model
lasso = LassoCV()
lasso.fit(x_train, y_train)

# calculate the score
train_score = lasso.score(x_train, y_train)
test_score = lasso.score(x_test, y_test)

print('The train score is {} and the test score is {}'.format(train_score, test_score)) 

# calculate betas
print('\n')
betas = pd.Series(lasso.coef_, index= x_train.columns)
betas = betas.append(pd.Series({"Intercept": lasso.intercept_}))
print(betas)
              



The train score is 0.37246235511267184 and the test score is 0.37271795665210605


vendor_id         -0.000000
passenger_count   -0.000000
trip_distance      0.000000
payment_type      -0.000000
fare_amount        0.072324
surcharge          0.000000
tolls_amount      -0.000000
duration           0.000736
Location           0.000000
time_of_day        0.000000
Intercept          0.653600
dtype: float64


### Cart and RFT then feature engineering 

## CART MODEL 

In [48]:
data.tip_amount.max()

4.94

In [76]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from subprocess import call
from IPython.display import SVG
#from graphviz import Source

In [89]:
y_train.dtypes

dtype('float64')

In [98]:
from sklearn import preprocessing
from sklearn import utils

In [108]:
lab_enc = preprocessing.LabelEncoder()

In [109]:
y_train_enc = lab_enc.fit_transform(y_train)

In [110]:
y_test_enc = lab_enc.fit_transform(y_test)

In [111]:
print(utils.multiclass.type_of_target(y_train))

continuous


In [115]:
print(utils.multiclass.type_of_target(y_train.astype('category')))

continuous


In [105]:
print(utils.multiclass.type_of_target(y_train_enc))

multiclass


In [80]:
# Train the CART model
cart_model = DecisionTreeClassifier(random_state=3,max_depth=3)
cart_model.fit(x_train, y_train)
train_score = cart_model.score(x_train, y_train)
test_score = cart_model.score(x_test, y_test)


ValueError: Unknown label type: 'continuous'

In [106]:
# Train the CART model
cart_model = DecisionTreeClassifier(random_state=3,max_depth=3)
cart_model.fit(x_train, y_train_enc)
train_score = cart_model.score(x_train, y_train_enc)
test_score = cart_model.score(x_test, y_test_enc)

In [107]:
# Print out summary of model performance 
print('The score of this model over training data is {:.3f} and {:.3f} over the testing data'.format(train_score, test_score))

The score of this model over training data is 0.204 and 0.000 over the testing data


In [None]:
# Visualize the decision tree
cart_graph = Source(export_graphviz(cart_model,
                   feature_names = df.columns[1:],
                   rounded = True, proportion = False, 
                  filled = True))
SVG(cart_graph.pipe(format='svg'))

## Random Forest Model 

In [86]:
from sklearn.ensemble import RandomForestClassifier

# Train the CART model
rfc_model = RandomForestClassifier(n_estimators=50, max_features=0.2,
                                   max_depth=3,random_state=3)
rfc_model.fit(x_train, y_train)


ValueError: Unknown label type: 'continuous'

## Model Engineering 

In [49]:
data.head()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount,duration,Location,time_of_day
0,1,2014-01-09 20:45:25,2014-01-09 20:52:31,1,0.7,-73.99477,40.736828,-73.982227,40.73179,CRD,6.5,0.5,0.5,1.4,0.0,8.9,426,5,4
1,1,2014-01-09 20:46:12,2014-01-09 20:55:12,1,1.4,-73.982392,40.773382,-73.960449,40.763995,CRD,8.5,0.5,0.5,1.9,0.0,11.4,540,4,4
2,1,2014-01-09 20:44:47,2014-01-09 20:59:46,2,2.3,-73.98857,40.739406,-73.986626,40.765217,CRD,11.5,0.5,0.5,1.5,0.0,14.0,899,5,4
3,1,2014-01-09 20:44:57,2014-01-09 20:51:40,1,1.7,-73.960213,40.770464,-73.979863,40.77705,CRD,7.5,0.5,0.5,1.7,0.0,10.2,403,5,4
4,1,2014-01-09 20:47:09,2014-01-09 20:53:32,1,0.9,-73.995371,40.717248,-73.984367,40.720524,CRD,6.0,0.5,0.5,1.75,0.0,8.75,383,5,4


In [50]:
data.payment_type.value_counts()

CRD    6483990
UNK      53321
NOC      21242
DIS       6018
CSH        199
Name: payment_type, dtype: int64