# Predicting Tips for NYC Yellow Taxi Rides - Follow Up 2

In this second follow up I wanted to explore more options in selecting a Model. Initially, the Decision Tree Regressor was set to defualt, and it handled the data quite well. Here I'm going to experiment with the tree's max depth to see if we can make any improvements.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import time as t

In [2]:
taxi = pd.read_csv('yellow_tripdata_2019-06.csv')

In [3]:
taxi.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1,2019-06-01 00:55:13,2019-06-01 00:56:17,1,0.0,1,N,145,145,2,3.0,0.5,0.5,0.0,0.0,0.3,4.3,0.0
1,1,2019-06-01 00:06:31,2019-06-01 00:06:52,1,0.0,1,N,262,263,2,2.5,3.0,0.5,0.0,0.0,0.3,6.3,2.5
2,1,2019-06-01 00:17:05,2019-06-01 00:36:38,1,4.4,1,N,74,7,2,17.5,0.5,0.5,0.0,0.0,0.3,18.8,0.0
3,1,2019-06-01 00:59:02,2019-06-01 00:59:12,0,0.8,1,N,145,145,2,2.5,1.0,0.5,0.0,0.0,0.3,4.3,0.0
4,1,2019-06-01 00:03:25,2019-06-01 00:15:42,1,1.7,1,N,113,148,1,9.5,3.0,0.5,2.65,0.0,0.3,15.95,2.5


In [4]:
taxi.describe()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
count,6941024.0,6941024.0,6941024.0,6941024.0,6941024.0,6941024.0,6941024.0,6941024.0,6941024.0,6941024.0,6941024.0,6941024.0,6941024.0,6941024.0,6941024.0
mean,1.642546,1.567322,3.078505,1.059386,162.3034,160.8204,1.29204,13.66414,1.164652,0.495001,2.265934,0.4061977,0.298486,19.74127,2.274467
std,0.5018739,1.210831,17.90048,0.7349947,66.38413,70.46478,0.4809936,132.3297,1.275345,0.1008513,2.962398,1.797505,0.02854512,132.6798,0.7289606
min,1.0,0.0,0.0,1.0,1.0,1.0,1.0,-305.0,-26.5,-0.5,-88.88,-39.74,-0.3,-305.8,-2.5
25%,1.0,1.0,1.0,1.0,114.0,107.0,1.0,6.5,0.0,0.5,0.0,0.0,0.3,11.3,2.5
50%,2.0,1.0,1.68,1.0,161.0,162.0,1.0,9.5,0.5,0.5,1.95,0.0,0.3,14.8,2.5
75%,2.0,2.0,3.14,1.0,233.0,233.0,2.0,15.0,2.5,0.5,3.0,0.0,0.3,21.3,2.5
max,4.0,9.0,45977.22,99.0,265.0,265.0,4.0,346950.0,84.76,212.42,1624.64,823.0,0.3,347035.0,2.75


In [5]:
from sklearn.preprocessing import MinMaxScaler

def convert_to_timestamp(x):
    for i in x:
        t.mktime(i.timetuple())
    return x
def normalize(df):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    dates_scaled = scaler.fit_transform(df)
    return dates_scaled

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

In [7]:
class NormalTime(BaseEstimator,TransformerMixin):
    def __init__(self, _time=True):
        self._time = _time
        
    def present(self,x):
        updated = x.str.starts_with('2019-')
        return updated
    
    def fit(self,x,y=None):
        return self
        
    def transform(self,x,y=None):
        df = pd.DataFrame(x)
        if self._time:
            y = pd.to_datetime(df.stack()).unstack()
            y = pd.DataFrame(y)
            y = y.apply(convert_to_timestamp)
            i = normalize(y)
            return i
    
        

In [8]:
class NumsOnly(BaseEstimator,TransformerMixin):
    def fit(self,x,y=None):
        return self
    
    def transform(self,x,y=None):
        x = x.drop(columns='store_and_fwd_flag')
        return x

In [9]:
from sklearn.compose import ColumnTransformer

pipeline = ColumnTransformer([
    ('time',NormalTime(),[1,2]),('flag',NumsOnly(),[6])],remainder='passthrough') 

In [10]:
taxi = taxi[taxi['tip_amount']>=0]
taxi = taxi[taxi['total_amount']>=0]

In [11]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(taxi, train_size=.7,test_size=.3,random_state=42)

In [12]:
train_label = train_set.iloc[:,13]
train_set_ = train_set.drop(columns= 'tip_amount')

In [13]:
prepared = pipeline.fit_transform(train_set_)

  return self.partial_fit(X, y)


In [14]:
from sklearn import tree

In [15]:
test_label = test_set.iloc[:,13]
new_test = test_set.drop(columns ='tip_amount')

In [16]:
new_prepared = pipeline.fit_transform(new_test)

  return self.partial_fit(X, y)


In [31]:
from sklearn.model_selection import GridSearchCV

In [37]:
params = {'max_depth':[None,2,5,10,12,15,12,20,25,30],'random_state':[42]}

In [38]:
reg = GridSearchCV(tree.DecisionTreeRegressor(),params,verbose=10)

In [39]:
reg.fit(prepared,train_label)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] max_depth=None, random_state=42 .................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=None, random_state=42, score=0.989288192893305, total=  35.9s
[CV] max_depth=None, random_state=42 .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   36.6s remaining:    0.0s


[CV]  max_depth=None, random_state=42, score=0.7906128782126337, total=  36.6s
[CV] max_depth=None, random_state=42 .................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.2min remaining:    0.0s


[CV]  max_depth=None, random_state=42, score=0.8271727629837574, total=  36.9s
[CV] max_depth=2, random_state=42 ....................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.9min remaining:    0.0s


[CV]  max_depth=2, random_state=42, score=0.5097761288076395, total=   6.5s
[CV] max_depth=2, random_state=42 ....................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.0min remaining:    0.0s


[CV]  max_depth=2, random_state=42, score=0.31359865152490685, total=   6.5s
[CV] max_depth=2, random_state=42 ....................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.1min remaining:    0.0s


[CV]  max_depth=2, random_state=42, score=0.44767595166933294, total=   6.5s
[CV] max_depth=5, random_state=42 ....................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  2.2min remaining:    0.0s


[CV]  max_depth=5, random_state=42, score=0.7894008501221856, total=  13.8s
[CV] max_depth=5, random_state=42 ....................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  2.4min remaining:    0.0s


[CV]  max_depth=5, random_state=42, score=0.581376499475784, total=  13.8s
[CV] max_depth=5, random_state=42 ....................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  2.7min remaining:    0.0s


[CV]  max_depth=5, random_state=42, score=0.6748594378080769, total=  13.7s
[CV] max_depth=10, random_state=42 ...................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  2.9min remaining:    0.0s


[CV]  max_depth=10, random_state=42, score=0.9171070893981347, total=  22.9s
[CV] max_depth=10, random_state=42 ...................................
[CV]  max_depth=10, random_state=42, score=0.7251578266108746, total=  23.5s
[CV] max_depth=10, random_state=42 ...................................
[CV]  max_depth=10, random_state=42, score=0.775326396019135, total=  24.1s
[CV] max_depth=12, random_state=42 ...................................
[CV]  max_depth=12, random_state=42, score=0.9476195077276022, total=  26.3s
[CV] max_depth=12, random_state=42 ...................................
[CV]  max_depth=12, random_state=42, score=0.7514214149528726, total=  26.8s
[CV] max_depth=12, random_state=42 ...................................
[CV]  max_depth=12, random_state=42, score=0.798046859478243, total=  28.2s
[CV] max_depth=15, random_state=42 ...................................
[CV]  max_depth=15, random_state=42, score=0.9752419290073088, total=  31.0s
[CV] max_depth=15, random_state=42 ..

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 14.0min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [None, 2, 5, 10, 12, 15, 12, 20, 25, 30], 'random_state': [42]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

Looking at the grid search, the score improves as max_depth increases. Although we get similar scores with depths like 30, I chose to go with passing the None type, becuase the tree will adjust to the depth needed. And with our paticular question this seems to be the best solution. 