# Machine Learning Models

## Model objective: Predict customers who will leave a generous tip

In [1]:
# packages for manipulation
import pandas as pd
import numpy as np

# Package for datetime
import datetime 

# Packages for visualization
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)

# Packages for machine learning model
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# This is the function that helps plot feature importance 
from xgboost import plot_importance

In [2]:
df = pd.read_csv('ml_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,month,day,duration,rush_hour,pickup_dropoff,mean_distance,mean_duration
0,24870114,2,2017-03-25 08:55:43,2017-03-25 09:09:47,6,3.34,1,N,100,231,1,13.0,0.0,0.5,2.76,0.0,0.3,16.56,mar,saturday,14.066667,0,100 231,3.521667,22.847222
1,35634249,1,2017-04-11 14:53:28,2017-04-11 15:19:58,1,1.8,1,N,186,43,1,16.0,0.0,0.5,4.0,0.0,0.3,20.8,apr,tuesday,26.5,0,186 43,3.108889,24.47037
2,106203690,1,2017-12-15 07:26:56,2017-12-15 07:34:08,1,1.0,1,N,262,236,1,6.5,0.0,0.5,1.45,0.0,0.3,8.75,dec,friday,7.2,1,262 236,0.881429,7.25
3,38942136,2,2017-05-07 13:17:59,2017-05-07 13:48:14,1,3.7,1,N,188,97,1,20.5,0.0,0.5,6.39,0.0,0.3,27.69,may,sunday,30.25,0,188 97,3.7,30.25
4,30841670,2,2017-04-15 23:32:20,2017-04-15 23:49:03,1,4.37,1,N,4,112,2,16.5,0.5,0.5,0.0,0.0,0.3,17.8,apr,saturday,16.716667,0,4 112,4.435,14.616667


In [3]:
df0 = pd.read_csv('nyc_preds_means.csv')
df0.head()

Unnamed: 0,mean_duration,mean_distance,predicted_fare
0,22.847222,3.521667,16.434245
1,24.47037,3.108889,16.052218
2,7.25,0.881429,7.053706
3,30.25,3.7,18.73165
4,14.616667,4.435,15.845642


In [4]:
df0 = df.merge(df0['predicted_fare'],
                left_index=True,
                right_index=True)

df0.head()

Unnamed: 0.1,Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,month,day,duration,rush_hour,pickup_dropoff,mean_distance,mean_duration,predicted_fare
0,24870114,2,2017-03-25 08:55:43,2017-03-25 09:09:47,6,3.34,1,N,100,231,1,13.0,0.0,0.5,2.76,0.0,0.3,16.56,mar,saturday,14.066667,0,100 231,3.521667,22.847222,16.434245
1,35634249,1,2017-04-11 14:53:28,2017-04-11 15:19:58,1,1.8,1,N,186,43,1,16.0,0.0,0.5,4.0,0.0,0.3,20.8,apr,tuesday,26.5,0,186 43,3.108889,24.47037,16.052218
2,106203690,1,2017-12-15 07:26:56,2017-12-15 07:34:08,1,1.0,1,N,262,236,1,6.5,0.0,0.5,1.45,0.0,0.3,8.75,dec,friday,7.2,1,262 236,0.881429,7.25,7.053706
3,38942136,2,2017-05-07 13:17:59,2017-05-07 13:48:14,1,3.7,1,N,188,97,1,20.5,0.0,0.5,6.39,0.0,0.3,27.69,may,sunday,30.25,0,188 97,3.7,30.25,18.73165
4,30841670,2,2017-04-15 23:32:20,2017-04-15 23:49:03,1,4.37,1,N,4,112,2,16.5,0.5,0.5,0.0,0.0,0.3,17.8,apr,saturday,16.716667,0,4 112,4.435,14.616667,15.845642


In [5]:
df0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22699 entries, 0 to 22698
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             22699 non-null  int64  
 1   VendorID               22699 non-null  int64  
 2   tpep_pickup_datetime   22699 non-null  object 
 3   tpep_dropoff_datetime  22699 non-null  object 
 4   passenger_count        22699 non-null  int64  
 5   trip_distance          22699 non-null  float64
 6   RatecodeID             22699 non-null  int64  
 7   store_and_fwd_flag     22699 non-null  object 
 8   PULocationID           22699 non-null  int64  
 9   DOLocationID           22699 non-null  int64  
 10  payment_type           22699 non-null  int64  
 11  fare_amount            22699 non-null  float64
 12  extra                  22699 non-null  float64
 13  mta_tax                22699 non-null  float64
 14  tip_amount             22699 non-null  float64
 15  to

In [6]:
# Converting from object to datetime
df0['tpep_pickup_datetime'] = pd.to_datetime(df0['tpep_pickup_datetime'])

df0['tpep_dropoff_datetime'] = pd.to_datetime(df0['tpep_dropoff_datetime'])

df0[['tpep_pickup_datetime', 'tpep_dropoff_datetime']].dtypes

tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
dtype: object

- Recall that only credit card payment passengers leave a tip. hence, we filter our data to reflect credit card passengers.

In [7]:
# Subsetting the data to contain only credit card customers
df1 = df0[df0['payment_type']==1].copy()

Next, we engineer target feature for the model

$$tip\ percent = \frac{tip\ amount}{total\ amount - tip\ amount}$$

### Feature engineering

In [8]:
# Creating tip percent column
df1['tip_percent'] = round(df1['tip_amount'] / (df1['total_amount'] - df1['tip_amount']), 3)
df1.tip_percent.head()

0    0.200
1    0.238
2    0.199
3    0.300
5    0.200
Name: tip_percent, dtype: float64

In [9]:
# Creating a generous column
df1['generous'] = df1['tip_percent'].copy()
df1['generous'] = (df1['generous'] >= 0.2).astype(int)
df1['generous'].head()

0    1
1    1
2    0
3    1
5    1
Name: generous, dtype: int32

Next, engineering four new columns that represent time of day bins. Each column will contain binary values (0=no, 1=yes) that indicate whether a trip began (picked up) during the following times:

`am_rush` = [06:00&ndash;10:00)  
`daytime` = [10:00&ndash;16:00)  
`pm_rush` = [16:00&ndash;20:00)  
`nighttime` = [20:00&ndash;06:00)  

In [10]:
# Creating 'am_rush' col
df1['am_rush'] = df1['tpep_pickup_datetime'].dt.hour

# Creating 'daytime' col
df1['daytime'] = df1['tpep_pickup_datetime'].dt.hour

# Creating 'pm_rush' col
df1['pm_rush'] = df1['tpep_pickup_datetime'].dt.hour

# Creating 'nighttime' col
df1['nighttime'] = df1['tpep_pickup_datetime'].dt.hour

df1.head()

Unnamed: 0.1,Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,month,day,duration,rush_hour,pickup_dropoff,mean_distance,mean_duration,predicted_fare,tip_percent,generous,am_rush,daytime,pm_rush,nighttime
0,24870114,2,2017-03-25 08:55:43,2017-03-25 09:09:47,6,3.34,1,N,100,231,1,13.0,0.0,0.5,2.76,0.0,0.3,16.56,mar,saturday,14.066667,0,100 231,3.521667,22.847222,16.434245,0.2,1,8,8,8,8
1,35634249,1,2017-04-11 14:53:28,2017-04-11 15:19:58,1,1.8,1,N,186,43,1,16.0,0.0,0.5,4.0,0.0,0.3,20.8,apr,tuesday,26.5,0,186 43,3.108889,24.47037,16.052218,0.238,1,14,14,14,14
2,106203690,1,2017-12-15 07:26:56,2017-12-15 07:34:08,1,1.0,1,N,262,236,1,6.5,0.0,0.5,1.45,0.0,0.3,8.75,dec,friday,7.2,1,262 236,0.881429,7.25,7.053706,0.199,0,7,7,7,7
3,38942136,2,2017-05-07 13:17:59,2017-05-07 13:48:14,1,3.7,1,N,188,97,1,20.5,0.0,0.5,6.39,0.0,0.3,27.69,may,sunday,30.25,0,188 97,3.7,30.25,18.73165,0.3,1,13,13,13,13
5,23345809,2,2017-03-25 20:34:11,2017-03-25 20:42:11,6,2.3,1,N,161,236,1,9.0,0.5,0.5,2.06,0.0,0.3,12.36,mar,saturday,8.0,0,161 236,2.052258,11.855376,10.441351,0.2,1,20,20,20,20


In [11]:
# Converting to binary values
df1['am_rush'] = np.where(((df1['am_rush'] >= 6) & (df1['am_rush'] < 10)), 1, 0)
df1['daytime'] = np.where(((df1['daytime'] >= 10) & (df1['daytime'] < 16)), 1, 0)
df1['pm_rush'] = np.where(((df1['pm_rush'] >= 16) & (df1['pm_rush'] < 20)), 1, 0)

In [12]:
# Define 'nighttime()' conversion function [20:00–06:00)
def nighttime(hour):
    if 20 <= hour['nighttime'] < 24:
        val = 1
    elif 0 <= hour['nighttime'] < 6:
        val = 1
    else:
        val = 0
    return val

In [13]:
# Apply 'nighttime' function to the 'nighttime' series
df1['nighttime'] = df1.apply(nighttime, axis=1)

df1.head()

Unnamed: 0.1,Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,month,day,duration,rush_hour,pickup_dropoff,mean_distance,mean_duration,predicted_fare,tip_percent,generous,am_rush,daytime,pm_rush,nighttime
0,24870114,2,2017-03-25 08:55:43,2017-03-25 09:09:47,6,3.34,1,N,100,231,1,13.0,0.0,0.5,2.76,0.0,0.3,16.56,mar,saturday,14.066667,0,100 231,3.521667,22.847222,16.434245,0.2,1,1,0,0,0
1,35634249,1,2017-04-11 14:53:28,2017-04-11 15:19:58,1,1.8,1,N,186,43,1,16.0,0.0,0.5,4.0,0.0,0.3,20.8,apr,tuesday,26.5,0,186 43,3.108889,24.47037,16.052218,0.238,1,0,1,0,0
2,106203690,1,2017-12-15 07:26:56,2017-12-15 07:34:08,1,1.0,1,N,262,236,1,6.5,0.0,0.5,1.45,0.0,0.3,8.75,dec,friday,7.2,1,262 236,0.881429,7.25,7.053706,0.199,0,1,0,0,0
3,38942136,2,2017-05-07 13:17:59,2017-05-07 13:48:14,1,3.7,1,N,188,97,1,20.5,0.0,0.5,6.39,0.0,0.3,27.69,may,sunday,30.25,0,188 97,3.7,30.25,18.73165,0.3,1,0,1,0,0
5,23345809,2,2017-03-25 20:34:11,2017-03-25 20:42:11,6,2.3,1,N,161,236,1,9.0,0.5,0.5,2.06,0.0,0.3,12.36,mar,saturday,8.0,0,161 236,2.052258,11.855376,10.441351,0.2,1,0,0,0,1


In [14]:
# Drop columns
drop_cols = ['Unnamed: 0', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
             'trip_distance', 'store_and_fwd_flag', 'payment_type', 'fare_amount',
             'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
             'total_amount', 'duration', 'rush_hour', 'pickup_dropoff', 'tip_percent']

df1 = df1.drop(drop_cols, axis=1)
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15265 entries, 0 to 22698
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   VendorID         15265 non-null  int64  
 1   passenger_count  15265 non-null  int64  
 2   RatecodeID       15265 non-null  int64  
 3   PULocationID     15265 non-null  int64  
 4   DOLocationID     15265 non-null  int64  
 5   month            15265 non-null  object 
 6   day              15265 non-null  object 
 7   mean_distance    15265 non-null  float64
 8   mean_duration    15265 non-null  float64
 9   predicted_fare   15265 non-null  float64
 10  generous         15265 non-null  int32  
 11  am_rush          15265 non-null  int32  
 12  daytime          15265 non-null  int32  
 13  pm_rush          15265 non-null  int32  
 14  nighttime        15265 non-null  int64  
dtypes: float64(3), int32(4), int64(6), object(2)
memory usage: 1.6+ MB


### Variable encoding
- Converting numerical columns that contain categorical information.

In [15]:
# Defining list of cols to convert to string
cols_to_str = ['RatecodeID', 'PULocationID', 'DOLocationID', 'VendorID']

# Converting each column to string
for col in cols_to_str:
    df1[col] = df1[col].astype('str')

In [16]:
# Converting categoricals to binary
df1 = pd.get_dummies(df1, drop_first=True)
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15265 entries, 0 to 22698
Columns: 347 entries, passenger_count to day_wednesday
dtypes: float64(3), int32(4), int64(2), uint8(338)
memory usage: 5.9 MB


In [17]:
# Getting class balance of 'generous' col
df1['generous'].value_counts(normalize=True)

1    0.526368
0    0.473632
Name: generous, dtype: float64

- The dataset is nearly balanced.

**Determining evaluation metric for model**
Considering the cost of both kinds of model error:
* False positives (the model predicts a tip ≥ 20%, but the customer does not give one)
* False negatives (the model predicts a tip < 20%, but the customer gives more)

False positives are worse for cab drivers, because they would pick up a customer expecting a good tip and then not receive one, frustrating the driver.

False negatives are worse for customers, because a cab driver would likely pick up a different customer who was predicted to tip more&mdash;even when the original customer would have tipped generously.

Since the stake are relatively even, we will use **F1 score** to evaluate the model.

#### Pre-processing - splitting features

In [18]:
# Isolate target variable (y)
y = df1['generous']

# Isolate the features (X)
X = df1.drop('generous', axis=1)

# Split into train and test sets
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

### Random forest model

In [19]:
rf = RandomForestClassifier(random_state=0)

In [20]:
%%time
rf.fit(Xtrain, ytrain)

CPU times: total: 25.6 s
Wall time: 25.9 s


In [21]:
rf_pred = rf.predict(Xtest)

In [22]:
print('classification report for random forest model:')
print(classification_report(ytest, rf_pred))

classification report for random forest model:
              precision    recall  f1-score   support

           0       0.71      0.59      0.64      1446
           1       0.68      0.78      0.73      1607

    accuracy                           0.69      3053
   macro avg       0.69      0.68      0.68      3053
weighted avg       0.69      0.69      0.69      3053



- The model achieved good scores

### Extreme gradient boosting (XGBoost) model

In [23]:
xgb = XGBClassifier(objective='binary:logistic', random_state=0)

In [24]:
%%time
xgb.fit(Xtrain, ytrain)

CPU times: total: 43.5 s
Wall time: 22.5 s


In [25]:
xgb_pred = xgb.predict(Xtest)

In [26]:
print('classification report for xgboost model:')
print(classification_report(ytest, xgb_pred))

classification report for xgboost model:
              precision    recall  f1-score   support

           0       0.70      0.59      0.64      1446
           1       0.68      0.77      0.72      1607

    accuracy                           0.69      3053
   macro avg       0.69      0.68      0.68      3053
weighted avg       0.69      0.69      0.69      3053



- The xgboost model achieved good and similiar scores to the random forest model

#### Next steps
- Both machine learning model hyperparameters can be tuned
- More features can be engineered to improve model
- Data leakage was unavoidable because of learning purpose but will be avoided in real life