In [4]:
import pandas as pd
import numpy as np
import datetime as dt

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')
%matplotlib inline
np.set_printoptions(suppress=True) # Suppress scientific notation where possible

In [33]:
df = pd.read_csv('https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2019-01.csv')

In [34]:
df.head(1)

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2018-12-21 15:17:29,2018-12-21 15:18:57,N,1,264,264,5,0.0,3.0,0.5,0.5,0.0,0.0,,0.3,4.3,2,1,


In [46]:
df.columns = df.columns.str.lower()

## Payment Types
Removing payment type 2, cash payments. The data source noted that cash rides do not record tips

In [45]:
df.payment_type.value_counts()

1    388313
3      3392
4      1330
5        26
Name: payment_type, dtype: int64

In [36]:
df = df[df.payment_type !=2]

## Removing Columns
Some columns only existed for yellow, or green cars. Removing them to combine later. Also remvoing "congestion_surcharge" There were a large number of NaN's in both data sets. A baseline tree consisting of non-NaN congestion surcharge also showed little gain from this feature

In [39]:
df.drop(columns=['ehail_fee','trip_type','store_and_fwd_flag','congestion_surcharge'],inplace=True)

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 393061 entries, 2 to 630917
Data columns (total 16 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   vendorid               393061 non-null  int64  
 1   lpep_pickup_datetime   393061 non-null  object 
 2   lpep_dropoff_datetime  393061 non-null  object 
 3   ratecodeid             393061 non-null  int64  
 4   pulocationid           393061 non-null  int64  
 5   dolocationid           393061 non-null  int64  
 6   passenger_count        393061 non-null  int64  
 7   trip_distance          393061 non-null  float64
 8   fare_amount            393061 non-null  float64
 9   extra                  393061 non-null  float64
 10  mta_tax                393061 non-null  float64
 11  tip_amount             393061 non-null  float64
 12  tolls_amount           393061 non-null  float64
 13  improvement_surcharge  393061 non-null  float64
 14  total_amount           393061 non-nu

## Setting Target

Changing the "tip_amount" column to "tip" and changing it to a binary column to use as my target

In [51]:
df.rename(columns = {'tip_amount':'tip'},inplace=True)

In [52]:
#putting target variable in first column

cols = list(df.columns)
cols.insert(0, cols.pop(cols.index('tip')))
df = df[cols]

In [54]:
def to_binary(value):
    if value > 0:
        return 1
    else:
        return 0

In [55]:
df.tip = df.tip.apply(to_binary)

## Datetime Elements

Using the provided timestamps, I am creating features for week of the month, day of the week, and hour of the day, all stored as string values to later be one-hot-encoded

In [48]:
#making the columns easier to work with

df.rename(columns = {'lpep_pickup_datetime':'pickup_datetime','lpep_dropoff_datetime':'dropoff_datetime'},inplace=True)

In [59]:
df.pickup_datetime, df.dropoff_datetime = pd.to_datetime(df.pickup_datetime), pd.to_datetime(df.dropoff_datetime)

Unnamed: 0,tip,vendorid,pickup_datetime,dropoff_datetime,ratecodeid,pulocationid,dolocationid,passenger_count,trip_distance,fare_amount,extra,mta_tax,tolls_amount,improvement_surcharge,total_amount,payment_type
2,0,2,2019-01-01 00:27:11,2019-01-01 00:31:38,1,49,189,2,0.66,4.5,0.5,0.5,0.0,0.3,5.8,1


<class 'pandas.core.frame.DataFrame'>
Int64Index: 393061 entries, 2 to 630917
Data columns (total 16 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   tip                    393061 non-null  int64         
 1   vendorid               393061 non-null  int64         
 2   pickup_datetime        393061 non-null  datetime64[ns]
 3   dropoff_datetime       393061 non-null  datetime64[ns]
 4   ratecodeid             393061 non-null  int64         
 5   pulocationid           393061 non-null  int64         
 6   dolocationid           393061 non-null  int64         
 7   passenger_count        393061 non-null  int64         
 8   trip_distance          393061 non-null  float64       
 9   fare_amount            393061 non-null  float64       
 10  extra                  393061 non-null  float64       
 11  mta_tax                393061 non-null  float64       
 12  tolls_amount           393061 non-null  floa