# Data Science Take Home Test

## Initialization

Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import time
start_time = time.time()

Load data

In [2]:
df_mes = pd.read_csv('I:\Javier Resano\Curriculum\Caso Carto\yellow_tripdata_2017-06.csv', sep=',')

## Data Exploration and Cleaning

In [3]:
df_mes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9656993 entries, 0 to 9656992
Data columns (total 17 columns):
VendorID                 int64
tpep_pickup_datetime     object
tpep_dropoff_datetime    object
passenger_count          int64
trip_distance            float64
RatecodeID               int64
store_and_fwd_flag       object
PULocationID             int64
DOLocationID             int64
payment_type             int64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
improvement_surcharge    float64
total_amount             float64
dtypes: float64(8), int64(6), object(3)
memory usage: 1.2+ GB


**To note:**
-  "tpep_pickup_datetime" and "tpep_dropoff_datetime" should be datetime
- PULocationID, DOLocationID, payment_type, RateCodeID are really numeric codes. I'll transform them to strings

In [4]:
df_mes['tpep_pickup_datetime'] = pd.to_datetime(df_mes['tpep_pickup_datetime'])
df_mes['tpep_dropoff_datetime'] = pd.to_datetime(df_mes['tpep_dropoff_datetime'])
df_mes['PULocationID'] = df_mes['PULocationID'].astype(str)
df_mes['DOLocationID'] = df_mes['DOLocationID'].astype(str)
df_mes['payment_type'] = df_mes['payment_type'].astype(str)
df_mes['RatecodeID'] = df_mes['RatecodeID'].astype(str)
df_mes['VendorID'] = df_mes['VendorID'].astype(str)

In [5]:
with pd.option_context('float_format', '{:.2f}'.format): print(df_mes.describe()) #10.294.628 cases

       passenger_count  trip_distance  fare_amount      extra    mta_tax  \
count       9656993.00     9656993.00   9656993.00 9656993.00 9656993.00   
mean              1.62           2.98        13.29       0.34       0.50   
std               1.26           5.70       215.17       0.46       0.08   
min               0.00           0.00      -550.00     -50.56      -0.50   
25%               1.00           1.00         6.50       0.00       0.50   
50%               1.00           1.67         9.50       0.00       0.50   
75%               2.00           3.10        15.00       0.50       0.50   
max               9.00        9496.98    630461.82      22.50     140.00   

       tip_amount  tolls_amount  improvement_surcharge  total_amount  
count  9656993.00    9656993.00             9656993.00    9656993.00  
mean         1.88          0.34                   0.30         16.65  
std          2.70          2.02                   0.01        215.34  
min        -74.00        -12.50

** Findings. Data issues: **
- trip_distance sometimes is 0 (no travel?)
- fare_amount sometimes contains negative numbers
- extra sometimes contains negative numbers. Max value is strange too. Does not seem to match Data Dictionary explanation (only 0.5 or 1 charges)
- mta_tax sometimes contains negative numbers. Max value is strange too. Does not seem to match Data Dictionary explanation (only 0.5 charges)
- total_amount sometimes contains negative numbers
- tip_amount sometimes contains negative numbers
- tolls_amount sometimes contains negative numbers
- improvement_surcharge sometimes contains negative numbers. Max value is strange too. Does not seem to match Data Dictionary explanation (only 0.3 charges)
   

Since we have a lot of data, I will remove wrong data. <br>
Initial number of rows:

In [6]:
df_mes.shape[0]

9656993

In [7]:
df_mes = df_mes[df_mes['trip_distance']>0]
df_mes.shape[0]

9592473

In [8]:
df_mes = df_mes[df_mes['fare_amount']>0]
df_mes.shape[0]

9587132

In [9]:
df_mes = df_mes[df_mes['extra']>=0]
df_mes.shape[0]

9587127

In [10]:
df_mes = df_mes[df_mes['mta_tax']>=0]
df_mes.shape[0]

9587127

In [11]:
df_mes = df_mes[df_mes['total_amount']>=0]
df_mes.shape[0]

9587127

In [12]:
df_mes = df_mes[df_mes['tip_amount']>=0]
df_mes.shape[0]

9587127

In [13]:
df_mes = df_mes[df_mes['tolls_amount']>=0]
df_mes.shape[0]

9587127

In [14]:
df_mes = df_mes[df_mes['improvement_surcharge']>=0]
df_mes.shape[0]

9587127

Now, I'll check and delete strange cases:

In [15]:
df_mes[(df_mes['improvement_surcharge']!=0.3)].shape[0]

1099

In [16]:
df_mes = df_mes[df_mes['improvement_surcharge']==0.3]
df_mes.shape[0]

9586028

In [17]:
df_mes[(df_mes['mta_tax']!=0.5)&(df_mes['mta_tax']!=0)].shape[0]

0

In [18]:
df_mes[(df_mes['extra']!=0)&(df_mes['extra']!=0.5)&(df_mes['extra']!=1)&(df_mes['extra']!=1.5)].shape[0]

40007

In [19]:
df_mes[(df_mes['extra']!=0)&(df_mes['extra']!=0.5)&(df_mes['extra']!=1)&(df_mes['extra']!=1.5)].extra.unique()

array([4.5 , 4.54, 0.02, 2.  , 2.5 , 1.01, 0.6 , 0.3 ])

In [20]:
df_mes = df_mes[(df_mes['extra']==0)|(df_mes['extra']==0.5)|(df_mes['extra']==1)|(df_mes['extra']==1.5)]
df_mes.shape[0]

9546021

All in all we have gone from 10294628 to 10180582 rows. We are keeping 98.9% of the initial dataset

### Feature Engineering
In order to improve dataset quality, I am going to create some new features based on the existing ones

1. Travel time

In [21]:
df_mes['travel_time'] = df_mes['tpep_dropoff_datetime'] - df_mes['tpep_pickup_datetime']
df_mes['travel_time'] = df_mes['travel_time'].dt.total_seconds()

2. Average speed

In [22]:
df_mes['average_speed'] = df_mes['trip_distance'] / df_mes['travel_time'] *3600

3. Hour of the day

In [23]:
df_mes['tpep_pickup_hour'] = pd.DatetimeIndex(df_mes['tpep_pickup_datetime']).hour
df_mes['tpep_dropoff_hour'] = pd.DatetimeIndex(df_mes['tpep_dropoff_datetime']).hour

4. Day of the week

In [24]:
df_mes['tpep_pickup_weekday'] = pd.DatetimeIndex(df_mes['tpep_pickup_datetime']).weekday
df_mes['tpep_dropoff_weekday'] = pd.DatetimeIndex(df_mes['tpep_dropoff_datetime']).weekday

And we check their values:

In [25]:
with pd.option_context('float_format', '{:.2f}'.format): print(df_mes.loc[:,['travel_time','average_speed']].describe()) #10.294.628 cases

       travel_time  average_speed
count   9546021.00     9546021.00
mean       1005.60            inf
std        3313.02            nan
min           0.00           0.00
25%         405.00           7.33
50%         678.00          10.00
75%        1116.00          13.56
max      864459.00            inf


Some new issues with data arise:
- There are some negative travel_time values
- "average_speed == inf" happens when travel_time is 0. Which mean there was no trip
- Very high average speeds (> 100 mph) also are a mark of wrong data

In [26]:
df_mes[(df_mes['average_speed'] == np.inf) | (df_mes['average_speed'] > 100)]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,tip_amount,tolls_amount,improvement_surcharge,total_amount,travel_time,average_speed,tpep_pickup_hour,tpep_dropoff_hour,tpep_pickup_weekday,tpep_dropoff_weekday
5,2,2017-06-01 00:00:00,2017-06-01 00:00:00,2,17.57,2,N,132,74,1,...,11.71,5.76,0.3,70.27,0.0,inf,0,0,3,3
6,2,2017-06-01 00:00:00,2017-06-01 00:00:00,5,13.34,1,N,138,249,1,...,10.71,5.76,0.3,64.27,0.0,inf,0,0,3,3
69,1,2017-06-01 00:00:18,2017-06-01 00:00:30,1,1.30,1,N,138,138,1,...,37.00,0.00,0.3,40.80,12.0,3.900000e+02,0,0,3,3
2568,1,2017-06-09 18:12:14,2017-06-09 18:12:19,1,8.20,1,N,164,164,1,...,1.25,0.00,0.3,5.55,5.0,5.904000e+03,18,18,4,4
3021,1,2017-06-09 18:15:26,2017-06-09 18:15:42,1,2.20,1,N,236,43,3,...,0.00,0.00,0.3,11.30,16.0,4.950000e+02,18,18,4,4
3601,1,2017-06-09 18:19:13,2017-06-09 18:19:21,1,8.20,1,N,263,263,1,...,2.00,0.00,0.3,6.30,8.0,3.690000e+03,18,18,4,4
6533,1,2017-06-09 18:38:23,2017-06-09 18:38:26,1,1.10,1,N,161,161,2,...,0.00,0.00,0.3,4.30,3.0,1.320000e+03,18,18,4,4
9149,1,2017-06-09 18:54:54,2017-06-09 18:54:57,1,16.50,1,N,132,132,2,...,0.00,0.00,0.3,4.30,3.0,1.980000e+04,18,18,4,4
9252,1,2017-06-09 18:55:28,2017-06-09 18:55:42,1,0.70,1,N,100,100,3,...,0.00,0.00,0.3,4.30,14.0,1.800000e+02,18,18,4,4
10566,1,2017-06-29 16:29:06,2017-06-29 16:29:32,1,6.30,5,N,132,132,4,...,0.00,0.00,0.3,70.30,26.0,8.723077e+02,16,16,3,3


In [27]:
df_mes = df_mes[(df_mes['average_speed'] != np.inf) & (df_mes['average_speed'] < 100)]
df_mes.shape[0]

9538018

In [28]:
#I convert Timedelta value to float (number of seconds of the trip)
df_mes['travel_time'] = df_mes['travel_time'].astype('timedelta64[s]').dt.total_seconds()

In [29]:
df_mes[df_mes['travel_time'] < 0]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,tip_amount,tolls_amount,improvement_surcharge,total_amount,travel_time,average_speed,tpep_pickup_hour,tpep_dropoff_hour,tpep_pickup_weekday,tpep_dropoff_weekday


In [30]:
df_mes = df_mes[df_mes['travel_time'] > 0]
df_mes.shape[0]

9538018

All in all we have gone from 10294628 to 10171487 rows. We are still keeping **98.8% of the initial dataset**

### Filling out code variables
In order to encode all code variables (i.e. those that take string values, not numeric), and not miss any column if in the dataset a variable doesn't have a certain code, I am adding at the end of the dataframe new rows with all the vaules for these code varibles. I will remove these rows from the dataframe once they are encoded.

In [35]:
print(sorted(df_mes['payment_type'].unique()))

['1', '2', '3', '4']


In [36]:
print(sorted(df_mes['RatecodeID'].unique()))

['1', '2', '3', '4', '5', '6', '99']


In [37]:
df_mes['PULocationID'].unique().shape

(256,)

In [38]:
df_mes[df_mes['PULocationID'].astype('int64') >= 266]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,tip_amount,tolls_amount,improvement_surcharge,total_amount,travel_time,average_speed,tpep_pickup_hour,tpep_dropoff_hour,tpep_pickup_weekday,tpep_dropoff_weekday


In [39]:
df_mes['DOLocationID'].unique().shape

(262,)

In [40]:
df_mes[df_mes['DOLocationID'].astype('int64') >= 266]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,tip_amount,tolls_amount,improvement_surcharge,total_amount,travel_time,average_speed,tpep_pickup_hour,tpep_dropoff_hour,tpep_pickup_weekday,tpep_dropoff_weekday


All in all, we see that "payment_type" is mising 2 values (5 & 6), the locations also miss some out of the 265 listed in the document, and "RatecodeID" has an extra one: 99. As it is unknown for us, we wil get rid of the values (we can see there are only a few of them)

In [41]:
df_mes[df_mes['RatecodeID'] == '99']

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,tip_amount,tolls_amount,improvement_surcharge,total_amount,travel_time,average_speed,tpep_pickup_hour,tpep_dropoff_hour,tpep_pickup_weekday,tpep_dropoff_weekday
648052,1,2017-06-11 01:36:50,2017-06-11 01:59:29,1,10.6,99,N,231,223,1,...,6.55,0.0,0.3,39.35,1359.0,28.07947,1,1,6,6
3265343,1,2017-06-13 19:38:29,2017-06-13 19:47:24,1,2.2,99,N,100,237,1,...,2.15,0.0,0.3,12.95,535.0,14.803738,19,19,1,1
4155606,1,2017-06-16 08:54:45,2017-06-16 09:16:41,1,2.5,99,N,161,79,1,...,3.05,0.0,0.3,18.35,1316.0,6.838906,8,9,4,4
7453036,1,2017-06-25 18:04:40,2017-06-25 18:13:21,1,1.6,99,N,162,141,1,...,1.75,0.0,0.3,10.55,521.0,11.055662,18,18,6,6


In [42]:
df_mes = df_mes[df_mes['RatecodeID'] != '99']

## Model Building

I am importing model created for March

In [43]:
# Due to memory issues, I'll work with a smaller dataset.
df_mes2 = df_mes.sample(n=1000000, random_state=0) #.iloc[0:1000000,:]

In [44]:
shape1 = df_mes2.shape[0]
df2 = pd.DataFrame([['1'], ['2']], columns=(['VendorID']))
df3 = [str(x) for x in range(1,266)]
df3 = pd.DataFrame({'PULocationID':df3, 'DOLocationID':df3})
df4 = pd.DataFrame([['Y'], ['N']], columns=(['store_and_fwd_flag']))
df5 = [str(x) for x in range(1,7)]
df5 = pd.DataFrame({'payment_type':df5, 'RatecodeID':df5})

df_mes2 = df_mes2.append(df2, sort=False, ignore_index=True)
df_mes2 = df_mes2.append(df3, sort=False, ignore_index=True)
df_mes2 = df_mes2.append(df4, sort=False, ignore_index=True)
df_mes2 = df_mes2.append(df5, sort=False, ignore_index=True)

df_mes2.fillna(1, inplace=True) #1 is a value every string column has, so I am not adding new values here

In [45]:
df_mes2 = df_mes2.drop(['tpep_pickup_datetime','tpep_dropoff_datetime'], axis=1)
df_mes2 = pd.get_dummies(df_mes2, drop_first=True)

In [46]:
df_mes2 = df_mes2.head(shape1)

In [47]:
cols = list(df_mes2.columns)
cols.remove('tip_amount')

In [48]:
#Divide the dataset into input variables and output 
# (I get a memory error if I work with the whole set, so I'm reducing it to 1.000.000 rows)
X = df_mes2.loc[0:1000000,cols]
Y = df_mes2.loc[0:1000000,['tip_amount']]

### Model: Linear Regression
We start with a simple model in order to get a baseline to compare the rest

In [49]:
%store -r lin_reg

In [50]:
Y_lin_reg = lin_reg.predict(X)

from sklearn import metrics
print('RMSE:', np.sqrt(metrics.mean_squared_error(Y, Y_lin_reg)))
print('Variance explained: ', metrics.explained_variance_score(Y, Y_lin_reg))

RMSE: 0.25377465589468884
Variance explained:  0.9899704299408163


This model explains 99.8% of the variance, and predicts the tip with a precission of around 9.7 cents (RMSE)

Another test:

In [51]:
start_time = time.time()

In [52]:
df_mes2 = df_mes.iloc[5000000:6000000,:]

shape1 = df_mes2.shape[0]
df2 = pd.DataFrame([['1'], ['2']], columns=(['VendorID']))
df3 = [str(x) for x in range(1,266)]
df3 = pd.DataFrame({'PULocationID':df3, 'DOLocationID':df3})
df4 = pd.DataFrame([['Y'], ['N']], columns=(['store_and_fwd_flag']))
df5 = [str(x) for x in range(1,7)]
df5 = pd.DataFrame({'payment_type':df5, 'RatecodeID':df5})

df_mes2 = df_mes2.append(df2, sort=False, ignore_index=True)
df_mes2 = df_mes2.append(df3, sort=False, ignore_index=True)
df_mes2 = df_mes2.append(df4, sort=False, ignore_index=True)
df_mes2 = df_mes2.append(df5, sort=False, ignore_index=True)

df_mes2.fillna(1, inplace=True) #1 is a value every string column has, so I am not adding new values here

df_mes2 = df_mes2.drop(['tpep_pickup_datetime','tpep_dropoff_datetime'], axis=1)
df_mes2 = pd.get_dummies(df_mes2, drop_first=True)

df_mes2 = df_mes2.head(shape1)

cols = list(df_mes2.columns)
cols.remove('tip_amount')

#Divide the dataset into input variables and output 
# (I get a memory error if I work with the whole set, so I'm reducing it to 1.000.000 rows)
X2 = df_mes2.loc[:,cols]
Y2 = df_mes2.loc[:,['tip_amount']]



In [53]:
Y_lin_reg = lin_reg.predict(X2)

from sklearn import metrics
print('RMSE:', np.sqrt(metrics.mean_squared_error(Y2, Y_lin_reg)))
print('Variance explained: ', metrics.explained_variance_score(Y2, Y_lin_reg))

RMSE: 0.26076856499493123
Variance explained:  0.9892641034248001


In [54]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 23.634079933166504 seconds ---
