In [98]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import folium
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_colwidth',1000)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# a. In what trips you can confidently use respective means as measures of central tendency to estimate fare, time taken.etc ?

Trips that have a 'sufficient' sample size.

If we assume, 
* Margin of Error = 5% 
* Confidence Interval = 95% => z-score = 1.96
* Standard Deviation = 0.5(say)

Thus, sample size = 385 

So, in routes with more than 385 trips we can confidently use means as a measure of central tendency.

# b.	Can we build a model to predict fare and tip amount given pick up and drop off coordinates, time of day and week?

Lets try to predict fare amount. We will first create a baseline model without any feature engineering. Then later on, we will create some features to improve upon this baseline.

In [99]:
trips = pd.read_csv('../data/trips.csv',parse_dates=['pickup_datetime','dropoff_datetime'])
trips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11308202 entries, 0 to 11308201
Data columns (total 20 columns):
 #   Column             Dtype         
---  ------             -----         
 0   medallion          object        
 1   hack_license       object        
 2   vendor_id          object        
 3   rate_code          int64         
 4   pickup_datetime    datetime64[ns]
 5   dropoff_datetime   datetime64[ns]
 6   passenger_count    int64         
 7   trip_time_in_secs  int64         
 8   trip_distance      float64       
 9   pickup_longitude   float64       
 10  pickup_latitude    float64       
 11  dropoff_longitude  float64       
 12  dropoff_latitude   float64       
 13  payment_type       object        
 14  fare_amount        float64       
 15  surcharge          float64       
 16  mta_tax            float64       
 17  tip_amount         float64       
 18  tolls_amount       float64       
 19  total_amount       float64       
dtypes: datetime64[ns](2), 

In [100]:
#Columns to keeps

cols = ['pickup_datetime','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','fare_amount']
trips = trips [cols]
print(trips.columns)


Index(['pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'fare_amount'],
      dtype='object')


# Baseline Model


In [101]:
# Lets drop datetime as it cant be used as is
dataset = trips.drop(columns=['pickup_datetime'])

X = dataset.drop(columns=['fare_amount'])
y = dataset[['fare_amount']]

In [102]:
# Test-Train Split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)

(9046561, 4)
(2261641, 4)


In [103]:
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)
predictions = lin_reg.predict(X_test)

print(f'RMSE : {mean_squared_error(y_test,predictions,squared=False)}')
print(f'R2_SCORE : {r2_score(y_test,predictions)}')

RMSE : 7.038349533267527
R2_SCORE : 0.44142784638695376


Now, lets do some feature engineering and try to create a better model

In [104]:
# Calculate haversine distance given long,lat of two points

def haversine(df):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    lat1= np.radians(df["pickup_latitude"])
    lat2 = np.radians(df["dropoff_latitude"])
    
    dlat = np.radians(df['dropoff_latitude']-df["pickup_latitude"])
    dlong = np.radians(df["dropoff_longitude"]-df["pickup_longitude"])
    
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlong/2)**2

    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

trips['distance']=haversine(trips)
trips.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,fare_amount,distance
0,2013-04-04 18:47:45,-73.95785,40.76532,-73.97627,40.78565,11.0,2.74126
1,2013-04-04 18:12:01,-73.97812,40.76345,-73.95567,40.77664,10.0,2.39303
2,2013-04-05 02:48:11,-73.98519,40.75493,-73.99078,40.748,4.5,0.90332
3,2013-04-05 06:16:10,-73.98517,40.76342,-73.97887,40.75114,6.5,1.46457
4,2013-04-04 19:20:16,-73.97848,40.76123,-73.98457,40.75941,7.5,0.55112


Now for datetime, lets create two features. 

1. Day_of_week - This tells which day of the week it is. 0 for Monday and 6 for Sunday. 

2. Hour - Hour of the day in 24hr format

In [105]:
trips['day_of_week'] = trips.pickup_datetime.dt.dayofweek
trips['hour'] = trips.pickup_datetime.dt.hour

trips.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,fare_amount,distance,day_of_week,hour
0,2013-04-04 18:47:45,-73.95785,40.76532,-73.97627,40.78565,11.0,2.74126,3,18
1,2013-04-04 18:12:01,-73.97812,40.76345,-73.95567,40.77664,10.0,2.39303,3,18
2,2013-04-05 02:48:11,-73.98519,40.75493,-73.99078,40.748,4.5,0.90332,4,2
3,2013-04-05 06:16:10,-73.98517,40.76342,-73.97887,40.75114,6.5,1.46457,4,6
4,2013-04-04 19:20:16,-73.97848,40.76123,-73.98457,40.75941,7.5,0.55112,3,19


# Linear Model

In [106]:
X = trips[['day_of_week','hour','distance']]
y = trips[['fare_amount']]

In [107]:
# Test-Train Split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)

(9046561, 3)
(2261641, 3)


In [108]:
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)
predictions = lin_reg.predict(X_test)

print(f'RMSE : {mean_squared_error(y_test,predictions,squared=False)}')
print(f'R2_SCORE : {r2_score(y_test,predictions)}')

RMSE : 3.667858227251842
R2_SCORE : 0.8483079917038522


By creating some new features, we were able to improve upon our baseline model tremendously!

We can further improve this score by trying out some other models such as decision trees and xgboost.

# c.	If you were a taxi owner, how would you maximize your earnings in a day?

There are a lot of strategies one can consider to achieve this goal. I will take this a problem of finding the best routes with maximum total amount/trip_time. 

In [109]:
trips = pd.read_csv('../data/trips.csv',parse_dates=['pickup_datetime','dropoff_datetime'])
taxi_df = trips[['pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude','total_amount','trip_time_in_secs']]
taxi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11308202 entries, 0 to 11308201
Data columns (total 6 columns):
 #   Column             Dtype  
---  ------             -----  
 0   pickup_latitude    float64
 1   pickup_longitude   float64
 2   dropoff_latitude   float64
 3   dropoff_longitude  float64
 4   total_amount       float64
 5   trip_time_in_secs  int64  
dtypes: float64(5), int64(1)
memory usage: 517.6 MB


In [110]:
# Round off lat and long to two decimal places to get places at a more reasonable scale
taxi_df['pickup_latitude'] = taxi_df.pickup_latitude.round(2)
taxi_df['pickup_longitude'] = taxi_df.pickup_longitude.round(2)
taxi_df['dropoff_latitude'] = taxi_df.dropoff_latitude.round(2)
taxi_df['dropoff_longitude'] = taxi_df.dropoff_longitude.round(2)

# Create lat,long pairs for pickup and dropoff
taxi_df['pickup_geocode'] = taxi_df.pickup_latitude.astype(str) + ', ' + taxi_df.pickup_longitude.astype(str)
taxi_df['dropoff_geocode'] = taxi_df.dropoff_latitude.astype(str) + ', ' + taxi_df.dropoff_longitude.astype(str)

# Creating routes with a combination of pickup and dropoff codes
taxi_df['route'] = taxi_df.pickup_geocode + ' : ' + taxi_df.dropoff_geocode

# Filter out routes with less than 385 trips
taxi_df = taxi_df.groupby('route').filter(lambda x: len(x) >= 385)

# Drop columns
taxi_df.drop(columns=['pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude'],inplace=True)

taxi_df.head()

Unnamed: 0,total_amount,trip_time_in_secs,pickup_geocode,dropoff_geocode,route
0,15.0,759,"40.77, -73.96","40.79, -73.98","40.77, -73.96 : 40.79, -73.98"
1,14.95,799,"40.76, -73.98","40.78, -73.96","40.76, -73.98 : 40.78, -73.96"
2,6.6,189,"40.75, -73.99","40.75, -73.99","40.75, -73.99 : 40.75, -73.99"
3,8.0,354,"40.76, -73.99","40.75, -73.98","40.76, -73.99 : 40.75, -73.98"
4,11.25,635,"40.76, -73.98","40.76, -73.98","40.76, -73.98 : 40.76, -73.98"


In [111]:
taxi_df['amount_per_hour'] = (taxi_df['total_amount']/taxi_df['trip_time_in_secs'])*3600

taxi_df['amount_per_hour'].describe()

count   10163486.00000
mean          83.93872
std          569.12722
min            1.29310
25%           57.96064
50%           70.50000
75%           88.42105
max       249768.00000
Name: amount_per_hour, dtype: float64

In [112]:
# Remove upper 1% of points
p_upper = np.percentile(taxi_df.amount_per_hour, 99.0)
print(p_upper)

taxi_df = taxi_df[taxi_df.amount_per_hour<p_upper]

taxi_df.describe()

198.1651376146789


Unnamed: 0,total_amount,trip_time_in_secs,amount_per_hour
count,10061775.0,10061775.0,10061775.0
mean,12.78816,673.31872,75.94203
std,9.54137,490.3944,25.60083
min,2.5,55.0,1.2931
25%,7.5,360.0,57.82677
50%,10.1,540.0,70.03891
75%,14.0,840.0,87.33333
max,205.0,10800.0,198.11321


In [113]:
# Top 10 highest earning routes per hour travel time
top_routes = taxi_df.groupby(['route'])['amount_per_hour'].mean().sort_values(ascending=False).head(10)

top_routes

route
40.77, -73.87 : 40.8, -73.94    131.04217
40.78, -73.95 : 40.65, -73.78   130.55232
40.78, -73.95 : 40.77, -73.87   129.75344
40.78, -73.95 : 40.64, -73.79   124.17950
40.78, -73.95 : 40.77, -73.86   124.13070
40.77, -73.95 : 40.65, -73.78   124.03132
40.77, -73.87 : 40.78, -73.95   123.34556
40.77, -73.95 : 40.77, -73.87   122.91817
40.74, -73.98 : 40.65, -73.78   122.82516
40.65, -73.78 : 40.78, -73.95   122.64630
Name: amount_per_hour, dtype: float64

In [114]:
m=folium.Map(location=[40.730610, -73.935242],zoom_start = 11,tiles='cartodbpositron')
colors = ['red', 'blue', 'green', 'purple', 'orange', 'beige',   'darkpurple', 'pink',  'gray', 'black']

for col, route in zip(colors,top_routes.index):
    
    p_lat,p_long,d_lat,d_long = [float(v) for x in route.split(':') for v in x.split(',')]
    folium.PolyLine([(p_lat,p_long),(d_lat,d_long)],
                color=col,
                weight=5,
                opacity=0.8).add_to(m)

In [115]:
m

# d.	If you were a taxi owner, how would you minimize your work time while retaining the average wages earned by a typical taxi in the dataset?

Lets calculate how much a taxi driver earns in a day.

In [116]:
taxi = trips[['hack_license','pickup_datetime','total_amount']]
taxi['date'] = taxi.pickup_datetime.dt.date
taxi.drop(columns=['pickup_datetime'],inplace=True)

# What is the average daily earnings of a taxi driver?
print(taxi.groupby(['hack_license', 'date'])['total_amount'].sum().mean())

224.76432068769833


In [117]:
#This our goal to reach
target = 225.0

Lets use the dataframe from the last question.

In [118]:
taxi_df.head()

Unnamed: 0,total_amount,trip_time_in_secs,pickup_geocode,dropoff_geocode,route,amount_per_hour
0,15.0,759,"40.77, -73.96","40.79, -73.98","40.77, -73.96 : 40.79, -73.98",71.14625
1,14.95,799,"40.76, -73.98","40.78, -73.96","40.76, -73.98 : 40.78, -73.96",67.3592
2,6.6,189,"40.75, -73.99","40.75, -73.99","40.75, -73.99 : 40.75, -73.99",125.71429
3,8.0,354,"40.76, -73.99","40.75, -73.98","40.76, -73.99 : 40.75, -73.98",81.35593
4,11.25,635,"40.76, -73.98","40.76, -73.98","40.76, -73.98 : 40.76, -73.98",63.77953


In [119]:
min_time = taxi_df.groupby(['route']).agg({'amount_per_hour':np.mean})
min_time

Unnamed: 0_level_0,amount_per_hour
route,Unnamed: 1_level_1
"40.64, -73.78 : 40.64, -73.78",95.62308
"40.64, -73.78 : 40.72, -73.99",99.21515
"40.64, -73.78 : 40.73, -73.99",103.22144
"40.64, -73.78 : 40.74, -73.98",113.38869
"40.64, -73.78 : 40.74, -73.99",98.71940
...,...
"40.83, -73.95 : 40.82, -73.95",108.70832
"40.84, -73.94 : 40.78, -73.98",88.20658
"40.84, -73.94 : 40.81, -73.96",75.82530
"40.84, -73.94 : 40.85, -73.94",93.99546


In [120]:
#Time required to achieve target (in mins)
min_time['time_to_target'] = (target / min_time['amount_per_hour'])*60

#Top 10 routes with minimum times
min_time = min_time.sort_values(by='time_to_target')[:10]

min_time

Unnamed: 0_level_0,amount_per_hour,time_to_target
route,Unnamed: 1_level_1,Unnamed: 2_level_1
"40.77, -73.87 : 40.8, -73.94",131.04217,103.02027
"40.78, -73.95 : 40.65, -73.78",130.55232,103.40682
"40.78, -73.95 : 40.77, -73.87",129.75344,104.04348
"40.78, -73.95 : 40.64, -73.79",124.1795,108.7136
"40.78, -73.95 : 40.77, -73.86",124.1307,108.75634
"40.77, -73.95 : 40.65, -73.78",124.03132,108.84348
"40.77, -73.87 : 40.78, -73.95",123.34556,109.4486
"40.77, -73.95 : 40.77, -73.87",122.91817,109.82916
"40.74, -73.98 : 40.65, -73.78",122.82516,109.91233
"40.65, -73.78 : 40.78, -73.95",122.6463,110.07262


In [121]:
m=folium.Map(location=[40.730610, -73.935242],zoom_start = 10,tiles='cartodbpositron')
colors = ['red', 'blue', 'green', 'purple', 'orange', 'beige',   'darkpurple', 'pink',  'gray', 'black']

for col, route in zip(colors,min_time.index):
    
    p_lat,p_long,d_lat,d_long = [float(v) for x in route.split(':') for v in x.split(',')]
    folium.PolyLine([(p_lat,p_long),(d_lat,d_long)],
                color=col,
                weight=5,
                opacity=0.8).add_to(m)

In [122]:
m

# e.	If you run a taxi company with 10 taxis, how would you maximize your earnings?

1. Ensure taxis are available at most popular dropoff and pickup locations especially in Manhattan area.

2. Instruct taxi drivers with routes with highest earnings per hour

3. Since evening times and weekend are busiest, ensure any preventive maintenance is done at other times and days.

4. Since some of these popular routes are overcrowded, can divert like 4 of the taxis to operate on high total revenue with smaller individual trips.