In [42]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from math import radians,cos,sin,sqrt
import warnings
warnings.filterwarnings('ignore')


# Importing Dataset

In [43]:
df=pd.read_csv(r'nyc_taxi_trip_duration.csv')
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 729322 entries, 0 to 729321
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  729322 non-null  object 
 1   vendor_id           729322 non-null  int64  
 2   pickup_datetime     729322 non-null  object 
 3   dropoff_datetime    729322 non-null  object 
 4   passenger_count     729322 non-null  int64  
 5   pickup_longitude    729322 non-null  float64
 6   pickup_latitude     729322 non-null  float64
 7   dropoff_longitude   729322 non-null  float64
 8   dropoff_latitude    729322 non-null  float64
 9   store_and_fwd_flag  729322 non-null  object 
 10  trip_duration       729322 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 61.2+ MB


In [44]:
#data=data.iloc[0 : 10000]

In [45]:
df.shape

(729322, 11)

In [46]:
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id1080784,2,2016-02-29 16:40:21,2016-02-29 16:47:01,1,-73.953918,40.778873,-73.963875,40.771164,N,400
1,id0889885,1,2016-03-11 23:35:37,2016-03-11 23:53:57,2,-73.988312,40.731743,-73.994751,40.694931,N,1100
2,id0857912,2,2016-02-21 17:59:33,2016-02-21 18:26:48,2,-73.997314,40.721458,-73.948029,40.774918,N,1635
3,id3744273,2,2016-01-05 09:44:31,2016-01-05 10:03:32,6,-73.96167,40.75972,-73.956779,40.780628,N,1141
4,id0232939,1,2016-02-17 06:42:23,2016-02-17 06:56:31,1,-74.01712,40.708469,-73.988182,40.740631,N,848


In [47]:
df.isnull().sum()

id                    0
vendor_id             0
pickup_datetime       0
dropoff_datetime      0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
trip_duration         0
dtype: int64

In [48]:
df['trip_duration_hour'] = df['trip_duration'].apply(lambda x: x/3600)
df.drop(columns=['trip_duration'], inplace=True)

In [49]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], format = '%Y-%m-%d %H:%M:%S')
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'], format = '%Y-%m-%d %H:%M:%S')

In [50]:
df['pickup_hour'] = df['pickup_datetime'].dt.hour
df['pickup_weekday'] = df['pickup_datetime'].dt.weekday
df['pickup_day'] = df['pickup_datetime'].dt.day
df['pickup_month'] = df['pickup_datetime'].dt.month
df['pickup_year'] = df['pickup_datetime'].dt.year
df['pickup_date'] = df['pickup_datetime'].dt.date

df['dropoff_hour'] = df['dropoff_datetime'].dt.hour
df['dropoff_weekday'] = df['dropoff_datetime'].dt.weekday
df['dropoff_day'] = df['dropoff_datetime'].dt.day
df['dropoff_month'] = df['dropoff_datetime'].dt.month
df['dropoff_year'] = df['dropoff_datetime'].dt.year
df['dropoff_date'] = df['dropoff_datetime'].dt.date

In [51]:
def time_of_day(x):
    # to calculate what time of it is now
    if x in range(6,12):
        return 'Morning'
    elif x in range(12,16):
        return 'Afternoon'
    elif x in range(16,22):
        return 'Evening'
    else:
        return 'Late night'

df['pickup_time_of_day'] = df['pickup_hour'].apply(time_of_day)
df['dropoff_time_of_day'] = df['dropoff_hour'].apply(time_of_day)

In [52]:
df.drop(columns=['pickup_hour','pickup_weekday','pickup_day','pickup_month','pickup_year','dropoff_hour','dropoff_weekday','dropoff_day','dropoff_month','dropoff_year'], inplace=True)

In [53]:
#simple predictive model
df["trip_duration_hour_mean"]=df["trip_duration_hour"].mean()
df["trip_duration_hour_mean"].head()

0    0.264508
1    0.264508
2    0.264508
3    0.264508
4    0.264508
Name: trip_duration_hour_mean, dtype: float64

# 1.Choose the most suitable evaluation metric and state why you chose it.

In [54]:
# We have  chosen root mean squared error because:
# 1.RMSE is better in terms of reflecting performance when dealing with large error values
# 2.RMSE is more useful when lower residual values are preferred.
# 3.RMSE penalize large errors.

# 2.Build a benchmark model for the given dataset.


# shuffling and creating Train and Test Dataset

In [55]:
#importing the shuffle library 
from sklearn.utils import shuffle

# Shuffling the Dataset
df = shuffle(df, random_state = 42)

#creating 4 divisions
div = int(df.shape[0]/4)

# 3 parts to train set and 1 part to test set
train = df.loc[:3*div+1,:]
test = df.loc[3*div+1:]


In [56]:
div

182330

In [57]:
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration_hour,pickup_date,dropoff_date,pickup_time_of_day,dropoff_time_of_day,trip_duration_hour_mean
469114,id2380741,2,2016-05-21 10:40:14,2016-05-21 10:51:11,1,-73.981796,40.762035,-73.972267,40.781265,N,0.1825,2016-05-21,2016-05-21,Morning,Morning,0.264508
694852,id3946961,2,2016-01-08 18:49:27,2016-01-08 18:52:42,5,-73.980965,40.747677,-73.982704,40.741161,N,0.054167,2016-01-08,2016-01-08,Evening,Evening,0.264508
696324,id0833913,1,2016-05-22 00:54:10,2016-05-22 01:08:10,1,-73.951065,40.782722,-73.867691,40.833664,N,0.233333,2016-05-22,2016-05-22,Late night,Late night,0.264508
356496,id1336849,1,2016-06-11 10:32:12,2016-06-11 10:38:50,1,-73.987625,40.762791,-73.973518,40.762909,N,0.110556,2016-06-11,2016-06-11,Morning,Morning,0.264508
645318,id1610858,1,2016-04-03 10:45:51,2016-04-03 10:57:13,3,-73.964333,40.792503,-73.988609,40.758369,N,0.189444,2016-04-03,2016-04-03,Morning,Morning,0.264508


In [58]:
test.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration_hour,pickup_date,dropoff_date,pickup_time_of_day,dropoff_time_of_day,trip_duration_hour_mean
546991,id2240736,1,2016-05-25 07:59:16,2016-05-25 08:05:02,1,-73.991364,40.73259,-74.000526,40.742283,N,0.096111,2016-05-25,2016-05-25,Morning,Morning,0.264508
43126,id1423404,1,2016-01-18 12:17:13,2016-01-18 12:21:13,2,-73.966225,40.768059,-73.967606,40.763073,N,0.066667,2016-01-18,2016-01-18,Afternoon,Afternoon,0.264508
641450,id1317268,2,2016-03-02 18:39:01,2016-03-02 18:50:12,1,-73.994926,40.766018,-74.004219,40.742523,N,0.186389,2016-03-02,2016-03-02,Evening,Evening,0.264508
611380,id3335546,1,2016-04-06 19:17:20,2016-04-06 19:18:03,1,-73.974388,40.793781,-73.976006,40.792339,N,0.011944,2016-04-06,2016-04-06,Evening,Evening,0.264508
62690,id2174190,2,2016-06-21 18:35:31,2016-06-21 18:40:56,3,-73.96344,40.798557,-73.979736,40.777878,N,0.090278,2016-06-21,2016-06-21,Evening,Evening,0.264508


In [59]:
# storing simple mean in a new column in the test set as "simple_mean"
df["trip_duration_hour_mean"]=df["trip_duration_hour"].mean()
df["trip_duration_hour_mean"]

469114    0.264508
694852    0.264508
696324    0.264508
356496    0.264508
645318    0.264508
            ...   
259178    0.264508
365838    0.264508
131932    0.264508
671155    0.264508
121958    0.264508
Name: trip_duration_hour_mean, Length: 729322, dtype: float64

In [60]:
from sklearn.metrics import mean_squared_error as mse
from math import sqrt

trip_mean_error = sqrt((mse(test['trip_duration_hour'] , test['trip_duration_hour_mean'])))
trip_mean_error

0.8904067655425832

In [61]:

pickup = pd.pivot_table(train, values='trip_duration_hour', index = ['pickup_time_of_day'], aggfunc=np.mean)
pickup

Unnamed: 0_level_0,trip_duration_hour
pickup_time_of_day,Unnamed: 1_level_1
Afternoon,0.291531
Evening,0.264078
Late night,0.255589
Morning,0.25061


In [62]:
# initializing new column to zero
test['pickup'] = 0


for i in train['pickup_time_of_day'].unique():
  
  test['pickup'][test['pickup_time_of_day'] == str(i)] = train['trip_duration_hour'][train['pickup_time_of_day'] == str(i)].mean()

In [63]:
#calculating RMSE
pickup_error = sqrt(mse(test['trip_duration_hour'] , test['pickup'] ))
pickup_error

0.8903816016040094

In [64]:
#trip duration mean with respect to the mean of dropoff time of the day
dropoff =pd.pivot_table(train, values='trip_duration_hour', index = ['dropoff_time_of_day'], aggfunc=np.mean)
dropoff

Unnamed: 0_level_0,trip_duration_hour
dropoff_time_of_day,Unnamed: 1_level_1
Afternoon,0.284258
Evening,0.269666
Late night,0.256343
Morning,0.247905


In [65]:
# initializing new column to zero
test['dropoff'] = 0

# For every unique entry in pickup latitude
for i in train['dropoff_time_of_day'].unique():
  # Assign the mean value corresponding to unique entry
  test['dropoff'][test['dropoff_time_of_day'] == str(i)] = train['trip_duration_hour'][train['dropoff_time_of_day'] == str(i)].mean()

In [66]:
#calculating mean absolute error
dropoff_error = sqrt(mse(test['trip_duration_hour'] , test['dropoff'] ))
dropoff_error

0.8903865469170373

In [67]:
#trip duration mean with respect to the mean of passenger count
pass_count = pd.pivot_table(train, values='trip_duration_hour', index = ["passenger_count"], aggfunc=np.mean)
pass_count

Unnamed: 0_level_0,trip_duration_hour
passenger_count,Unnamed: 1_level_1
0,0.092981
1,0.255343
2,0.277822
3,0.287332
4,0.285759
5,0.299641
6,0.300193


In [68]:
# initializing new column to zero
test['pass_count'] = 0

# For every unique entry in passenger count
for i in train['passenger_count'].unique():
  # Assign the mean value corresponding to unique entry
  test['pass_count'][test['passenger_count'] == str(i)] = train['trip_duration_hour'][train['passenger_count'] == str(i)].mean()

In [69]:
pass_count_error = sqrt(mse(test['trip_duration_hour'] , test['pass_count'] ))
pass_count_error

0.9290781075032716

In [70]:
store_and_fwd = pd.pivot_table(train, values='trip_duration_hour', index = ["store_and_fwd_flag"], aggfunc=np.mean)
store_and_fwd     

Unnamed: 0_level_0,trip_duration_hour
store_and_fwd_flag,Unnamed: 1_level_1
N,0.264109
Y,0.304058


In [71]:
# initializing new column to zero
test['store_and_fwd'] = 0

# For every unique entry in pickup latitude
for i in train['store_and_fwd_flag'].unique():
  # Assign the mean value corresponding to unique entry
  test['store_and_fwd'][test['store_and_fwd_flag'] == str(i)] = train['trip_duration_hour'][train['store_and_fwd_flag'] == str(i)].mean()

In [72]:
str_and_fwd_error = sqrt(mse(test['store_and_fwd'] , test['trip_duration_hour'] ))
str_and_fwd_error

0.8904020727484228

In [73]:
combo = pd.pivot_table(train, values = 'trip_duration_hour', index = ['passenger_count','pickup_time_of_day','dropoff_time_of_day'], aggfunc = np.mean)
combo

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,trip_duration_hour
passenger_count,pickup_time_of_day,dropoff_time_of_day,Unnamed: 3_level_1
0,Afternoon,Afternoon,0.305417
0,Evening,Evening,0.054352
0,Evening,Late night,0.106944
0,Late night,Late night,0.023856
0,Morning,Afternoon,0.432222
...,...,...,...
6,Late night,Late night,0.282054
6,Late night,Morning,0.313738
6,Morning,Afternoon,0.376369
6,Morning,Late night,16.731944


# Build a K-Nearest neighbours’ model for the given dataset and find the best value of k

In [74]:
custom_df = df.sample(100000)

In [75]:
sample_df=custom_df

In [76]:
#seperate features and target
features = sample_df.drop(["id","vendor_id","trip_duration_hour","pickup_datetime","dropoff_datetime","store_and_fwd_flag"],axis = 1)
target = sample_df["trip_duration_hour"]

In [77]:
features = sample_df.drop(["id","vendor_id","trip_duration_hour","pickup_datetime","dropoff_datetime","store_and_fwd_flag"],axis = 1)
target = sample_df["trip_duration_hour"]

In [78]:
sample_df.info

<bound method DataFrame.info of                id  vendor_id     pickup_datetime    dropoff_datetime  \
271073  id0734277          1 2016-04-25 13:06:00 2016-04-25 13:08:23   
297373  id0808418          2 2016-05-22 07:32:30 2016-05-22 07:36:32   
680565  id1203490          1 2016-06-10 18:16:59 2016-06-10 18:50:08   
124349  id1533246          1 2016-02-25 14:53:31 2016-02-25 14:58:37   
38835   id0776232          1 2016-01-12 07:50:17 2016-01-12 08:00:11   
...           ...        ...                 ...                 ...   
588365  id1574736          1 2016-02-10 05:56:46 2016-02-10 05:59:44   
598118  id1974200          2 2016-04-30 21:09:28 2016-04-30 21:23:17   
515556  id0245243          2 2016-01-27 20:31:45 2016-01-27 20:40:43   
713487  id2724784          1 2016-06-03 19:47:24 2016-06-03 19:55:38   
215088  id1680821          1 2016-01-16 00:38:46 2016-01-16 00:59:38   

        passenger_count  pickup_longitude  pickup_latitude  dropoff_longitude  \
271073                

In [79]:
sample_df.dtypes

id                                 object
vendor_id                           int64
pickup_datetime            datetime64[ns]
dropoff_datetime           datetime64[ns]
passenger_count                     int64
pickup_longitude                  float64
pickup_latitude                   float64
dropoff_longitude                 float64
dropoff_latitude                  float64
store_and_fwd_flag                 object
trip_duration_hour                float64
pickup_date                        object
dropoff_date                       object
pickup_time_of_day                 object
dropoff_time_of_day                object
trip_duration_hour_mean           float64
dtype: object

In [80]:
#converting the store and fwd flag to int type
df["passenger_count"].value_counts()

1    517415
2    105097
5     38926
3     29692
6     24107
4     14050
0        33
7         1
9         1
Name: passenger_count, dtype: int64

In [82]:
df1 = pd.concat([sample_df, pd.get_dummies(sample_df[['passenger_count']].astype('str'))], axis=1)
try:
    df1.drop(['id','vendor_id','trip_duration_hour','pickup_datetime','dropoff_datetime','store_and_fwd_flag','passenger_count'], axis=1, inplace=True)
except KeyError:
    pass

df1.head(10)

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,pickup_date,dropoff_date,pickup_time_of_day,dropoff_time_of_day,trip_duration_hour_mean,passenger_count_0,passenger_count_1,passenger_count_2,passenger_count_3,passenger_count_4,passenger_count_5,passenger_count_6,passenger_count_7,passenger_count_9
271073,-73.974167,40.791256,-73.968513,40.798988,2016-04-25,2016-04-25,Afternoon,Afternoon,0.264508,0,1,0,0,0,0,0,0,0
297373,-73.976967,40.759048,-73.989983,40.751652,2016-05-22,2016-05-22,Morning,Morning,0.264508,0,1,0,0,0,0,0,0,0
680565,-73.967278,40.788277,-73.925392,40.767876,2016-06-10,2016-06-10,Evening,Evening,0.264508,0,1,0,0,0,0,0,0,0
124349,-73.962555,40.766609,-73.955673,40.778652,2016-02-25,2016-02-25,Afternoon,Afternoon,0.264508,0,1,0,0,0,0,0,0,0
38835,-73.954956,40.785835,-73.965546,40.769745,2016-01-12,2016-01-12,Morning,Morning,0.264508,0,1,0,0,0,0,0,0,0
511295,-73.96949,40.790218,-73.968422,40.801521,2016-06-28,2016-06-28,Morning,Morning,0.264508,0,1,0,0,0,0,0,0,0
446579,-73.964745,40.76981,-73.969749,40.763016,2016-01-05,2016-01-05,Morning,Morning,0.264508,0,1,0,0,0,0,0,0,0
40037,-73.947487,40.776196,-73.975578,40.752686,2016-03-17,2016-03-17,Afternoon,Afternoon,0.264508,0,1,0,0,0,0,0,0,0
411908,-73.993729,40.745926,-73.975471,40.744732,2016-05-18,2016-05-18,Evening,Evening,0.264508,0,0,1,0,0,0,0,0,0
257755,-73.947731,40.775143,-73.982697,40.774029,2016-04-15,2016-04-15,Morning,Morning,0.264508,0,0,0,0,0,1,0,0,0


In [83]:
#seperate features and target
x = df1
y = df["trip_duration_hour"]
x.shape,y.shape

((100000, 18), (729322,))

In [84]:
#scaling the data(using MinMax Scaler)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x)

TypeError: float() argument must be a string or a number, not 'datetime.date'