<a href="https://www.kaggle.com/code/masoudnaghshbandi/new-york-city-taxi-fare-prediction?scriptVersionId=107822941" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/new-york-city-taxi-fare-prediction/sample_submission.csv
/kaggle/input/new-york-city-taxi-fare-prediction/GCP-Coupons-Instructions.rtf
/kaggle/input/new-york-city-taxi-fare-prediction/train.csv
/kaggle/input/new-york-city-taxi-fare-prediction/test.csv


# Import libraries

In [2]:

import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import  train_test_split
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error as MSE

# load data

In [3]:
train = pd.read_csv("../input/new-york-city-taxi-fare-prediction/train.csv", nrows = 1000000)
test = pd.read_csv("../input/new-york-city-taxi-fare-prediction/test.csv")

In [4]:
train.shape, test.shape

((1000000, 8), (9914, 7))

In [5]:
train.head()


Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


# Data Pre-processing


In [6]:
train.isnull().sum()


key                   0
fare_amount           0
pickup_datetime       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude    10
dropoff_latitude     10
passenger_count       0
dtype: int64

###  we have a negligible number of null entries. It is better to eliminate them.

In [7]:
train = train.dropna(how = 'any', axis = 'rows')


In [8]:
test.isnull().sum()


key                  0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [9]:
train.head()


Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [10]:
pip install haversine

[0mNote: you may need to restart the kernel to use updated packages.


In [11]:
train['fare_amount'].describe()

count    999990.000000
mean         11.347953
std           9.821790
min         -44.900000
25%           6.000000
50%           8.500000
75%          12.500000
max         500.000000
Name: fare_amount, dtype: float64

In [12]:
train.drop(train[train['pickup_longitude'] == 0].index, axis=0, inplace = True)
train.drop(train[train['pickup_latitude'] == 0].index, axis=0, inplace = True)
train.drop(train[train['dropoff_longitude'] == 0].index, axis=0, inplace = True)
train.drop(train[train['dropoff_latitude'] == 0].index, axis=0, inplace = True)
train.drop(train[train['passenger_count'] == 208].index, axis=0, inplace = True)
train.drop(train[train['passenger_count'] > 5].index, axis=0, inplace = True)
train.drop(train[train['passenger_count'] == 0].index, axis=0, inplace = True)


### the datetime object, we’ll be able to create useful attributes like Year, Month, Day, Day of Week, and Hour.


In [13]:
train.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [14]:
train.drop(['key'], axis=1,inplace=True)

In [15]:
train.drop(['pickup_datetime'], axis=1,inplace=True)

### The city of New York longitude ranges between -75 and -72. The latitude ranges between 40 and 42, SO:

In [16]:
train.dropna(inplace=True)

train.drop(train.index[(train.pickup_longitude < -75) | 
           (train.pickup_longitude > -72) | 
           (train.pickup_latitude < 40) | 
           (train.pickup_latitude > 42)],inplace=True)
train.drop(train.index[(train.dropoff_longitude < -75) | 
           (train.dropoff_longitude > -72) | 
           (train.dropoff_latitude < 40) | 
           (train.dropoff_latitude > 42)],inplace=True)

In [17]:
train.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.5,-73.844311,40.721319,-73.84161,40.712278,1
1,16.9,-74.016048,40.711303,-73.979268,40.782004,1
2,5.7,-73.982738,40.76127,-73.991242,40.750562,2
3,7.7,-73.98713,40.733143,-73.991567,40.758092,1
4,5.3,-73.968095,40.768008,-73.956655,40.783762,1


## Model Training


In [18]:
X, y = train.drop('fare_amount', axis = 1), train['fare_amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

In [19]:
scaler = StandardScaler()

In [20]:
scaler.fit_transform(X_train,X_test)

array([[-0.36866789,  0.00784906, -0.71721126,  0.13121136, -0.51768646],
       [-0.18208259,  0.66107304, -0.46043167,  0.23972133,  2.95095484],
       [ 3.13331615, -0.44044256,  5.44836304, -1.27229493, -0.51768646],
       ...,
       [-0.24410753, -0.31119753, -1.05468037, -1.25946691, -0.51768646],
       [ 2.79319998,  0.65147465, -0.02092249, -0.02347948,  0.34947386],
       [ 0.33419196,  0.94078627, -0.17537641,  0.3834592 ,  0.34947386]])

In [21]:
xgb_r = xgb.XGBRegressor(objective ='reg:linear',
                  n_estimators = 400, seed = 123)




In [22]:
xgb_r.fit(X_train,y_train)




XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=400, n_jobs=0,
             num_parallel_tree=1, objective='reg:linear', predictor='auto',
             random_state=123, reg_alpha=0, ...)

In [23]:
y_pred = xgb_r.predict(X_test)

In [24]:
rmse = np.sqrt(MSE(y_test, y_pred))
print("RMSE : % f" %(rmse))

RMSE :  4.232444


# Working on test data

In [25]:
test.drop(['key'], axis=1,inplace=True)

In [26]:
test.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24 UTC,-73.97332,40.763805,-73.98143,40.743835,1
1,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44 UTC,-73.982524,40.75126,-73.979654,40.746139,1
3,2012-12-01 21:12:12 UTC,-73.98116,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12 UTC,-73.966046,40.789775,-73.988565,40.744427,1


In [27]:
test.dropna(inplace=True)

test.drop(test.index[(test.pickup_longitude < -75) | 
           (test.pickup_longitude > -72) | 
           (test.pickup_latitude < 40) | 
           (test.pickup_latitude > 42)],inplace=True)
test.drop(test.index[(test.dropoff_longitude < -75) | 
           (test.dropoff_longitude > -72) | 
           (test.dropoff_latitude < 40) | 
           (test.dropoff_latitude > 42)],inplace=True)

In [28]:
test.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24 UTC,-73.97332,40.763805,-73.98143,40.743835,1
1,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44 UTC,-73.982524,40.75126,-73.979654,40.746139,1
3,2012-12-01 21:12:12 UTC,-73.98116,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12 UTC,-73.966046,40.789775,-73.988565,40.744427,1


In [29]:
test.drop(['pickup_datetime'], axis=1,inplace=True)

In [30]:
scaler.fit_transform(test)

array([[ 0.03278367,  0.38058296, -0.19896544, -0.22316025, -0.52497232],
       [-0.28383094, -0.94387891, -0.64575618, -0.35396491, -0.52497232],
       [-0.18240469,  0.00653781, -0.15350703, -0.15814958, -0.52497232],
       ...,
       [-0.39460146, -0.72846192,  4.70733595, -2.95573967,  3.38530232],
       [-0.25368566, -0.46538934,  0.88247126,  1.41076765,  3.38530232],
       [-0.31094385,  0.09032734, -0.68149163,  0.21102511,  3.38530232]])

In [31]:
new_pred = xgb_r.predict(test)

In [32]:
sample_new=pd.read_csv('../input/new-york-city-taxi-fare-prediction/test.csv')


In [33]:
sample_new.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24 UTC,-73.97332,40.763805,-73.98143,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44 UTC,-73.982524,40.75126,-73.979654,40.746139,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12 UTC,-73.98116,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12 UTC,-73.966046,40.789775,-73.988565,40.744427,1


In [34]:
sample_new.drop(['pickup_datetime','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count'], axis=1,inplace=True)

In [35]:
sample_new['fare_amount'] = new_pred

In [36]:
sample_new.head()

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,7.805399
1,2015-01-27 13:08:24.0000003,8.485815
2,2011-10-08 11:53:44.0000002,5.562542
3,2012-12-01 21:12:12.0000002,7.742702
4,2012-12-01 21:12:12.0000003,15.426907


In [37]:
submission1=sample_new.to_csv("submission1.csv", index=False)