# Taxi Fare Prediction

## Importing Libraries

In [162]:
import numpy as np 
import pandas as pd  
import matplotlib.pyplot as plt 
%matplotlib inline 
import seaborn as sns
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression

## Load and review data

In [163]:
df= pd.read_csv("https://raw.githubusercontent.com/Premalatha-success/Datasets/main/TaxiFare.csv")

In [164]:
df.shape

(50000, 8)

In [165]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   unique_id             50000 non-null  object 
 1   amount                50000 non-null  float64
 2   date_time_of_pickup   50000 non-null  object 
 3   longitude_of_pickup   50000 non-null  float64
 4   latitude_of_pickup    50000 non-null  float64
 5   longitude_of_dropoff  50000 non-null  float64
 6   latitude_of_dropoff   50000 non-null  float64
 7   no_of_passenger       50000 non-null  int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 3.1+ MB


In [166]:
df.sample(10)

Unnamed: 0,unique_id,amount,date_time_of_pickup,longitude_of_pickup,latitude_of_pickup,longitude_of_dropoff,latitude_of_dropoff,no_of_passenger
27197,37:01.0,10.1,2011-09-26 21:37:01 UTC,-74.001228,40.727704,-73.980967,40.763711,4
43687,46:12.0,5.5,2014-09-09 19:46:12 UTC,-73.970559,40.759946,-73.96256,40.765392,1
45092,48:00.0,5.7,2012-08-18 18:48:00 UTC,-73.97648,40.75232,-73.975555,40.744557,5
39555,29:00.0,5.7,2010-01-15 20:29:00 UTC,-73.974798,40.762492,-73.959555,40.767593,1
30958,52:50.0,36.8,2012-10-22 13:52:50 UTC,-73.99241,40.756726,-73.865039,40.77052,2
16387,25:06.0,11.0,2015-05-20 09:25:06 UTC,-73.97953,40.781456,-73.964905,40.808636,3
3083,46:00.0,6.5,2014-06-14 17:46:00 UTC,-73.97349,40.748092,-73.975563,40.7604,2
17578,07:00.0,18.0,2014-07-30 22:07:00 UTC,-74.005362,40.721962,-73.949838,40.710347,1
19492,04:23.0,9.5,2012-09-06 15:04:23 UTC,-73.992616,40.748426,-73.978278,40.751265,1
6014,28:00.0,16.1,2012-06-02 20:28:00 UTC,-73.95634,40.818567,-73.990387,40.755797,1


In [167]:
df = df.drop(labels=['unique_id','date_time_of_pickup'], axis=1)
df.head()

Unnamed: 0,amount,longitude_of_pickup,latitude_of_pickup,longitude_of_dropoff,latitude_of_dropoff,no_of_passenger
0,4.5,-73.844311,40.721319,-73.84161,40.712278,1
1,16.9,-74.016048,40.711303,-73.979268,40.782004,1
2,5.7,-73.982738,40.76127,-73.991242,40.750562,2
3,7.7,-73.98713,40.733143,-73.991567,40.758092,1
4,5.3,-73.968095,40.768008,-73.956655,40.783762,1


## Dealing with Missing Values

In [168]:
df.describe()

Unnamed: 0,amount,longitude_of_pickup,latitude_of_pickup,longitude_of_dropoff,latitude_of_dropoff,no_of_passenger
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,11.364171,-72.509756,39.933759,-72.504616,39.926251,1.66784
std,9.685557,10.39386,6.224857,10.40757,6.014737,1.289195
min,-5.0,-75.423848,-74.006893,-84.654241,-74.006377,0.0
25%,6.0,-73.992062,40.73488,-73.991152,40.734372,1.0
50%,8.5,-73.98184,40.752678,-73.980082,40.753372,1.0
75%,12.5,-73.967148,40.76736,-73.963584,40.768167,2.0
max,200.0,40.783472,401.083332,40.851027,43.41519,6.0


In [169]:
df.dtypes

amount                  float64
longitude_of_pickup     float64
latitude_of_pickup      float64
longitude_of_dropoff    float64
latitude_of_dropoff     float64
no_of_passenger           int64
dtype: object

In [170]:
df = df.replace(0, np.NaN)

In [171]:
df.head(15)

Unnamed: 0,amount,longitude_of_pickup,latitude_of_pickup,longitude_of_dropoff,latitude_of_dropoff,no_of_passenger
0,4.5,-73.844311,40.721319,-73.84161,40.712278,1.0
1,16.9,-74.016048,40.711303,-73.979268,40.782004,1.0
2,5.7,-73.982738,40.76127,-73.991242,40.750562,2.0
3,7.7,-73.98713,40.733143,-73.991567,40.758092,1.0
4,5.3,-73.968095,40.768008,-73.956655,40.783762,1.0
5,12.1,-74.000964,40.73163,-73.972892,40.758233,1.0
6,7.5,-73.980002,40.751662,-73.973802,40.764842,1.0
7,16.5,-73.9513,40.774138,-73.990095,40.751048,1.0
8,9.0,-74.006462,40.726713,-73.993078,40.731628,1.0
9,8.9,-73.980658,40.733873,-73.99154,40.758138,2.0


In [172]:
df.mean()

amount                  11.364853
longitude_of_pickup    -73.921659
latitude_of_pickup      40.711346
longitude_of_dropoff   -73.920941
latitude_of_dropoff     40.703692
no_of_passenger          1.673362
dtype: float64

In [173]:
for columns in df:
    df[columns] = df[columns].fillna(df[columns].mean())
df.head(15)

Unnamed: 0,amount,longitude_of_pickup,latitude_of_pickup,longitude_of_dropoff,latitude_of_dropoff,no_of_passenger
0,4.5,-73.844311,40.721319,-73.84161,40.712278,1.0
1,16.9,-74.016048,40.711303,-73.979268,40.782004,1.0
2,5.7,-73.982738,40.76127,-73.991242,40.750562,2.0
3,7.7,-73.98713,40.733143,-73.991567,40.758092,1.0
4,5.3,-73.968095,40.768008,-73.956655,40.783762,1.0
5,12.1,-74.000964,40.73163,-73.972892,40.758233,1.0
6,7.5,-73.980002,40.751662,-73.973802,40.764842,1.0
7,16.5,-73.9513,40.774138,-73.990095,40.751048,1.0
8,9.0,-74.006462,40.726713,-73.993078,40.731628,1.0
9,8.9,-73.980658,40.733873,-73.99154,40.758138,2.0


In [174]:
# df['longitude_of_pickup'] = df['longitude_of_pickup'].fillna(df['longitude_of_pickup'].mean())
# df['latitude_of_pickup'] = df['latitude_of_pickup'].fillna(df['latitude_of_pickup'].mean())
# df['longitude_of_dropoff'] = df['longitude_of_dropoff'].fillna(df['longitude_of_dropoff'].mean())
# df['latitude_of_dropoff'] = df['latitude_of_dropoff'].fillna(df['latitude_of_dropoff'].mean())
# df['amount'] = df['amount'].fillna(df['amount'].mean())
# df['no_of_passenger'] = df['no_of_passenger'].fillna(df['no_of_passenger'].mean())
# df.head(15)

## Normalization

In [175]:
df['amount']=(df['amount'] - df['amount'].min())/(df['amount'].max() - df['amount'].min())

## Removing outliers

In [176]:
def remove_outlier(df_in):
    for i in df_in.columns:
        q1 = df_in[i].quantile(0.25)
        q3 = df_in[i].quantile(0.75)
        iqr = q3-q1 #Interquartile range
        fence_low  = q1-1.5*iqr
        fence_high = q3+1.5*iqr
        df_out = df_in.loc[(df_in[i] > fence_low) & (df_in[i] < fence_high)]
    return df_out

In [177]:
df=remove_outlier(df)

In [178]:
df.shape

(44542, 6)

## Spliting data 

In [179]:
X = df.drop(['amount'], axis=1)
y = df[['amount']]

In [180]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=1)

## Fit linear model


In [181]:
reg_model = LinearRegression()
reg_model.fit(X_train, y_train)

LinearRegression()

In [182]:
reg_model.score(X_train, y_train)

0.000662448956375572

In [183]:
reg_model.score(X_test, y_test)

-0.0013663471838476493

## Using Decision tree

In [184]:
from sklearn.tree import DecisionTreeRegressor

In [185]:
model=DecisionTreeRegressor(criterion="squared_error",max_depth=6)

In [186]:
model.fit(X_train,y_train)

DecisionTreeRegressor(max_depth=6)

In [187]:
model.score(X_train,y_train)

0.6545700916004173

In [188]:
model.score(X_test,y_test)

0.6117424866389254

## Using KNN

In [189]:
from sklearn.neighbors import KNeighborsRegressor

In [190]:
knn_model = KNeighborsRegressor(n_neighbors=8)

In [191]:
knn_model.fit(X_train,y_train)

KNeighborsRegressor(n_neighbors=8)

In [192]:
knn_model.score(X_train,y_train)

0.7851806993504471

In [193]:
knn_model.score(X_test,y_test)

0.7433010292421363