In [19]:
#### Importing the required libraries Setting the working Directory and importing the Dataset ####
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
os.chdir("D:/python & R/Projects/Cab Fare Prediction")
train_set = pd.read_csv("train_cab.csv")
test_set = pd.read_csv("test.csv")

In [20]:
#### Joining the Train set and Test Set ###
Data =pd.concat([train_set,test_set], axis=0)
Data = Data.reset_index(drop=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  


In [21]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25981 entries, 0 to 25980
Data columns (total 7 columns):
dropoff_latitude     25981 non-null float64
dropoff_longitude    25981 non-null float64
fare_amount          16043 non-null object
passenger_count      25926 non-null float64
pickup_datetime      25981 non-null object
pickup_latitude      25981 non-null float64
pickup_longitude     25981 non-null float64
dtypes: float64(5), object(2)
memory usage: 1.4+ MB


In [22]:
#### Understanding the Data ####
Data.columns

Index(['dropoff_latitude', 'dropoff_longitude', 'fare_amount',
       'passenger_count', 'pickup_datetime', 'pickup_latitude',
       'pickup_longitude'],
      dtype='object')

In [23]:
Data.corr()

Unnamed: 0,dropoff_latitude,dropoff_longitude,passenger_count,pickup_latitude,pickup_longitude
dropoff_latitude,1.0,-0.978197,-0.000508,0.883,-0.952288
dropoff_longitude,-0.978197,1.0,0.000511,-0.864221,0.964045
passenger_count,-0.000508,0.000511,1.0,-0.000491,0.0005
pickup_latitude,0.883,-0.864221,-0.000491,1.0,-0.894925
pickup_longitude,-0.952288,0.964045,0.0005,-0.894925,1.0


In [24]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25981 entries, 0 to 25980
Data columns (total 7 columns):
dropoff_latitude     25981 non-null float64
dropoff_longitude    25981 non-null float64
fare_amount          16043 non-null object
passenger_count      25926 non-null float64
pickup_datetime      25981 non-null object
pickup_latitude      25981 non-null float64
pickup_longitude     25981 non-null float64
dtypes: float64(5), object(2)
memory usage: 1.4+ MB


In [6]:
######## Removing the Pickup_datetime variable to avoid Curse of dimensionality ########

In [25]:
Data=Data.drop(columns='pickup_datetime')


In [26]:
Data.corr()

Unnamed: 0,dropoff_latitude,dropoff_longitude,passenger_count,pickup_latitude,pickup_longitude
dropoff_latitude,1.0,-0.978197,-0.000508,0.883,-0.952288
dropoff_longitude,-0.978197,1.0,0.000511,-0.864221,0.964045
passenger_count,-0.000508,0.000511,1.0,-0.000491,0.0005
pickup_latitude,0.883,-0.864221,-0.000491,1.0,-0.894925
pickup_longitude,-0.952288,0.964045,0.0005,-0.894925,1.0


In [27]:
Data.isnull().sum()

dropoff_latitude        0
dropoff_longitude       0
fare_amount          9938
passenger_count        55
pickup_latitude         0
pickup_longitude        0
dtype: int64

In [13]:
9938+55

9993

In [28]:
######## Missing value Analysis ########
### Takning all the missing values and storing it in the separate datframe ###
missing_value = pd.DataFrame(Data.isnull().sum())
missing_value=missing_value.reset_index()
missing_value = missing_value.rename(columns = {'index': 'variables', 0 : 'Missing_Percentage'})
#Calculate percentage
missing_value['Missing_Percentage'] = (missing_value['Missing_Percentage']/len(Data))*100

#descending order
missing_value = missing_value.sort_values('Missing_Percentage', ascending = False).reset_index(drop = True)
missing_value

Unnamed: 0,variables,Missing_Percentage
0,fare_amount,38.25103
1,passenger_count,0.211693
2,dropoff_latitude,0.0
3,dropoff_longitude,0.0
4,pickup_latitude,0.0
5,pickup_longitude,0.0


In [29]:
#### Missing Value Imputation ####
Data['fare_amount']=pd.to_numeric(Data['fare_amount'], errors='coerce')
#Imputation method

#train_set['fare_amount'].loc[0]= np.nan
#actual value = 4.5
#mean = 15.015
#median = 8.5
##### Imputing the missing value the median #####
Data = Data.fillna(Data.median())

In [30]:
################# outlier analysis #################
no_outlier = Data.copy()

In [31]:
no_outlier.columns

Index(['dropoff_latitude', 'dropoff_longitude', 'fare_amount',
       'passenger_count', 'pickup_latitude', 'pickup_longitude'],
      dtype='object')

In [32]:
numeric_var = ['dropoff_latitude', 'dropoff_longitude', 'fare_amount',
       'passenger_count', 'pickup_latitude',
       'pickup_longitude']

In [33]:
for i in numeric_var:
    print(i)
    q75, q25 = np.percentile(no_outlier.loc[:,i], [75 ,25])
    iqr = q75 - q25
    min = q25 - (iqr*1.5)
    max = q75 + (iqr*1.5)
    print(min)
    print(max)
    no_outlier = no_outlier.drop(no_outlier[no_outlier.loc[:,i] < min].index)
    no_outlier = no_outlier.drop(no_outlier[no_outlier.loc[:,i] > max].index)

dropoff_latitude
40.68487400000001
40.81841800000001
dropoff_longitude
-74.02844424999998
-73.93008625000002
fare_amount
4.0
12.0
passenger_count
-0.5
3.5
pickup_latitude
40.694806249999985
40.81038425000001
pickup_longitude
-74.02542637499998
-73.93778537500002


In [34]:
print(Data.shape)
print(no_outlier.shape)

(25981, 6)
(17166, 6)


In [35]:
no_outlier.corr()

Unnamed: 0,dropoff_latitude,dropoff_longitude,fare_amount,passenger_count,pickup_latitude,pickup_longitude
dropoff_latitude,1.0,0.632498,-0.026595,-0.00564,0.580633,0.405069
dropoff_longitude,0.632498,1.0,0.005359,-0.022609,0.387357,0.475608
fare_amount,-0.026595,0.005359,1.0,0.013211,-0.03423,-0.025834
passenger_count,-0.00564,-0.022609,0.013211,1.0,-0.021795,-0.023605
pickup_latitude,0.580633,0.387357,-0.03423,-0.021795,1.0,0.685471
pickup_longitude,0.405069,0.475608,-0.025834,-0.023605,0.685471,1.0


In [36]:
numeric_columns = list(no_outlier.columns[no_outlier.dtypes != 'category'])
# Checking VIF values of numeric columns 
from statsmodels.stats.outliers_influence import variance_inflation_factor as vf 
from statsmodels.tools.tools import add_constant 
numeric_df = add_constant(no_outlier[numeric_columns]) 
vif = pd.Series([vf(numeric_df.values, i) 
                 for i in range(numeric_df.shape[1])], 
                index = numeric_df.columns) 

In [37]:
vif.round(1)

const                94763918.1
dropoff_latitude            2.2
dropoff_longitude           1.9
fare_amount                 1.0
passenger_count             1.0
pickup_latitude             2.5
pickup_longitude            2.2
dtype: float64

In [38]:
################# Building the model #################

In [39]:
###Making a matrix of Features and vector of Dependent Variables##

In [40]:
no_outlier.columns

Index(['dropoff_latitude', 'dropoff_longitude', 'fare_amount',
       'passenger_count', 'pickup_latitude', 'pickup_longitude'],
      dtype='object')

In [41]:
y = no_outlier[['fare_amount']].reset_index(drop=True)
X = no_outlier[['dropoff_latitude', 'dropoff_longitude',
       'passenger_count', 'pickup_latitude',
       'pickup_longitude']].reset_index(drop=True)

In [42]:
print(X.shape)
print(y.shape)

(17166, 5)
(17166, 1)


In [43]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 0)




In [44]:
######## Simple Linear Regression ##########
# Fitting Simple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
model_lr= regressor.fit(X_train, y_train)

In [45]:
# Predicting the Test set results
y_pred = regressor.predict(X_test)

In [47]:
####Evaluating the model ith the error metrics####
from sklearn import metrics
###Printing MAE###
print("MAE")
print(metrics.mean_absolute_error(y_test,y_pred))
###Printing MSE###
print("MSE")
print(metrics.mean_squared_error(y_test,y_pred))
###Printing RMSE###
print("RMSE")
print(np.sqrt(metrics.mean_squared_error(y_test,y_pred)))


MAE
1.327820966421147
MSE
2.8854577422488545
RMSE
1.6986635164884347


In [48]:
########Decision Tree Regression ##########
# Fitting Decision Tree Regression to the dataset
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
model_dt=regressor.fit(X_train, y_train)


In [49]:
# Predicting a new result
y_pred = regressor.predict(X_test)

In [29]:
####Evaluating the model ith the error metrics####
from sklearn import metrics
###Printing MAE###
print("MAE")
print(metrics.mean_absolute_error(y_test,y_pred))
###Printing MSE###
print("MSE")
print(metrics.mean_squared_error(y_test,y_pred))
###Printing RMSE###
print("RMSE")
print(np.sqrt(metrics.mean_squared_error(y_test,y_pred)))


MAE
1.5030234183851803
MSE
4.212084935337295
RMSE
2.052336457634882


In [40]:
y_pred

array([8.5, 8.5, 8.5, ..., 8.9, 8.5, 6.1])

In [50]:
######## Random Forest Regression ######## 
# Fitting Random Forest Regression to the dataset
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
model_rf=regressor.fit(X_train, y_train)

  """


In [51]:
# Predicting a new result
y_pred = regressor.predict(X_test)


In [52]:
####Evaluating the model ith the error metrics####
from sklearn import metrics
###Printing MAE###
print("MAE")
print(metrics.mean_absolute_error(y_test,y_pred))
###Printing MSE###
print("MSE")
print(metrics.mean_squared_error(y_test,y_pred))
###Printing RMSE###
print("RMSE")
print(np.sqrt(metrics.mean_squared_error(y_test,y_pred)))


MAE
1.2284603285564488
MSE
2.3693851275777704
RMSE
1.5392807176008443


In [None]:
###### FOR LINEAR REGRESSION #####
# MAE
# 1.327820966421147
# MSE
# 2.8854577422488545
# RMSE
# 1.6986635164884347
###### DECISION TREE REGRESSION #####
# MAE
# 1.5030234183851803
# MSE
# 4.212084935337295
# RMSE
# 2.052336457634882
###### RANDOM FOREST REGRESSION #####
# MAE
# 1.2284603285564488
# MSE
# 2.3693851275777704
# RMSE
# 1.5392807176008443


In [53]:
######TUNING OF THE MODEL######

In [54]:
######## Random Forest Regression ######## 
# Fitting Random Forest Regression to the dataset
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 200, random_state = 0, min_samples_leaf = 5, min_samples_split = 12)
model_rf=regressor.fit(X_train, y_train)

  """


In [55]:
y_pred = regressor.predict(X_test)

In [56]:
from sklearn import metrics
###Printing MAE###
print("MAE")
print(metrics.mean_absolute_error(y_test,y_pred))
###Printing MSE###
print("MSE")
print(metrics.mean_squared_error(y_test,y_pred))
###Printing RMSE###
print("RMSE")
print(np.sqrt(metrics.mean_squared_error(y_test,y_pred)))


MAE
1.1827316336281253
MSE
2.137185173791558
RMSE
1.4619114794650045


In [34]:
####### Hyperparameter tuning #######

In [45]:
# from sklearn.model_selection import GridSearchCV
# # Create the parameter grid based on the results of random search 
# param_grid = {
#     'min_samples_leaf': [3, 4, 5],
#     'min_samples_split': [8, 10, 12],
#     'n_estimators': [100, 200, 300, 1000]
# }
# # Create a based model
# rf = RandomForestRegressor()
# # Instantiate the grid search model
# grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
#                           cv = 3, n_jobs = -1, verbose = 2)

In [46]:
# Fit the grid search to the data
# grid_search.fit(X_train,y_train)
# grid_search.best_params_

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed: 12.8min finished
  self.best_estimator_.fit(X, y, **fit_params)


{'min_samples_leaf': 5, 'min_samples_split': 12, 'n_estimators': 200}