In [1]:
#Basic Import
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#Modelling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor        
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('uber.csv')

In [3]:
df = df.drop(['Unnamed: 0','key'], axis = 1)

In [4]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [5]:
df['datetime'] = pd.to_datetime(df['pickup_datetime'], utc=True)

# extract the components
df['weekday'] = df['datetime'].dt.weekday
df['month'] = df['datetime'].dt.month
df['year'] = df['datetime'].dt.year
df['hour'] = df['datetime'].dt.hour
df['minute'] = df['datetime'].dt.minute

In [6]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,datetime,weekday,month,year,hour,minute
0,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1,2015-05-07 19:52:06+00:00,3,5,2015,19,52
1,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1,2009-07-17 20:04:56+00:00,4,7,2009,20,4
2,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1,2009-08-24 21:45:00+00:00,0,8,2009,21,45
3,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3,2009-06-26 08:22:21+00:00,4,6,2009,8,22
4,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5,2014-08-28 17:47:00+00:00,3,8,2014,17,47


In [7]:
df = df.drop(['pickup_datetime','datetime'], axis = 1)
df.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,weekday,month,year,hour,minute
0,7.5,-73.999817,40.738354,-73.999512,40.723217,1,3,5,2015,19,52
1,7.7,-73.994355,40.728225,-73.99471,40.750325,1,4,7,2009,20,4
2,12.9,-74.005043,40.74077,-73.962565,40.772647,1,0,8,2009,21,45
3,5.3,-73.976124,40.790844,-73.965316,40.803349,3,4,6,2009,8,22
4,16.0,-73.925023,40.744085,-73.973082,40.761247,5,3,8,2014,17,47


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   fare_amount        200000 non-null  float64
 1   pickup_longitude   200000 non-null  float64
 2   pickup_latitude    200000 non-null  float64
 3   dropoff_longitude  199999 non-null  float64
 4   dropoff_latitude   199999 non-null  float64
 5   passenger_count    200000 non-null  int64  
 6   weekday            200000 non-null  int32  
 7   month              200000 non-null  int32  
 8   year               200000 non-null  int32  
 9   hour               200000 non-null  int32  
 10  minute             200000 non-null  int32  
dtypes: float64(5), int32(5), int64(1)
memory usage: 13.0 MB


In [9]:
df.isnull().sum()

fare_amount          0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
weekday              0
month                0
year                 0
hour                 0
minute               0
dtype: int64

In [10]:
df['dropoff_longitude'].fillna(df['dropoff_longitude'].mean(), inplace=True)
df['dropoff_latitude'].fillna(df['dropoff_latitude'].mean(), inplace=True)


In [11]:
df.isnull().sum()

fare_amount          0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
weekday              0
month                0
year                 0
hour                 0
minute               0
dtype: int64

In [12]:
X = df.drop(['fare_amount'],axis=1)
y = df['fare_amount']

In [13]:
num_features = X.select_dtypes(exclude='object').columns
cat_features = X.select_dtypes(include='object').columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
ohe = OneHotEncoder()

ct = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_features),
    ('cat', ohe, cat_features)
])

X = ct.fit_transform(X)



In [14]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

In [15]:
# EVALUATION METRIC

def evaluate_model(true,predicted):

    mae = mean_absolute_error(predicted,true)
    rmse = np.sqrt(mean_squared_error(predicted,true))
    r2 = r2_score(predicted,true)

    return mae,rmse,r2

In [16]:
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest Regressor': RandomForestRegressor(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Cat Pattern Regressor': CatBoostRegressor(),
    'XGBoost Regressor': XGBRegressor()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)

    model_train_mae,model_train_rmse,model_train_r2= evaluate_model(y_train,pred_train)
    model_test_mae,model_test_rmse,model_test_r2= evaluate_model(y_test,pred_test)

    print(f"{list(models.keys())[i]}")
    model_list.append(list(models.keys())[i])

    print("Model performance of training dataset")
    print("Root mean Squared Error: {}".format(model_train_rmse))
    print("Mean Absolute Error: {}".format(model_train_mae))
    print("R2 Score: {}".format(model_train_r2))

    print("Model performance of testing dataset") 
    print("Root mean Squared Error: {}".format(model_test_rmse))
    print("Mean Absolute Error: {}".format(model_test_mae))
    print("R2 Score: {}".format(model_test_r2))

    r2_list.append(model_test_r2)

    print('*'*35)
  
    

Linear Regression
Model performance of training dataset
Root mean Squared Error: 9.71735680662449
Mean Absolute Error: 5.956793157717951
R2 Score: -59.99080405161832
Model performance of testing dataset
Root mean Squared Error: 10.227134721191511
Mean Absolute Error: 6.02216674623971
R2 Score: -66.0088551772262
***********************************
Random Forest Regressor
Model performance of training dataset
Root mean Squared Error: 1.6615483614258828
Mean Absolute Error: 0.7416949057291665
R2 Score: 0.9673590215829821
Model performance of testing dataset
Root mean Squared Error: 5.398350526333764
Mean Absolute Error: 2.0353868141666664
R2 Score: 0.6254499850858914
***********************************
Decision Tree Regressor
Model performance of training dataset
Root mean Squared Error: 0.010608074754638562
Mean Absolute Error: 4.93750000000557e-05
R2 Score: 0.9999988274957048
Model performance of testing dataset
Root mean Squared Error: 7.02151362154201
Mean Absolute Error: 2.88912175
R