In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
df = pd.read_csv('data/zomato_dataset.csv')

In [3]:
df.drop(labels = ['ID' , 'Delivery_person_ID'] , axis = 1 , inplace = True)

In [4]:
df.isna().sum()

Delivery_person_Age            1854
Delivery_person_Ratings        1908
Restaurant_latitude               0
Restaurant_longitude              0
Delivery_location_latitude        0
Delivery_location_longitude       0
Order_Date                        0
Time_Orderd                    1731
Time_Order_picked                 0
Weather_conditions              616
Road_traffic_density            601
Vehicle_condition                 0
Type_of_order                     0
Type_of_vehicle                   0
multiple_deliveries             993
Festival                        228
City                           1200
Time_taken (min)                  0
dtype: int64

In [5]:
#  find this unique value in each column

for col in df.columns:
    print(df[col].unique())
    print('***********************************************************************')
    print('***********************************************************************')

[36. 21. 23. 34. 24. 29. 35. 33. 25. 31. 37. 27. 32. 26. 38. 20. 22. nan
 28. 39. 30. 15. 50.]
***********************************************************************
***********************************************************************
[4.2 4.7 4.3 4.5 4.  4.9 4.1 5.  4.8 3.5 4.6 nan 4.4 3.8 3.9 3.7 2.6 2.5
 3.6 3.1 2.7 1.  3.2 3.3 6.  3.4 2.8 2.9 3. ]
***********************************************************************
***********************************************************************
[ 30.327968  10.003064  18.56245   30.899584  26.463504  19.176269
  12.311072  18.592718  17.426228  22.552672  18.563934  23.357804
  11.003669  12.986047  19.221315  13.005801  26.849596  21.160522
  12.934179  18.51421   11.022477  21.160437  15.51315   15.561295
   0.        18.55144   18.593481  21.173343  17.451976  12.972532
  13.064181  19.121999  21.149569  19.091458  22.539129  12.970324
  21.175975  23.369746  12.914264  11.003681  10.96185   27.165108
  26.88842   26.913987  12.30

In [8]:
# this columns not corelated target feature so drop this columns
df.drop(labels = ['Restaurant_latitude' , 'Restaurant_longitude' , 'Delivery_location_latitude' , 'Delivery_location_longitude'  , 'Order_Date', 'Time_Orderd' ,'Time_Order_picked' ,'Vehicle_condition' , 'Delivery_person_Ratings'] , axis =1 , inplace=True)

In [14]:
## Independent and dependent features
X = df.drop(labels = ["Time_taken (min)"] , axis = 1)
y = df["Time_taken (min)"]

In [15]:
# Define which columns should be ordinal-encoded and which should be scaled
categorical_cols = X.select_dtypes(include= 'object').columns
numerical_cols = X.select_dtypes(exclude="object").columns

In [16]:
categorical_cols

Index(['Weather_conditions', 'Road_traffic_density', 'Type_of_order',
       'Type_of_vehicle', 'Festival', 'City'],
      dtype='object')

In [17]:
numerical_cols

Index(['Delivery_person_Age', 'multiple_deliveries'], dtype='object')

In [18]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OneHotEncoder # OnehotEncoder
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [19]:
#numerical_columns

num_pipeline = Pipeline(
                steps=[
                      ('Imputer' , SimpleImputer(strategy='median')),
                      ('scaler' , StandardScaler())
                      
                    
                ])

cat_pipeline =Pipeline(
                steps =[
                    ('Imputer' , SimpleImputer(strategy = 'most_frequent')),
                    ('one_hot_encoder' , OneHotEncoder(sparse = False)),
                    ('scaler' , StandardScaler())
                ])

preprocessor=ColumnTransformer(
    [
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)

]
)

In [20]:
# train Test Split
from sklearn.model_selection import train_test_split

In [21]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)

In [22]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train))
X_test=pd.DataFrame(preprocessor.transform(X_test))
                    

In [23]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,1.130946,-1.318236,-0.446305,-0.470836,2.265788,-0.444638,-0.437185,-0.442715,-0.326571,1.487497,...,-0.582851,-0.03716,-0.302932,0.852586,-0.71055,0.142953,-0.142953,0.537784,-0.05935,-0.532357
1,-0.099454,3.943714,-0.446305,-0.470836,-0.441348,-0.444638,2.287363,-0.442715,-0.326571,1.487497,...,-0.582851,-0.03716,-0.302932,0.852586,-0.71055,-6.995304,6.995304,-1.859481,-0.05935,1.878437
2,0.252089,0.435747,-0.446305,-0.470836,-0.441348,-0.444638,-0.437185,2.258788,-0.326571,-0.672270,...,-0.582851,-0.03716,-0.302932,-1.172902,1.40736,0.142953,-0.142953,0.537784,-0.05935,-0.532357
3,0.603632,0.435747,-0.446305,-0.470836,-0.441348,-0.444638,-0.437185,2.258788,-0.326571,1.487497,...,-0.582851,-0.03716,-0.302932,-1.172902,1.40736,0.142953,-0.142953,0.537784,-0.05935,-0.532357
4,0.779403,0.435747,-0.446305,-0.470836,-0.441348,-0.444638,-0.437185,2.258788,-0.326571,-0.672270,...,-0.582851,-0.03716,-0.302932,0.852586,-0.71055,0.142953,-0.142953,-1.859481,-0.05935,1.878437
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31903,1.306717,-1.318236,-0.446305,-0.470836,-0.441348,-0.444638,2.287363,-0.442715,-0.326571,-0.672270,...,1.715704,-0.03716,-0.302932,0.852586,-0.71055,0.142953,-0.142953,-1.859481,-0.05935,1.878437
31904,-0.275225,-1.318236,-0.446305,2.123883,-0.441348,-0.444638,-0.437185,-0.442715,-0.326571,-0.672270,...,-0.582851,-0.03716,-0.302932,0.852586,-0.71055,0.142953,-0.142953,0.537784,-0.05935,-0.532357
31905,-1.681397,0.435747,-0.446305,2.123883,-0.441348,-0.444638,-0.437185,-0.442715,-0.326571,-0.672270,...,-0.582851,-0.03716,-0.302932,-1.172902,1.40736,0.142953,-0.142953,0.537784,-0.05935,-0.532357
31906,0.779403,0.435747,2.240620,-0.470836,-0.441348,-0.444638,-0.437185,-0.442715,-0.326571,-0.672270,...,-0.582851,-0.03716,-0.302932,0.852586,-0.71055,0.142953,-0.142953,0.537784,-0.05935,-0.532357


In [24]:
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,0.076317,0.435747,-0.446305,-0.470836,2.265788,-0.444638,-0.437185,-0.442715,-0.326571,1.487497,...,1.715704,-0.03716,-0.302932,-1.172902,1.40736,0.142953,-0.142953,0.537784,-0.05935,-0.532357
1,-0.978311,0.435747,2.240620,-0.470836,-0.441348,-0.444638,-0.437185,-0.442715,-0.326571,-0.672270,...,-0.582851,-0.03716,-0.302932,0.852586,-0.71055,0.142953,-0.142953,0.537784,-0.05935,-0.532357
2,-0.275225,-1.318236,-0.446305,-0.470836,-0.441348,-0.444638,-0.437185,2.258788,-0.326571,-0.672270,...,1.715704,-0.03716,3.301074,-1.172902,-0.71055,0.142953,-0.142953,0.537784,-0.05935,-0.532357
3,-0.626768,-1.318236,-0.446305,-0.470836,-0.441348,-0.444638,2.287363,-0.442715,-0.326571,-0.672270,...,1.715704,-0.03716,-0.302932,0.852586,-0.71055,0.142953,-0.142953,0.537784,-0.05935,-0.532357
4,-0.099454,0.435747,-0.446305,2.123883,-0.441348,-0.444638,-0.437185,-0.442715,-0.326571,1.487497,...,-0.582851,-0.03716,3.301074,-1.172902,-0.71055,0.142953,-0.142953,0.537784,-0.05935,-0.532357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13671,0.252089,2.189730,-0.446305,-0.470836,2.265788,-0.444638,-0.437185,-0.442715,-0.326571,1.487497,...,1.715704,-0.03716,3.301074,-1.172902,-0.71055,0.142953,-0.142953,0.537784,-0.05935,-0.532357
13672,0.779403,-1.318236,-0.446305,2.123883,-0.441348,-0.444638,-0.437185,-0.442715,-0.326571,-0.672270,...,-0.582851,-0.03716,3.301074,-1.172902,-0.71055,0.142953,-0.142953,-1.859481,-0.05935,1.878437
13673,-0.802540,0.435747,-0.446305,-0.470836,2.265788,-0.444638,-0.437185,-0.442715,-0.326571,-0.672270,...,1.715704,-0.03716,-0.302932,-1.172902,1.40736,0.142953,-0.142953,-1.859481,-0.05935,1.878437
13674,-0.802540,0.435747,-0.446305,-0.470836,-0.441348,-0.444638,-0.437185,2.258788,-0.326571,-0.672270,...,1.715704,-0.03716,-0.302932,-1.172902,1.40736,0.142953,-0.142953,0.537784,-0.05935,-0.532357


In [25]:
# Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [26]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [27]:
regression.coef_


array([ 2.38511452e+00,  2.14251086e+00, -5.07740492e+11, -5.25783544e+11,
       -5.03949879e+11, -5.06471100e+11, -5.00729150e+11, -5.05000541e+11,
       -4.10108128e+12, -6.43463000e+12, -6.64001217e+12, -5.93293621e+12,
       -6.30799368e+11, -6.33182687e+11, -6.34268767e+11, -6.36756829e+11,
        1.58727238e+11,  1.18683146e+12,  2.11176155e+12,  2.01960746e+12,
       -1.03071193e+13, -1.03071193e+13, -1.04371455e+13, -1.47976771e+12,
       -1.03785729e+13])

In [28]:
regression.intercept_


26.323275547562673

In [29]:
import numpy as np
def evaluate_model(true , predicted):
    mse = mean_squared_error(true, predicted)  
    mae = mean_absolute_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [30]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 6.805779240140453
MAE: 5.3543706768778865
R2 score 46.449302185232


Lasso
Model Training Performance
RMSE: 7.312537830889024
MAE: 5.830649896378377
R2 score 38.17762704216171


Ridge
Model Training Performance
RMSE: 6.80549519808326
MAE: 5.353565464148302
R2 score 46.45377201371843


Elasticnet
Model Training Performance
RMSE: 7.229878270846078
MAE: 5.7841418591199085
R2 score 39.56738469869917


