In [18]:
import pandas as pd
from pyproj import Geod

In [19]:
df=pd.read_csv('finalTrain.csv')

In [20]:
df.head()

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min)
0,0xcdcd,DEHRES17DEL01,36.0,4.2,30.327968,78.046106,30.397968,78.116106,12-02-2022,21:55,22:10,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46
1,0xd987,KOCRES16DEL01,21.0,4.7,10.003064,76.307589,10.043064,76.347589,13-02-2022,14:55,15:05,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23
2,0x2784,PUNERES13DEL03,23.0,4.7,18.56245,73.916619,18.65245,74.006619,04-03-2022,17:30,17:40,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21
3,0xc8b6,LUDHRES15DEL02,34.0,4.3,30.899584,75.809346,30.919584,75.829346,13-02-2022,09:20,09:30,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,20
4,0xdb64,KNPRES14DEL02,24.0,4.7,26.463504,80.372929,26.593504,80.502929,14-02-2022,19:50,20:05,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,41


In [21]:
##handling null values
df.isnull().sum()

ID                                0
Delivery_person_ID                0
Delivery_person_Age            1854
Delivery_person_Ratings        1908
Restaurant_latitude               0
Restaurant_longitude              0
Delivery_location_latitude        0
Delivery_location_longitude       0
Order_Date                        0
Time_Orderd                    1731
Time_Order_picked                 0
Weather_conditions              616
Road_traffic_density            601
Vehicle_condition                 0
Type_of_order                     0
Type_of_vehicle                   0
multiple_deliveries             993
Festival                        228
City                           1200
Time_taken (min)                  0
dtype: int64

In [22]:
##droping id columns
df=df.drop(labels=['ID','Delivery_person_ID',"Type_of_order","Order_Date","Time_Order_picked","Time_Orderd"],axis=1)

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45584 entries, 0 to 45583
Data columns (total 14 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Delivery_person_Age          43730 non-null  float64
 1   Delivery_person_Ratings      43676 non-null  float64
 2   Restaurant_latitude          45584 non-null  float64
 3   Restaurant_longitude         45584 non-null  float64
 4   Delivery_location_latitude   45584 non-null  float64
 5   Delivery_location_longitude  45584 non-null  float64
 6   Weather_conditions           44968 non-null  object 
 7   Road_traffic_density         44983 non-null  object 
 8   Vehicle_condition            45584 non-null  int64  
 9   Type_of_vehicle              45584 non-null  object 
 10  multiple_deliveries          44591 non-null  float64
 11  Festival                     45356 non-null  object 
 12  City                         44384 non-null  object 
 13  Time_taken (min)

In [24]:
##independent and dependent features
X=df.drop(labels=['Time_taken (min)'],axis=1)
y=df[['Time_taken (min)']]

In [25]:
#Segregating numerical and categorical features and define which should be ordinal encodeed and which should be scaled
categorical_cols=X.select_dtypes(include='O').columns
numerical_cols=X.select_dtypes(exclude='O').columns


In [38]:
numerical_cols

Index(['Delivery_person_Age', 'Delivery_person_Ratings', 'Restaurant_latitude',
       'Restaurant_longitude', 'Delivery_location_latitude',
       'Delivery_location_longitude', 'Vehicle_condition',
       'multiple_deliveries'],
      dtype='object')

In [27]:
#Define the custom ranking for each ordinal features
#replacing odinal categorical values
Weather_conditions_map=['Stormy','Sandstorms','Windy','Fog','Cloudy','Sunny']
Road_traffic_density_map=['Low','Medium','High','Jam']
Type_of_vehicle_map=['bicycle','electric_scooter','scooter','motorcycle']
Festival_map=['No','Yes']
City_map=['Semi-Urban','Urban','Metropolitian']
#category=[Weather_conditions_map,Road_traffic_density_map,Type_of_vehicle_map,Festival_map,City_map]





In [28]:
from sklearn.impute import SimpleImputer #for handling missing values
from sklearn.preprocessing import StandardScaler #for handling feature scaling
from sklearn.preprocessing import OrdinalEncoder #ordinal encoder

from sklearn.pipeline import Pipeline #for combining multiple steps
from sklearn.compose import ColumnTransformer#for connecting pipelines

In [29]:
##Numerical pipeline
num_pipeline=Pipeline(
    steps=[
    ('Imputer',SimpleImputer(strategy='median')),
    ('scalar',StandardScaler())
       ]
)
##categorical pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[Weather_conditions_map,Road_traffic_density_map,Type_of_vehicle_map,Festival_map,City_map])),
    ('scalar',StandardScaler())
        ]
)
##combining cat and num pipelines
preprocessor=ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)
])

In [30]:
##Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=42)

In [31]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train))
X_test=pd.DataFrame(preprocessor.transform(X_test))

In [32]:
##Model Training
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [33]:
import numpy as np
def evaluate_model(true,predicted):
    mse=mean_squared_error(true,predicted)
    mae=mean_absolute_error(true,predicted)
    rmse=np.sqrt(mse)
    rsq=r2_score(true,predicted)
    return mse,mae,rmse,rsq

In [34]:
models={
    'Linear Regression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elastic Net':ElasticNet()
}

model_list=[]
rsq_list=[]
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    y_pred=model.predict(X_test)
    mse,mae,rmse,rsq=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    print("-"*20)
    model_list.append(list(models.keys())[i])
    rsq_list.append(rsq*100)
    print('model training performance :')
    print('MSE :',mse)
    print('MAE :',mae)
    print('RMSE:',rmse)
    print('R-Sq:',rsq*100)
    print("="*30,'\n')

Linear Regression
--------------------
model training performance :
MSE : 46.01170513893845
MAE : 5.356662298080637
RMSE: 6.783192842529132
R-Sq: 48.49337643174339

Lasso
--------------------
model training performance :
MSE : 51.45134486915368
MAE : 5.7416924671985745
RMSE: 7.172959282552333
R-Sq: 42.40411120922892

Ridge
--------------------
model training performance :
MSE : 46.011725071052005
MAE : 5.356667664586797
RMSE: 6.783194311756962
R-Sq: 48.49335411924994

Elastic Net
--------------------
model training performance :
MSE : 52.84023199106177
MAE : 5.841730509139369
RMSE: 7.269128695453244
R-Sq: 40.849357131958705



In [35]:
model_list

['Linear Regression', 'Lasso', 'Ridge', 'Elastic Net']

In [36]:
rsq_list

[48.49337643174339, 42.40411120922892, 48.49335411924994, 40.849357131958705]

In [37]:
print(f'The accuracy of the model {model_list[rsq_list.index(max(rsq_list))]} is higher with R-squared value {max(rsq_list)}')

The accuracy of the model Linear Regression is higher with R-squared value 48.49337643174339
