In [62]:
import pandas as pd
df = pd.read_csv("data/finalTrain.csv")

In [63]:
df.head()

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min)
0,0xcdcd,DEHRES17DEL01,36.0,4.2,30.327968,78.046106,30.397968,78.116106,12-02-2022,21:55,22:10,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46
1,0xd987,KOCRES16DEL01,21.0,4.7,10.003064,76.307589,10.043064,76.347589,13-02-2022,14:55,15:05,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23
2,0x2784,PUNERES13DEL03,23.0,4.7,18.56245,73.916619,18.65245,74.006619,04-03-2022,17:30,17:40,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21
3,0xc8b6,LUDHRES15DEL02,34.0,4.3,30.899584,75.809346,30.919584,75.829346,13-02-2022,09:20,09:30,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,20
4,0xdb64,KNPRES14DEL02,24.0,4.7,26.463504,80.372929,26.593504,80.502929,14-02-2022,19:50,20:05,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,41


In [64]:
R = 6371

# Convert degrees to radians
def deg_to_rad(degrees):
    return degrees * (np.pi/180)

# Function to calculate the distance between two points using the haversine formula
def distcalculate(lat1, lon1, lat2, lon2):
    d_lat = deg_to_rad(lat2-lat1)
    d_lon = deg_to_rad(lon2-lon1)
    a = np.sin(d_lat/2)**2 + np.cos(deg_to_rad(lat1)) * np.cos(deg_to_rad(lat2)) * np.sin(d_lon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c

# Calculate the distance between each pair of points
df['distance'] = np.nan

for i in range(len(df)):
    df.loc[i, 'distance'] = distcalculate(df.loc[i, 'Restaurant_latitude'], 
                                        df.loc[i, 'Restaurant_longitude'], 
                                        df.loc[i, 'Delivery_location_latitude'], 
                                        df.loc[i, 'Delivery_location_longitude'])

In [65]:
df = df.drop(labels=["ID","Delivery_person_ID","Type_of_order","Time_Order_picked","Order_Date","Time_Orderd","Restaurant_latitude","Restaurant_longitude","Delivery_location_latitude","Delivery_location_longitude"],axis=1)

In [66]:
## classification into dependent feature and independent feature.
X = df.drop(labels=["Time_taken (min)"],axis = 1) # independent feature
Y = df["Time_taken (min)"] # dependent feature

In [67]:
categorical_cols = X.select_dtypes(include = 'object').columns
numerical_cols = X.select_dtypes(exclude = 'object').columns

In [68]:
categorical_cols

Index(['Weather_conditions', 'Road_traffic_density', 'Type_of_vehicle',
       'Festival', 'City'],
      dtype='object')

In [69]:
numerical_cols

Index(['Delivery_person_Age', 'Delivery_person_Ratings', 'Vehicle_condition',
       'multiple_deliveries', 'distance'],
      dtype='object')

In [70]:
from sklearn.impute import SimpleImputer ##Handling Missing Values
from sklearn.preprocessing import StandardScaler ##Handling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder ##Handling categorical data doing Ordinal Encoding
from sklearn.pipeline import Pipeline ## To create a pipeline so things happen in sequence
from sklearn.compose import ColumnTransformer ## To combine num and cat pipelines

In [71]:
Weather_conditions_categories=["Sunny","Stormy","Sandstorms","Windy","Fog","Cloudy"]
Road_Traffic_categories=["Low","Medium","High","Jam"]
Type_of_vehicle_categories=["bicycle","electric_scooter","scooter","motorcycle"]
Festival_categories=["No","Yes"]
City_categories=["Urban","Metropolitian","Semi-Urban"]

In [72]:
## Numerical Pipeline
num_pipeline = Pipeline(
    
    steps = [
        ('imputer', SimpleImputer(strategy = 'median')),
        ('scalar', StandardScaler())
        
    ]
)

## Categorical Pipeline

cat_pipeline = Pipeline(
    
    steps = [
        
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinal_encoder',OrdinalEncoder(categories = [Weather_conditions_categories,Road_Traffic_categories,Type_of_vehicle_categories,Festival_categories,City_categories])),
        ('scalar',StandardScaler())
    ]
)

pre_processor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)
])

In [73]:
# Train-Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size= 0.30,random_state=30)

In [74]:

X_train = pd.DataFrame(pre_processor.fit_transform(X_train),columns = pre_processor.get_feature_names_out())
X_test = pd.DataFrame(pre_processor.transform(X_test),columns = pre_processor.get_feature_names_out())

In [75]:

X_train.head()

Unnamed: 0,num_pipeline__Delivery_person_Age,num_pipeline__Delivery_person_Ratings,num_pipeline__Vehicle_condition,num_pipeline__multiple_deliveries,num_pipeline__distance,cat_pipeline__Weather_conditions,cat_pipeline__Road_traffic_density,cat_pipeline__Type_of_vehicle,cat_pipeline__Festival,cat_pipeline__City
0,1.124025,-1.62344,-1.223528,-1.318236,-0.079476,-0.316528,1.308057,0.77763,-0.142953,0.516375
1,-0.106505,-3.143059,-1.223528,3.943714,-0.080867,-1.490785,1.308057,0.77763,6.995304,-1.859815
2,0.245075,0.504025,1.160323,0.435747,-0.086475,0.2706,-1.093916,-0.753919,-0.142953,0.516375
3,0.596655,-1.319517,1.160323,0.435747,-0.070761,0.2706,1.308057,-0.753919,-0.142953,0.516375
4,0.772445,-1.62344,-1.223528,0.435747,-0.086406,0.2706,-1.093916,0.77763,-0.142953,-1.859815


In [78]:
# Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [79]:
regression = LinearRegression()
regression.fit(X_train, y_train)

In [80]:
regression.coef_

array([ 2.26245595, -2.38990887, -1.75705322,  1.9325933 ,  0.18603274,
        1.9465388 ,  3.18339408,  0.02829092,  1.4964173 ,  0.98974967])

In [81]:
regression.intercept_

26.317694622038353

In [82]:
import numpy as np
def evaluate_model(true,predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mean_squared_error(true,predicted))
    r2_square = r2_score(true,predicted)
    return mae, rmse, r2_square

In [83]:
## Train multiple Models

models = {
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'ElasticNet':ElasticNet()
}
model_list = []
r2_list= []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)
    
    # Make Predictions
    
    y_pred = model.predict(X_test)
    mae, rmse,r2_square = evaluate_model(y_test,y_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model training performance')
    print("RMSE",rmse)
    print("MAE",mae)
    print("R2 Score",r2_square*100)
    
    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

LinearRegression
Model training performance
RMSE 6.45238463511149
MAE 5.128529167856683
R2 Score 51.8662258675579


Lasso
Model training performance
RMSE 6.8586110494624215
MAE 5.4754587999888695
R2 Score 45.614669936212294


Ridge
Model training performance
RMSE 6.4523822898103305
MAE 5.128527684034433
R2 Score 51.866260858706816


ElasticNet
Model training performance
RMSE 6.950146109200683
MAE 5.568245998943202
R2 Score 44.15332923480052


