In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from geopy.distance import geodesic  # For calculating distance
import os,sys

Load your dataset

In [4]:
scripts_path = os.path.abspath(os.path.join(os.getcwd(), '../Scripts'))
sys.path.append(scripts_path)

In [5]:
from paths import PATH_CLEAN_CSV

In [6]:
data= pd.read_csv(PATH_CLEAN_CSV) 

In [8]:
data

Unnamed: 0.1,Unnamed: 0,Order_ID,Agent_Age,Agent_Rating,Store_Latitude,Store_Longitude,Drop_Latitude,Drop_Longitude,Order_Date,Order_Time,Pickup_Time,Weather,Traffic,Vehicle,Area,Delivery_Time,Category
0,0,ialx566343618,37,4.9,22.745049,75.892471,22.765049,75.912471,2022-03-19,11:30:00,11:45:00,Sunny,High,motorcycle,Urban,120,Clothing
1,1,akqg208421122,34,4.5,12.913041,77.683237,13.043041,77.813237,2022-03-25,19:45:00,19:50:00,Stormy,Jam,scooter,Metropolitian,165,Electronics
2,2,njpu434582536,23,4.4,12.914264,77.678400,12.924264,77.688400,2022-03-19,08:30:00,08:45:00,Sandstorms,Low,motorcycle,Urban,130,Sports
3,3,rjto796129700,38,4.7,11.003669,76.976494,11.053669,77.026494,2022-04-05,18:00:00,18:10:00,Sunny,Medium,motorcycle,Metropolitian,105,Cosmetics
4,4,zguw716275638,32,4.6,12.972793,80.249982,13.012793,80.289982,2022-03-26,13:30:00,13:45:00,Cloudy,High,scooter,Metropolitian,150,Toys
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40104,43733,gcxb277393192,35,4.2,23.371292,85.327872,23.481292,85.437872,2022-03-08,21:45:00,21:55:00,Windy,Jam,motorcycle,Metropolitian,165,Sports
40105,43734,jlxf819993117,30,4.8,26.902328,75.794257,26.912328,75.804257,2022-03-24,11:35:00,11:45:00,Windy,High,motorcycle,Metropolitian,160,Home
40106,43736,xnek760674819,30,4.9,13.022394,80.242439,13.052394,80.272439,2022-03-11,23:50:00,00:05:00,Cloudy,Low,scooter,Metropolitian,80,Home
40107,43737,cynl434665991,20,4.7,11.001753,76.986241,11.041753,77.026241,2022-03-07,13:35:00,13:40:00,Cloudy,High,motorcycle,Metropolitian,130,Kitchen


### 1. Data Preprocessing

In [17]:
data=data.drop('Unnamed: 0',axis=1)

In [18]:
data

Unnamed: 0,Order_ID,Agent_Age,Agent_Rating,Store_Latitude,Store_Longitude,Drop_Latitude,Drop_Longitude,Order_Date,Order_Time,Pickup_Time,Weather,Traffic,Vehicle,Area,Delivery_Time,Category
0,ialx566343618,37,4.9,22.745049,75.892471,22.765049,75.912471,2022-03-19,11:30:00,11:45:00,Sunny,High,motorcycle,Urban,120,Clothing
1,akqg208421122,34,4.5,12.913041,77.683237,13.043041,77.813237,2022-03-25,19:45:00,19:50:00,Stormy,Jam,scooter,Metropolitian,165,Electronics
2,njpu434582536,23,4.4,12.914264,77.678400,12.924264,77.688400,2022-03-19,08:30:00,08:45:00,Sandstorms,Low,motorcycle,Urban,130,Sports
3,rjto796129700,38,4.7,11.003669,76.976494,11.053669,77.026494,2022-04-05,18:00:00,18:10:00,Sunny,Medium,motorcycle,Metropolitian,105,Cosmetics
4,zguw716275638,32,4.6,12.972793,80.249982,13.012793,80.289982,2022-03-26,13:30:00,13:45:00,Cloudy,High,scooter,Metropolitian,150,Toys
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40104,gcxb277393192,35,4.2,23.371292,85.327872,23.481292,85.437872,2022-03-08,21:45:00,21:55:00,Windy,Jam,motorcycle,Metropolitian,165,Sports
40105,jlxf819993117,30,4.8,26.902328,75.794257,26.912328,75.804257,2022-03-24,11:35:00,11:45:00,Windy,High,motorcycle,Metropolitian,160,Home
40106,xnek760674819,30,4.9,13.022394,80.242439,13.052394,80.272439,2022-03-11,23:50:00,00:05:00,Cloudy,Low,scooter,Metropolitian,80,Home
40107,cynl434665991,20,4.7,11.001753,76.986241,11.041753,77.026241,2022-03-07,13:35:00,13:40:00,Cloudy,High,motorcycle,Metropolitian,130,Kitchen


Convert date and time columns to datetime

In [19]:
data['Order_Date'] = pd.to_datetime(data['Order_Date'])
data['Order_Time'] = pd.to_datetime(data['Order_Time'], format='%H:%M:%S').dt.time
data['Pickup_Time'] = pd.to_datetime(data['Pickup_Time'], format='%H:%M:%S').dt.time

Calculate distance between store and drop location

In [22]:
data['Distance'] = data.apply(lambda row: geodesic((row['Store_Latitude'], row['Store_Longitude']), 
                                              (row['Drop_Latitude'], row['Drop_Longitude'])).km, axis=1)

# Drop unnecessary columns

In [None]:

data = data.drop(['Order_ID', 'Store_Latitude', 'Store_Longitude', 'Drop_Latitude', 'Drop_Longitude', 
              'Order_Date', 'Order_Time', 'Pickup_Time'], axis=1)

In [29]:
data=data.drop(['Category'],axis=1)

In [30]:
data.head()

Unnamed: 0,Agent_Age,Agent_Rating,Weather,Traffic,Vehicle,Area,Delivery_Time,Distance
0,37,4.9,Sunny,High,motorcycle,Urban,120,3.020737
1,34,4.5,Stormy,Jam,scooter,Metropolitian,165,20.143737
2,23,4.4,Sandstorms,Low,motorcycle,Urban,130,1.549693
3,38,4.7,Sunny,Medium,motorcycle,Metropolitian,105,7.774497
4,32,4.6,Cloudy,High,scooter,Metropolitian,150,6.197898


One-hot encode categorical variables

In [34]:

data = pd.get_dummies(data, columns=['Weather', 'Traffic', 'Vehicle', 'Area'], drop_first=True)

In [35]:
data

Unnamed: 0,Agent_Age,Agent_Rating,Delivery_Time,Distance,Weather_Fog,Weather_Sandstorms,Weather_Stormy,Weather_Sunny,Weather_Windy,Traffic_Jam,Traffic_Low,Traffic_Medium,Vehicle_scooter,Vehicle_van,Area_Other,Area_Semi-Urban,Area_Urban
0,37,4.9,120,3.020737,False,False,False,True,False,False,False,False,False,False,False,False,True
1,34,4.5,165,20.143737,False,False,True,False,False,True,False,False,True,False,False,False,False
2,23,4.4,130,1.549693,False,True,False,False,False,False,True,False,False,False,False,False,True
3,38,4.7,105,7.774497,False,False,False,True,False,False,False,True,False,False,False,False,False
4,32,4.6,150,6.197898,False,False,False,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40104,35,4.2,165,16.576800,False,False,False,False,True,True,False,False,False,False,False,False,False
40105,30,4.8,160,1.488112,False,False,False,False,True,False,False,False,False,False,False,False,False
40106,30,4.9,80,4.648024,False,False,False,False,False,False,True,False,True,False,False,False,False
40107,20,4.7,130,6.219668,False,False,False,False,False,False,False,False,False,False,False,False,False


In [37]:
X = data.drop('Delivery_Time', axis=1)
y = data['Delivery_Time']

split data

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

In [68]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [70]:
model = LinearRegression()
model.fit(X_train, y_train)

In [71]:
# Predict on test data
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Absolute Error: 31.320381919969122
Mean Squared Error: 1757.672073480632
R-squared: 0.34863379854246734


In [56]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print(f"Cross-Validation R²: {scores.mean()}")

Cross-Validation R²: 0.3508979752998462


In [57]:
from sklearn.preprocessing import PolynomialFeatures

# Create polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

# Split and train the model
X_train_poly, X_test_poly, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)
model.fit(X_train_poly, y_train)

In [58]:
# Create polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)  # Transform the entire feature set

# Split the data into training and testing sets
X_train_poly, X_test_poly, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# Train the model on polynomial features
model = LinearRegression()
model.fit(X_train_poly, y_train)

# Predict on the polynomial-transformed test data
y_pred = model.predict(X_test_poly)  # Use X_test_poly, not X_test

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Absolute Error: 30.424831224914808
Mean Squared Error: 1693.7906619202836
R-squared: 0.36931626884961244
