## Importing the essential libraries over here


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns',None)
%matplotlib inline

In [2]:
data=pd.read_csv("amazon_delivery.csv")

In [3]:
data.head()

Unnamed: 0,Order_ID,Agent_Age,Agent_Rating,Store_Latitude,Store_Longitude,Drop_Latitude,Drop_Longitude,Order_Date,Order_Time,Pickup_Time,Weather,Traffic,Vehicle,Area,Delivery_Time,Category
0,ialx566343618,37,4.9,22.745049,75.892471,22.765049,75.912471,2022-03-19,11:30:00,11:45:00,Sunny,High,motorcycle,Urban,120,Clothing
1,akqg208421122,34,4.5,12.913041,77.683237,13.043041,77.813237,2022-03-25,19:45:00,19:50:00,Stormy,Jam,scooter,Metropolitian,165,Electronics
2,njpu434582536,23,4.4,12.914264,77.6784,12.924264,77.6884,2022-03-19,08:30:00,08:45:00,Sandstorms,Low,motorcycle,Urban,130,Sports
3,rjto796129700,38,4.7,11.003669,76.976494,11.053669,77.026494,2022-04-05,18:00:00,18:10:00,Sunny,Medium,motorcycle,Metropolitian,105,Cosmetics
4,zguw716275638,32,4.6,12.972793,80.249982,13.012793,80.289982,2022-03-26,13:30:00,13:45:00,Cloudy,High,scooter,Metropolitian,150,Toys


## Taking care of duplicate observations over here

In [4]:
data.duplicated().sum()

0

In [5]:
data.isnull().sum()

Order_ID            0
Agent_Age           0
Agent_Rating       54
Store_Latitude      0
Store_Longitude     0
Drop_Latitude       0
Drop_Longitude      0
Order_Date          0
Order_Time          0
Pickup_Time         0
Weather            91
Traffic             0
Vehicle             0
Area                0
Delivery_Time       0
Category            0
dtype: int64

In [6]:
missing_values=[feature for feature in data.columns if data[feature].isnull().sum()>1]
for feature in missing_values:
  print(feature)

Agent_Rating
Weather


In [7]:
data[missing_values]

Unnamed: 0,Agent_Rating,Weather
0,4.9,Sunny
1,4.5,Stormy
2,4.4,Sandstorms
3,4.7,Sunny
4,4.6,Cloudy
...,...,...
43734,4.8,Windy
43735,4.6,Windy
43736,4.9,Cloudy
43737,4.7,Cloudy


In [8]:
data.dropna(inplace=True)

## Filtering all the numerical features over here

In [9]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!='O']
for feature in numerical_features:
  print(feature)

Agent_Age
Agent_Rating
Store_Latitude
Store_Longitude
Drop_Latitude
Drop_Longitude
Delivery_Time


In [10]:
data[numerical_features]

Unnamed: 0,Agent_Age,Agent_Rating,Store_Latitude,Store_Longitude,Drop_Latitude,Drop_Longitude,Delivery_Time
0,37,4.9,22.745049,75.892471,22.765049,75.912471,120
1,34,4.5,12.913041,77.683237,13.043041,77.813237,165
2,23,4.4,12.914264,77.678400,12.924264,77.688400,130
3,38,4.7,11.003669,76.976494,11.053669,77.026494,105
4,32,4.6,12.972793,80.249982,13.012793,80.289982,150
...,...,...,...,...,...,...,...
43734,30,4.8,26.902328,75.794257,26.912328,75.804257,160
43735,21,4.6,0.000000,0.000000,0.070000,0.070000,180
43736,30,4.9,13.022394,80.242439,13.052394,80.272439,80
43737,20,4.7,11.001753,76.986241,11.041753,77.026241,130


## Filtering all the categorical features over here


In [11]:
cat_features=[feature for feature in data.columns if data[feature].dtype=='O']
for feature in cat_features:
  print(feature)

Order_ID
Order_Date
Order_Time
Pickup_Time
Weather
Traffic
Vehicle
Area
Category


In [12]:
data[cat_features]

Unnamed: 0,Order_ID,Order_Date,Order_Time,Pickup_Time,Weather,Traffic,Vehicle,Area,Category
0,ialx566343618,2022-03-19,11:30:00,11:45:00,Sunny,High,motorcycle,Urban,Clothing
1,akqg208421122,2022-03-25,19:45:00,19:50:00,Stormy,Jam,scooter,Metropolitian,Electronics
2,njpu434582536,2022-03-19,08:30:00,08:45:00,Sandstorms,Low,motorcycle,Urban,Sports
3,rjto796129700,2022-04-05,18:00:00,18:10:00,Sunny,Medium,motorcycle,Metropolitian,Cosmetics
4,zguw716275638,2022-03-26,13:30:00,13:45:00,Cloudy,High,scooter,Metropolitian,Toys
...,...,...,...,...,...,...,...,...,...
43734,jlxf819993117,2022-03-24,11:35:00,11:45:00,Windy,High,motorcycle,Metropolitian,Home
43735,aevx342135787,2022-02-16,19:55:00,20:10:00,Windy,Jam,motorcycle,Metropolitian,Jewelry
43736,xnek760674819,2022-03-11,23:50:00,00:05:00,Cloudy,Low,scooter,Metropolitian,Home
43737,cynl434665991,2022-03-07,13:35:00,13:40:00,Cloudy,High,motorcycle,Metropolitian,Kitchen


## Encoding the categorical features into numerical features over here

In [13]:
for feature in cat_features:
  feature_mapping={category:index for index,category in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

In [14]:
data

Unnamed: 0,Order_ID,Agent_Age,Agent_Rating,Store_Latitude,Store_Longitude,Drop_Latitude,Drop_Longitude,Order_Date,Order_Time,Pickup_Time,Weather,Traffic,Vehicle,Area,Delivery_Time,Category
0,0,37,4.9,22.745049,75.892471,22.765049,75.912471,0,0,0,0,0,0,0,120,0
1,1,34,4.5,12.913041,77.683237,13.043041,77.813237,1,1,1,1,1,1,1,165,1
2,2,23,4.4,12.914264,77.678400,12.924264,77.688400,0,2,2,2,2,0,0,130,2
3,3,38,4.7,11.003669,76.976494,11.053669,77.026494,2,3,3,0,3,0,1,105,3
4,4,32,4.6,12.972793,80.249982,13.012793,80.289982,3,4,4,3,0,1,1,150,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43734,43589,30,4.8,26.902328,75.794257,26.912328,75.804257,27,136,0,5,0,0,1,160,13
43735,43590,21,4.6,0.000000,0.000000,0.070000,0.070000,36,81,103,5,1,0,1,180,8
43736,43591,30,4.9,13.022394,80.242439,13.052394,80.272439,4,52,65,3,2,1,1,80,13
43737,43592,20,4.7,11.001753,76.986241,11.041753,77.026241,34,29,30,3,0,0,1,130,12


In [15]:
data['DeliveryTime']=data['Delivery_Time']
data.drop('Delivery_Time',axis=1,inplace=True)

## Creating the features and labels over here

In [16]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set to avoid the problem of overfitting over here

In [17]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training set over here

In [21]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso

regressor=XGBRegressor()
regressor.fit(X_train,y_train)

# regressor=RandomForestRegressor(n_estimators=10,random_state=0)
# regressor.fit(X_train,y_train)

## Evaluating the performance of the model on the testing dataset over here

In [22]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[112.99 110.  ]
 [139.29 125.  ]
 [180.34 190.  ]
 ...
 [199.89 190.  ]
 [ 70.36  31.  ]
 [103.09  80.  ]]


## Checking the metrics like r2 squared over here

In [23]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.7874158090390567