In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Let's read the data
def load_file(path):
    data = pd.read_csv(path)
    return data
path = 'Food_Delivery_Times.csv'
df = load_file(path)
df.head()

Unnamed: 0,Order_ID,Distance_km,Weather,Traffic_Level,Time_of_Day,Vehicle_Type,Preparation_Time_min,Courier_Experience_yrs,Delivery_Time_min
0,522,7.93,Windy,Low,Afternoon,Scooter,12,1.0,43
1,738,16.42,Clear,Medium,Evening,Bike,20,2.0,84
2,741,9.52,Foggy,Low,Night,Scooter,28,1.0,59
3,661,7.44,Rainy,Medium,Afternoon,Scooter,5,1.0,37
4,412,19.03,Clear,Low,Morning,Bike,16,5.0,68


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Order_ID                1000 non-null   int64  
 1   Distance_km             1000 non-null   float64
 2   Weather                 970 non-null    object 
 3   Traffic_Level           970 non-null    object 
 4   Time_of_Day             970 non-null    object 
 5   Vehicle_Type            1000 non-null   object 
 6   Preparation_Time_min    1000 non-null   int64  
 7   Courier_Experience_yrs  970 non-null    float64
 8   Delivery_Time_min       1000 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 70.4+ KB


In [7]:
# Let's fill in the missing values
def fill_missing(data):
    for column in data.columns:
        if data[column].dtype in ['int64', 'float64']:
            # fill numeric columns with mean
            data[column].fillna(data[column].mean(), inplace=True)
        elif data[column].dtype == 'object':
            # fill categorical columns with mode
            data[column].fillna(data[column].mode()[0], inplace=True)
    return data
df = fill_missing(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Order_ID                1000 non-null   int64  
 1   Distance_km             1000 non-null   float64
 2   Weather                 1000 non-null   object 
 3   Traffic_Level           1000 non-null   object 
 4   Time_of_Day             1000 non-null   object 
 5   Vehicle_Type            1000 non-null   object 
 6   Preparation_Time_min    1000 non-null   int64  
 7   Courier_Experience_yrs  1000 non-null   float64
 8   Delivery_Time_min       1000 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 70.4+ KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].mode()[0], inplace=True)


In [10]:
X = df.drop(['Order_ID', 'Delivery_Time_min'], axis=1)
y = df['Delivery_Time_min']

In [14]:
# Function to encode all categorical data
def encode_all(df):
    df = df.copy()
    cat_columns = df.select_dtypes(include=['object']).columns
    encoded_df = pd.get_dummies(df, columns=cat_columns, drop_first=False)
    return encoded_df

X = encode_all(X)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Distance_km             1000 non-null   float64
 1   Preparation_Time_min    1000 non-null   int64  
 2   Courier_Experience_yrs  1000 non-null   float64
 3   Weather_Clear           1000 non-null   bool   
 4   Weather_Foggy           1000 non-null   bool   
 5   Weather_Rainy           1000 non-null   bool   
 6   Weather_Snowy           1000 non-null   bool   
 7   Weather_Windy           1000 non-null   bool   
 8   Traffic_Level_High      1000 non-null   bool   
 9   Traffic_Level_Low       1000 non-null   bool   
 10  Traffic_Level_Medium    1000 non-null   bool   
 11  Time_of_Day_Afternoon   1000 non-null   bool   
 12  Time_of_Day_Evening     1000 non-null   bool   
 13  Time_of_Day_Morning     1000 non-null   bool   
 14  Time_of_Day_Night       1000 non-null   b

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=21)
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=2, random_state=21)

model.fit(X_train, y_train)
y_hat = model.predict(X_test)

r2 = model.score(X_test, y_test)
print(f"R2: {r2:.2f}")

R2: 0.78
