In [1]:
import pandas as pd
import numpy as np
import os

-----------------------------------------
LOADING THE DOORDASH ETA DATASET
------------------------

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("dharun4772/doordash-eta-prediction")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/doordash-eta-prediction


In [3]:

files = os.listdir(path)

# 'files' contain ['historical_data.csv'] to load that csv

for file in files:
    if file.endswith(".csv"):
        dataset_path = os.path.join(path, file)
        df = pd.read_csv(dataset_path)
        print(f"Loaded data from {file}")
        break

Loaded data from historical_data.csv


----------------------------------------------
LOADED TIME SERIES DATAFRAME
-------------------------------

In [4]:
df

Unnamed: 0,market_id,created_at,actual_delivery_time,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration
0,1.0,2015-02-06 22:24:17,2015-02-06 23:27:16,1845,american,1.0,4,3441,4,557,1239,33.0,14.0,21.0,446,861.0
1,2.0,2015-02-10 21:49:25,2015-02-10 22:56:29,5477,mexican,2.0,1,1900,1,1400,1400,1.0,2.0,2.0,446,690.0
2,3.0,2015-01-22 20:39:28,2015-01-22 21:09:09,5477,,1.0,1,1900,1,1900,1900,1.0,0.0,0.0,446,690.0
3,3.0,2015-02-03 21:21:45,2015-02-03 22:13:00,5477,,1.0,6,6900,5,600,1800,1.0,1.0,2.0,446,289.0
4,3.0,2015-02-15 02:40:36,2015-02-15 03:20:26,5477,,1.0,3,3900,3,1100,1600,6.0,6.0,9.0,446,650.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197423,1.0,2015-02-17 00:19:41,2015-02-17 01:24:48,2956,fast,4.0,3,1389,3,345,649,17.0,17.0,23.0,251,331.0
197424,1.0,2015-02-13 00:01:59,2015-02-13 00:58:22,2956,fast,4.0,6,3010,4,405,825,12.0,11.0,14.0,251,915.0
197425,1.0,2015-01-24 04:46:08,2015-01-24 05:36:16,2956,fast,4.0,5,1836,3,300,399,39.0,41.0,40.0,251,795.0
197426,1.0,2015-02-01 18:18:15,2015-02-01 19:23:22,3630,sandwich,1.0,1,1175,1,535,535,7.0,7.0,12.0,446,384.0


---------------------------------------------------------
Data Processing (Handling Null Values and Removing Outliers) & Feature Engineering
-----------------------------------------

----------------------------------
Checking for Null Values in Various Columns of Dataframe 'df'

In [5]:
df.isna().sum()

market_id                                         987
created_at                                          0
actual_delivery_time                                7
store_id                                            0
store_primary_category                           4760
order_protocol                                    995
total_items                                         0
subtotal                                            0
num_distinct_items                                  0
min_item_price                                      0
max_item_price                                      0
total_onshift_dashers                           16262
total_busy_dashers                              16262
total_outstanding_orders                        16262
estimated_order_place_duration                      0
estimated_store_to_consumer_driving_duration      526
dtype: int64

In [6]:
# Market_ID is checked for null values and these rows are removed

critical_columns = [
    'market_id',
    'actual_delivery_time'
]

# Removing rows with null values in the above mentioned attributes or columns
df = df.dropna(subset=critical_columns)

In [7]:
df.isna().sum()

market_id                                           0
created_at                                          0
actual_delivery_time                                0
store_id                                            0
store_primary_category                           4268
order_protocol                                    508
total_items                                         0
subtotal                                            0
num_distinct_items                                  0
min_item_price                                      0
max_item_price                                      0
total_onshift_dashers                           16194
total_busy_dashers                              16194
total_outstanding_orders                        16194
estimated_order_place_duration                      0
estimated_store_to_consumer_driving_duration      525
dtype: int64

In [8]:

def handle_null_values(df):
        
    # Store Primary Category is filled with a new category and filled as 'Uncategorized'
    df['store_primary_category'].fillna('Uncategorized', inplace=True)
    
    # Order Protocol doesn't have too many null values so I just filled with mostly adopted value (mode value)
    df['order_protocol'].fillna(df['order_protocol'].mode()[0], inplace=True)
    
    # Estimated Driving Duration is filled using the Median value throught our dataset and there are only 525 null values, so it will not
    # hamper the performance of the model
    df['estimated_store_to_consumer_driving_duration'].fillna(
        df['estimated_store_to_consumer_driving_duration'].median(), 
        inplace=True
    )
    
    return df

# Apply the function
df = handle_null_values(df)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['store_primary_category'].fillna('Uncategorized', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['store_primary_category'].fillna('Uncategorized', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=Tr

In [9]:
df.isna().sum()

market_id                                           0
created_at                                          0
actual_delivery_time                                0
store_id                                            0
store_primary_category                              0
order_protocol                                      0
total_items                                         0
subtotal                                            0
num_distinct_items                                  0
min_item_price                                      0
max_item_price                                      0
total_onshift_dashers                           16194
total_busy_dashers                              16194
total_outstanding_orders                        16194
estimated_order_place_duration                      0
estimated_store_to_consumer_driving_duration        0
dtype: int64

----------------------------------------------------
Price Range obtained instead of Min or Max prices

In [10]:

df['price_range'] = df['max_item_price'] - df['min_item_price']

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df['max_item_price'] - df['min_item_price']


Unnamed: 0,market_id,created_at,actual_delivery_time,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration,price_range
0,1.0,2015-02-06 22:24:17,2015-02-06 23:27:16,1845,american,1.0,4,3441,4,557,1239,33.0,14.0,21.0,446,861.0,682
1,2.0,2015-02-10 21:49:25,2015-02-10 22:56:29,5477,mexican,2.0,1,1900,1,1400,1400,1.0,2.0,2.0,446,690.0,0
2,3.0,2015-01-22 20:39:28,2015-01-22 21:09:09,5477,Uncategorized,1.0,1,1900,1,1900,1900,1.0,0.0,0.0,446,690.0,0
3,3.0,2015-02-03 21:21:45,2015-02-03 22:13:00,5477,Uncategorized,1.0,6,6900,5,600,1800,1.0,1.0,2.0,446,289.0,1200
4,3.0,2015-02-15 02:40:36,2015-02-15 03:20:26,5477,Uncategorized,1.0,3,3900,3,1100,1600,6.0,6.0,9.0,446,650.0,500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197423,1.0,2015-02-17 00:19:41,2015-02-17 01:24:48,2956,fast,4.0,3,1389,3,345,649,17.0,17.0,23.0,251,331.0,304
197424,1.0,2015-02-13 00:01:59,2015-02-13 00:58:22,2956,fast,4.0,6,3010,4,405,825,12.0,11.0,14.0,251,915.0,420
197425,1.0,2015-01-24 04:46:08,2015-01-24 05:36:16,2956,fast,4.0,5,1836,3,300,399,39.0,41.0,40.0,251,795.0,99
197426,1.0,2015-02-01 18:18:15,2015-02-01 19:23:22,3630,sandwich,1.0,1,1175,1,535,535,7.0,7.0,12.0,446,384.0,0


In [11]:
# Mean value is used to fill the null values in these three attributes or columns

df['total_onshift_dashers'].fillna(df['total_onshift_dashers'].mean(), inplace=True)
df['total_busy_dashers'].fillna(df['total_busy_dashers'].mean(), inplace=True)
df['total_outstanding_orders'].fillna(df['total_outstanding_orders'].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['total_onshift_dashers'].fillna(df['total_onshift_dashers'].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['total_onshift_dashers'].fillna(df['total_onshift_dashers'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using

-------------------------------------------------------
Preprocessing the date, time values present in 'created_at' and 'actual_delivery_time' attributes

In [12]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

data = df.copy()

# Convert timestamps
data['created_at'] = pd.to_datetime(data['created_at'])
data['actual_delivery_time'] = pd.to_datetime(data['actual_delivery_time'])

# Extract time-based features
data['hour_of_day'] = data['created_at'].dt.hour
data['day_of_week'] = data['created_at'].dt.dayofweek
data['month'] = data['created_at'].dt.month

# Compute the delivery duration in minutes
data['delivery_duration_minutes'] = (
    (data['actual_delivery_time'] - data['created_at']).dt.total_seconds() / 60
)

# Encode categorical variables
le = LabelEncoder()
data['store_primary_category_encoded'] = le.fit_transform(data['store_primary_category'])


In [13]:
data.columns

Index(['market_id', 'created_at', 'actual_delivery_time', 'store_id',
       'store_primary_category', 'order_protocol', 'total_items', 'subtotal',
       'num_distinct_items', 'min_item_price', 'max_item_price',
       'total_onshift_dashers', 'total_busy_dashers',
       'total_outstanding_orders', 'estimated_order_place_duration',
       'estimated_store_to_consumer_driving_duration', 'price_range',
       'hour_of_day', 'day_of_week', 'month', 'delivery_duration_minutes',
       'store_primary_category_encoded'],
      dtype='object')

In [14]:

# Dashers per order available

data['dashers_per_order'] = data['total_onshift_dashers'] / (data['total_outstanding_orders'] + 1e-5)

In [15]:

# Total time from placing the order from Doordash to the point where it reaches to the consumer

data['orderPlaced_to_home_location_time'] = data['estimated_order_place_duration'] + data['estimated_store_to_consumer_driving_duration']

data.head(10)

Unnamed: 0,market_id,created_at,actual_delivery_time,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,...,estimated_order_place_duration,estimated_store_to_consumer_driving_duration,price_range,hour_of_day,day_of_week,month,delivery_duration_minutes,store_primary_category_encoded,dashers_per_order,orderPlaced_to_home_location_time
0,1.0,2015-02-06 22:24:17,2015-02-06 23:27:16,1845,american,1.0,4,3441,4,557,...,446,861.0,682,22,4,2,62.983333,5,1.571428,1307.0
1,2.0,2015-02-10 21:49:25,2015-02-10 22:56:29,5477,mexican,2.0,1,1900,1,1400,...,446,690.0,0,21,1,2,67.066667,48,0.499998,1136.0
2,3.0,2015-01-22 20:39:28,2015-01-22 21:09:09,5477,Uncategorized,1.0,1,1900,1,1900,...,446,690.0,0,20,3,1,29.683333,0,100000.0,1136.0
3,3.0,2015-02-03 21:21:45,2015-02-03 22:13:00,5477,Uncategorized,1.0,6,6900,5,600,...,446,289.0,1200,21,1,2,51.25,0,0.499998,735.0
4,3.0,2015-02-15 02:40:36,2015-02-15 03:20:26,5477,Uncategorized,1.0,3,3900,3,1100,...,446,650.0,500,2,6,2,39.833333,0,0.666666,1096.0
5,3.0,2015-01-28 20:30:38,2015-01-28 21:08:58,5477,Uncategorized,1.0,3,5000,3,1500,...,446,338.0,400,20,2,1,38.333333,0,0.999995,784.0
6,3.0,2015-01-31 02:16:36,2015-01-31 02:43:00,5477,Uncategorized,1.0,2,3900,2,1200,...,446,638.0,1500,2,5,1,26.4,0,1.11111,1084.0
7,3.0,2015-02-12 03:03:35,2015-02-12 03:36:20,5477,Uncategorized,1.0,4,4850,4,750,...,446,626.0,1050,3,3,2,32.75,0,0.999999,1072.0
8,2.0,2015-02-16 00:11:35,2015-02-16 00:38:01,5477,indian,3.0,4,4771,3,820,...,446,289.0,784,0,0,2,26.433333,37,0.444444,735.0
9,3.0,2015-02-18 01:15:45,2015-02-18 02:08:57,5477,Uncategorized,1.0,2,2100,2,700,...,446,715.0,500,1,2,2,53.2,0,0.999995,1161.0


In [16]:
# Estimating the Delivery Speed and Average Delivery Time

data['Avg_delivery_time'] = data.groupby(['store_id', 'hour_of_day'])['delivery_duration_minutes'].transform('mean')

data['Delivery_Speed'] = data['Avg_delivery_time'] / (data['estimated_store_to_consumer_driving_duration'] / 60 + 1e-5)



In [17]:
data['Avg_price'] = (data['min_item_price'] + data['max_item_price'])/2
data.head()

Unnamed: 0,market_id,created_at,actual_delivery_time,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,...,hour_of_day,day_of_week,month,delivery_duration_minutes,store_primary_category_encoded,dashers_per_order,orderPlaced_to_home_location_time,Avg_delivery_time,Delivery_Speed,Avg_price
0,1.0,2015-02-06 22:24:17,2015-02-06 23:27:16,1845,american,1.0,4,3441,4,557,...,22,4,2,62.983333,5,1.571428,1307.0,62.983333,4.389079,898.0
1,2.0,2015-02-10 21:49:25,2015-02-10 22:56:29,5477,mexican,2.0,1,1900,1,1400,...,21,1,2,67.066667,48,0.499998,1136.0,59.158333,5.144198,1400.0
2,3.0,2015-01-22 20:39:28,2015-01-22 21:09:09,5477,Uncategorized,1.0,1,1900,1,1900,...,20,3,1,29.683333,0,100000.0,1136.0,34.008333,2.957244,1900.0
3,3.0,2015-02-03 21:21:45,2015-02-03 22:13:00,5477,Uncategorized,1.0,6,6900,5,600,...,21,1,2,51.25,0,0.499998,735.0,59.158333,12.281981,1200.0
4,3.0,2015-02-15 02:40:36,2015-02-15 03:20:26,5477,Uncategorized,1.0,3,3900,3,1100,...,2,6,2,39.833333,0,0.666666,1096.0,33.116667,3.05692,1350.0


-----------------------------------------------------
Removing Outliers from the Dataset
---------------------------------------------------

In [18]:
def remove_outliers(df, outlier_columns):
    # Iterate over each column to remove outliers based on IQR
    for col in outlier_columns:
        # Calculate the Q1 (25th percentile) and Q3 (75th percentile)
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        
        # Calculate the IQR (Interquartile Range)
        IQR = Q3 - Q1
        
        # Define the lower and upper bounds for outliers
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Remove rows where the column values are outside the bounds
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    
    return df

outlier_columns = [
    'subtotal', 'delivery_duration_minutes', 'num_distinct_items', 'price_range','max_item_price', 'Delivery_Speed','Avg_price'
]

data_cleaned = remove_outliers(data, outlier_columns)

# Length of the cleaned dataset after removing outliers
print(f"Original dataset size: {len(data)}")
print(f"Cleaned dataset size: {len(data_cleaned)}")


Original dataset size: 196434
Cleaned dataset size: 156055


In [19]:
data_cleaned.columns

Index(['market_id', 'created_at', 'actual_delivery_time', 'store_id',
       'store_primary_category', 'order_protocol', 'total_items', 'subtotal',
       'num_distinct_items', 'min_item_price', 'max_item_price',
       'total_onshift_dashers', 'total_busy_dashers',
       'total_outstanding_orders', 'estimated_order_place_duration',
       'estimated_store_to_consumer_driving_duration', 'price_range',
       'hour_of_day', 'day_of_week', 'month', 'delivery_duration_minutes',
       'store_primary_category_encoded', 'dashers_per_order',
       'orderPlaced_to_home_location_time', 'Avg_delivery_time',
       'Delivery_Speed', 'Avg_price'],
      dtype='object')

In [20]:
# We have the date time in numerical format and also actual delivery time in minutes
# Dropping these three columns 'created_at', 'actual_delivery_time', 'store_primary_category'

data_cleaned = data_cleaned.drop(columns=['created_at', 'actual_delivery_time', 'store_primary_category'],axis=1)

------------------------------------------------------
Model Training using XGBoost model
-----------------------------------

In [21]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tqdm import tqdm
import time

# Dataloader for our training dataset
class DeliveryDataset(Dataset):
    def __init__(self, features, target):
        self.features = features
        self.target = target
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return torch.tensor(self.features[idx], dtype=torch.float32), torch.tensor(self.target[idx], dtype=torch.float32)

# Function is used to define the features which will be used for our model training
def select_features(data):
    features = [
        'market_id',
        'store_id',
        'order_protocol',
        'total_items',
        'subtotal',
        'num_distinct_items',
        'min_item_price',
        'max_item_price',
        'total_onshift_dashers',
        'total_busy_dashers',
        'total_outstanding_orders',
        'estimated_order_place_duration',
        'estimated_store_to_consumer_driving_duration',
        'price_range',
        'hour_of_day',
        'day_of_week',
        'month',
        'store_primary_category_encoded',
        'dashers_per_order',
        'orderPlaced_to_home_location_time',
        'Avg_delivery_time',
        'Delivery_Speed',
        'Avg_price'
    ]
    
    X = data[features]
    y = data['delivery_duration_minutes']
    
    return X, y


def train_and_evaluate_model(X, y):

    # Data is split into training and testing parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scaling the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
   # Using the Dataloader for using small batches to run our code
    train_dataset = DeliveryDataset(X_train_scaled, y_train)
    test_dataset = DeliveryDataset(X_test_scaled, y_test)
    
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

    # Checking whether GPU is avaliable or not
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f'Using device: {device}')
    
    # XGBoost Model (GPU Acceleration)
    model = xgb.XGBRegressor(
        n_estimators=100,
        learning_rate=0.1,
        random_state=42,
        tree_method='gpu_hist'
    )
    
    # Model training 
    start_time = time.time()
    for i in tqdm(range(100), desc="Training Model", unit="round"):
        model.fit(X_train_scaled, y_train, verbose=False)
        
        elapsed_time = time.time() - start_time
        estimated_total_time = elapsed_time * 100 / (i + 1)
        remaining_time = estimated_total_time - elapsed_time
        tqdm.write(f"Round {i+1} - Time elapsed: {elapsed_time:.2f}s - Estimated time left: {remaining_time:.2f}s")

    
    y_pred = model.predict(X_test_scaled)
    
    
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    

    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    return {
        'model': model,
        'mae': mae,
        'mse': mse,
        'rmse': rmse,
        'r2': r2,
        'feature_importance': feature_importance
    }



def main(df):

    # Selecting Features and Target label for our data
    X, y = select_features(df)

    # Using the function to train the model
    results = train_and_evaluate_model(X, y)
    
    # Print Results
    print("\nModel Performance Metrics:")
    print(f"Mean Absolute Error: {results['mae']:.2f}")
    print(f"Mean Squared Error: {results['mse']:.2f}")
    print(f"Root Mean Squared Error: {results['rmse']:.2f}")
    print(f"R-squared Score: {results['r2']:.2f}")
    
    # Display Feature Importance
    print("\nTop 10 Most Important Features:")
    print(results['feature_importance'].head(10))
    
    return results


results = main(data_cleaned) 


Using device: cuda



    E.g. tree_method = "hist", device = "cuda"

Training Model:   1%|          | 1/100 [00:00<01:20,  1.23round/s]

Round 1 - Time elapsed: 0.81s - Estimated time left: 80.42s


Training Model:   2%|▏         | 2/100 [00:01<01:04,  1.51round/s]

Round 2 - Time elapsed: 1.37s - Estimated time left: 66.97s



    E.g. tree_method = "hist", device = "cuda"

Training Model:   3%|▎         | 3/100 [00:01<00:59,  1.64round/s]

Round 3 - Time elapsed: 1.92s - Estimated time left: 61.93s


Training Model:   4%|▍         | 4/100 [00:02<00:56,  1.71round/s]

Round 4 - Time elapsed: 2.47s - Estimated time left: 59.20s



    E.g. tree_method = "hist", device = "cuda"

Training Model:   5%|▌         | 5/100 [00:03<00:54,  1.74round/s]

Round 5 - Time elapsed: 3.02s - Estimated time left: 57.47s



    E.g. tree_method = "hist", device = "cuda"

Training Model:   6%|▌         | 6/100 [00:03<00:53,  1.76round/s]

Round 6 - Time elapsed: 3.58s - Estimated time left: 56.05s


Training Model:   7%|▋         | 7/100 [00:04<00:52,  1.78round/s]

Round 7 - Time elapsed: 4.13s - Estimated time left: 54.82s



    E.g. tree_method = "hist", device = "cuda"

Training Model:   8%|▊         | 8/100 [00:04<00:51,  1.79round/s]

Round 8 - Time elapsed: 4.68s - Estimated time left: 53.81s


Training Model:   9%|▉         | 9/100 [00:05<00:50,  1.80round/s]

Round 9 - Time elapsed: 5.23s - Estimated time left: 52.85s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  10%|█         | 10/100 [00:05<00:49,  1.80round/s]

Round 10 - Time elapsed: 5.78s - Estimated time left: 52.02s


Training Model:  11%|█         | 11/100 [00:06<00:49,  1.81round/s]

Round 11 - Time elapsed: 6.33s - Estimated time left: 51.19s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  12%|█▏        | 12/100 [00:06<00:49,  1.79round/s]

Round 12 - Time elapsed: 6.90s - Estimated time left: 50.61s


Training Model:  13%|█▎        | 13/100 [00:07<00:48,  1.79round/s]

Round 13 - Time elapsed: 7.46s - Estimated time left: 49.94s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  14%|█▍        | 14/100 [00:08<00:47,  1.80round/s]

Round 14 - Time elapsed: 8.01s - Estimated time left: 49.22s


Training Model:  15%|█▌        | 15/100 [00:08<00:47,  1.80round/s]

Round 15 - Time elapsed: 8.56s - Estimated time left: 48.53s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  16%|█▌        | 16/100 [00:09<00:46,  1.80round/s]

Round 16 - Time elapsed: 9.12s - Estimated time left: 47.86s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  17%|█▋        | 17/100 [00:09<00:45,  1.81round/s]

Round 17 - Time elapsed: 9.67s - Estimated time left: 47.19s


Training Model:  18%|█▊        | 18/100 [00:10<00:45,  1.82round/s]

Round 18 - Time elapsed: 10.21s - Estimated time left: 46.52s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  19%|█▉        | 19/100 [00:10<00:44,  1.82round/s]

Round 19 - Time elapsed: 10.76s - Estimated time left: 45.87s


Training Model:  20%|██        | 20/100 [00:11<00:44,  1.82round/s]

Round 20 - Time elapsed: 11.31s - Estimated time left: 45.25s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  21%|██        | 21/100 [00:11<00:43,  1.81round/s]

Round 21 - Time elapsed: 11.86s - Estimated time left: 44.63s


Training Model:  22%|██▏       | 22/100 [00:12<00:42,  1.82round/s]

Round 22 - Time elapsed: 12.41s - Estimated time left: 44.00s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  23%|██▎       | 23/100 [00:12<00:42,  1.82round/s]

Round 23 - Time elapsed: 12.96s - Estimated time left: 43.37s


Training Model:  24%|██▍       | 24/100 [00:13<00:41,  1.82round/s]

Round 24 - Time elapsed: 13.50s - Estimated time left: 42.76s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  25%|██▌       | 25/100 [00:14<00:41,  1.83round/s]

Round 25 - Time elapsed: 14.05s - Estimated time left: 42.15s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  26%|██▌       | 26/100 [00:14<00:40,  1.82round/s]

Round 26 - Time elapsed: 14.60s - Estimated time left: 41.55s


Training Model:  27%|██▋       | 27/100 [00:15<00:40,  1.82round/s]

Round 27 - Time elapsed: 15.15s - Estimated time left: 40.96s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  28%|██▊       | 28/100 [00:15<00:39,  1.82round/s]

Round 28 - Time elapsed: 15.70s - Estimated time left: 40.36s


Training Model:  29%|██▉       | 29/100 [00:16<00:38,  1.83round/s]

Round 29 - Time elapsed: 16.24s - Estimated time left: 39.76s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  30%|███       | 30/100 [00:16<00:38,  1.83round/s]

Round 30 - Time elapsed: 16.79s - Estimated time left: 39.17s


Training Model:  31%|███       | 31/100 [00:17<00:37,  1.82round/s]

Round 31 - Time elapsed: 17.35s - Estimated time left: 38.61s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  32%|███▏      | 32/100 [00:17<00:37,  1.82round/s]

Round 32 - Time elapsed: 17.89s - Estimated time left: 38.02s


Training Model:  33%|███▎      | 33/100 [00:18<00:36,  1.82round/s]

Round 33 - Time elapsed: 18.44s - Estimated time left: 37.44s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  34%|███▍      | 34/100 [00:18<00:36,  1.83round/s]

Round 34 - Time elapsed: 18.99s - Estimated time left: 36.85s


Training Model:  35%|███▌      | 35/100 [00:19<00:35,  1.83round/s]

Round 35 - Time elapsed: 19.53s - Estimated time left: 36.27s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  36%|███▌      | 36/100 [00:20<00:35,  1.83round/s]

Round 36 - Time elapsed: 20.08s - Estimated time left: 35.69s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  37%|███▋      | 37/100 [00:20<00:34,  1.82round/s]

Round 37 - Time elapsed: 20.63s - Estimated time left: 35.13s


Training Model:  38%|███▊      | 38/100 [00:21<00:33,  1.83round/s]

Round 38 - Time elapsed: 21.17s - Estimated time left: 34.55s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  39%|███▉      | 39/100 [00:21<00:35,  1.71round/s]

Round 39 - Time elapsed: 21.84s - Estimated time left: 34.17s


Training Model:  40%|████      | 40/100 [00:22<00:35,  1.67round/s]

Round 40 - Time elapsed: 22.48s - Estimated time left: 33.72s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  41%|████      | 41/100 [00:23<00:35,  1.64round/s]

Round 41 - Time elapsed: 23.11s - Estimated time left: 33.26s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  42%|████▏     | 42/100 [00:23<00:34,  1.70round/s]

Round 42 - Time elapsed: 23.66s - Estimated time left: 32.67s


Training Model:  43%|████▎     | 43/100 [00:24<00:32,  1.74round/s]

Round 43 - Time elapsed: 24.20s - Estimated time left: 32.07s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  44%|████▍     | 44/100 [00:24<00:31,  1.76round/s]

Round 44 - Time elapsed: 24.75s - Estimated time left: 31.50s


Training Model:  45%|████▌     | 45/100 [00:25<00:30,  1.78round/s]

Round 45 - Time elapsed: 25.30s - Estimated time left: 30.92s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  46%|████▌     | 46/100 [00:25<00:30,  1.79round/s]

Round 46 - Time elapsed: 25.85s - Estimated time left: 30.34s


Training Model:  47%|████▋     | 47/100 [00:26<00:29,  1.81round/s]

Round 47 - Time elapsed: 26.39s - Estimated time left: 29.76s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  48%|████▊     | 48/100 [00:26<00:28,  1.80round/s]

Round 48 - Time elapsed: 26.95s - Estimated time left: 29.19s


Training Model:  49%|████▉     | 49/100 [00:27<00:28,  1.80round/s]

Round 49 - Time elapsed: 27.51s - Estimated time left: 28.63s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  50%|█████     | 50/100 [00:28<00:27,  1.81round/s]

Round 50 - Time elapsed: 28.05s - Estimated time left: 28.05s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  51%|█████     | 51/100 [00:28<00:27,  1.81round/s]

Round 51 - Time elapsed: 28.60s - Estimated time left: 27.48s


Training Model:  52%|█████▏    | 52/100 [00:29<00:26,  1.82round/s]

Round 52 - Time elapsed: 29.15s - Estimated time left: 26.91s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  53%|█████▎    | 53/100 [00:29<00:25,  1.82round/s]

Round 53 - Time elapsed: 29.69s - Estimated time left: 26.33s


Training Model:  54%|█████▍    | 54/100 [00:30<00:25,  1.82round/s]

Round 54 - Time elapsed: 30.24s - Estimated time left: 25.76s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  55%|█████▌    | 55/100 [00:30<00:24,  1.83round/s]

Round 55 - Time elapsed: 30.79s - Estimated time left: 25.19s


Training Model:  56%|█████▌    | 56/100 [00:31<00:24,  1.83round/s]

Round 56 - Time elapsed: 31.33s - Estimated time left: 24.62s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  57%|█████▋    | 57/100 [00:31<00:23,  1.83round/s]

Round 57 - Time elapsed: 31.88s - Estimated time left: 24.05s


Training Model:  58%|█████▊    | 58/100 [00:32<00:22,  1.83round/s]

Round 58 - Time elapsed: 32.43s - Estimated time left: 23.48s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  59%|█████▉    | 59/100 [00:32<00:22,  1.83round/s]

Round 59 - Time elapsed: 32.97s - Estimated time left: 22.91s


Training Model:  60%|██████    | 60/100 [00:33<00:21,  1.83round/s]

Round 60 - Time elapsed: 33.52s - Estimated time left: 22.34s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  61%|██████    | 61/100 [00:34<00:21,  1.84round/s]

Round 61 - Time elapsed: 34.06s - Estimated time left: 21.78s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  62%|██████▏   | 62/100 [00:34<00:20,  1.84round/s]

Round 62 - Time elapsed: 34.60s - Estimated time left: 21.21s


Training Model:  63%|██████▎   | 63/100 [00:35<00:20,  1.83round/s]

Round 63 - Time elapsed: 35.15s - Estimated time left: 20.64s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  64%|██████▍   | 64/100 [00:35<00:19,  1.83round/s]

Round 64 - Time elapsed: 35.70s - Estimated time left: 20.08s


Training Model:  65%|██████▌   | 65/100 [00:36<00:19,  1.82round/s]

Round 65 - Time elapsed: 36.26s - Estimated time left: 19.52s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  66%|██████▌   | 66/100 [00:36<00:18,  1.82round/s]

Round 66 - Time elapsed: 36.80s - Estimated time left: 18.96s


Training Model:  67%|██████▋   | 67/100 [00:37<00:18,  1.82round/s]

Round 67 - Time elapsed: 37.36s - Estimated time left: 18.40s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  68%|██████▊   | 68/100 [00:37<00:17,  1.82round/s]

Round 68 - Time elapsed: 37.90s - Estimated time left: 17.84s


Training Model:  69%|██████▉   | 69/100 [00:38<00:17,  1.82round/s]

Round 69 - Time elapsed: 38.45s - Estimated time left: 17.27s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  70%|███████   | 70/100 [00:38<00:16,  1.82round/s]

Round 70 - Time elapsed: 39.00s - Estimated time left: 16.71s


Training Model:  71%|███████   | 71/100 [00:39<00:15,  1.83round/s]

Round 71 - Time elapsed: 39.54s - Estimated time left: 16.15s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  72%|███████▏  | 72/100 [00:40<00:15,  1.83round/s]

Round 72 - Time elapsed: 40.09s - Estimated time left: 15.59s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  73%|███████▎  | 73/100 [00:40<00:14,  1.83round/s]

Round 73 - Time elapsed: 40.63s - Estimated time left: 15.03s


Training Model:  74%|███████▍  | 74/100 [00:41<00:14,  1.83round/s]

Round 74 - Time elapsed: 41.18s - Estimated time left: 14.47s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  75%|███████▌  | 75/100 [00:41<00:13,  1.83round/s]

Round 75 - Time elapsed: 41.72s - Estimated time left: 13.91s


Training Model:  76%|███████▌  | 76/100 [00:42<00:13,  1.83round/s]

Round 76 - Time elapsed: 42.27s - Estimated time left: 13.35s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  77%|███████▋  | 77/100 [00:42<00:12,  1.84round/s]

Round 77 - Time elapsed: 42.81s - Estimated time left: 12.79s


Training Model:  78%|███████▊  | 78/100 [00:43<00:11,  1.84round/s]

Round 78 - Time elapsed: 43.35s - Estimated time left: 12.23s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  79%|███████▉  | 79/100 [00:43<00:11,  1.83round/s]

Round 79 - Time elapsed: 43.90s - Estimated time left: 11.67s


Training Model:  80%|████████  | 80/100 [00:44<00:10,  1.83round/s]

Round 80 - Time elapsed: 44.45s - Estimated time left: 11.11s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  81%|████████  | 81/100 [00:44<00:10,  1.83round/s]

Round 81 - Time elapsed: 44.99s - Estimated time left: 10.55s


Training Model:  82%|████████▏ | 82/100 [00:45<00:09,  1.83round/s]

Round 82 - Time elapsed: 45.54s - Estimated time left: 10.00s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  83%|████████▎ | 83/100 [00:46<00:09,  1.83round/s]

Round 83 - Time elapsed: 46.09s - Estimated time left: 9.44s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  84%|████████▍ | 84/100 [00:46<00:08,  1.83round/s]

Round 84 - Time elapsed: 46.63s - Estimated time left: 8.88s


Training Model:  85%|████████▌ | 85/100 [00:47<00:08,  1.83round/s]

Round 85 - Time elapsed: 47.18s - Estimated time left: 8.33s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  86%|████████▌ | 86/100 [00:47<00:07,  1.81round/s]

Round 86 - Time elapsed: 47.75s - Estimated time left: 7.77s


Training Model:  87%|████████▋ | 87/100 [00:48<00:07,  1.81round/s]

Round 87 - Time elapsed: 48.30s - Estimated time left: 7.22s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  88%|████████▊ | 88/100 [00:48<00:06,  1.82round/s]

Round 88 - Time elapsed: 48.84s - Estimated time left: 6.66s


Training Model:  89%|████████▉ | 89/100 [00:49<00:06,  1.83round/s]

Round 89 - Time elapsed: 49.39s - Estimated time left: 6.10s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  90%|█████████ | 90/100 [00:49<00:05,  1.83round/s]

Round 90 - Time elapsed: 49.93s - Estimated time left: 5.55s


Training Model:  91%|█████████ | 91/100 [00:50<00:04,  1.83round/s]

Round 91 - Time elapsed: 50.48s - Estimated time left: 4.99s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  92%|█████████▏| 92/100 [00:51<00:04,  1.83round/s]

Round 92 - Time elapsed: 51.02s - Estimated time left: 4.44s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  93%|█████████▎| 93/100 [00:51<00:03,  1.83round/s]

Round 93 - Time elapsed: 51.57s - Estimated time left: 3.88s


Training Model:  94%|█████████▍| 94/100 [00:52<00:03,  1.83round/s]

Round 94 - Time elapsed: 52.12s - Estimated time left: 3.33s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  95%|█████████▌| 95/100 [00:52<00:02,  1.83round/s]

Round 95 - Time elapsed: 52.66s - Estimated time left: 2.77s


Training Model:  96%|█████████▌| 96/100 [00:53<00:02,  1.83round/s]

Round 96 - Time elapsed: 53.21s - Estimated time left: 2.22s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  97%|█████████▋| 97/100 [00:53<00:01,  1.77round/s]

Round 97 - Time elapsed: 53.81s - Estimated time left: 1.66s


Training Model:  98%|█████████▊| 98/100 [00:54<00:01,  1.67round/s]

Round 98 - Time elapsed: 54.49s - Estimated time left: 1.11s



    E.g. tree_method = "hist", device = "cuda"

Training Model:  99%|█████████▉| 99/100 [00:55<00:00,  1.67round/s]

Round 99 - Time elapsed: 55.09s - Estimated time left: 0.56s



    E.g. tree_method = "hist", device = "cuda"

Training Model: 100%|██████████| 100/100 [00:55<00:00,  1.80round/s]

Round 100 - Time elapsed: 55.64s - Estimated time left: 0.00s

Model Performance Metrics:
Mean Absolute Error: 7.80
Mean Squared Error: 103.25
Root Mean Squared Error: 10.16
R-squared Score: 0.50

Top 10 Most Important Features:
                                         feature  importance
20                             Avg_delivery_time    0.577487
18                             dashers_per_order    0.081316
19             orderPlaced_to_home_location_time    0.053756
12  estimated_store_to_consumer_driving_duration    0.048899
15                                   day_of_week    0.025624
16                                         month    0.024606
14                                   hour_of_day    0.019779
5                             num_distinct_items    0.018859
3                                    total_items    0.017044
8                          total_onshift_dashers    0.016808



Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [22]:
print("\nModel Performance Metrics:")
print(f"Mean Absolute Error: {results['mae']:.2f}")
print(f"Mean Squared Error: {results['mse']:.2f}")
print(f"Root Mean Squared Error: {results['rmse']:.2f}")
print(f"R-squared Score: {results['r2']:.2f}")

# Display Feature Importance
print("\nTop 10 Most Important Features:")
print(results['feature_importance'].head(10))


Model Performance Metrics:
Mean Absolute Error: 7.80
Mean Squared Error: 103.25
Root Mean Squared Error: 10.16
R-squared Score: 0.50

Top 10 Most Important Features:
                                         feature  importance
20                             Avg_delivery_time    0.577487
18                             dashers_per_order    0.081316
19             orderPlaced_to_home_location_time    0.053756
12  estimated_store_to_consumer_driving_duration    0.048899
15                                   day_of_week    0.025624
16                                         month    0.024606
14                                   hour_of_day    0.019779
5                             num_distinct_items    0.018859
3                                    total_items    0.017044
8                          total_onshift_dashers    0.016808


---------------------------------------------------
Model Inference
--------------------------

In [23]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from datetime import datetime, timedelta


# Function to handle the prediction code and return the results in minutes format in which the model has been trained for delivery time
# Finally chaning it to date and time format

def predict_delivery_duration(model, X_test, y_test):
    
    # Scaling the features
    scaler = StandardScaler()
    X_test_scaled = scaler.fit_transform(X_test)
    
    # Selecting a random row for inference each time from the test data
    random_index = np.random.randint(0, len(X_test_scaled))
    sample_feature = X_test_scaled[random_index].reshape(1, -1)
    
    # Predicting delivery duration
    
    predicted_duration = float(model.predict(sample_feature)[0])
    
    # order datetime
    order_datetime = datetime.now()
    
    # Calculating the delivery datetime
    delivery_datetime = order_datetime + timedelta(minutes=predicted_duration)
    
    return {
        'selected_row_index': random_index,
        'order_datetime': order_datetime,
        'actual_delivery_duration': y_test.iloc[random_index],
        'predicted_delivery_duration': predicted_duration,
        'estimated_delivery_datetime': delivery_datetime
    }

# Main inference function
def main():
    # Select features and target from the original dataset
    features = [
        'market_id', 'store_id', 'order_protocol', 'total_items', 'subtotal', 'num_distinct_items', 'min_item_price', 'max_item_price',
        'total_onshift_dashers', 'total_busy_dashers', 'total_outstanding_orders', 'estimated_order_place_duration', 
        'estimated_store_to_consumer_driving_duration','price_range', 'hour_of_day', 'day_of_week', 'month', 
        'store_primary_category_encoded', 'dashers_per_order', 'orderPlaced_to_home_location_time', 'Avg_delivery_time', 
        'Delivery_Speed', 'Avg_price'
    ]
    
    X = data_cleaned[features]
    y = data_cleaned['delivery_duration_minutes']
    
    # Data is split into training and testing parts where 20% is for test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Using our XGBoost trained model 
    model = results['model']
    
    inference_results = predict_delivery_duration(model, X_test, y_test)
    
    print("Selected Row Index:", inference_results['selected_row_index'])
    print("Order Datetime:", inference_results['order_datetime'].strftime('%Y-%m-%d %H:%M:%S'))
    print("Actual Delivery Duration (minutes):", inference_results['actual_delivery_duration'])
    print("Predicted Delivery Duration (minutes):", inference_results['predicted_delivery_duration'])
    print("Estimated Delivery Datetime:", inference_results['estimated_delivery_datetime'].strftime('%Y-%m-%d %H:%M:%S'))

main()

Selected Row Index: 28436
Order Datetime: 2024-12-11 16:58:16
Actual Delivery Duration (minutes): 31.966666666666665
Predicted Delivery Duration (minutes): 37.953495025634766
Estimated Delivery Datetime: 2024-12-11 17:36:13


-------------------------------------
Model Selection
------------------------

In [24]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tqdm import tqdm
import time

# Dataloader for our training dataset
class DeliveryDataset(Dataset):
    def __init__(self, features, target):
        self.features = features
        self.target = target
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return torch.tensor(self.features[idx], dtype=torch.float32), torch.tensor(self.target[idx], dtype=torch.float32)

# Function to select features
def select_features(data):
    features = [
        'market_id', 'store_id', 'order_protocol', 'total_items', 'subtotal',
        'num_distinct_items', 'min_item_price', 'max_item_price', 'total_onshift_dashers',
        'total_busy_dashers', 'total_outstanding_orders', 'estimated_order_place_duration',
        'estimated_store_to_consumer_driving_duration', 'price_range', 'hour_of_day', 
        'day_of_week', 'month', 'store_primary_category_encoded', 'dashers_per_order', 
        'orderPlaced_to_home_location_time', 'Avg_delivery_time', 'Delivery_Speed', 'Avg_price'
    ]
    
    X = data[features]
    y = data['delivery_duration_minutes']
    
    return X, y

# Function to train and evaluate different models
def train_and_evaluate_model(X, y, model_type):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Feature Scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Model selection
    if model_type == "xgboost":
        model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42, tree_method='gpu_hist')
    elif model_type == "lightgbm":
        model = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
    elif model_type == "linear":
        model = LinearRegression()
    elif model_type == "random_forest":
        model = RandomForestRegressor(n_estimators=100, random_state=42)
    elif model_type == "gradient_boosting":
        model = GradientBoostingRegressor(n_estimators=100, random_state=42)
    
    # Model training
    start_time = time.time()
    model.fit(X_train_scaled, y_train)
    
    elapsed_time = time.time() - start_time
    tqdm.write(f"Training Time for {model_type}: {elapsed_time:.2f}s")
    
    y_pred = model.predict(X_test_scaled)
    
    # Evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    return {
        'model': model,
        'mae': mae,
        'mse': mse,
        'rmse': rmse,
        'r2': r2
    }


def main(df):
    
    # Selecting Features and Target label for our data
    X, y = select_features(df)

    # XGBoost Model
    print("Evaluating XGBoost Model...")
    results_xgboost = train_and_evaluate_model(X, y, "xgboost")
    print_results("XGBoost", results_xgboost)

    # LightGBM Model
    print("\nEvaluating LightGBM Model...")
    results_lgb = train_and_evaluate_model(X, y, "lightgbm")
    print_results("LightGBM", results_lgb)

    # Linear Regression Model
    print("\nEvaluating Linear Regression Model...")
    results_lr = train_and_evaluate_model(X, y, "linear")
    print_results("Linear Regression", results_lr)

    # Random Forest Regressor Model
    print("\nEvaluating Random Forest Regressor Model...")
    results_rf = train_and_evaluate_model(X, y, "random_forest")
    print_results("Random Forest", results_rf)

    # Gradient Boosting Regressor Model
    print("\nEvaluating Gradient Boosting Regressor Model...")
    results_gb = train_and_evaluate_model(X, y, "gradient_boosting")
    print_results("Gradient Boosting", results_gb)

    return results_xgboost, results_lgb, results_lr, results_rf, results_gb


def print_results(model_name, results):
    print(f"\n{model_name} Performance Metrics:")
    print(f"Mean Absolute Error: {results['mae']:.2f}")
    print(f"Mean Squared Error: {results['mse']:.2f}")
    print(f"Root Mean Squared Error: {results['rmse']:.2f}")
    print(f"R-squared Score: {results['r2']:.2f}")


results = main(data_cleaned)


Evaluating XGBoost Model...



    E.g. tree_method = "hist", device = "cuda"



Training Time for xgboost: 0.56s

XGBoost Performance Metrics:
Mean Absolute Error: 7.80
Mean Squared Error: 103.25
Root Mean Squared Error: 10.16
R-squared Score: 0.50

Evaluating LightGBM Model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004986 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3562
[LightGBM] [Info] Number of data points in the train set: 124844, number of used features: 23
[LightGBM] [Info] Start training from score 44.961240
Training Time for lightgbm: 1.08s

LightGBM Performance Metrics:
Mean Absolute Error: 7.80
Mean Squared Error: 103.10
Root Mean Squared Error: 10.15
R-squared Score: 0.50

Evaluating Linear Regression Model...
Training Time for linear: 0.14s

Linear Regression Performance Metrics:
Mean Absolute Error: 8.10
Mean Squared Error: 110.70
Root Mean Squared Error: 10.52
R-squared Score: 0.47

Eval

-------------------------------------
XGBoost and LightGBM have almost identical performance but XGBoost Performed much better among all the models in terms of training time and evaluation metrics
----------------------

--------------------------
Model Inference through XGBoost is presented above the Model Selection code
---------------------------