In [1]:
import time
import numpy as np
import pandas as pd

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor

from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import make_scorer, root_mean_squared_error


In [2]:
df = pd.read_csv('../data/processed/df_after_feature_engineering.csv')
df.head()

Unnamed: 0,cuisine_category_alcohol-plus-food,cuisine_category_chocolate,cuisine_category_belgian,cuisine_category_lebanese,cuisine_category_russian,cuisine_category_african,cuisine_category_european,cuisine_category_gluten-free,cuisine_category_comfort-food,cuisine_category_moroccan,...,cuisine_category_japanese,total_items,cuisine_category_mexican,cuisine_category_pizza,cuisine_category_american,avg_price_per_item,busy_dashers_ratio,estimated_store_to_consumer_driving_duration,estimated_order_place_duration,total_delivery_duration
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,1.0,860.25,0.424242,861.0,446.0,3779.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1900.0,2.0,690.0,446.0,4024.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,1192.75,0.75,289.0,446.0,1586.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1525.0,1.2,795.0,446.0,2273.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,1810.0,1.0,205.0,446.0,2988.0


In [3]:
X = df.drop(columns='total_delivery_duration')
y = df['total_delivery_duration']

In [4]:
# RMSE function for inverse scaling
def rmse_inverse(y_true, y_pred, y_scaler):
    y_true_orig = y_scaler.inverse_transform(y_true.reshape(-1, 1))
    y_pred_orig = y_scaler.inverse_transform(y_pred.reshape(-1, 1))
    return root_mean_squared_error(y_true_orig, y_pred_orig)

In [5]:
# Regression function
def create_regression(X, y, model, model_name, scaler_name, y_scaler=None, verbose=True):
    """
    Trains a regression model using cross-validation and computes RMSE.
    Handles scaled and unscaled data correctly.

    Args:
    - X: Features (scaled or unscaled)
    - y: Target variable (scaled or unscaled)
    - model: Regression model to use
    - model_name: Name of the model
    - scaler_name: Name of the scaler used
    - y_scaler: Target scaler for inverse scaling RMSE (if applicable)
    - verbose: Whether to print detailed output

    Returns:
    - rmse_scores: List of RMSE scores across folds
    - elapsed_time: Total time taken for cross-validation
    """
    start_time = time.time()
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    if scaler_name != 'Without Scale' and y_scaler is not None:
        # Use inverse RMSE scoring for scaled data
        rmse_scorer = make_scorer(lambda y_true, y_pred: rmse_inverse(y_true, y_pred, y_scaler), greater_is_better=False)
    else:
        # Use regular RMSE scoring for unscaled data
        rmse_scorer = make_scorer(root_mean_squared_error, greater_is_better=False)
        
    # Perform cross-validation
    rmse_scores = cross_val_score(model, X, y, cv=kf, scoring=rmse_scorer)
    end_time = time.time()
    elapsed_time = end_time - start_time

    if verbose:
        print(f'Average RMSE : {-np.mean(rmse_scores):.4f} - Time taken: {elapsed_time:.4f} seconds')

    return -np.mean(rmse_scores), round(elapsed_time, 4)

In [6]:
# Scaling function
def scale(scaler, X, y):
    """
    Scales features and target variable using the given scaler.

    Args:
    - scaler: Instance of sklearn scaler (e.g., MinMaxScaler, StandardScaler)
    - X: Features
    - y: Target variable

    Returns:
    - X_scaled: Scaled features
    - y_scaled: Scaled target variable
    - y_scaler: Scaler instance for the target variable
    """
    X_scaler = scaler
    y_scaler = scaler

    X_scaled = X_scaler.fit_transform(X)
    y_scaled = y_scaler.fit_transform(y.values.reshape(-1, 1))
    
    return X_scaled, y_scaled, y_scaler

In [7]:
# Data structures to store results
pred_dict = {
    "regression_model": [],
    "feature_set": [],
    "scaler_name": [],
    "RMSE": [],
    "time_taken": []
}

# Regression models
regression_models = {
    "AdaBoost": AdaBoostRegressor(),
    "DecisionTree": DecisionTreeRegressor(max_depth=6),
    "GradientBoosting": GradientBoostingRegressor(),
    "XGBoost": XGBRegressor(),
    "LGBM": LGBMRegressor()
}

# Feature sets
feature_sets = {
    "All Features": X.columns.to_list()
}

# Scalers
scalers = {
    "Standard Scaler": StandardScaler(),
    "Min-Max Scaler": MinMaxScaler(),
    "Without Scale": None
}

In [8]:
# Main loop for feature sets, scalers, and regression models
for feature_set_name, features in feature_sets.items():
    for scaler_name, scaler in scalers.items():
        for model_name, model in regression_models.items():
            print(f"Included Columns: {feature_set_name} | Scaling Method: {scaler_name} | Algorithm Used: {model_name}")

            # Select features and target variable
            X = df[features]
            y = df['total_delivery_duration']
            
            # Model-specific adjustments
            if model_name == "LGBM":
                model.set_params(force_col_wise=True)

            if scaler_name == 'Without Scale':
                # Unscaled data
                avg_rmse_error, time_taken = create_regression(X, y, model, model_name, scaler_name, verbose=True)
            else:
                # Scaled data
                X_scaled, y_scaled, y_scaler = scale(scaler, X, y)
                avg_rmse_error, time_taken = create_regression(X_scaled, y_scaled[:, 0], model, model_name, scaler_name, y_scaler=y_scaler, verbose=True)

            print('-' * 100)

            # Store results in pred_dict
            pred_dict['regression_model'].append(model_name)
            pred_dict['feature_set'].append(feature_set_name)
            pred_dict['scaler_name'].append(scaler_name)
            pred_dict['RMSE'].append(avg_rmse_error)
            pred_dict['time_taken'].append(time_taken)

Included Columns: All Features | Scaling Method: Standard Scaler | Algorithm Used: AdaBoost
Average RMSE : 3454.5769 - Time taken: 75.7516 seconds
----------------------------------------------------------------------------------------------------
Included Columns: All Features | Scaling Method: Standard Scaler | Algorithm Used: DecisionTree
Average RMSE : 1076.4030 - Time taken: 3.5434 seconds
----------------------------------------------------------------------------------------------------
Included Columns: All Features | Scaling Method: Standard Scaler | Algorithm Used: GradientBoosting
Average RMSE : 1043.9634 - Time taken: 189.9971 seconds
----------------------------------------------------------------------------------------------------
Included Columns: All Features | Scaling Method: Standard Scaler | Algorithm Used: XGBoost
Average RMSE : 1044.2271 - Time taken: 6.9361 seconds
---------------------------------------------------------------------------------------------------

In [13]:
pd.DataFrame(data=pred_dict).sort_values(by='RMSE')

Unnamed: 0,regression_model,feature_set,scaler_name,RMSE,time_taken
4,LGBM,All Features,Standard Scaler,1037.222363,5.8213
14,LGBM,All Features,Without Scale,1037.628511,5.0804
9,LGBM,All Features,Min-Max Scaler,1037.755774,3.6045
2,GradientBoosting,All Features,Standard Scaler,1043.963364,189.9971
7,GradientBoosting,All Features,Min-Max Scaler,1043.969762,182.9733
12,GradientBoosting,All Features,Without Scale,1043.971064,101.4466
13,XGBoost,All Features,Without Scale,1044.22714,6.3346
3,XGBoost,All Features,Standard Scaler,1044.227141,6.9361
8,XGBoost,All Features,Min-Max Scaler,1044.481077,6.3458
11,DecisionTree,All Features,Without Scale,1076.40298,2.9496


In [None]:
# df_before_feature_eng = pd.read_csv('../data/processed/df_before_feature_eng.csv')
# df_before_feature_eng.head()
# X = df_before_feature_eng.drop(columns=['total_delivery_duration'])
# y = df_before_feature_eng['total_delivery_duration']

# start_time = time.time()

# kf = KFold(n_splits=5, shuffle=True, random_state=42)
# rmse_scorer = make_scorer(root_mean_squared_error, greater_is_better=False)
# rmse_scores = cross_val_score(LGBMRegressor(), X, y, cv=kf, scoring=rmse_scorer)

# end_time = time.time()
# elapsed_time = end_time - start_time

# print(f'Average RMSE : {-np.mean(rmse_scores):.4f} - Time taken: {elapsed_time:.4f} seconds')