In [106]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MinMaxScaler, PowerTransformer, OrdinalEncoder
from sklearn.model_selection import train_test_split

In [107]:
pip install dagshub mlflow




In [108]:
import dagshub
dagshub.init(repo_owner='speedyskill', repo_name='swiggy-delivery-time-prediction', mlflow=True)


In [109]:
import mlflow

In [113]:
mlflow.set_experiment('Exp 5 - LGBM HP Tuning')

2025/04/28 19:41:38 INFO mlflow.tracking.fluent: Experiment with name 'Exp 5 - LGBM HP Tuning' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/8244edd0dae747d88da6dadff64b8cc0', creation_time=1745869298081, experiment_id='4', last_update_time=1745869298081, lifecycle_stage='active', name='Exp 5 - LGBM HP Tuning', tags={}>

In [114]:
df=pd.read_csv('/content/cleaned_data.csv')

In [115]:
# drop columns not required for model input

columns_to_drop =  ['rider_id',
                    'restaurant_latitude',
                    'restaurant_longitude',
                    'delivery_latitude',
                    'delivery_longitude',
                    'order_date',
                    "order_time_hour",
                    "order_day",
                    "city_name",
                    "order_day_of_week",
                    "order_month"]

df.drop(columns=columns_to_drop, inplace=True)

df

Unnamed: 0,age,ratings,weather,traffic,vehicle_condition,type_of_order,type_of_vehicle,multiple_deliveries,festival,city_type,time_taken,is_weekend,order_time_of_day,pickup_time,distance,distance_type
0,37.0,4.9,sunny,high,2,snack,motorcycle,0.0,no,urban,24,1,morning,15.0,3.025149,short
1,34.0,4.5,stormy,jam,2,snack,scooter,1.0,no,metropolitian,33,0,evening,5.0,20.183530,very_long
2,23.0,4.4,sandstorms,low,0,drinks,motorcycle,1.0,no,urban,26,1,morning,15.0,1.552758,short
3,38.0,4.7,sunny,medium,0,buffet,motorcycle,1.0,no,metropolitian,21,0,evening,10.0,7.790401,medium
4,32.0,4.6,cloudy,high,1,snack,scooter,1.0,no,metropolitian,30,1,afternoon,15.0,6.210138,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45497,30.0,4.8,windy,high,1,meal,motorcycle,0.0,no,metropolitian,32,0,morning,10.0,1.489846,short
45498,21.0,4.6,windy,jam,0,buffet,motorcycle,1.0,no,metropolitian,36,0,evening,15.0,,
45499,30.0,4.9,cloudy,low,1,drinks,scooter,0.0,no,metropolitian,16,0,night,15.0,4.657195,short
45500,20.0,4.7,cloudy,high,0,snack,motorcycle,1.0,no,metropolitian,26,0,afternoon,5.0,6.232393,medium


In [116]:
temp_df=df.dropna().copy()

In [117]:
num_cols = ["age","ratings","pickup_time","distance"]

nominal_cat_cols = ['weather','type_of_order',
                    'type_of_vehicle',"festival",
                    "city_type",
                    "is_weekend",
                    "order_time_of_day"]

ordinal_cat_cols = ["traffic","distance_type"]

In [118]:
traffic_order = ["low","medium","high","jam"]

distance_type_order = ["short","medium","long","very_long"]

In [119]:
X=temp_df.drop(columns='time_taken')
y=temp_df['time_taken']

In [120]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [121]:
# build a preprocessor

preprocessor = ColumnTransformer(transformers=[
    ("scale", MinMaxScaler(), num_cols),
    ("nominal_encode", OneHotEncoder(drop="first",handle_unknown="ignore",
                                     sparse_output=False), nominal_cat_cols),
    ("ordinal_encode", OrdinalEncoder(categories=[traffic_order,distance_type_order],
                                      encoded_missing_value=-999,
                                      handle_unknown="use_encoded_value",
                                      unknown_value=-1), ordinal_cat_cols)
],remainder="passthrough",n_jobs=-1,force_int_remainder_cols=False,verbose_feature_names_out=False)


In [122]:
preprocessor.set_output(transform='pandas')

In [123]:
X_train_trans=preprocessor.fit_transform(X_train)
X_test_trans=preprocessor.transform(X_test)

In [124]:
pt=PowerTransformer(method='yeo-johnson')

In [125]:
y_train_pt=pt.fit_transform(y_train.values.reshape(-1,1))
y_test_pt=pt.transform(y_test.values.reshape(-1,1))

In [126]:
pip install optuna



In [127]:
from lightgbm import LGBMRegressor
import optuna

In [128]:
from sklearn.metrics import mean_absolute_error,r2_score
from sklearn.model_selection import cross_val_score
from sklearn.compose import TransformedTargetRegressor

In [129]:
def objective(trial):
    with mlflow.start_run(nested=True):
        params = {
            "n_estimators": trial.suggest_int("n_estimators",10,200),
            "max_depth": trial.suggest_int("max_depth",1,40),
            "learning_rate": trial.suggest_float("learning_rate",0.1,0.8),
            "subsample": trial.suggest_float("subsample",0.5,1),
            "min_child_weight": trial.suggest_int("min_child_weight",5,20),
            "min_split_gain": trial.suggest_float("min_split_gain",0,10),
            "reg_lambda": trial.suggest_float("reg_lambda",0,100),
            "random_state": 42,
            "n_jobs": -1,
        }

        # log model parameters
        mlflow.log_params(params)

        lgbm_reg = LGBMRegressor(**params)
        model = TransformedTargetRegressor(regressor=lgbm_reg,transformer=pt)

        # train the model
        model.fit(X_train_trans,y_train)

        # get the predictions
        y_pred_train = model.predict(X_train_trans)
        y_pred_test = model.predict(X_test_trans)


        # perform cross validation
        cv_score = cross_val_score(model,
                                X_train_trans,
                                y_train,
                                cv=5,
                                scoring="neg_mean_absolute_error",
                                n_jobs=-1)

        # mean score
        mean_score = -(cv_score.mean())
        # log avg cross val error
        mlflow.log_metric("cross_val_error",mean_score)

        return mean_score

In [130]:
# create optuna study
study = optuna.create_study(direction="minimize")

with mlflow.start_run(run_name="best_model"):
    # optimize the objective function
    study.optimize(objective,n_trials=50,n_jobs=-1,show_progress_bar=True)

    # log the best parameters
    mlflow.log_params(study.best_params)

    # log the best score
    mlflow.log_metric("best_score",study.best_value)

    # train the model on best parameters
    best_lgbm = LGBMRegressor(**study.best_params)

    best_lgbm.fit(X_train_trans,y_train_pt.ravel())

    # get the predictions
    y_pred_train = best_lgbm.predict(X_train_trans)
    y_pred_test = best_lgbm.predict(X_test_trans)

    # get the actual predictions values
    y_pred_train_actual = pt.inverse_transform(y_pred_train.reshape(-1,1))
    y_pred_test_actual = pt.inverse_transform(y_pred_test.reshape(-1,1))


    # perform cross validation
    model = TransformedTargetRegressor(regressor=best_lgbm,
                                        transformer=pt)


    scores = cross_val_score(model,
                         X_train_trans,
                         y_train,
                         scoring="neg_mean_absolute_error",
                         cv=5,n_jobs=-1)

    # log metrics
    mlflow.log_metric("training_error",mean_absolute_error(y_train,y_pred_train_actual))
    mlflow.log_metric("test_error",mean_absolute_error(y_test,y_pred_test_actual))
    mlflow.log_metric("training_r2",r2_score(y_train,y_pred_train_actual))
    mlflow.log_metric("test_r2",r2_score(y_test,y_pred_test_actual))
    mlflow.log_metric("cross_val",- scores.mean())

    # log the best model
    mlflow.sklearn.log_model(best_lgbm,artifact_path="model")

[I 2025-04-28 19:42:15,250] A new study created in memory with name: no-name-26ed3e99-ffdf-439d-8225-970ec96d52ea


  0%|          | 0/50 [00:00<?, ?it/s]



🏃 View run illustrious-moth-737 at: https://dagshub.com/speedyskill/swiggy-delivery-time-prediction.mlflow/#/experiments/4/runs/a2ba9c92a4cf48b593eca0498e2b6e93
🧪 View experiment at: https://dagshub.com/speedyskill/swiggy-delivery-time-prediction.mlflow/#/experiments/4
🏃 View run gifted-gnu-464 at: https://dagshub.com/speedyskill/swiggy-delivery-time-prediction.mlflow/#/experiments/4/runs/f387890427354b35907d4d9f713b23b0
🧪 View experiment at: https://dagshub.com/speedyskill/swiggy-delivery-time-prediction.mlflow/#/experiments/4
[I 2025-04-28 19:42:32,400] Trial 0 finished with value: 3.602799018434375 and parameters: {'n_estimators': 45, 'max_depth': 23, 'learning_rate': 0.270332469532703, 'subsample': 0.9539095490660229, 'min_child_weight': 19, 'min_split_gain': 4.398207580864261, 'reg_lambda': 89.70579134009331}. Best is trial 0 with value: 3.602799018434375.
[I 2025-04-28 19:42:32,431] Trial 1 finished with value: 3.955350733706273 and parameters: {'n_estimators': 22, 'max_depth': 4



🏃 View run best_model at: https://dagshub.com/speedyskill/swiggy-delivery-time-prediction.mlflow/#/experiments/4/runs/d28c55f70d944d6280538fba15a79da1
🧪 View experiment at: https://dagshub.com/speedyskill/swiggy-delivery-time-prediction.mlflow/#/experiments/4
