In [2]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MinMaxScaler, PowerTransformer, OrdinalEncoder
from sklearn.model_selection import train_test_split

In [3]:
pip install dagshub mlflow

Collecting dagshub
  Downloading dagshub-0.5.9-py3-none-any.whl.metadata (12 kB)
Collecting mlflow
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting appdirs>=1.4.4 (from dagshub)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting dacite~=1.6.0 (from dagshub)
  Downloading dacite-1.6.0-py3-none-any.whl.metadata (14 kB)
Collecting gql[requests] (from dagshub)
  Downloading gql-3.5.2-py2.py3-none-any.whl.metadata (9.4 kB)
Collecting dataclasses-json (from dagshub)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting treelib>=1.6.4 (from dagshub)
  Downloading treelib-1.7.1-py3-none-any.whl.metadata (1.4 kB)
Collecting pathvalidate>=3.0.0 (from dagshub)
  Downloading pathvalidate-3.2.3-py3-none-any.whl.metadata (12 kB)
Collecting boto3 (from dagshub)
  Downloading boto3-1.38.4-py3-none-any.whl.metadata (6.6 kB)
Collecting semver (from dagshub)
  Downloading semver-3.0.4-py3-none-any.whl.metadata (6.8 kB)
Collect

In [4]:
import dagshub
dagshub.init(repo_owner='speedyskill', repo_name='swiggy-delivery-time-prediction', mlflow=True)

Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=98e700b3-bbca-47a8-b307-f4c2ab658953&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=53d4d996fad4b9c58d90c8c3e5d688ed20c2b88b63d174e90bf7a9a94ad179ce




In [5]:
import mlflow
mlflow.set_tracking_uri('https://dagshub.com/speedyskill/swiggy-delivery-time-prediction.mlflow')

In [46]:
mlflow.set_experiment('Exp 7 - Stacking Regressor')



<Experiment: artifact_location='mlflow-artifacts:/122a8a9972304b0d9b2ee7307ea888b2', creation_time=1745931689349, experiment_id='7', last_update_time=1745939475802, lifecycle_stage='active', name='Exp 7 - Stacking Regressor', tags={}>

In [8]:
pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.3.0


In [9]:
df=pd.read_csv('cleaned_data.csv')
df.head()

Unnamed: 0,rider_id,age,ratings,restaurant_latitude,restaurant_longitude,delivery_latitude,delivery_longitude,order_date,weather,traffic,...,city_name,order_day,order_month,order_day_of_week,is_weekend,order_time_hour,order_time_of_day,pickup_time,distance,distance_type
0,INDORES13DEL02,37.0,4.9,22.745049,75.892471,22.765049,75.912471,2022-03-19,sunny,high,...,INDO,19,3,saturday,1,11.0,morning,15.0,3.025149,short
1,BANGRES18DEL02,34.0,4.5,12.913041,77.683237,13.043041,77.813237,2022-03-25,stormy,jam,...,BANG,25,3,friday,0,19.0,evening,5.0,20.18353,very_long
2,BANGRES19DEL01,23.0,4.4,12.914264,77.6784,12.924264,77.6884,2022-03-19,sandstorms,low,...,BANG,19,3,saturday,1,8.0,morning,15.0,1.552758,short
3,COIMBRES13DEL02,38.0,4.7,11.003669,76.976494,11.053669,77.026494,2022-04-05,sunny,medium,...,COIMB,5,4,tuesday,0,18.0,evening,10.0,7.790401,medium
4,CHENRES12DEL01,32.0,4.6,12.972793,80.249982,13.012793,80.289982,2022-03-26,cloudy,high,...,CHEN,26,3,saturday,1,13.0,afternoon,15.0,6.210138,medium


In [10]:
# drop columns not required for model input

columns_to_drop =  ['rider_id',
                    'restaurant_latitude',
                    'restaurant_longitude',
                    'delivery_latitude',
                    'delivery_longitude',
                    'order_date',
                    "order_time_hour",
                    "order_day",
                    "city_name",
                    "order_day_of_week",
                    "order_month"]

df.drop(columns=columns_to_drop, inplace=True)

df

Unnamed: 0,age,ratings,weather,traffic,vehicle_condition,type_of_order,type_of_vehicle,multiple_deliveries,festival,city_type,time_taken,is_weekend,order_time_of_day,pickup_time,distance,distance_type
0,37.0,4.9,sunny,high,2,snack,motorcycle,0.0,no,urban,24,1,morning,15.0,3.025149,short
1,34.0,4.5,stormy,jam,2,snack,scooter,1.0,no,metropolitian,33,0,evening,5.0,20.183530,very_long
2,23.0,4.4,sandstorms,low,0,drinks,motorcycle,1.0,no,urban,26,1,morning,15.0,1.552758,short
3,38.0,4.7,sunny,medium,0,buffet,motorcycle,1.0,no,metropolitian,21,0,evening,10.0,7.790401,medium
4,32.0,4.6,cloudy,high,1,snack,scooter,1.0,no,metropolitian,30,1,afternoon,15.0,6.210138,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45497,30.0,4.8,windy,high,1,meal,motorcycle,0.0,no,metropolitian,32,0,morning,10.0,1.489846,short
45498,21.0,4.6,windy,jam,0,buffet,motorcycle,1.0,no,metropolitian,36,0,evening,15.0,,
45499,30.0,4.9,cloudy,low,1,drinks,scooter,0.0,no,metropolitian,16,0,night,15.0,4.657195,short
45500,20.0,4.7,cloudy,high,0,snack,motorcycle,1.0,no,metropolitian,26,0,afternoon,5.0,6.232393,medium


In [11]:
temp_df=df.dropna().copy()

In [12]:
X=temp_df.drop(columns='time_taken')
y=temp_df['time_taken']

In [13]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [18]:
num_cols = ["age","ratings","pickup_time","distance"]

nominal_cat_cols = ['weather',
                    'type_of_order',
                    'type_of_vehicle',
                    "festival",
                    "city_type",
                    "is_weekend",
                    "order_time_of_day"]

ordinal_cat_cols = ["traffic","distance_type"]

In [19]:
# generate order for ordinal encoding

traffic_order = ["low","medium","high","jam"]

distance_type_order = ["short","medium","long","very_long"]

In [20]:
preprocessor=ColumnTransformer(transformers=[
    ('scale',MinMaxScaler(),num_cols),
    ('nominal_encode',OneHotEncoder(drop='first',handle_unknown='ignore',sparse_output=False),nominal_cat_cols),
    ('ordinal_encode',OrdinalEncoder(categories=[traffic_order,distance_type_order]),ordinal_cat_cols)
],remainder='passthrough',force_int_remainder_cols=False,n_jobs=-1,verbose_feature_names_out=False)

preprocessor.set_output(transform='pandas')

In [21]:
X_train_trans=preprocessor.fit_transform(X_train)
X_test_trans=preprocessor.transform(X_test)

In [23]:
pt=PowerTransformer(method='yeo-johnson')
y_train_pt=pt.fit_transform(y_train.values.reshape(-1,1))
y_test_pt=pt.transform(y_test.values.reshape(-1,1))

In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error,r2_score
from sklearn.model_selection import cross_val_score
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import StackingRegressor

In [28]:
import optuna

In [25]:
# build the best model

best_lgbm_params= {
    'n_estimators' : 105,
    'max_depth' : 39,
    'learning_rate' : 0.23688131888261493,
    'subsample' : 0.5173176053180517,
    'min_child_weight' : 12,
    'min_split_gain' : 0.0007351518634197696,
    'reg_lambda' : 1.012414361562786
}



best_rf_params= {
    'n_estimators' : 342,
    'max_depth' : 15,
    'max_features' : None,
    'min_samples_split' : 7,
    'min_samples_leaf' : 6,
    'max_samples' : 0.70279720384227
}


best_lgbm=LGBMRegressor(**best_lgbm_params)
best_rf=RandomForestRegressor(**best_rf_params)

In [30]:
def objective(trial):

  with mlflow.start_run(nested=True):


    meta_model_name = trial.suggest_categorical('model',['LR','KNN','DT'])

    if meta_model_name == 'LR':
      meta = LinearRegression()

    elif meta_model_name == 'KNN':
      n_neighbours_knn = trial.suggest_int('n_neighbours_knn',1,15)
      weights_knn = trial.suggest_categorical('weights_knn',['uniform','distance'])
      meta = KNeighborsRegressor(n_neighbors=n_neighbours_knn,weights=weights_knn,n_jobs=-1)

    elif meta_model_name == 'DT':
      max_depth_dt = trial.suggest_int('max_depth_dt',1,10)
      min_samples_split_dt = trial.suggest_int('min_samples_split_dt',2,10)
      min_samples_leaf_dt = trial.suggest_int('min_samples_leaf_dt',1,10)
      meta = DecisionTreeRegressor(max_depth=max_depth_dt,min_samples_split=min_samples_split_dt,
                                 min_samples_leaf=min_samples_leaf_dt,random_state=42)

    # log model name
    mlflow.log_param('meta_model_name',meta_model_name)

    # stacking Regressor
    stacking_reg=StackingRegressor(estimators=[('best_lgbm',best_lgbm),
                                             ('best_rf',best_rf)],
                                   final_estimator=meta,n_jobs=-1,cv=5)


    model=TransformedTargetRegressor(regressor=stacking_reg,transformer=pt)

    model.fit(X_train_trans,y_train)

    y_pred_test=model.predict(X_test_trans)

    error=mean_absolute_error(y_test,y_pred_test)

    # log error
    mlflow.log_metric('MAE',error)

    return error



In [31]:
# create optuna study

study=optuna.create_study(direction='minimize')

with mlflow.start_run(run_name='best_model'):
  # optimize the objective function
  study.optimize(objective,n_trials=20,n_jobs=-1,show_progress_bar=True)

  # log the best parameter
  mlflow.log_params(study.best_params)

  # log the best score
  mlflow.log_metric('best_score',study.best_value)


[I 2025-04-29 14:15:33,209] A new study created in memory with name: no-name-267e1a5a-e563-4698-a36a-adb2f807b3ea


  0%|          | 0/20 [00:00<?, ?it/s]

🏃 View run dashing-worm-988 at: https://dagshub.com/speedyskill/swiggy-delivery-time-prediction.mlflow/#/experiments/7/runs/dcee189ad272403785bc95ab12fa87e3
🧪 View experiment at: https://dagshub.com/speedyskill/swiggy-delivery-time-prediction.mlflow/#/experiments/7
[I 2025-04-29 14:19:52,036] Trial 1 finished with value: 3.0642595352825905 and parameters: {'model': 'DT', 'max_depth_dt': 4, 'min_samples_split_dt': 6, 'min_samples_leaf_dt': 1}. Best is trial 1 with value: 3.0642595352825905.
🏃 View run carefree-squid-309 at: https://dagshub.com/speedyskill/swiggy-delivery-time-prediction.mlflow/#/experiments/7/runs/a42f417dc8604259ace29075bb955f92
🧪 View experiment at: https://dagshub.com/speedyskill/swiggy-delivery-time-prediction.mlflow/#/experiments/7
[I 2025-04-29 14:19:54,910] Trial 0 finished with value: 3.5965978163712786 and parameters: {'model': 'KNN', 'n_neighbours_knn': 2, 'weights_knn': 'distance'}. Best is trial 1 with value: 3.0642595352825905.
🏃 View run painted-hog-821 at

In [32]:
study.best_params

{'model': 'LR'}

In [34]:
study.trials_dataframe().columns

Index(['number', 'value', 'datetime_start', 'datetime_complete', 'duration',
       'params_max_depth_dt', 'params_min_samples_leaf_dt',
       'params_min_samples_split_dt', 'params_model',
       'params_n_neighbours_knn', 'params_weights_knn', 'state'],
      dtype='object')

In [38]:
study.trials_dataframe()['params_model'].value_counts()

Unnamed: 0_level_0,count
params_model,Unnamed: 1_level_1
LR,12
KNN,5
DT,3


In [41]:
study.trials_dataframe().groupby('params_model')['value'].mean().sort_values()

Unnamed: 0_level_0,value
params_model,Unnamed: 1_level_1
LR,3.011885
DT,3.056011
KNN,3.227478


In [42]:
optuna.visualization.plot_optimization_history(study)

In [45]:
optuna.visualization.plot_parallel_coordinate(study,params=['model'])