In [60]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, cross_val_predict, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, make_scorer, f1_score, classification_report, confusion_matrix, precision_recall_fscore_support
import tempfile, shutil, joblib

In [45]:
df = pd.read_csv("../data/rides_data_delhi_clean.csv")
df.head(5)

Unnamed: 0.1,Unnamed: 0,month,day,weekday,hour,booking_id,booking_status,customer_id,vehicle_type,pickup_location,drop_location,avg_vtat,reason_for_cancelling_by_customer,driver_cancellation_reason,incomplete_rides_reason,booking_value,ride_distance,driver_rating,customer_rating,payment_method
0,0,3,23,5,12,"""CNR5884300""",No Driver Found,"""CID1982111""",eBike,Palam Vihar,Jhilmil,0.0,,,,0.0,0.0,-1.0,-1.0,No payment
1,1,11,29,4,18,"""CNR1326809""",Incomplete,"""CID4604802""",Go Sedan,Shastri Nagar,Gurgaon Sector 56,4.9,,,Vehicle Breakdown,237.0,5.73,-1.0,-1.0,UPI
2,2,8,23,4,8,"""CNR8494506""",Completed,"""CID9202816""",Auto,Khandsa,Malviya Nagar,13.4,,,,627.0,13.58,4.9,4.9,Debit Card
3,3,10,21,0,17,"""CNR8906825""",Completed,"""CID2610914""",Premier Sedan,Central Secretariat,Inderlok,13.1,,,,416.0,34.02,4.6,5.0,UPI
4,4,9,16,0,22,"""CNR1950162""",Completed,"""CID9933542""",Bike,Ghitorni Village,Khan Market,5.3,,,,737.0,48.21,4.1,4.3,UPI


In [5]:
# We convert to string type the following columns

df["reason_for_cancelling_by_customer"] = df["reason_for_cancelling_by_customer"].astype("str")
df["driver_cancellation_reason"] = df["driver_cancellation_reason"].astype("str")
df["incomplete_rides_reason"] = df["incomplete_rides_reason"].astype("str")

# Prediction of the booking value

### Is it possible to predict accurately the booking value of a trip ?


In [33]:
# We remove the useless columns for modeling

df = df.drop(["customer_id", "booking_id", "pickup_location", "drop_location"], axis=1)

In [35]:
# To get efficient ML models we first need to scale / normalize numerical data and encode categorical ones

standard_features = ["avg_vtat", "pickup_latitude", "pickup_longitude", "drop_latitude", "drop_longitude"]
robustscale_features = ["ride_distance"]
minmax_features = ["driver_rating", "customer_rating"]
onehot_features = ["month", "weekday", "day","hour", "booking_status", "vehicle_type", "reason_for_cancelling_by_customer",
                  "driver_cancellation_reason", "incomplete_rides_reason", "payment_method"]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), standard_features),
    ("num2", RobustScaler(), robustscale_features),
    ("num3", MinMaxScaler(), minmax_features),
    ("cat", OneHotEncoder(), onehot_features)])

In [36]:
# Split the data between the target and other variables

X = df.drop("booking_value", axis=1)  
y = df["booking_value"]               


In [37]:
# Split the data between a test and train data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
# Pipeline creation

model = Pipeline(steps=[
    ("preprocessor", preprocessor),   
    ("regressor", LinearRegression())])

In [39]:
# Model training

model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('num2', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [40]:
# Model predictions

y_pred = model.predict(X_test)

In [41]:
# Model's performance assessment

mse = mean_squared_error(y_test, y_pred)
rmse = mse**0.5
r2 = r2_score(y_test, y_pred)

print("RMSE: ", rmse, "\nR²:", r2)

print("""\nThe regression models show limited predictive power for booking value (low R², relatively high RMSE). 
This indicates that current variables (distance, ratings, time, vehicle type, etc.) are not sufficient to accurately explain or predict pricing. 
Booking value is likely driven by external factors such as surge pricing, demand fluctuations, or market conditions, 
which are not captured in the dataset.""")

RMSE:  4.256130731227488 
R²: 0.057710450143792835

The regression models show limited predictive power for booking value (low R², relatively high RMSE). 
This indicates that current variables (distance, ratings, time, vehicle type, etc.) are not sufficient to accurately explain or predict pricing. 
Booking value is likely driven by external factors such as surge pricing, demand fluctuations, or market conditions, 
which are not captured in the dataset.


In [42]:
# Let's try to use various other ML models to predict an accurate booking value

# Split target / features
y = df["booking_value"]
X = df.drop(columns=["booking_value"]).copy()

# Scaling / encoding data 
standard_features    = [c for c in ["avg_vtat","pickup_latitude","pickup_longitude","drop_latitude","drop_longitude"] if c in X.columns]
robustscale_features = [c for c in ["ride_distance"] if c in X.columns]
minmax_features      = [c for c in ["driver_rating","customer_rating"] if c in X.columns]
onehot_features      = [c for c in ["month","weekday","day","hour","booking_status","vehicle_type",
                                    "reason_for_cancelling_by_customer","driver_cancellation_reason",
                                    "incomplete_rides_reason","payment_method"] if c in X.columns]

preproc_sparse = ColumnTransformer(transformers=[
        ("num_std",  StandardScaler(),   standard_features),
        ("num_rob",  RobustScaler(),     robustscale_features),
        ("num_min",  MinMaxScaler(),     minmax_features),
        ("cat", OneHotEncoder(handle_unknown="ignore", min_frequency=0.01,sparse_output=True), onehot_features),],remainder="drop",sparse_threshold=1.0)
preproc_dense = ColumnTransformer(transformers=[
        ("num_std",  StandardScaler(),   standard_features),
        ("num_rob",  RobustScaler(),     robustscale_features),
        ("num_min",  MinMaxScaler(),     minmax_features),
        ("cat", OneHotEncoder(handle_unknown="ignore", min_frequency=0.01,sparse_output=False), onehot_features),remainder="drop")

cache_dir = tempfile.mkdtemp()

def make_pipe(model, dense=False):
    pre = preproc_dense if dense else preproc_sparse
    return Pipeline([("pre", pre), ("model", model)], memory=cache_dir)

# ML models
pipelines = {
    "Linear": make_pipe(LinearRegression()),
    "Ridge":  make_pipe(Ridge(alpha=1.0)),
    "Lasso":  make_pipe(Lasso(alpha=0.001, max_iter=5000)),
    "RF":     make_pipe(RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=20)),
    "HGB":    make_pipe(HistGradientBoostingRegressor(
                max_depth=None, learning_rate=0.05, max_bins=255, random_state=20), dense=True),
}

# Assessing the models
rows = []
for name, pipe in pipelines.items():
    cv = cross_validate(
        pipe, X, y,
        cv=3,
        scoring={"rmse": "neg_root_mean_squared_error", "r2": "r2"},
        n_jobs=-1, error_score="raise", return_train_score=False, verbose=0)
    rmse = -cv["test_rmse"].mean()
    r2   =  cv["test_r2"].mean()
    rows.append({"model": name, "RMSE": rmse, "R2": r2})

results = pd.DataFrame(rows).sort_values("RMSE")
print(results)

# Keeping the most accurate model
best_name = results.iloc[0]["model"]
best_pipe = pipelines[best_name].fit(X, y)

print("Best model fitted:", best_name)
joblib.dump(best_pipe, "booking_value_ml_model.joblib", compress=3)
print("Best model saved as booking_value_ml_model.joblib")
shutil.rmtree(cache_dir, ignore_errors=True)

    model      RMSE        R2
2   Lasso  4.218915  0.057138
4     HGB  4.219547  0.056858
1   Ridge  4.219945  0.056677
0  Linear  4.219948  0.056676
3      RF  4.269439  0.034327
Best model fitted: Lasso
Best model saved as booking_value_ml_model.joblib


### **Conclusion**

The models tested all show limited predictive power on booking value, with an RMSE of ~4.2 and R² around 0.05–0.06. This means that less than 10% of the variance in booking value can be explained by the available features.
In practice, booking value appears to be only weakly related to operational variables such as distance, ratings, or booking status. External factors not captured in the dataset (traffic, surge pricing, demand peaks, city events, etc.) are likely stronger drivers of fare variation.

**From a business perspective, this highlights that the current dataset is insufficient for accurate fare prediction. To improve, additional contextual data (traffic, time-based demand, geographic patterns) would need to be integrated.**

# Prediction of the booking status

### Is it possible to predict if a trip will be completed or not based on the current variables ?

In [7]:
df_2 = pd.read_csv("../data/rides_data_clean.csv")
df_2.head(5)

Unnamed: 0.1,Unnamed: 0,month,day,weekday,hour,booking_id,booking_status,customer_id,vehicle_type,pickup_location,drop_location,avg_vtat,reason_for_cancelling_by_customer,driver_cancellation_reason,incomplete_rides_reason,booking_value,ride_distance,driver_rating,customer_rating,payment_method
0,0,3,23,5,12,"""CNR5884300""",No Driver Found,"""CID1982111""",eBike,Palam Vihar,Jhilmil,0.0,,,,0.0,0.0,-1.0,-1.0,No payment
1,1,11,29,4,18,"""CNR1326809""",Incomplete,"""CID4604802""",Go Sedan,Shastri Nagar,Gurgaon Sector 56,4.9,,,Vehicle Breakdown,237.0,5.73,-1.0,-1.0,UPI
2,2,8,23,4,8,"""CNR8494506""",Completed,"""CID9202816""",Auto,Khandsa,Malviya Nagar,13.4,,,,627.0,13.58,4.9,4.9,Debit Card
3,3,10,21,0,17,"""CNR8906825""",Completed,"""CID2610914""",Premier Sedan,Central Secretariat,Inderlok,13.1,,,,416.0,34.02,4.6,5.0,UPI
4,4,9,16,0,22,"""CNR1950162""",Completed,"""CID9933542""",Bike,Ghitorni Village,Khan Market,5.3,,,,737.0,48.21,4.1,4.3,UPI


In [11]:
# We remove the useless columns for modeling (or the data that may leak)

df_2 = df_2.drop(["driver_cancellation_reason", "reason_for_cancelling_by_customer", 
                  "incomplete_rides_reason", "ride_distance", "avg_vtat", "booking_value", "driver_rating", "customer_rating"], axis=1)

In [25]:
# Let's try to use logistic regression to predict the booking status

# 1) Target / features
y = df_2["booking_status"]
X = df_2.drop(columns=["booking_status"])
num_cols = [c for c in ["pickup_latitude","pickup_longitude","drop_latitude","drop_longitude"] if c in X.columns]
cat_cols = [c for c in ["hour","weekday","day","month","vehicle_type","payment_method"] if c in X.columns]

# 3) Preprocessing + Modelling
preprocessor = ColumnTransformer(transformers=[
        *([("num", StandardScaler(), num_cols)] if num_cols else []),
        *([("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)] if cat_cols else []),],remainder="drop")
pipe = Pipeline([("pre", preprocessor),("model", LogisticRegression(max_iter=1000, class_weight="balanced"))])
n_splits = max(2, min(5, int(min_per_class)))
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

scoring = {"accuracy": make_scorer(accuracy_score),"f1_macro": make_scorer(f1_score, average="macro", zero_division=0),}
cv_res = cross_validate(pipe, X, y, cv=cv, scoring=scoring, n_jobs=-1)
print(f"Accuracy : {cv_res['test_accuracy'].mean():.3f} ± {cv_res['test_accuracy'].std():.3f}")
print(f"F1-macro : {cv_res['test_f1_macro'].mean():.3f} ± {cv_res['test_f1_macro'].std():.3f}")

# 5) Performance + confusion matrix
y_pred_oof = cross_val_predict(pipe, X, y, cv=cv, n_jobs=-1, method="predict")
print("\n=== Classification report (OOF) ===")
print(classification_report(y, y_pred_oof, zero_division=0))
labels = list(np.unique(y))
cm = confusion_matrix(y, y_pred_oof, labels=labels)

# Model fitting
import joblib
pipe.fit(X, y)
joblib.dump(pipe, "booking_status_logreg_baseline.joblib", compress=3)
print("Model saved as: booking_status_logreg.joblib")



Accuracy : 0.439 ± 0.004
F1-macro : 0.346 ± 0.002

=== Classification report (OOF) ===
                       precision    recall  f1-score   support

Cancelled by Customer       0.22      0.36      0.28     10500
  Cancelled by Driver       0.56      0.31      0.40     27000
            Completed       0.91      0.49      0.64     93000
           Incomplete       0.09      0.51      0.15      9000
      No Driver Found       0.22      0.33      0.27     10500

             accuracy                           0.44    150000
            macro avg       0.40      0.40      0.35    150000
         weighted avg       0.70      0.44      0.51    150000

model saved as: booking_status_logreg_baseline.joblib


### **Conclusion**

With today’s booking-time data, the model can’t reliably predict ride outcomes (around 44% accuracy with low balanced performance). It frequently confuses Completed vs Incomplete and the different cancellation types, which would create too many false alarms for automation. The likely missing signals are real-time operational and contextual factors (driver ETA/proximity, supply–demand, traffic, weather, local events). Business takeaway: don’t use this for automated actions; at best, treat it as a rough risk flag for manual review. To improve, add those real-time signals at booking and re-evaluate.

**From a business perspective, we should not use this model for automated actions; at best, treat it as a rough risk flag for manual review. To improve it, we could add those real-time signals at booking and re-evaluate.**

# Prediction of the waiting time

### Is it possible to predict how many time a customer will wait (on average) before starting his trip ?

In [62]:
df_3 = pd.read_csv("../data/rides_data_clean.csv")
df_3.head(5)

Unnamed: 0.1,Unnamed: 0,month,day,weekday,hour,booking_id,booking_status,customer_id,vehicle_type,pickup_location,drop_location,avg_vtat,reason_for_cancelling_by_customer,driver_cancellation_reason,incomplete_rides_reason,booking_value,ride_distance,driver_rating,customer_rating,payment_method
0,0,3,23,5,12,"""CNR5884300""",No Driver Found,"""CID1982111""",eBike,Palam Vihar,Jhilmil,0.0,,,,0.0,0.0,-1.0,-1.0,No payment
1,1,11,29,4,18,"""CNR1326809""",Incomplete,"""CID4604802""",Go Sedan,Shastri Nagar,Gurgaon Sector 56,4.9,,,Vehicle Breakdown,237.0,5.73,-1.0,-1.0,UPI
2,2,8,23,4,8,"""CNR8494506""",Completed,"""CID9202816""",Auto,Khandsa,Malviya Nagar,13.4,,,,627.0,13.58,4.9,4.9,Debit Card
3,3,10,21,0,17,"""CNR8906825""",Completed,"""CID2610914""",Premier Sedan,Central Secretariat,Inderlok,13.1,,,,416.0,34.02,4.6,5.0,UPI
4,4,9,16,0,22,"""CNR1950162""",Completed,"""CID9933542""",Bike,Ghitorni Village,Khan Market,5.3,,,,737.0,48.21,4.1,4.3,UPI


In [64]:
# We remove the useless columns for modeling (or the data that may leak)

df_3 = df_3.drop(["booking_status","reason_for_cancelling_by_customer","driver_cancellation_reason","incomplete_rides_reason",
                "booking_value","ride_distance","Unnamed: 0", "booking_id", "customer_id","driver_rating", 
                "driver_rating", "customer_rating"], axis=1)

In [74]:
# 1) Target / features
y = df_3["avg_vtat"]
X = df_3.drop(columns=["avg_vtat"])

# 2) Types
num_cols = X.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X.select_dtypes(exclude=["number"]).columns.tolist()

preprocess = ColumnTransformer(transformers=[
        ("num", Pipeline([("imputer", SimpleImputer(strategy="median")),]), num_cols) if num_cols else ("num_drop","drop",[]),
        ("cat", Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore", min_frequency=50))]), cat_cols) if cat_cols else ("cat_drop","drop",[]),],remainder="drop")

# 4) RF Modelling
rf_fast = RandomForestRegressor(n_estimators=200,max_depth=16,min_samples_leaf=10,max_features=0.5,bootstrap=True,max_samples=0.7,
                                n_jobs=-1, random_state=42)

pipe = Pipeline([("pre", preprocess),("model", rf_fast)])

# 5) Model assessment
cv = KFold(n_splits=3, shuffle=True, random_state=42)
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
mae_cv = -cross_val_score(pipe, X, y, cv=cv, scoring=mae_scorer, n_jobs=-1)
print(f"CV (3-fold) | MAE: {mae_cv.mean():.3f} ± {mae_cv.std():.3f}")

# Model fitting
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)
pipe.fit(X_tr, y_tr)
pred = pipe.predict(X_te)
mae  = mean_absolute_error(y_te, pred)
rmse = np.sqrt(mean_squared_error(y_te, pred))
r2   = r2_score(y_te, pred)
print(f"Hold-out | MAE: {mae:.3f} | RMSE: {rmse:.3f} | R²: {r2:.3f}")
pipe.fit(X, y)
import joblib
joblib.dump(pipe, "avg_vtat_rf_fast.joblib", compress=3)
print("Model saved as: avg_vtat_rf.joblib")

CV (3-fold) | MAE: 3.479 ± 0.007
Hold-out | MAE: 3.486 | RMSE: 4.198 | R²: 0.022
Model saved as: avg_vtat_rf_fast.joblib


### **Conclusion**

The model estimates customer wait time before a trip starts, but current results show an average error of about 3–4 minutes and very low explanatory power—only slightly better than a simple historical average. This means the predictions aren’t reliable enough to display precise ETAs or to trigger costly actions (re-dispatch, proactive credits). 

**At best, the output can help loosely prioritize manual checks in certain zones or hours. The main reason for the limitation is missing real-time information (driver proximity/ETA, supply–demand balance, traffic, weather, local events).**