In [39]:
import pandas as pd

# Load the cleaned CSV file
df = pd.read_csv("ncr_ride_bookings_with_weather_filled_scaled_short.csv")

# Display the first 5 rows to preview the data structure
df.head()


Unnamed: 0,Date,Time,booking_datetime,booking_hour,Booking ID,Booking Status,Customer ID,Vehicle Type,Pickup Location,Drop Location,...,wind_speed_10m_dropoff_scaled.1,precipitation_log_scaled,precipitation_log_scaled.1,rain_log_scaled,rain_log_scaled.1,snowfall_log_scaled,snowfall_log_scaled.1,precipitation_dropoff_log_scaled,rain_dropoff_log_scaled,snowfall_dropoff_log_scaled
0,2024-03-23,12:29:38,2024-03-23 12:29:38,2024-03-23 12:00:00,"""CNR5884300""",No Driver Found,"""CID1982111""",eBike,Palam Vihar,Jhilmil,...,1.392857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2024-11-29,18:01:39,2024-11-29 18:01:39,2024-11-29 18:00:00,"""CNR1326809""",Incomplete,"""CID4604802""",Go Sedan,Shastri Nagar,Gurgaon Sector 56,...,-1.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2024-08-23,08:56:10,2024-08-23 08:56:10,2024-08-23 08:00:00,"""CNR8494506""",Completed,"""CID9202816""",Auto,Khandsa,Malviya Nagar,...,-0.464286,0.182322,0.182322,0.182322,0.182322,0.0,0.0,0.0,0.0,0.0
3,2024-10-21,17:17:25,2024-10-21 17:17:25,2024-10-21 17:00:00,"""CNR8906825""",Completed,"""CID2610914""",Premier Sedan,Central Secretariat,Inderlok,...,-0.214286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2024-09-16,22:08:00,2024-09-16 22:08:00,2024-09-16 22:00:00,"""CNR1950162""",Completed,"""CID9933542""",Bike,Ghitorni Village,Khan Market,...,0.017857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
# -----------------------------
# Cell 1) Imports & folders
# -----------------------------
import os
import uuid
import time
from pathlib import Path
from typing import Dict, List, Tuple, Optional

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    f1_score, precision_score, recall_score, accuracy_score, roc_auc_score
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Make sure artifact folders exist
Path("artifacts/metrics").mkdir(parents=True, exist_ok=True)
Path("artifacts/plots").mkdir(parents=True, exist_ok=True)
Path("artifacts/models").mkdir(parents=True, exist_ok=True)

print("Folders ready.")


Folders ready.


In [41]:
# -----------------------------
# Cell 2) Feature inference helpers
# -----------------------------
from typing import Tuple

def infer_feature_lists(df: pd.DataFrame, target_col: str) -> Tuple[List[str], List[str], List[str]]:
    """
    Infer feature columns based on naming patterns created by preprocessing.
    Returns (numeric_scaled, categorical_fill, missing_flags).
    """
    cols = [c for c in df.columns if c != target_col]

    numeric_scaled = [c for c in cols if c.endswith("_scaled") or c.endswith("_log_scaled")]
    categorical_fill = [c for c in cols if c.endswith("_fill") or c == "Vehicle Type"]
    missing_flags = [c for c in cols if c.endswith("_missing_flag")]

    # De-duplicate while preserving order
    used = set()
    ordered_groups = []
    for group in (numeric_scaled, categorical_fill, missing_flags):
        tmp = []
        for c in group:
            if c not in used:
                tmp.append(c)
                used.add(c)
        ordered_groups.append(tmp)

    numeric_scaled, categorical_fill, missing_flags = ordered_groups
    print(f"#numeric_scaled={len(numeric_scaled)}, #categorical_fill={len(categorical_fill)}, #missing_flags={len(missing_flags)}")
    return numeric_scaled, categorical_fill, missing_flags


In [42]:
# -----------------------------
# Cell 3) Build preprocessor
# -----------------------------
def build_preprocessor(
    numeric_scaled: List[str],
    categorical_fill: List[str],
    missing_flags: List[str]
) -> ColumnTransformer:
    """
    One-hot encode categorical; passthrough numeric+flags (already scaled / binary).
    """
    num_and_flags = numeric_scaled + missing_flags
    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_fill),
            ("num", "passthrough", num_and_flags),
        ],
        remainder="drop",
    )
    return preprocessor


In [43]:
# -----------------------------
# Cell 4) Model registry
# -----------------------------
def get_models(random_state: int = 42) -> Dict[str, object]:
    """
    Baselines for logistic regression, decision tree, and ensembles.
    """
    models = {
        "logreg_l2": LogisticRegression(
            max_iter=1000,
            class_weight="balanced",
            random_state=random_state
        ),
        "dtree": DecisionTreeClassifier(
            max_depth=None,
            min_samples_split=5,
            class_weight="balanced",
            random_state=random_state
        ),
        "rf_300": RandomForestClassifier(
            n_estimators=300,
            max_depth=None,
            n_jobs=-1,
            class_weight="balanced",
            random_state=random_state
        ),
        "gbdt": GradientBoostingClassifier(
            random_state=random_state
        ),
    }
    return models


In [44]:
# -----------------------------
# Cell 5) Exact-label mapping & safer CV
# -----------------------------
from typing import Optional, Iterable
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd

def map_status_to_binary_exact_labels(s: str, positive_label_set: Iterable[str]) -> int:
    """
    Map Booking Status to binary using an explicit label set.
    Returns 1 if the normalized label is in positive_label_set, else 0.
    """
    if pd.isna(s):
        return 0
    s_norm = str(s).strip()
    return int(s_norm in set(positive_label_set))

def safe_stratified_cv(y_bin: pd.Series, n_splits: int = 5, random_state: int = 42):
    """
    Yield StratifiedKFold splits ensuring at least 2 classes per fold.
    Reduce n_splits if the minority class is too small; skip degenerate folds.
    """
    vc = y_bin.value_counts()
    if vc.nunique() == 1:
        raise ValueError(
            "Only one class present overall after mapping. Adjust your positive_label_set or labels."
        )
    minority = vc.min()
    n_splits = min(n_splits, minority)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    for fold_id, (tr_idx, va_idx) in enumerate(skf.split(np.zeros(len(y_bin)), y_bin), 1):
        y_tr, y_va = y_bin.iloc[tr_idx], y_bin.iloc[va_idx]
        if y_tr.nunique() < 2 or y_va.nunique() < 2:
            print(f"⚠️ Skipping fold {fold_id}: single class in train/valid.")
            continue
        yield fold_id, tr_idx, va_idx


In [45]:
# -----------------------------
# Cell 6) Evaluate models with CV (exact label set)
# -----------------------------
def evaluate_models_cv(
    df: pd.DataFrame,
    target_col: str,
    positive_label_set: Iterable[str],
    n_splits: int = 5,
) -> pd.DataFrame:
    """
    Build preprocessor, train multiple models with StratifiedKFold CV,
    and return a tidy metrics dataframe:
      ['run_id','timestamp','model_name','fold','metric','value','params']
    """
    # 1) Normalize target text and map to binary via exact label set
    y_text = df[target_col].astype(str).str.strip()
    y_bin = y_text.apply(lambda s: map_status_to_binary_exact_labels(s, positive_label_set)).astype(int)
    print("Label distribution:", y_bin.value_counts().to_dict())
    if y_bin.value_counts().min() < 2:
        raise ValueError("Not enough minority samples. Adjust positive_label_set or filter data.")

    # 2) Infer features & build X
    numeric_scaled, categorical_fill, missing_flags = infer_feature_lists(df, target_col)
    X = df[numeric_scaled + categorical_fill + missing_flags].copy()

    # 3) Preprocessor & models
    pre = build_preprocessor(numeric_scaled, categorical_fill, missing_flags)
    models = get_models()

    # 4) CV loop
    rows = []
    run_id = str(uuid.uuid4())
    ts = int(time.time())

    for model_name, base_model in models.items():
        pipe = Pipeline(steps=[("preprocess", pre), ("model", base_model)])
        for fold_id, tr_idx, va_idx in safe_stratified_cv(y_bin, n_splits=n_splits):
            X_tr, X_va = X.iloc[tr_idx], X_]()


SyntaxError: unmatched ']' (1946838344.py, line 38)

In [None]:
# -----------------------------
# Cell 7) Run: load data, set positive labels, evaluate, save, leaderboard
# -----------------------------
# 7.1 Load cleaned CSV from teammate
csv_path = "ncr_ride_bookings_with_weather_filled_scaled_short.csv"
df = pd.read_csv(csv_path)

# (optional) quick peek
print(df.shape)
print(df.columns[:10])

# 7.2 Choose target & define exact positive label set based on your dataset
target_col = "Booking Status"

# From your screenshot / dataset (adjust if you find more):
# { "Cancelled by Customer", "Cancelled by Driver", "Completed", "Incomplete", "No Driver Found" }
POSITIVE_LABELS = {
    "Cancelled by Customer",
    "Cancelled by Driver",
    "Incomplete",
    "No Driver Found"
}
NEGATIVE_LABELS = {"Completed"}  # optional for sanity check

# Sanity: list unknown labels not covered by the sets
df[target_col] = df[target_col].astype(str).str.strip()
known = POSITIVE_LABELS | NEGATIVE_LABELS
unknown = sorted(set(df[target_col].unique()) - known)
print("Unknown labels (not in POSITIVE or NEGATIVE sets):", unknown)

# 7.3 Evaluate with CV using the exact label set
metrics_df = evaluate_models_cv(
    df=df,
    target_col=target_col,
    positive_label_set=POSITIVE_LABELS,
    n_splits=5
)

# 7.4 Save metrics CSV for visualization teammate
from pathlib import Path
Path("artifacts/metrics").mkdir(parents=True, exist_ok=True)
out_csv = "artifacts/metrics/metrics.csv"
metrics_df.to_csv(out_csv, index=False)
print(f"Saved metrics to {out_csv}")

# 7.5 Quick leaderboard by F1
leaderboard = (
    metrics_df[metrics]()_


(146614, 68)
Index(['Date', 'Time', 'booking_datetime', 'booking_hour', 'Booking ID',
       'Booking Status', 'Customer ID', 'Vehicle Type', 'Pickup Location',
       'Drop Location'],
      dtype='object')
#numeric_scaled=20, #categorical_fill=9, #missing_flags=3
Label distribution: {0: 101146, 1: 45468}
Saved metrics to artifacts/metrics/metrics.csv


Unnamed: 0,model_name,mean,std
0,dtree,1.0,0.0
1,gbdt,1.0,0.0
2,logreg_l2,1.0,0.0
3,rf_300,1.0,0.0


In [None]:
# -----------------------------
# (Optional) Cell 8) Peek metrics
# -----------------------------
pd.read_csv("artifacts/metrics/metrics.csv").head(10)


Unnamed: 0,run_id,timestamp,model_name,fold,metric,value,params
0,6142a5b1-cd00-4028-b027-8ddf3db0f934,1758155153,logreg_l2,1,f1,1.0,"{'C': 1.0, 'class_weight': 'balanced', 'dual':..."
1,6142a5b1-cd00-4028-b027-8ddf3db0f934,1758155153,logreg_l2,1,precision,1.0,"{'C': 1.0, 'class_weight': 'balanced', 'dual':..."
2,6142a5b1-cd00-4028-b027-8ddf3db0f934,1758155153,logreg_l2,1,recall,1.0,"{'C': 1.0, 'class_weight': 'balanced', 'dual':..."
3,6142a5b1-cd00-4028-b027-8ddf3db0f934,1758155153,logreg_l2,1,accuracy,1.0,"{'C': 1.0, 'class_weight': 'balanced', 'dual':..."
4,6142a5b1-cd00-4028-b027-8ddf3db0f934,1758155153,logreg_l2,1,roc_auc,1.0,"{'C': 1.0, 'class_weight': 'balanced', 'dual':..."
5,6142a5b1-cd00-4028-b027-8ddf3db0f934,1758155153,logreg_l2,2,f1,1.0,"{'C': 1.0, 'class_weight': 'balanced', 'dual':..."
6,6142a5b1-cd00-4028-b027-8ddf3db0f934,1758155153,logreg_l2,2,precision,1.0,"{'C': 1.0, 'class_weight': 'balanced', 'dual':..."
7,6142a5b1-cd00-4028-b027-8ddf3db0f934,1758155153,logreg_l2,2,recall,1.0,"{'C': 1.0, 'class_weight': 'balanced', 'dual':..."
8,6142a5b1-cd00-4028-b027-8ddf3db0f934,1758155153,logreg_l2,2,accuracy,1.0,"{'C': 1.0, 'class_weight': 'balanced', 'dual':..."
9,6142a5b1-cd00-4028-b027-8ddf3db0f934,1758155153,logreg_l2,2,roc_auc,1.0,"{'C': 1.0, 'class_weight': 'balanced', 'dual':..."


In [None]:
df.columns

Index(['Date', 'Time', 'booking_datetime', 'booking_hour', 'Booking ID',
       'Booking Status', 'Customer ID', 'Vehicle Type', 'Pickup Location',
       'Drop Location', 'Cancelled Rides by Customer_fill',
       'Reason for cancelling by Customer_fill',
       'Cancelled Rides by Driver_fill', 'Driver Cancellation Reason_fill',
       'Incomplete Rides_fill', 'Incomplete Rides Reason_fill',
       'Avg VTAT_fill_scaled', 'VTAT_missing_flag', 'CTAT_missing_flag',
       'Avg CTAT_fill_scaled', 'BookingValue_missing_flag',
       'Booking Value_fill_scaled', 'Ride Distance_fill_scaled',
       'Driver Ratings_fill', 'Customer Rating_fill', 'pick_longitude',
       'pick_latitude', 'pick_address', 'pick_region', 'pick_locality',
       'drop_longitude', 'drop_latitude', 'drop_address', 'drop_region',
       'drop_locality', 'pick_station_latitude', 'pick_station_longitude',
       'drop_station_latitude', 'drop_station_longitude',
       'temperature_2m_scaled', 'temperature_2m_scaled.