In [None]:
%pip install pandas numpy scikit-learn matplotlib seaborn imbalanced-learn xgboost joblib streamlit




In [None]:
## Problem Definition
#- **Problem:** High ride cancellation rates in NCR impact revenue and customer satisfaction.
#- **Objective:** Build a machine learning model to predict cancellations and create a Streamlit dashboard for insights.
#- **Dataset:** ncr_ride_bookings.csv (21 columns, ~148,770 rows).

In [None]:
#Data Understanding
import pandas as pd

# Load the dataset
df = pd.read_csv('../data/ncr_ride_bookings.csv')  # Replace with your file path

# Basic inspection
print("Dataset Shape:", df.shape)  # Should be ~148770 rows, 21 columns
print("\nData Types:\n", df.dtypes)
print("\nMissing Values:\n", df.isnull().sum())  # High in Avg VTAT, Avg CTAT, etc., as per SOW
print("\nSample Data:\n", df.head(5))

# Summary statistics for numerical columns
print("\nNumerical Summary:\n", df.describe())

# Check class distribution for 'Booking Status' (imbalance check)
print("\nBooking Status Distribution:\n", df['Booking Status'].value_counts(normalize=True))

In [None]:
#Data Cleaning and Preparation
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Handle missing values
numerical_cols = ['Avg VTAT', 'Avg CTAT', 'Booking Value', 'Ride Distance', 'Driver Ratings', 'Customer Rating']
categorical_cols = ['Vehicle Type', 'Pickup Location', 'Drop Location', 'Payment Method', 'Reason for cancelling by Customer', 'Driver Cancellation Reason', 'Incomplete Rides Reason']

# Impute numerical with median
for col in numerical_cols:
    df[col] = df[col].fillna(df[col].median())

# Impute categorical with 'Unknown'
for col in categorical_cols:
    df[col] = df[col].fillna('Unknown')

# Handle flags (binary-like, fill with 0)
flag_cols = ['Cancelled Rides by Customer', 'Cancelled Rides by Driver', 'Incomplete Rides']
for col in flag_cols:
    df[col] = df[col].fillna(0).astype(int)

# Convert Date and Time to datetime
#df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%d/%m/%Y %H:%M:%S')
#df.drop(['Date', 'Time'], axis=1, inplace=True)  # Drop original
# Fixed Datetime parsing (use '%Y-%m-%d %H:%M:%S' based on error; assumes Date is 'YYYY-MM-DD')
df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%Y-%m-%d %H:%M:%S')
df.drop(['Date', 'Time'], axis=1, inplace=True)

# Outlier handling (IQR for numerical)
for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = np.clip(df[col], lower_bound, upper_bound)  # Cap outliers

# Drop unnecessary IDs (not predictive)
df.drop(['Booking ID', 'Customer ID'], axis=1, inplace=True)

# Target transformation (binary: True for cancellation, False for Completed)
df['Cancelled'] = df['Booking Status'].apply(lambda x: False if x == 'Completed' else True)
df.drop('Booking Status', axis=1, inplace=True)

# Split data (80% train, 10% val, 10% test)
X = df.drop('Cancelled', axis=1)
y = df['Cancelled']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Scaling (after split, fit on train)
scaler = StandardScaler()
numerical_to_scale = ['Avg VTAT', 'Avg CTAT', 'Booking Value', 'Ride Distance']  # Select continuous
X_train[numerical_to_scale] = scaler.fit_transform(X_train[numerical_to_scale])
X_val[numerical_to_scale] = scaler.transform(X_val[numerical_to_scale])
X_test[numerical_to_scale] = scaler.transform(X_test[numerical_to_scale])

print("Data Cleaned and Split. Train Shape:", X_train.shape)

In [None]:
#Feature Engineering
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Temporal features
X_train['hour_of_day'] = X_train['Datetime'].dt.hour
X_train['day_of_week'] = X_train['Datetime'].dt.dayofweek
X_train['is_weekend'] = X_train['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

X_val['hour_of_day'] = X_val['Datetime'].dt.hour
X_val['day_of_week'] = X_val['Datetime'].dt.dayofweek
X_val['is_weekend'] = X_val['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

X_test['hour_of_day'] = X_test['Datetime'].dt.hour
X_test['day_of_week'] = X_test['Datetime'].dt.dayofweek
X_test['is_weekend'] = X_test['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

# Peak/Off-peak flag (e.g., peak: 7-10 AM, 5-8 PM)
def is_peak(hour):
    return 1 if (7 <= hour <= 10) or (17 <= hour <= 20) else 0
X_train['peak_flag'] = X_train['hour_of_day'].apply(is_peak)
X_val['peak_flag'] = X_val['hour_of_day'].apply(is_peak)
X_test['peak_flag'] = X_test['hour_of_day'].apply(is_peak)

# VTAT buckets (low <5, medium 5-10, high >10)
def vtat_bucket(vtat):
    if vtat < 5: return 'Low'
    elif vtat <= 10: return 'Medium'
    else: return 'High'
X_train['vtat_bucket'] = X_train['Avg VTAT'].apply(vtat_bucket)
X_val['vtat_bucket'] = X_val['Avg VTAT'].apply(vtat_bucket)
X_test['vtat_bucket'] = X_test['Avg VTAT'].apply(vtat_bucket)

# High/Low fare flag (above median = high)
median_value = X_train['Booking Value'].median()
X_train['high_fare_flag'] = X_train['Booking Value'].apply(lambda x: 1 if x > median_value else 0)
X_val['high_fare_flag'] = X_val['Booking Value'].apply(lambda x: 1 if x > median_value else 0)
X_test['high_fare_flag'] = X_test['Booking Value'].apply(lambda x: 1 if x > median_value else 0)

# Reliability scores (customer cancellation rate - group by Customer ID, but since IDs dropped, approximate with flags)
X_train['customer_reliability'] = 1 - X_train['Cancelled Rides by Customer']  # Simple: 1 if no past cancel, 0 if yes
X_val['customer_reliability'] = 1 - X_val['Cancelled Rides by Customer']
X_test['customer_reliability'] = 1 - X_test['Cancelled Rides by Customer']

X_train['driver_reliability'] = 1 - X_train['Cancelled Rides by Driver']
X_val['driver_reliability'] = 1 - X_val['Cancelled Rides by Driver']
X_test['driver_reliability'] = 1 - X_test['Cancelled Rides by Driver']

# Ride speed (avoid division by zero)
X_train['ride_speed'] = X_train['Ride Distance'] / X_train['Avg CTAT'].replace(0, np.nan).fillna(1)
X_val['ride_speed'] = X_val['Ride Distance'] / X_val['Avg CTAT'].replace(0, np.nan).fillna(1)
X_test['ride_speed'] = X_test['Ride Distance'] / X_test['Avg CTAT'].replace(0, np.nan).fillna(1)

# Drop Datetime after extraction
X_train.drop('Datetime', axis=1, inplace=True)
X_val.drop('Datetime', axis=1, inplace=True)
X_test.drop('Datetime', axis=1, inplace=True)

# Encoding
ohe_cols = ['Vehicle Type', 'Payment Method', 'vtat_bucket']  # One-hot
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_ohe = pd.DataFrame(ohe.fit_transform(X_train[ohe_cols]), columns=ohe.get_feature_names_out(), index=X_train.index)
X_val_ohe = pd.DataFrame(ohe.transform(X_val[ohe_cols]), columns=ohe.get_feature_names_out(), index=X_val.index)
X_test_ohe = pd.DataFrame(ohe.transform(X_test[ohe_cols]), columns=ohe.get_feature_names_out(), index=X_test.index)

# Label encoding for locations and reasons (high cardinality, so label)
le_cols = ['Pickup Location', 'Drop Location', 'Reason for cancelling by Customer', 'Driver Cancellation Reason', 'Incomplete Rides Reason']
le_dict = {}
for col in le_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_val[col] = le.transform(X_val[col].map(lambda s: '<unknown>' if s not in le.classes_ else s))
    X_test[col] = le.transform(X_test[col].map(lambda s: '<unknown>' if s not in le.classes_ else s))
    le.classes_ = np.append(le.classes_, '<unknown>')  # Handle unknowns
    le_dict[col] = le

# Combine encoded and drop originals
X_train = pd.concat([X_train.drop(ohe_cols, axis=1), X_train_ohe], axis=1)
X_val = pd.concat([X_val.drop(ohe_cols, axis=1), X_val_ohe], axis=1)
X_test = pd.concat([X_test.drop(ohe_cols, axis=1), X_test_ohe], axis=1)

print("Features Engineered. Train Shape:", X_train.shape)

In [None]:
#Exploratory Data Analysis (EDA)
import matplotlib.pyplot as plt
import seaborn as sns

# Full df for EDA (before split)
df_eda = df.copy()  # Use original for visuals

# Cancellation trends by Vehicle Type
plt.figure(figsize=(10,6))
sns.countplot(data=df_eda, x='Vehicle Type', hue='Cancelled')
plt.title('Cancellations by Vehicle Type')
plt.savefig('vehicle_cancellations.png')  # Save for report
plt.show()

# By hour
df_eda['hour'] = df_eda['Datetime'].dt.hour
plt.figure(figsize=(10,6))
sns.histplot(data=df_eda, x='hour', hue='Cancelled', multiple='stack')
plt.title('Cancellations by Hour')
plt.savefig('hour_cancellations.png')
plt.show()

# Correlation heatmap (numerical only)
num_df = df_eda.select_dtypes(include=np.number)
plt.figure(figsize=(12,8))
sns.heatmap(num_df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.savefig('correlation_heatmap.png')
plt.show()

# Cancellation reasons (top 5)
top_reasons = df_eda['Reason for cancelling by Customer'].value_counts().head(5)
plt.figure(figsize=(8,5))
top_reasons.plot(kind='bar')
plt.title('Top Customer Cancellation Reasons')
plt.savefig('reasons.png')
plt.show()

# Print insights
print("Key Insights: High cancellations in peak hours, certain vehicles, low ratings correlate with cancels.")

In [None]:
#Model Development
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Handle imbalance with SMOTE (on train only)
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(),
    'Neural Network': MLPClassifier(max_iter=500)
}

# Hyperparam tuning (example for RF and XGBoost)
param_grids = {
    'Random Forest': {'n_estimators': [50, 100], 'max_depth': [10, 20]},
    'XGBoost': {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1]}
}

best_models = {}
for name, model in models.items():
    if name in param_grids:
        grid = GridSearchCV(model, param_grids[name], cv=5, scoring='f1')
        grid.fit(X_train_res, y_train_res)
        best_models[name] = grid.best_estimator_
        print(f"{name} Best Params: {grid.best_params_}")
    else:
        model.fit(X_train_res, y_train_res)
        best_models[name] = model

print("Models Trained.")


In [None]:
#Model Evaluation
