<a href="https://colab.research.google.com/github/KashiKumari/OrderCancellation/blob/main/Finalone.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importing Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from collections import Counter
import gradio as gr

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
data= pd.read_csv("/content/Rider-Info.csv")

In [None]:
# Inspect dataset
print(data.head())
print(data.columns)

            order_time  order_id           order_date           allot_time  \
0  2021-01-26 02:21:35    556753  2021-01-26 00:00:00  2021-01-26 02:21:59   
1  2021-01-26 02:33:16    556754  2021-01-26 00:00:00  2021-01-26 02:33:57   
2  2021-01-26 02:39:49    556755  2021-01-26 00:00:00  2021-01-26 02:39:57   
3  2021-01-26 02:47:53    556756  2021-01-26 00:00:00  2021-01-26 02:48:25   
4  2021-01-26 03:06:30    556757  2021-01-26 00:00:00  2021-01-26 03:07:21   

           accept_time          pickup_time       delivered_time  rider_id  \
0  2021-01-26 02:22:08  2021-01-26 02:32:51  2021-01-26 02:49:47     11696   
1  2021-01-26 02:34:45  2021-01-26 02:50:25  2021-01-26 03:11:15     18117   
2  2021-01-26 02:40:13  2021-01-26 02:56:00  2021-01-26 03:12:46     18623   
3  2021-01-26 02:49:06  2021-01-26 03:21:51  2021-01-26 03:41:05     15945   
4  2021-01-26 03:07:57  2021-01-26 03:31:38  2021-01-26 04:00:15     17589   

   first_mile_distance  last_mile_distance  alloted_orders  de

In [None]:
# Handle datetime columns
date_cols = ['order_time', 'allot_time', 'accept_time', 'pickup_time', 'delivered_time']
for col in date_cols:
    data[col] = pd.to_datetime(data[col], errors='coerce', infer_datetime_format=True)

  data[col] = pd.to_datetime(data[col], errors='coerce', infer_datetime_format=True)
  data[col] = pd.to_datetime(data[col], errors='coerce', infer_datetime_format=True)
  data[col] = pd.to_datetime(data[col], errors='coerce', infer_datetime_format=True)
  data[col] = pd.to_datetime(data[col], errors='coerce', infer_datetime_format=True)
  data[col] = pd.to_datetime(data[col], errors='coerce', infer_datetime_format=True)


In [None]:
# Feature Engineering
# Customer type
data['customer_type'] = data['lifetime_order_count'].fillna(0).apply(lambda x: 'New' if x <= 1 else 'Returning')

In [None]:
# Device type proxy
data['order_hour'] = data['order_time'].dt.hour.fillna(0)
data['device_type'] = data['order_hour'].apply(lambda x: 'Mobile' if x >= 22 or x <= 6 else 'Web')

In [None]:
# Promo usage
data['promo_used'] = data['reassigned_order'].fillna(0).apply(lambda x: 1 if x > 0 else 0)


In [None]:
# Distance feature
data['first_mile_distance'] = data['first_mile_distance'].fillna(0)
data['last_mile_distance'] = data['last_mile_distance'].fillna(0)
data['total_distance'] = data['first_mile_distance'] + data['last_mile_distance']


In [None]:
# Rider workload
data['alloted_orders'] = data['alloted_orders'].fillna(0)
data['delivered_orders'] = data['delivered_orders'].fillna(0)
data['rider_workload'] = data['alloted_orders'] - data['delivered_orders']


In [None]:
# Time-based features
data['accept_delay'] = ((data['accept_time'] - data['allot_time']).dt.total_seconds() / 60).fillna(0)
data['delivery_duration'] = ((data['delivered_time'] - data['pickup_time']).dt.total_seconds() / 60).fillna(0)


In [None]:
# Inspect dataset
print(data.head())
print(data.columns)

           order_time  order_id           order_date          allot_time  \
0 2021-01-26 02:21:35    556753  2021-01-26 00:00:00 2021-01-26 02:21:59   
1 2021-01-26 02:33:16    556754  2021-01-26 00:00:00 2021-01-26 02:33:57   
2 2021-01-26 02:39:49    556755  2021-01-26 00:00:00 2021-01-26 02:39:57   
3 2021-01-26 02:47:53    556756  2021-01-26 00:00:00 2021-01-26 02:48:25   
4 2021-01-26 03:06:30    556757  2021-01-26 00:00:00 2021-01-26 03:07:21   

          accept_time         pickup_time      delivered_time  rider_id  \
0 2021-01-26 02:22:08 2021-01-26 02:32:51 2021-01-26 02:49:47     11696   
1 2021-01-26 02:34:45 2021-01-26 02:50:25 2021-01-26 03:11:15     18117   
2 2021-01-26 02:40:13 2021-01-26 02:56:00 2021-01-26 03:12:46     18623   
3 2021-01-26 02:49:06 2021-01-26 03:21:51 2021-01-26 03:41:05     15945   
4 2021-01-26 03:07:57 2021-01-26 03:31:38 2021-01-26 04:00:15     17589   

   first_mile_distance  last_mile_distance  ...  session_time  cancelled_time  \
0          

In [None]:

# Features and target
features = ['customer_type', 'device_type', 'promo_used', 'total_distance',
            'rider_workload', 'accept_delay', 'delivery_duration']

# One-hot encode categorical features
X = pd.get_dummies(data[features], drop_first=True)

# Scale numeric features
numeric_cols = ['total_distance','rider_workload','accept_delay','delivery_duration']
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

# Target variable
y = data['cancelled'].fillna(0)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Save model columns for input alignment
model_columns = X.columns.tolist()


# Train logistic regression on resampled data
model = LogisticRegression(penalty='l2', class_weight='balanced', max_iter=1000)
model.fit(X_train_res, y_train_res)


# Optional: check class distribution before and after SMOTE
print("Original training set class distribution:", Counter(y_train))
print("Resampled training set class distribution:", Counter(y_train_res))


Original training set class distribution: Counter({0: 311347, 1: 3653})
Resampled training set class distribution: Counter({0: 311347, 1: 311347})


In [None]:

# Gradio Prediction Function
def predict_order(customer_type, device_type, promo_used, total_distance,
                  rider_workload, accept_delay, delivery_duration):

    input_df = pd.DataFrame({
        'customer_type': [customer_type],
        'device_type': [device_type],
        'promo_used': [int(promo_used)],
        'total_distance': [float(total_distance)],
        'rider_workload': [float(rider_workload)],
        'accept_delay': [float(accept_delay)],
        'delivery_duration': [float(delivery_duration)]
    })

    # One-hot encode
    input_df = pd.get_dummies(input_df, drop_first=True)

    # Ensure all columns match training
    for col in model_columns:
        if col not in input_df.columns:
            input_df[col] = 0
    input_df = input_df[model_columns]

    # Scale numeric
    input_df[numeric_cols] = scaler.transform(input_df[numeric_cols])

    # Predict
    #prob = model.predict_proba(input_df)[:,1][0]
    prob = model.predict_proba(input_df)[:,1][0]
    print(prob)
    pred = 1 if prob > 0.3 else 0
    #pred = model.predict(input_df)[0]
    return f"Prediction: {'Cancelled/Fake' if pred==1 else 'Safe'}, Probability: {prob:.2f}"

In [None]:
# Gradio Interface
iface = gr.Interface(
    fn=predict_order,
    inputs=[
        gr.Dropdown(["New", "Returning"], label="Customer Type"),
        gr.Dropdown(["Mobile", "Web"], label="Device Type"),
        gr.Checkbox(label="Promo Used?"),
        gr.Number(label="Total Distance (km)", value=5),
        gr.Number(label="Rider Workload", value=2),
        gr.Number(label="Accept Delay (minutes)", value=5),
        gr.Number(label="Delivery Duration (minutes)", value=30)
    ],
    outputs="text",
    title=" Fake Order Prediction",
    description="Enter order details to predict whether the order is likely Cancelled/Fake."
)

# Launch the app
iface.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://87f7227667bd458afb.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Make predictions on the test set
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Classification Report (Precision, Recall, F1-Score)
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)


Accuracy: 0.9952
Confusion Matrix:
[[8212   40]
 [   0   89]]
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      8252
         1.0       0.69      1.00      0.82        89

    accuracy                           1.00      8341
   macro avg       0.84      1.00      0.91      8341
weighted avg       1.00      1.00      1.00      8341

