<a href="https://colab.research.google.com/github/Narayanan247/Ecommerce_Delivery_Performance/blob/main/ETA_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Load dataset (already uploaded in Colab)
df = pd.read_csv("shipments_data.csv", parse_dates=["shipment_date", "delivery_date", "promised_delivery_date"])

# ------------------- STEP 3: Feature Engineering ------------------- #
df["actual_delivery_days"] = (df["delivery_date"] - df["shipment_date"]).dt.days
df["promised_days"] = (df["promised_delivery_date"] - df["shipment_date"]).dt.days
df["delay_days"] = df["actual_delivery_days"] - df["promised_days"]
df["is_delayed"] = (df["delay_days"] > 0).astype(int)
df["ship_dayofweek"] = df["shipment_date"].dt.day_name()

print("‚úîÔ∏è Step 3 Done: Data cleaned and new features added.\n")

# ------------------- STEP 4: EDA & Insights ------------------- #
print("üìä Overall Delay Rate:")
print(f"{df['is_delayed'].mean() * 100:.2f}%\n")

print("üìä Delay by Courier Partner:")
print(df.groupby("courier_partner")["is_delayed"].mean().sort_values(ascending=False), "\n")

print("üìä Delay by City:")
print(df.groupby("customer_city")["is_delayed"].mean().sort_values(ascending=False), "\n")

df["distance_bucket"] = pd.cut(df["distance_km"], bins=[0, 200, 500, 1000, 2000],
                               labels=["0-200", "200-500", "500-1000", "1000-2000"])
print("üìä Delay by Distance:")
print(df.groupby("distance_bucket")["is_delayed"].mean().sort_values(ascending=False), "\n")

print("‚úîÔ∏è Step 4 Done: Insights generated.\n")

# ------------------- STEP 5: ETA Prediction Model ------------------- #
print("‚öôÔ∏è Training Delivery Time Prediction Model...")

model_df = df[[
    "actual_delivery_days",
    "distance_km",
    "package_weight_kg",
    "is_cod",
    "is_peak_sale",
    "courier_partner"
]].dropna()

model_df = pd.get_dummies(model_df, columns=["courier_partner"], drop_first=True)

X = model_df.drop("actual_delivery_days", axis=1)
y = model_df["actual_delivery_days"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("\nüìå Model Performance:")
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

# Add predictions back to df
df_model = pd.get_dummies(df[[
    "distance_km",
    "package_weight_kg",
    "is_cod",
    "is_peak_sale",
    "courier_partner"
]], drop_first=True)

df["predicted_delivery_days"] = model.predict(df_model)

# Export file for Power BI
df.to_csv("shipments_with_predictions.csv", index=False)

print("\n‚úîÔ∏è Step 5 Done: Final dataset saved as shipments_with_predictions.csv")


‚úîÔ∏è Step 3 Done: Data cleaned and new features added.

üìä Overall Delay Rate:
30.60%

üìä Delay by Courier Partner:
courier_partner
XpressBees    0.312402
Bluedart      0.306179
Ekart         0.305720
Delhivery     0.304317
Name: is_delayed, dtype: float64 

üìä Delay by City:
customer_city
Ahmedabad    0.312044
Chennai      0.308846
Hyderabad    0.306927
Bengaluru    0.306176
Kolkata      0.306016
Delhi        0.305906
Mumbai       0.302138
Pune         0.300192
Name: is_delayed, dtype: float64 

üìä Delay by Distance:
distance_bucket
1000-2000    0.351020
500-1000     0.336071
200-500      0.204553
0-200        0.157916
Name: is_delayed, dtype: float64 

‚úîÔ∏è Step 4 Done: Insights generated.

‚öôÔ∏è Training Delivery Time Prediction Model...


  print(df.groupby("distance_bucket")["is_delayed"].mean().sort_values(ascending=False), "\n")



üìå Model Performance:
MAE: 0.5649958902777777
R2 Score: 0.8707966302124818

‚úîÔ∏è Step 5 Done: Final dataset saved as shipments_with_predictions.csv
