In [3]:
# main.py
"""
Full data processing + model training pipeline for Predictive Delivery Optimizer.

Outputs:
 - processed_data.csv       # cleaned & feature-engineered dataset
 - delivery_delay_model.pkl # trained classifier
 - encoders.pkl             # dict of label encoders used for categorical columns
"""

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
import warnings
warnings.filterwarnings("ignore")

# -------------------------
# 1) CONFIG / FILE PATHS
# -------------------------
DATA_FILES = {
    "delivery": "delivery_performance.csv",
    "orders": "orders.csv",
    "routes": "routes_distance.csv",
    "vehicles": "vehicle_fleet.csv",  # optional merge if Vehicle_ID present
}

OUTPUT_PROCESSED = "processed_data.csv"
OUTPUT_MODEL = "delivery_delay_model.pkl"
OUTPUT_ENCODERS = "encoders.pkl"

# -------------------------
# 2) LOAD DATAFRAMES
# -------------------------
def load_csv_safe(path):
    if os.path.exists(path):
        return pd.read_csv(path)
    else:
        raise FileNotFoundError(f"Required file not found: {path}")

print("Loading CSVs...")
df_delivery = load_csv_safe(DATA_FILES["delivery"])
df_orders = load_csv_safe(DATA_FILES["orders"])
df_routes = load_csv_safe(DATA_FILES["routes"])
# vehicles file optional (may not join to orders directly)
df_vehicles = pd.read_csv(DATA_FILES["vehicles"]) if os.path.exists(DATA_FILES["vehicles"]) else None

# -------------------------
# 3) MERGE
# -------------------------
# Primary merge: delivery + orders + routes on Order_ID
print("Merging tables on Order_ID...")
merged = df_delivery.merge(df_orders, on="Order_ID", how="outer") \
                    .merge(df_routes, on="Order_ID", how="outer")

# If vehicle mapping exists in orders (e.g., Vehicle_ID) and vehicles df loaded, merge it.
if df_vehicles is not None and "Vehicle_ID" in merged.columns and "Vehicle_ID" in df_vehicles.columns:
    merged = merged.merge(df_vehicles, on="Vehicle_ID", how="left")

# -------------------------
# 4) BASIC CLEANING
# -------------------------
print("Cleaning and filling missing values...")
# Standardize column names (strip spaces)
merged.columns = [c.strip() for c in merged.columns]

# Fill important numeric columns with sensible defaults
numeric_defaults = {
    "Customer_Rating": 0,
    "Delivery_Cost_INR": 0.0,
    "Distance_KM": 0.0,
    "Fuel_Consumption_L": 0.0,
    "Traffic_Delay_Minutes": 0.0,
    "Toll_Charges_INR": 0.0,
    "Order_Value_INR": 0.0,
    "Promised_Delivery_Days": np.nan,   # leave NaN for careful handling
    "Actual_Delivery_Days": np.nan
}
for col, val in numeric_defaults.items():
    if col in merged.columns:
        merged[col] = merged[col].fillna(val)

# Fill categorical defaults
cat_defaults = {
    "Delivery_Status": "Unknown",
    "Quality_Issue": "None",
    "Priority": "Standard",
    "Customer_Segment": "Unknown",
    "Product_Category": "Unknown",
    "Carrier": "Unknown",
    "Route": "Unknown"
}
for col, val in cat_defaults.items():
    if col in merged.columns:
        merged[col] = merged[col].fillna(val)

# Drop duplicates
merged.drop_duplicates(subset=["Order_ID"], inplace=True)

# -------------------------
# 5) DERIVED FEATURES
# -------------------------
print("Creating derived features...")

# Delay_Days: calculated from integer day counts (Actual - Promised)
if "Actual_Delivery_Days" in merged.columns and "Promised_Delivery_Days" in merged.columns:
    merged["Delay_Days"] = merged["Actual_Delivery_Days"] - merged["Promised_Delivery_Days"]
else:
    # fallback: if those columns don't exist, set Delay_Days to 0
    merged["Delay_Days"] = 0

# Binary target: Delayed (1) if Delay_Days > 0 else 0
merged["Delayed"] = (merged["Delay_Days"] > 0).astype(int)

# Distance, traffic and weather derived features - ensure columns exist
if "Distance_KM" not in merged.columns and "Distance" in merged.columns:
    merged["Distance_KM"] = merged["Distance"]

# Fuel cost per km (use tolls as proxy if no fuel cost)
def safe_div(a, b):
    try:
        return a / b if b and not np.isnan(b) and b != 0 else 0.0
    except Exception:
        return 0.0

merged["Fuel_Cost_per_KM"] = merged.apply(
    lambda r: safe_div(r.get("Toll_Charges_INR", 0.0), r.get("Distance_KM", 0.0)), axis=1
)

# Delivery efficiency: Distance per actual delivery day (avoid divide by zero)
merged["Delivery_Efficiency"] = merged.apply(
    lambda r: safe_div(r.get("Distance_KM", 0.0), r.get("Actual_Delivery_Days", 1.0)), axis=1
)

# Revenue per KM (Order value divided by distance)
merged["Revenue_per_KM"] = merged.apply(
    lambda r: safe_div(r.get("Order_Value_INR", 0.0), r.get("Distance_KM", 0.0)), axis=1
)

# Cost efficiency score (higher better)
merged["Cost_Efficiency_Score"] = merged.apply(
    lambda r: round((r["Revenue_per_KM"] / (r.get("Delivery_Cost_INR", 0.0) + 1)) * 100, 2), axis=1
)

# Simple satisfaction index: rating penalized by delay
merged["Satisfaction_Index"] = merged.get("Customer_Rating", 0) * (1 - (merged["Delay_Days"].fillna(0) / 10))
merged["Satisfaction_Index"] = merged["Satisfaction_Index"].clip(lower=0)

# -------------------------
# 6) FEATURE SELECTION for MODEL
# -------------------------
# Choose a sensible set of features that likely appear in your files
candidate_features = [
    "Distance_KM", "Traffic_Delay_Minutes", "Weather_Impact",
    "Order_Value_INR", "Delivery_Cost_INR",
    "Priority", "Carrier", "Customer_Segment", "Product_Category",
    "Fuel_Cost_per_KM", "Delivery_Efficiency", "Revenue_per_KM"
]

# Keep only features that present in merged
features = [c for c in candidate_features if c in merged.columns]
print("Using features:", features)

# Drop rows where target is missing (shouldn't happen) and where Distance_KM is nan
model_df = merged.dropna(subset=["Delayed"])
model_df[features] = model_df[features].fillna(0)

# -------------------------
# 7) ENCODE CATEGORICALS
# -------------------------
print("Encoding categorical variables...")
encoders = {}
X = model_df[features].copy()

for col in X.select_dtypes(include=["object", "category"]).columns:
    le = LabelEncoder()
    X[col] = X[col].astype(str).fillna("Unknown")
    X[col] = le.fit_transform(X[col])
    encoders[col] = le

y = model_df["Delayed"].astype(int)

# -------------------------
# 8) SCALE NUMERICS (optional)
# -------------------------
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# -------------------------
# 9) TRAIN/TEST SPLIT + MODEL
# -------------------------
print("Training RandomForestClassifier...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

clf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
clf.fit(X_train, y_train)

# -------------------------
# 10) EVALUATION
# -------------------------
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {acc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Save a simple feature importance plot
if hasattr(clf, "feature_importances_"):
    importances = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=True)
    plt.figure(figsize=(8, max(4, len(importances) * 0.4)))
    importances.plot(kind="barh")
    plt.title("Feature Importances")
    plt.tight_layout()
    plt.savefig("feature_importances.png")
    plt.close()
    print("Feature importance plot saved to feature_importances.png")

# -------------------------
# 11) SAVE ARTIFACTS
# -------------------------
print("Saving processed data and model artifacts...")
model_artifacts = {
    "model": clf,
    "scaler": scaler,
    "feature_columns": X.columns.tolist()
}
joblib.dump(model_artifacts, OUTPUT_MODEL)  # single file with model+scaler+cols
joblib.dump(encoders, OUTPUT_ENCODERS)
model_df.to_csv(OUTPUT_PROCESSED, index=False)

print(f"Saved processed dataset -> {OUTPUT_PROCESSED}")
print(f"Saved model+scaler+meta -> {OUTPUT_MODEL}")
print(f"Saved encoders -> {OUTPUT_ENCODERS}")
print("All done.")




Loading CSVs...
Merging tables on Order_ID...
Cleaning and filling missing values...
Creating derived features...
Using features: ['Distance_KM', 'Traffic_Delay_Minutes', 'Weather_Impact', 'Order_Value_INR', 'Delivery_Cost_INR', 'Priority', 'Carrier', 'Customer_Segment', 'Product_Category', 'Fuel_Cost_per_KM', 'Delivery_Efficiency', 'Revenue_per_KM']
Encoding categorical variables...
Training RandomForestClassifier...

Model Accuracy: 0.7750

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.88      0.84        26
           1       0.73      0.57      0.64        14

    accuracy                           0.78        40
   macro avg       0.76      0.73      0.74        40
weighted avg       0.77      0.78      0.77        40


Confusion Matrix:
[[23  3]
 [ 6  8]]
Feature importance plot saved to feature_importances.png
Saving processed data and model artifacts...
Saved processed dataset -> processed_data.csv
Saved model+scale