In [3]:
# EDA, Preprocessing, Feature Engineering, Training, Export
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import joblib

# Resolve data directory robustly whether running from project root or notebooks/
ROOT = os.getcwd()
CANDIDATES = [
    os.path.join(ROOT, 'data'),
    os.path.abspath(os.path.join(ROOT, '..', 'data')),
    os.path.abspath(os.path.join(ROOT, '..'))  # project root (in case files are directly there)
]
DATA_DIR = None
for cand in CANDIDATES:
    if os.path.exists(os.path.join(cand, 'orders.csv')):
        DATA_DIR = cand
        break
# final fallback to parent data folder by name
if DATA_DIR is None and os.path.exists(os.path.join(CANDIDATES[1])):
    DATA_DIR = CANDIDATES[1]
if DATA_DIR is None:
    raise FileNotFoundError(f"Could not locate CSVs. Tried: {CANDIDATES}")

# Load
orders = pd.read_csv(os.path.join(DATA_DIR, 'orders.csv'))
perf = pd.read_csv(os.path.join(DATA_DIR, 'delivery_performance.csv'))
routes = pd.read_csv(os.path.join(DATA_DIR, 'routes_distance.csv'))
fleet = pd.read_csv(os.path.join(DATA_DIR, 'vehicle_fleet.csv'))
warehouse = pd.read_csv(os.path.join(DATA_DIR, 'warehouse_inventory.csv'))
feedback = pd.read_csv(os.path.join(DATA_DIR, 'customer_feedback.csv'))
costs = pd.read_csv(os.path.join(DATA_DIR, 'cost_breakdown.csv'))

# Dates
for df, cols in [
    (orders, ['order_date']),
    (perf, ['promised_delivery_time', 'actual_delivery_time'])
]:
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_datetime(df[c], errors='coerce')

# Merge on order_id where applicable
for df in (orders, perf, routes, costs, feedback):
    if 'order_id' in df.columns:
        df['order_id'] = df['order_id'].astype(str)
orders['origin_warehouse'] = orders['origin_warehouse'].astype(str)
warehouse['warehouse_id'] = warehouse['warehouse_id'].astype(str)

base = orders.merge(perf, on='order_id', how='left') \
             .merge(routes, on='order_id', how='left') \
             .merge(costs, on='order_id', how='left') \
             .merge(feedback[['order_id','rating']], on='order_id', how='left')
base = base.merge(warehouse, left_on='origin_warehouse', right_on='warehouse_id', how='left')

# Feature targets
base['delivery_delay_minutes'] = (base['actual_delivery_time'] - base['promised_delivery_time']).dt.total_seconds() / 60
base['delivery_delay_minutes'] = base['delivery_delay_minutes'].fillna(0)
base['is_delayed'] = (base['delivery_delay_minutes'] > 0).astype(int)

# Derived features
base['promised_margin_min'] = (base['promised_delivery_time'] - base['order_date']).dt.total_seconds() / 60
base['promised_margin_min'] = base['promised_margin_min'].fillna(base['promised_margin_min'].median())

# Priority score
priority_map = {'Express':3, 'Standard':2, 'Economy':1}
if 'priority' in base.columns:
    base['priority_score'] = base['priority'].map(priority_map).fillna(1)
else:
    base['priority_score'] = 1

# Traffic intensity score (normalized traffic_delay_min)
if 'traffic_delay_min' in base.columns:
    td = base['traffic_delay_min'].fillna(0)
    base['traffic_intensity_score'] = (td - td.min()) / (td.max() - td.min() + 1e-9)
else:
    base['traffic_intensity_score'] = 0

# Carrier reliability: historical on-time rate
if 'carrier' in base.columns:
    carrier_on_time = base.groupby('carrier')['is_delayed'].apply(lambda s: 1 - s.mean())
    base['carrier_reliability_score'] = base['carrier'].map(carrier_on_time).fillna(carrier_on_time.mean() if len(carrier_on_time)>0 else 0.5)
else:
    base['carrier_reliability_score'] = 0.5

# Route risk: combine weather + traffic
if 'weather_impact' in base.columns:
    weather_map = {'Low':0.0, 'Moderate':0.5, 'High':1.0}
    w = base['weather_impact'].map(weather_map).fillna(0.0)
else:
    w = 0.0
base['route_risk_score'] = base['traffic_intensity_score'] * 0.6 + (w if isinstance(w, pd.Series) else 0) * 0.4

# Warehouse utilization
if 'stock_level' in base.columns and 'reorder_level' in base.columns:
    base['warehouse_utilization'] = (base['stock_level'] / base['reorder_level']).replace([np.inf, -np.inf], np.nan).fillna(1.0)
else:
    base['warehouse_utilization'] = 1.0

# Expected travel time: distance / average speed by vehicle type (fallback speed)
speed_map = {'Truck':45.0, 'Van':55.0}
if 'distance_km' in base.columns:
    avg_speed = base['vehicle_type'].map(speed_map) if 'vehicle_type' in base.columns else 50.0
    if isinstance(avg_speed, pd.Series):
        avg_speed = avg_speed.fillna(50.0)
    base['expected_travel_time_min'] = (base['distance_km'] / (avg_speed + 1e-9)) * 60
else:
    base['expected_travel_time_min'] = 0

# Select features
numeric_features = [
    'distance_km','order_value','traffic_delay_min','promised_margin_min',
    'priority_score','carrier_reliability_score','route_risk_score',
    'warehouse_utilization','expected_travel_time_min','fuel_consumption_liters','tolls'
]
categorical_features = ['carrier','priority','weather_impact','product_category','destination_city']

# Ensure columns exist
for col in list(numeric_features):
    if col not in base.columns:
        base[col] = 0
for col in list(categorical_features):
    if col not in base.columns:
        base[col] = 'Unknown'

# Train/Test split (time-based if dates exist)
base = base.sort_values('order_date')
X = base[numeric_features + categorical_features]
y = base['is_delayed']

preprocess = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ]
)

clf = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', RandomForestClassifier(n_estimators=300, random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
clf.fit(X_train, y_train)
probs = clf.predict_proba(X_test)[:,1]
preds = (probs > 0.5).astype(int)

metrics = {
    'accuracy': float(accuracy_score(y_test, preds)),
    'precision': float(precision_score(y_test, preds, zero_division=0)),
    'recall': float(recall_score(y_test, preds, zero_division=0)),
    'f1': float(f1_score(y_test, preds, zero_division=0)),
    'roc_auc': float(roc_auc_score(y_test, probs)) if len(np.unique(y_test))>1 else None
}
print('Metrics:', metrics)

# Corrective actions
cost_of_delay = 200.0
cost_of_action = 50.0
risk = pd.Series(probs, index=X_test.index)
base.loc[X_test.index, 'delay_probability'] = risk

def recommend(p):
    if p > 0.8:
        action = 'Reroute or Reassign or Expedite'
    elif p > 0.5:
        action = 'Monitor closely'
    else:
        action = 'No action required'
    expected_saving = p * cost_of_delay - (cost_of_action if action != 'No action required' else 0)
    return action, expected_saving

rec = risk.apply(lambda p: recommend(p))
base.loc[X_test.index, 'recommended_action'] = rec.apply(lambda x: x[0])
base.loc[X_test.index, 'expected_saving'] = rec.apply(lambda x: x[1])

# Save artifacts
os.makedirs('models', exist_ok=True)
os.makedirs('data', exist_ok=True)
joblib.dump(clf, os.path.join('models','model.pkl'))
base.to_csv(os.path.join('data','final_dataset.csv'), index=False)
print('Saved model to models/model.pkl and dataset to data/final_dataset.csv')



Metrics: {'accuracy': 0.452991452991453, 'precision': 0.75, 'recall': 0.23684210526315788, 'f1': 0.36, 'roc_auc': 0.56008303915276}
Saved model to models/model.pkl and dataset to data/final_dataset.csv
