# 🍲 Local Food Wastage Analytics — EDA, Hypothesis Tests & ML (Colab)
**Datasets:** `providers_data.csv`, `receivers_data.csv`, `food_listings_data.csv`, `claims_data.csv`  
**Goal:** Do end‑to‑end EDA, clean & prepare data, test 3 hypotheses, and build **classification models** to predict claim success — with clear, reproducible charts.

> Tip: Upload all 4 CSVs into Colab's `/content` then run the notebook top‑to‑bottom.

## 0. Setup (Install & Imports)

In [None]:
# Detect Colab and install extras
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False

if IN_COLAB:
    !pip -q install xgboost shap imbalanced-learn

import os, re, warnings, itertools
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt  # (no seaborn per project constraints)

from scipy import stats
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
                             confusion_matrix, RocCurveDisplay, ConfusionMatrixDisplay, classification_report)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

# Optional libs
try:
    import shap
    SHAP_AVAILABLE = True
except Exception:
    SHAP_AVAILABLE = False

try:
    from xgboost import XGBClassifier
    XGB_AVAILABLE = True
except Exception:
    XGB_AVAILABLE = False

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


## 1. Load the 4 datasets

In [None]:
# ===== Option A: Upload manually =====
if IN_COLAB:
    from google.colab import files
    print("👉 Upload the 4 CSVs: providers_data.csv, receivers_data.csv, food_listings_data.csv, claims_data.csv")
    _ = files.upload()

# ===== Option B: Use path if files already present =====
DATA_DIR = "."  # change to "/content" if needed

prov_path  = os.path.join(DATA_DIR, "providers_data.csv")
recv_path  = os.path.join(DATA_DIR, "receivers_data.csv")
food_path  = os.path.join(DATA_DIR, "food_listings_data.csv")
claim_path = os.path.join(DATA_DIR, "claims_data.csv")

missing = [p for p in [prov_path, recv_path, food_path, claim_path] if not os.path.exists(p)]
if missing:
    print("⚠️ Missing files:", missing, "\nUpload them or fix DATA_DIR before proceeding.")


### 1.1 Robust CSV Reader + Column Normalization

In [None]:
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out.columns = [re.sub(r'[^0-9a-zA-Z_]', '', c.strip().lower().replace(' ', '_')) for c in out.columns]
    return out

# Expected canonical columns after normalization:
# providers: provider_id, name, type, city, contact, address?, provider_type?
# receivers: receiver_id, name, type, city, contact, address?, receiver_type?
# food_listings: food_id, food_name, quantity, expiry_date, provider_id, provider_type?, location, food_type, meal_type
# claims: claim_id, food_id, receiver_id, status, timestamp

def read_providers_csv(path):
    df = normalize_columns(pd.read_csv(path))
    df.rename(columns={'providerid':'provider_id', 'providerid_':'provider_id'}, inplace=True)
    required = ['provider_id','name','type','city','contact']
    for col in required:
        if col not in df.columns:
            raise ValueError(f"[providers] missing column: {col}")
    if 'address' not in df.columns: df['address'] = np.nan
    if 'provider_type' not in df.columns: df['provider_type'] = df['type']
    return df[['provider_id','name','type','city','contact','address','provider_type']]

def read_receivers_csv(path):
    df = normalize_columns(pd.read_csv(path))
    df.rename(columns={'receiverid':'receiver_id', 'receiverid_':'receiver_id'}, inplace=True)
    required = ['receiver_id','name','type','city','contact']
    for col in required:
        if col not in df.columns:
            raise ValueError(f"[receivers] missing column: {col}")
    if 'address' not in df.columns: df['address'] = np.nan
    if 'receiver_type' not in df.columns: df['receiver_type'] = df['type']
    return df[['receiver_id','name','type','city','contact','address','receiver_type']]

def read_food_listings_csv(path):
    df = normalize_columns(pd.read_csv(path))
    df.rename(columns={'foodid':'food_id', 'foodid_':'food_id', 'providerid':'provider_id'}, inplace=True)
    required = ['food_id','food_name','quantity','expiry_date','provider_id','location','food_type','meal_type']
    for col in required:
        if col not in df.columns:
            raise ValueError(f"[food_listings] missing column: {col}")
    if 'provider_type' not in df.columns: df['provider_type'] = np.nan
    df['expiry_date'] = pd.to_datetime(df['expiry_date'], errors='coerce', infer_datetime_format=True)
    return df[['food_id','food_name','quantity','expiry_date','provider_id','provider_type','location','food_type','meal_type']]

def read_claims_csv(path):
    df = normalize_columns(pd.read_csv(path))
    df.rename(columns={'claimid':'claim_id','foodid':'food_id','receiverid':'receiver_id'}, inplace=True)
    required = ['claim_id','food_id','receiver_id','status','timestamp']
    for col in required:
        if col not in df.columns:
            raise ValueError(f"[claims] missing column: {col}")
    df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce', infer_datetime_format=True)
    return df[['claim_id','food_id','receiver_id','status','timestamp']]

providers = read_providers_csv(prov_path)
receivers  = read_receivers_csv(recv_path)
food       = read_food_listings_csv(food_path)
claims     = read_claims_csv(claim_path)

print("Shapes -> providers:", providers.shape, "| receivers:", receivers.shape, "| food:", food.shape, "| claims:", claims.shape)


## 2. Know Your Data — First Look

In [None]:
display(providers.head())
display(receivers.head())
display(food.head())
display(claims.head())

print("\nInfo: providers"); print(providers.info())
print("\nInfo: receivers"); print(receivers.info())
print("\nInfo: food"); print(food.info())
print("\nInfo: claims"); print(claims.info())

print("\nRows & Cols:",
      {"providers": providers.shape,
       "receivers": receivers.shape,
       "food": food.shape,
       "claims": claims.shape})


### 2.1 Duplicates & Missing Values

In [None]:
# Duplicates
print("Duplicate rows -> providers:", providers.duplicated().sum())
print("Duplicate rows -> receivers:", receivers.duplicated().sum())
print("Duplicate rows -> food:", food.duplicated().sum())
print("Duplicate rows -> claims:", claims.duplicated().sum())

# Missing summaries
def na_summary(df, name):
    s = df.isna().sum()
    pct = (s/len(df))*100
    out = pd.DataFrame({"column": s.index, "na_count": s.values, "na_pct": pct.values})
    out = out[out.na_count>0].sort_values("na_pct", ascending=False)
    print(f"\n{name} — Missing summary:")
    display(out if len(out) else pd.DataFrame({"note":[f"No missing in {name}"]}))

for n, d in [("providers", providers), ("receivers", receivers), ("food", food), ("claims", claims)]:
    na_summary(d, n)

# Simple missing "matrix"
def plot_missing_matrix(df, title):
    plt.figure()
    plt.imshow(df.isna(), aspect='auto', interpolation='nearest')
    plt.title(title)
    plt.xlabel("Columns")
    plt.ylabel("Rows")
    plt.xticks(range(len(df.columns)), df.columns, rotation=90, fontsize=8)
    plt.tight_layout()
    plt.show()

plot_missing_matrix(providers, "Missing Matrix — Providers")
plot_missing_matrix(receivers, "Missing Matrix — Receivers")
plot_missing_matrix(food, "Missing Matrix — Food")
plot_missing_matrix(claims, "Missing Matrix — Claims")


## 3. Build Analysis Tables (Joins)

In [None]:
cf   = claims.merge(food, on='food_id', how='left', suffixes=('_claim','_food'))
cfp  = cf.merge(providers, on='provider_id', how='left', suffixes=('','_prov'))
full = cfp.merge(receivers, on='receiver_id', how='left', suffixes=('','_recv'))

print("Merged table shape:", full.shape)
display(full.head(3))


## 4. EDA Visualizations (matplotlib-only)

In [None]:
def bar_count(series, title, xlabel="Category", ylabel="Count"):
    counts = series.value_counts(dropna=False)
    plt.figure()
    counts.plot(kind='bar')
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.tight_layout()
    plt.show()

# Counts
bar_count(food['food_type'], "Food Type Distribution")
bar_count(food['meal_type'], "Meal Type Distribution")
bar_count(food['location'], "Listings by Location")
bar_count(providers['provider_type'].fillna("Unknown"), "Provider Type Distribution")
bar_count(providers['city'], "Providers by City")

# Claims by status
bar_count(claims['status'].str.lower(), "Claims by Status")

# Quantity hist & box
plt.figure()
plt.hist(food['quantity'].dropna(), bins=30)
plt.title("Quantity Distribution")
plt.xlabel("Quantity")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

plt.figure()
plt.boxplot(food['quantity'].dropna(), vert=True)
plt.title("Quantity Boxplot (Outliers visible)")
plt.ylabel("Quantity")
plt.tight_layout()
plt.show()

# Simple numeric correlation
num_df = full.select_dtypes(include=[np.number]).copy()
if num_df.shape[1] > 1:
    corr = num_df.corr(numeric_only=True)
    plt.figure()
    im = plt.imshow(corr, interpolation='nearest')
    plt.title("Correlation Heatmap (numeric)")
    plt.colorbar(im, fraction=0.046, pad=0.04)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90, fontsize=8)
    plt.yticks(range(len(corr.columns)), corr.columns, fontsize=8)
    plt.tight_layout()
    plt.show()
else:
    print("Not enough numeric columns for heatmap.")


## 5. Hypothesis Testing

### H1: Average **Quantity** differs across **Food_Type** (One-way ANOVA)

In [None]:
groups = [g.dropna().values for _, g in food.groupby('food_type')['quantity']]
if len(groups) >= 2 and all(len(g)>=2 for g in groups):
    F, p = stats.f_oneway(*groups)
    print(f"ANOVA F={F:.3f}, p-value={p:.6f}")
    print("Decision:", "Reject H0" if p<0.05 else "Fail to reject H0")
else:
    print("Insufficient group sizes for ANOVA.")


### H2: **Claim success** rate differs by **Meal_Type** (Chi-square)

In [None]:
def map_success(s):
    s = str(s).strip().lower()
    pos = {'completed','claimed','approved','success','successful','closed'}
    neg = {'pending','rejected','cancelled','canceled','failed','expired'}
    if s in pos: return 1
    if s in neg: return 0
    return np.nan

claims['success_flag'] = claims['status'].apply(map_success)
cf2 = claims.merge(food[['food_id','meal_type']], on='food_id', how='left')
ct = pd.crosstab(cf2['meal_type'], cf2['success_flag'])
print(ct)
if ct.shape[0] >= 2 and ct.shape[1] >= 2 and (ct.values>0).all():
    chi2, p, dof, exp = stats.chi2_contingency(ct)
    print(f"Chi-square={chi2:.3f}, dof={dof}, p-value={p:.6f}")
    print("Decision:", "Reject H0" if p<0.05 else "Fail to reject H0")
else:
    print("Not enough data variation for chi-square test.")


### H3: **Time-to-claim** differs by **Provider_Type** (Mann–Whitney U)

In [None]:
tmp = claims.merge(food[['food_id','expiry_date','provider_id','provider_type']], on='food_id', how='left')
tmp = tmp.dropna(subset=['timestamp','expiry_date'])
tmp['hours_to_expiry_at_claim'] = (tmp['expiry_date'] - tmp['timestamp']).dt.total_seconds() / 3600.0

gA = tmp.loc[tmp['provider_type'].fillna('x').str.lower()=='organization', 'hours_to_expiry_at_claim'].dropna()
gB = tmp.loc[tmp['provider_type'].fillna('x').str.lower()!='organization', 'hours_to_expiry_at_claim'].dropna()

if len(gA)>=10 and len(gB)>=10:
    U, p = stats.mannwhitneyu(gA, gB, alternative='two-sided')
    print(f"Mann–Whitney U={U:.1f}, p-value={p:.6f}")
    print("Decision:", "Reject H0" if p<0.05 else "Fail to reject H0")
else:
    print("Insufficient sample size for Mann–Whitney test.")


## 6. Feature Engineering & ML Dataset

In [None]:
full_ml = food.merge(claims[['food_id','success_flag','timestamp']], on='food_id', how='left')

# Time features
full_ml['expiry_dayofweek']  = full_ml['expiry_date'].dt.dayofweek
full_ml['expiry_is_weekend'] = full_ml['expiry_dayofweek'].isin([5,6]).astype(int)

# Clean categories
for col in ['food_type','meal_type','provider_type','location']:
    if col in full_ml.columns:
        full_ml[col] = full_ml[col].astype(str).str.strip().str.lower()

# Drop rows with no target
ml_df = full_ml.dropna(subset=['success_flag']).copy()
ml_df['success_flag'] = ml_df['success_flag'].astype(int)

numeric_features = [c for c in ['quantity','expiry_dayofweek','expiry_is_weekend'] if c in ml_df.columns]
categorical_features = [c for c in ['food_type','meal_type','provider_type','location'] if c in ml_df.columns]

print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)
print("Target counts:\n", ml_df['success_flag'].value_counts())
X = ml_df[numeric_features + categorical_features].copy()
y = ml_df['success_flag'].copy()


### 6.1 Split + Preprocess + (Optional) SMOTE

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

imbalance_ratio = y_train.mean()
print(f"Positive class ratio (train): {imbalance_ratio:.3f}")

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features)
    ]
)

USE_SMOTE = (imbalance_ratio < 0.35 or imbalance_ratio > 0.65)
print("Using SMOTE:", USE_SMOTE)

if USE_SMOTE:
    X_train_arr = preprocess.fit_transform(X_train)
    X_test_arr  = preprocess.transform(X_test)
    sm = SMOTE(random_state=RANDOM_STATE)
    X_train_bal, y_train_bal = sm.fit_resample(X_train_arr, y_train)
else:
    X_train_bal = preprocess.fit_transform(X_train)
    y_train_bal = y_train.copy()
    X_test_arr  = preprocess.transform(X_test)


### 6.2 Evaluation Helpers

In [None]:
def evaluate_clf(y_true, y_pred, y_prob=None):
    out = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0)
    }
    if y_prob is not None:
        try:
            out["roc_auc"] = roc_auc_score(y_true, y_prob)
        except Exception:
            out["roc_auc"] = np.nan
    return out

def plot_conf_mat(y_true, y_pred, title="Confusion Matrix"):
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    plt.figure()
    disp.plot()
    plt.title(title)
    plt.tight_layout()
    plt.show()

def plot_roc_curve(y_true, y_prob, title="ROC Curve"):
    plt.figure()
    RocCurveDisplay.from_predictions(y_true, y_prob)
    plt.title(title)
    plt.tight_layout()
    plt.show()


## 7. Model 1 — Logistic Regression

In [None]:
logreg = LogisticRegression(max_iter=1000, class_weight='balanced' if not USE_SMOTE else None, random_state=RANDOM_STATE)

if USE_SMOTE:
    logreg.fit(X_train_bal, y_train_bal)
    y_pred_lr = logreg.predict(X_test_arr)
    try: y_prob_lr = logreg.predict_proba(X_test_arr)[:,1]
    except Exception: y_prob_lr = None
else:
    pipe_lr = Pipeline([("prep", preprocess), ("clf", logreg)])
    pipe_lr.fit(X_train, y_train)
    y_pred_lr = pipe_lr.predict(X_test)
    try: y_prob_lr = pipe_lr.predict_proba(X_test)[:,1]
    except Exception: y_prob_lr = None

metrics_lr = evaluate_clf(y_test, y_pred_lr, y_prob_lr)
print("Logistic Regression metrics:", metrics_lr)
print("\nClassification report (LR):\n", classification_report(y_test, y_pred_lr, zero_division=0))
plot_conf_mat(y_test, y_pred_lr, "LogReg — Confusion Matrix")
if y_prob_lr is not None: plot_roc_curve(y_test, y_prob_lr, "LogReg — ROC Curve")


## 8. Model 2 — Random Forest (+ GridSearch)

In [None]:
rf = RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE, n_jobs=-1,
                            class_weight='balanced' if not USE_SMOTE else None)

if USE_SMOTE:
    rf.fit(X_train_bal, y_train_bal)
    y_pred_rf = rf.predict(X_test_arr)
    try: y_prob_rf = rf.predict_proba(X_test_arr)[:,1]
    except Exception: y_prob_rf = None
else:
    pipe_rf = Pipeline([("prep", preprocess), ("clf", rf)])
    pipe_rf.fit(X_train, y_train)
    y_pred_rf = pipe_rf.predict(X_test)
    try: y_prob_rf = pipe_rf.predict_proba(X_test)[:,1]
    except Exception: y_prob_rf = None

metrics_rf = evaluate_clf(y_test, y_pred_rf, y_prob_rf)
print("Random Forest metrics:", metrics_rf)
print("\nClassification report (RF):\n", classification_report(y_test, y_pred_rf, zero_division=0))
plot_conf_mat(y_test, y_pred_rf, "Random Forest — Confusion Matrix")
if y_prob_rf is not None: plot_roc_curve(y_test, y_prob_rf, "Random Forest — ROC Curve")

# Small grid (only when pipeline is usable)
if not USE_SMOTE:
    param_grid = {
        "clf__n_estimators": [200, 400],
        "clf__max_depth": [None, 10, 20]
    }
    grid_rf = GridSearchCV(
        Pipeline([("prep", preprocess), ("clf", RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1))]),
        param_grid, cv=3, scoring='f1', n_jobs=-1, verbose=0
    )
    grid_rf.fit(X_train, y_train)
    print("RF GridSearch best params:", grid_rf.best_params_)
    y_pred_rf_g = grid_rf.predict(X_test)
    try: y_prob_rf_g = grid_rf.predict_proba(X_test)[:,1]
    except Exception: y_prob_rf_g = None
    metrics_rf_grid = evaluate_clf(y_test, y_pred_rf_g, y_prob_rf_g)
    print("Random Forest (Grid) metrics:", metrics_rf_grid)
else:
    print("Skipping RF GridSearch on SMOTE branch (pipeline-less).")


## 9. Model 3 — XGBoost (+ GridSearch)

In [None]:
if XGB_AVAILABLE:
    xgb = XGBClassifier(
        n_estimators=400, max_depth=5, learning_rate=0.1,
        subsample=0.9, colsample_bytree=0.9, random_state=RANDOM_STATE,
        reg_lambda=1.0, n_jobs=-1, eval_metric='logloss'
    )
    if USE_SMOTE:
        xgb.fit(X_train_bal, y_train_bal)
        y_pred_xgb = xgb.predict(X_test_arr)
        try: y_prob_xgb = xgb.predict_proba(X_test_arr)[:,1]
        except Exception: y_prob_xgb = None
    else:
        pipe_xgb = Pipeline([("prep", preprocess), ("clf", xgb)])
        pipe_xgb.fit(X_train, y_train)
        y_pred_xgb = pipe_xgb.predict(X_test)
        try: y_prob_xgb = pipe_xgb.predict_proba(X_test)[:,1]
        except Exception: y_prob_xgb = None

    metrics_xgb = evaluate_clf(y_test, y_pred_xgb, y_prob_xgb)
    print("XGBoost metrics:", metrics_xgb)
    print("\nClassification report (XGB):\n", classification_report(y_test, y_pred_xgb, zero_division=0))
    plot_conf_mat(y_test, y_pred_xgb, "XGBoost — Confusion Matrix")
    if y_prob_xgb is not None: plot_roc_curve(y_test, y_prob_xgb, "XGBoost — ROC Curve")

    if not USE_SMOTE:
        param_grid = {
            "clf__n_estimators": [300, 500],
            "clf__max_depth": [4, 6],
            "clf__learning_rate": [0.05, 0.1]
        }
        grid_xgb = GridSearchCV(
            Pipeline([("prep", preprocess), ("clf", XGBClassifier(subsample=0.9, colsample_bytree=0.9,
                                                                  random_state=RANDOM_STATE, eval_metric='logloss'))]),
            param_grid, cv=3, scoring='f1', n_jobs=-1, verbose=0
        )
        grid_xgb.fit(X_train, y_train)
        print("XGB GridSearch best params:", grid_xgb.best_params_)
        y_pred_xgb_g = grid_xgb.predict(X_test)
        try: y_prob_xgb_g = grid_xgb.predict_proba(X_test)[:,1]
        except Exception: y_prob_xgb_g = None
        metrics_xgb_grid = evaluate_clf(y_test, y_pred_xgb_g, y_prob_xgb_g)
        print("XGBoost (Grid) metrics:", metrics_xgb_grid)
    else:
        print("Skipping XGB GridSearch on SMOTE branch.")
else:
    print("XGBoost not installed. Install xgboost to enable this model.")


## 10. Model Comparison & Selection

In [None]:
# Collect available metric dicts
rows = []
if 'metrics_lr' in locals(): rows.append(("Logistic Regression", metrics_lr))
if 'metrics_rf' in locals(): rows.append(("Random Forest", metrics_rf))
if 'metrics_xgb' in locals(): rows.append(("XGBoost", metrics_xgb))

# Include grid searched metrics when available
if 'metrics_rf_grid' in locals(): rows.append(("Random Forest (Grid)", metrics_rf_grid))
if 'metrics_xgb_grid' in locals(): rows.append(("XGBoost (Grid)", metrics_xgb_grid))

compare_df = pd.DataFrame([dict(model=name, **m) for name, m in rows]).fillna(np.nan)
display(compare_df)

# Simple metric bar chart (F1)
if not compare_df.empty and 'f1' in compare_df.columns:
    plt.figure()
    plt.bar(compare_df['model'], compare_df['f1'])
    plt.title("Model F1 Score Comparison")
    plt.xticks(rotation=30, ha='right')
    plt.ylabel("F1")
    plt.tight_layout()
    plt.show()

# Pick best by F1 (fallback to accuracy)
if not compare_df.empty:
    metric_to_use = 'f1' if 'f1' in compare_df.columns else 'accuracy'
    best_row = compare_df.loc[compare_df[metric_to_use].idxmax()]
    BEST_MODEL_NAME = best_row['model']
    print(f"Selected best model by {metric_to_use}: {BEST_MODEL_NAME}")
else:
    BEST_MODEL_NAME = None
    print("No models trained.")


## 11. Save a Deployable Pipeline (.pkl)

In [None]:
import pickle

# For deployment, retrain a pipeline (without SMOTE) on train data for the chosen algorithm.
def make_pipeline_for(name: str):
    if name.startswith("Logistic"):
        clf = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_STATE)
    elif name.startswith("Random Forest"):
        clf = RandomForestClassifier(n_estimators=400, max_depth=None, random_state=RANDOM_STATE,
                                     n_jobs=-1, class_weight='balanced')
    elif name.startswith("XGBoost"):
        if not XGB_AVAILABLE:
            return None
        clf = XGBClassifier(n_estimators=400, max_depth=5, learning_rate=0.1, subsample=0.9,
                            colsample_bytree=0.9, random_state=RANDOM_STATE, reg_lambda=1.0,
                            n_jobs=-1, eval_metric='logloss')
    else:
        return None
    return Pipeline([("prep", preprocess), ("clf", clf)])

deploy_pipe = make_pipeline_for(BEST_MODEL_NAME or "Random Forest")
if deploy_pipe is not None:
    deploy_pipe.fit(X_train, y_train)
    y_pred_dep = deploy_pipe.predict(X_test)
    try: y_prob_dep = deploy_pipe.predict_proba(X_test)[:,1]
    except Exception: y_prob_dep = None
    print("Deployable pipeline metrics:", evaluate_clf(y_test, y_pred_dep, y_prob_dep))
    with open("best_model_pipeline.pkl", "wb") as f:
        pickle.dump(deploy_pipe, f)
    print("Saved: best_model_pipeline.pkl")
else:
    print("Could not create deployable pipeline for:", BEST_MODEL_NAME)


## 12. Feature Importance / Explainability

In [None]:
if 'deploy_pipe' in locals() and hasattr(deploy_pipe.named_steps['clf'], 'feature_importances_'):
    # Extract feature names from ColumnTransformer
    ohe = deploy_pipe.named_steps['prep'].named_transformers_['cat']
    num_names = deploy_pipe.named_steps['prep'].transformers_[0][2]
    cat_names = list(ohe.get_feature_names_out(deploy_pipe.named_steps['prep'].transformers_[1][2]))
    feat_names = list(num_names) + cat_names
    importances = deploy_pipe.named_steps['clf'].feature_importances_

    # Plot top-20
    idx = np.argsort(importances)[::-1][:20]
    plt.figure()
    plt.bar(range(len(idx)), importances[idx])
    plt.xticks(range(len(idx)), [feat_names[i] for i in idx], rotation=90, fontsize=8)
    plt.title("Top 20 Feature Importances (Tree-based)")
    plt.tight_layout()
    plt.show()
elif SHAP_AVAILABLE and 'deploy_pipe' in locals():
    try:
        # Use a small sample for SHAP to keep it fast
        sample_X = X_test.sample(min(400, len(X_test)), random_state=RANDOM_STATE)
        sample_trans = deploy_pipe.named_steps['prep'].transform(sample_X)
        model = deploy_pipe.named_steps['clf']
        explainer = shap.Explainer(model, sample_trans)
        shap_values = explainer(sample_trans)
        # summary_plot uses matplotlib by default
        shap.summary_plot(shap_values, sample_trans, show=True)
    except Exception as e:
        print("SHAP explainability skipped:", e)
else:
    print("No tree-based importances and SHAP not available.")


## 13. (Optional) Simple Unsupervised Segmentation (KMeans on Providers)

In [None]:
from sklearn.cluster import KMeans

prov_feat = providers[['city']].copy()
prov_feat['provider_type'] = providers['provider_type'].astype(str).str.lower()
prov_feat = pd.get_dummies(prov_feat, columns=['city','provider_type'], drop_first=False)

# Keep it robust: 3 clusters if enough rows, else skip
if len(prov_feat) >= 6:
    km = KMeans(n_clusters=3, n_init=10, random_state=RANDOM_STATE)
    labels = km.fit_predict(prov_feat)
    providers_seg = providers.copy()
    providers_seg['segment'] = labels
    display(providers_seg.head())
    bar_count(pd.Series(labels), "Provider Segments (KMeans)")
else:
    print("Not enough provider rows for clustering.")


## 14. Conclusion (What to report)

**Summary:**  
- Completed comprehensive EDA, handled missing values, inspected duplicates/outliers.  
- Validated 3 hypotheses (ANOVA/Chi‑square/Mann–Whitney).  
- Engineered features (time‑based, categorical cleanup), built multiple models (LR, RF, XGB).  
- Compared metrics (F1/ROC‑AUC), selected a best model, and exported a deployable pipeline (`best_model_pipeline.pkl`).  
- Included feature importance/SHAP and a small unsupervised segmentation.

**Notes for the report:**  
- Emphasize class imbalance handling (SMOTE vs. class weights).  
- Interpret top features from the final model.  
- Show confusion matrix and tie Precision/Recall to business trade‑offs (wastage vs. stockouts).  
- State actionable insights from EDA (e.g., cities or meal types with low claim rates).