<a href="https://colab.research.google.com/github/Iamjohnko/Data-science-Project-Portfolio/blob/main/Compact_Intelligence_Panel_The_Predictive_Recall_System_Nissan_Needed_Before_173%2C000_Vehicles_Were_Recalled.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [114]:
import os
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import joblib
from datetime import datetime
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report
from sklearn.inspection import permutation_importance

In [115]:
RANDOM_STATE = 42
DATA_FILE = "nissan_fuel_pump_wiring_fault_dataset.csv"
OUTPUT_DIR = "outputs_live"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [116]:
# Stage 0: Load / generate dataset
# -----------------------
def load_data(path=DATA_FILE, regenerate_if_missing=True, n_samples=10000):
    if os.path.exists(path):
        df = pd.read_csv(path, parse_dates=['manufacture_date'] if 'manufacture_date' in pd.read_csv(path, nrows=0).columns else None)
        # ensure datetimes
        if 'manufacture_date' in df.columns:
            df['manufacture_date'] = pd.to_datetime(df['manufacture_date'], errors='coerce')
        if 'fault_detection_date' in df.columns:
            df['fault_detection_date'] = pd.to_datetime(df['fault_detection_date'], errors='coerce')
        print(f"[load_data] Loaded {path} shape={df.shape}")
        return df
    if not regenerate_if_missing:
        raise FileNotFoundError(path)
    # generate enhanced mock dataset (news-aligned)
    print("[load_data] Generating enhanced mock dataset.")
    np.random.seed(42)
    n = n_samples
    df = pd.DataFrame({
        'VIN': [f'VN{str(i).zfill(6)}' for i in range(1, n+1)],
        'model': np.random.choice(['NV200 Van','NV200 Taxi','City Express','Altima','Rogue'], n, p=[0.3,0.2,0.15,0.2,0.15]),
        'model_year': np.random.choice(range(2013, 2022), n),
        'region': np.random.choice(['North','South','East','West','Central'], n),
        'supplier_batch': np.random.choice(['B1','B2','B3','B4','B5'], n),
        'harness_routing_version': np.random.choice(['V1','V2','V3'], n, p=[0.4,0.4,0.2]),
        'ambient_temp_C': np.random.normal(30, 5, n).round(1),
        'fuel_pressure_psi': np.random.normal(50, 10, n).round(2),
        'current_draw_A': np.random.normal(10, 2, n).round(2),
        'mileage_km': np.abs(np.random.normal(80000, 25000, n).round(0))
    })
    fault_prob = 0.02 + 0.10 * df['model'].isin(['NV200 Van','NV200 Taxi','City Express']).astype(int) + 0.08*(df['harness_routing_version']=='V2').astype(int)
    fault_event = np.random.rand(n) < fault_prob
    df['wire_fault_event'] = fault_event.astype(int)
    df.loc[df['wire_fault_event']==1, 'fuel_pressure_psi'] *= np.random.uniform(0.3,0.7,size=fault_event.sum())
    df.loc[df['wire_fault_event']==1, 'current_draw_A'] *= np.random.uniform(1.3,1.7,size=fault_event.sum())
    df['fuse_blown_flag'] = ((df['current_draw_A'] > 15) & (df['fuel_pressure_psi'] < 35)).astype(int)
    df['failure_flag'] = ((df['wire_fault_event']==1) | (df['fuse_blown_flag']==1)).astype(int)
    df['manufacture_date'] = pd.to_datetime(np.random.choice(pd.date_range('2013-01-01','2020-12-31'), n))
    td = np.random.randint(30,1500,size=n)
    df['fault_detection_date'] = df['manufacture_date'] + pd.to_timedelta(td, unit='D')
    df.loc[df['failure_flag']==0, 'fault_detection_date'] = pd.NaT
    df.to_csv(path, index=False)
    print(f"[load_data] Saved generated dataset to {path}")
    return df

In [117]:
# -----------------------
# Stage 1: EDA (immediate rendering)
# -----------------------
def run_eda(df, show=True):
    print("[run_eda] Running EDA and rendering charts...")
    # Histograms
    fig_p = px.histogram(df, x='fuel_pressure_psi', color='failure_flag', nbins=60, title='Fuel Pressure Distribution by Failure Flag')
    fig_t = px.histogram(df, x='ambient_temp_C', color='failure_flag', nbins=40, title='Ambient Temperature Distribution')
    fig_c = px.histogram(df, x='current_draw_A', color='failure_flag', nbins=40, title='Current Draw Distribution')
    # Model failure rate
    model_rate = df.groupby('model')['failure_flag'].mean().reset_index()
    fig_m = px.bar(model_rate, x='model', y='failure_flag', title='Model-wise Failure Rate', text='failure_flag')
    fig_m.update_traces(texttemplate='%{text:.2%}', textposition='outside'); fig_m.update_layout(yaxis_tickformat='.0%')
    # Scatter pressure vs temp (colored by failure & harness)
    fig_s = px.scatter(df, x='ambient_temp_C', y='fuel_pressure_psi', color='failure_flag', symbol='harness_routing_version',
                       hover_data=['VIN','supplier_batch','model'], title='Ambient Temp vs Fuel Pressure (failures highlighted)')
    # Correlation heatmap
    numeric = [c for c in ['fuel_pressure_psi','current_draw_A','ambient_temp_C','mileage_km','model_year'] if c in df.columns]
    corr = df[numeric + ['failure_flag']].corr()
    fig_corr = px.imshow(corr, text_auto=True, title='Correlation Matrix')
    # Show figures
    if show:
        fig_p.show(); fig_t.show(); fig_c.show(); fig_m.show(); fig_s.show(); fig_corr.show()
    # Save summaries
    summary = df[numeric].describe().round(3)
    summary.to_csv(os.path.join(OUTPUT_DIR, "eda_numeric_summary.csv"))
    model_rate.to_csv(os.path.join(OUTPUT_DIR, "model_failure_rate.csv"), index=False)
    print(f"[run_eda] Saved numeric summary + model failure csv to {OUTPUT_DIR}")
    return {'pressure_hist': fig_p, 'temp_hist': fig_t, 'current_hist': fig_c, 'model_rate': fig_m, 'scatter': fig_s, 'corr': fig_corr}

In [118]:
# -----------------------
# Stage 2: Anomaly detection (renders)
# -----------------------
def run_anomaly(df, contamination=0.05, show=True):
    print("[run_anomaly] Running IsolationForest + fallback...")
    features = [c for c in ['fuel_pressure_psi','ambient_temp_C','mileage_km','current_draw_A'] if c in df.columns]
    if not features:
        df['anomaly_flag'] = 0
    else:
        try:
            iso = IsolationForest(n_estimators=200, contamination=contamination, random_state=RANDOM_STATE)
            iso.fit(df[features])
            pred = iso.predict(df[features])
            df['anomaly_flag'] = (pred == -1).astype(int)
        except Exception as e:
            print("[run_anomaly] IsolationForest error:", e)
            df['anomaly_flag'] = 0
        # fallback rule-based
        if df['anomaly_flag'].sum() == 0:
            df['anomaly_flag'] = ((df['fuel_pressure_psi'] < 35) | (df['ambient_temp_C'] > 45)).astype(int)
            print("[run_anomaly] Applied rule-based anomalies; count:", int(df['anomaly_flag'].sum()))
    # Visualize anomalies
    fig_anom = px.scatter(df, x='ambient_temp_C', y='fuel_pressure_psi', color='anomaly_flag', symbol='failure_flag',
                          hover_data=['VIN','supplier_batch','model'], title='Anomalies flagged (red) vs normal')
    if show:
        fig_anom.show()
    df.to_csv(os.path.join(OUTPUT_DIR, "with_anomalies.csv"), index=False)
    print("[run_anomaly] Saved dataset with anomaly flags to outputs.")
    return df

In [119]:
# Stage 3: Time-trend detection (renders)
# -----------------------
def run_trend(df, show=True):
    print("[run_trend] Creating monthly failure trend and 3-month moving average...")
    if 'fault_detection_date' not in df.columns:
        print("[run_trend] No fault_detection_date found; creating synthetic dates for demo.")
        df['fault_detection_date'] = df['manufacture_date'] + pd.to_timedelta(np.random.randint(30,1500,size=len(df)), unit='D')
        df.loc[df['failure_flag']==0, 'fault_detection_date'] = pd.NaT
    df['fault_month'] = pd.to_datetime(df['fault_detection_date']).dt.to_period('M').astype(str)
    trend = df.dropna(subset=['fault_month']).groupby('fault_month')['failure_flag'].mean().reset_index().rename(columns={'failure_flag':'failure_rate'})
    trend['moving_avg_3'] = trend['failure_rate'].rolling(3, min_periods=1).mean()
    # Plot
    fig_trend = go.Figure()
    fig_trend.add_trace(go.Scatter(x=trend['fault_month'], y=trend['failure_rate'], mode='lines+markers', name='monthly'))
    fig_trend.add_trace(go.Scatter(x=trend['fault_month'], y=trend['moving_avg_3'], mode='lines', name='3-mo MA', line=dict(dash='dash')))
    fig_trend.update_layout(title='Monthly Failure Rate & 3-month Moving Avg', xaxis_title='Month', yaxis_tickformat='.1%')
    if show:
        fig_trend.show()
    trend.to_csv(os.path.join(OUTPUT_DIR, "monthly_trend.csv"), index=False)
    print("[run_trend] Trend CSV saved to outputs.")
    return trend

In [120]:
# Stage 4: Predictive model train + show (renders feature importances, ROC, PR, risk dist)
# -----------------------
def train_and_show_model(df, show=True):
    print("[train_and_show_model] Training RandomForest classifier (predict failure_flag)...")
    numeric_features = [c for c in ['ambient_temp_C','fuel_pressure_psi','current_draw_A','mileage_km','model_year'] if c in df.columns]
    categorical_features = [c for c in ['model','region','harness_routing_version','supplier_batch'] if c in df.columns]
    X = df[numeric_features + categorical_features].copy()
    y = df['failure_flag'].astype(int).copy()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE, stratify=y)
    num_pipe = Pipeline([('scaler', StandardScaler())])
    cat_pipe = Pipeline([('ohe', OneHotEncoder(handle_unknown='ignore'))])
    pre = ColumnTransformer([('num', num_pipe, numeric_features), ('cat', cat_pipe, categorical_features)])
    rf = RandomForestClassifier(n_estimators=300, max_depth=12, class_weight='balanced', random_state=RANDOM_STATE, n_jobs=-1)
    pipe = Pipeline([('pre', pre), ('clf', rf)])
    pipe.fit(X_train, y_train)
    # Predict & evaluate
    y_proba = pipe.predict_proba(X_test)[:,1]
    y_pred = (y_proba >= 0.5).astype(int)
    auc = roc_auc_score(y_test, y_proba)
    pr_auc = average_precision_score(y_test, y_proba)
    report = classification_report(y_test, y_pred)
    print("[train_and_show_model] ROC AUC:", round(auc,4), "PR AUC:", round(pr_auc,4))
    print(report)
    # Feature importances (map OHE columns)
    try:
        ohe = pipe.named_steps['pre'].named_transformers_['cat'].named_steps['ohe']
        cat_cols = ohe.get_feature_names_out(categorical_features).tolist()
    except Exception:
        cat_cols = []
    feature_names = numeric_features + cat_cols
    importances = pipe.named_steps['clf'].feature_importances_
    fi_df = pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values('importance', ascending=False)
    # Permutation importance fallback (for interpretability)
    X_test_proc = pipe.named_steps['pre'].transform(X_test)
    perm = permutation_importance(pipe.named_steps['clf'], X_test_proc, y_test, n_repeats=10, random_state=RANDOM_STATE, n_jobs=-1)
    perm_df = pd.DataFrame({'feature': feature_names, 'perm_mean': perm.importances_mean, 'perm_std': perm.importances_std}).sort_values('perm_mean', ascending=False)
    # Render feature importance bar
    fig_fi = px.bar(fi_df.head(20), x='importance', y='feature', orientation='h', title='Top features (model importance)')
    fig_fi.update_layout(yaxis={'categoryorder':'total ascending'})
    # ROC
    from sklearn.metrics import roc_curve, precision_recall_curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    fig_roc = go.Figure(); fig_roc.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC')); fig_roc.update_layout(title=f"ROC Curve (AUC={auc:.3f})", xaxis_title='FPR', yaxis_title='TPR')
    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    fig_pr = go.Figure(); fig_pr.add_trace(go.Scatter(x=recall, y=precision, mode='lines', name='PR')); fig_pr.update_layout(title=f"Precision-Recall Curve (AP={pr_auc:.3f})", xaxis_title='Recall', yaxis_title='Precision')
    fig_risk = px.histogram(y_proba, nbins=30, title='Predicted Failure Probability (test set)')
    # Show
    if show:
        fig_fi.show(); fig_roc.show(); fig_pr.show(); fig_risk.show()
    # Save artifacts
    fi_df.to_csv(os.path.join(OUTPUT_DIR, "feature_importances.csv"), index=False)
    joblib.dump(pipe, os.path.join(OUTPUT_DIR, "rf_pipeline.joblib"))
    print("[train_and_show_model] Saved pipeline and feature importances to outputs.")
    # Add predicted proba to full df
    full_features = numeric_features + categorical_features
    try:
        df['predicted_failure_proba'] = pipe.predict_proba(df[full_features])[:,1]
    except Exception:
        df['predicted_failure_proba'] = 0.0
    return {'pipeline': pipe, 'auc': auc, 'pr_auc': pr_auc, 'fi_df': fi_df, 'perm_df': perm_df, 'figs': {'fi': fig_fi, 'roc': fig_roc, 'pr': fig_pr, 'risk': fig_risk}}

In [121]:
# Stage 5: Recall simulation (renders)
# -----------------------
def run_simulation(df, pipeline, harness_to='V3', current_reduce_pct=10, batch_to=None, show=True):
    print("[run_simulation] Running what-if with harness_to=", harness_to, "current_reduce_pct=", current_reduce_pct, "batch_to=", batch_to)
    d_sim = df.copy()
    if 'harness_routing_version' in d_sim.columns:
        d_sim['harness_routing_version'] = harness_to
    if 'current_draw_A' in d_sim.columns:
        d_sim['current_draw_A'] = d_sim['current_draw_A'] * (1 - current_reduce_pct/100.0)
    if batch_to:
        d_sim['supplier_batch'] = batch_to
    # compute probabilities
    features = [c for c in ['ambient_temp_C','fuel_pressure_psi','current_draw_A','mileage_km','model_year','model','region','harness_routing_version','supplier_batch'] if c in d_sim.columns]
    try:
        proba_before = pipeline.predict_proba(df[features])[:,1]
        proba_after = pipeline.predict_proba(d_sim[features])[:,1]
    except Exception as e:
        print("[run_simulation] pipeline predict failed:", e)
        proba_before = np.zeros(len(df)); proba_after = np.zeros(len(df))
    avg_before = proba_before.mean(); avg_after = proba_after.mean()
    vehicles = len(df)
    recall_cost_per_vehicle = 1200
    est_savings = max(0, (avg_before - avg_after) * vehicles * recall_cost_per_vehicle)
    # Plot distribution
    fig_sim = go.Figure()
    fig_sim.add_trace(go.Histogram(x=proba_before, name='Before', opacity=0.6))
    fig_sim.add_trace(go.Histogram(x=proba_after, name='After', opacity=0.6))
    fig_sim.update_layout(barmode='overlay', title=f'Risk Distribution Before vs After (avg {avg_before:.2%} → {avg_after:.2%})')
    if show:
        fig_sim.show()
    # Save delta table
    delta = pd.DataFrame({'VIN': df['VIN'], 'proba_before': proba_before, 'proba_after': proba_after})
    delta['delta'] = delta['proba_before'] - delta['proba_after']
    delta.sort_values('delta', ascending=False).to_csv(os.path.join(OUTPUT_DIR, "simulation_delta.csv"), index=False)
    print(f"[run_simulation] Est savings (USD): {est_savings:,.0f}; saved delta CSV to outputs.")
    return {'avg_before': avg_before, 'avg_after': avg_after, 'est_savings': est_savings, 'fig': fig_sim, 'delta': delta}

In [124]:
# Final: Predictive Dashboard Intelligence (compact rendered view)
# -----------------------
def show_predictive_dashboard(df, model_info, sim_info=None, show=True):
    """
    Render a compact intelligence view:
     - KPI tiles
     - Trend / risk chart
     - Top 20 risky VINs table (rendered as Plotly table)
     - Model top predictors and short actionable recommendations
    """
    print("[show_predictive_dashboard] Rendering predictive intelligence summary...")
    total = len(df)
    total_fail = int(df['failure_flag'].sum())
    total_anom = int(df.get('anomaly_flag', pd.Series([0])).sum())
    avg_risk = df.get('predicted_failure_proba', pd.Series([0])).mean()
    auc = model_info.get('auc', None); pr_auc = model_info.get('pr_auc', None)
    # KPI card figure (use indicator traces)
    kpi_fig = make_kpi_figure(total, total_fail, total_anom, avg_risk)
    kpi_fig.show() if show else None
    # Trend (reuse run_trend output quick)
    try:
        trend = run_trend(df, show=False)
        fig_trend = go.Figure()
        fig_trend.add_trace(go.Scatter(x=trend['fault_month'], y=trend['failure_rate'], mode='lines+markers', name='monthly'))
        fig_trend.add_trace(go.Scatter(x=trend['fault_month'], y=trend['moving_avg_3'], mode='lines', name='3-mo MA', line=dict(dash='dash')))
        fig_trend.update_layout(title='Monthly Fault Rate (Executive View)', yaxis_tickformat='.1%')
        if show: fig_trend.show()
    except Exception as e:
        print("[show_predictive_dashboard] Trend render failed:", e)
    # Top 20 risky VINs table
    top = df.sort_values('predicted_failure_proba', ascending=False).head(20)
    fig_table = go.Figure(data=[go.Table(
        header=dict(values=["VIN","Model","Year","Region","Batch","Harness","PredRisk","FailureFlag"]),
        cells=dict(values=[
            top['VIN'], top['model'], top['model_year'], top['region'], top['supplier_batch'], top['harness_routing_version'], (top['predicted_failure_proba']*100).round(2).astype(str) + '%', top['failure_flag']
        ])
    )])
    if show: fig_table.show()
    # Top model predictors (from model_info)
    fi = model_info.get('fi_df')
    if fi is not None:
        fig_fi = px.bar(fi.head(10), x='importance', y='feature', orientation='h', title='Top Model Predictors')
        if show: fig_fi.show()
    # Actionable recommendations (simple rules)
    recs = []
    if avg_risk > 0.08:
        recs.append(f"Fleet mean predicted risk {avg_risk:.2%} exceeds 8% threshold — consider targeted micro-recall of top 500 VINs.")
    # batch-level action
    if 'supplier_batch' in df.columns:
        batch = df.groupby('supplier_batch')['predicted_failure_proba'].mean().sort_values(ascending=False).reset_index().iloc[0]
        if batch['predicted_failure_proba'] > 0.12:
            recs.append(f"Supplier batch {batch['supplier_batch']} mean risk {batch['predicted_failure_proba']:.2%} — request supplier audit & QA sampling.")
    # Temperature action
    if {'ambient_temp_C','predicted_failure_proba'}.issubset(df.columns):
        high_temp_risk = df[df['ambient_temp_C'] > df['ambient_temp_C'].quantile(0.9)]['predicted_failure_proba'].mean()
        if high_temp_risk > avg_risk * 1.2:
            recs.append("High temperature deployments show elevated risk — add environmental stress testing to QA.")
    # Compose intelligence summary
    intelligence_text = f"Executive summary:\n - Fleet size: {total}\n - Historical failures: {total_fail}\n - Mean predicted failure risk: {avg_risk:.2%}\n - Model AUC: {auc:.2f} PR-AUC: {pr_auc:.2f}\n\nRecommendations:\n" + ("\n".join([" - " + r for r in recs]) if recs else " - No urgent mass action; monitor top risk VINs and supplier batch performance.")
    print(intelligence_text)
    # If a simulation info was provided, show its key numbers
    if sim_info is not None:
        print("\nSimulation result (quick): avg before:", f"{sim_info['avg_before']:.2%}", "avg after:", f"{sim_info['avg_after']:.2%}", "est savings:", f"${sim_info['est_savings']:,.0f}")
    return {'kpi_fig': kpi_fig, 'trend_fig': fig_trend, 'top_table': fig_table, 'fi_fig': fig_fi if fi is not None else None, 'intelligence_text': intelligence_text}


# helper KPI composite
def make_kpi_figure(total, total_fail, total_anom, avg_risk):
    fig = make_subkpi([
        ("Total Vehicles", f"{total:,}"),
        ("Historical Failures", f"{total_fail:,}"),
        ("Anomalies Detected", f"{total_anom:,}"),
        ("Mean Pred Risk", f"{avg_risk:.2%}")
    ])
    return fig

def make_subkpi(pairs):
    # simple horizontal layout of indicators
    fig = go.Figure()
    n = len(pairs)
    for i,(title,val) in enumerate(pairs):
        fig.add_trace(go.Indicator(
            mode="number+delta" if i==0 else "number",
            value=float(val.replace(',','').replace('%','')) if isinstance(val,str) and val.replace(',','').replace('%','').replace('.','').isdigit() else 0,
            title={"text": title},
            domain={'x':[i/n,(i+1)/n], 'y':[0,1]}
        ))
    fig.update_layout(height=200, margin=dict(l=20,r=20,t=20,b=20))
    return fig


# ---------- If executed as a script, provide a simple sequential run ----------
if __name__ == "__main__":
    # Run full pipeline sequentially, rendering each stage
    df = load_data()
    run_eda(df)
    df = run_anomaly(df)
    run_trend(df)
    model_info = train_and_show_model(df)
    sim_info = run_simulation(df, model_info['pipeline'])
    show_predictive_dashboard(df, model_info, sim_info)

[load_data] Loaded nissan_fuel_pump_wiring_fault_dataset.csv shape=(10000, 15)
[run_eda] Running EDA and rendering charts...


[run_eda] Saved numeric summary + model failure csv to outputs_live
[run_anomaly] Running IsolationForest + fallback...


[run_anomaly] Saved dataset with anomaly flags to outputs.
[run_trend] Creating monthly failure trend and 3-month moving average...


[run_trend] Trend CSV saved to outputs.
[train_and_show_model] Training RandomForest classifier (predict failure_flag)...
[train_and_show_model] ROC AUC: 0.994 PR AUC: 0.9635
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      2200
           1       0.93      0.88      0.90       300

    accuracy                           0.98      2500
   macro avg       0.96      0.94      0.95      2500
weighted avg       0.98      0.98      0.98      2500



[train_and_show_model] Saved pipeline and feature importances to outputs.
[run_simulation] Running what-if with harness_to= V3 current_reduce_pct= 10 batch_to= None


[run_simulation] Est savings (USD): 423,490; saved delta CSV to outputs.
[show_predictive_dashboard] Rendering predictive intelligence summary...


[run_trend] Creating monthly failure trend and 3-month moving average...
[run_trend] Trend CSV saved to outputs.


Executive summary:
 - Fleet size: 10000
 - Historical failures: 1200
 - Mean predicted failure risk: 13.35%
 - Model AUC: 0.99 PR-AUC: 0.96

Recommendations:
 - Fleet mean predicted risk 13.35% exceeds 8% threshold — consider targeted micro-recall of top 500 VINs.
 - Supplier batch B3 mean risk 14.26% — request supplier audit & QA sampling.

Simulation result (quick): avg before: 13.35% avg after: 9.82% est savings: $423,490


In [125]:
# Insight generator (local rule-based + model-aware)
def generate_insights(df, model_info=None):
    insights = []
    total = len(df)
    total_failures = int(df['failure_flag'].sum())
    total_anomalies = int(df['anomaly_flag'].sum()) if 'anomaly_flag' in df.columns else 0
    mean_risk = df.get('predicted_failure_proba', pd.Series([0])).mean()


    insights.append(f"**Fleet Overview:** Analyzed {total:,} vehicles; recorded failures: {total_failures:,}; anomalies detected: {total_anomalies:,}.")
    insights.append(f"**Overall Risk:** Fleet mean predicted failure probability: {mean_risk:.2%}.")


    # Top batch / region / model with highest risk or failure rate
    if 'supplier_batch' in df.columns:
        batch = df.groupby('supplier_batch').agg(vehicles=('VIN','count'), mean_risk=('predicted_failure_proba','mean'), total_failures=('failure_flag','sum')).reset_index().sort_values('mean_risk', ascending=False)
        if len(batch):
            b = batch.iloc[0]
            insights.append(f"**High Risk Batch:** Supplier batch **{b['supplier_batch']}** shows the highest mean predicted risk ({b['mean_risk']:.2%}, with {int(b['total_failures']):,} failures in {int(b['vehicles']):,} vehicles).")
    if 'region' in df.columns:
        r = df.groupby('region').agg(mean_risk=('predicted_failure_proba','mean'), total_failures=('failure_flag','sum')).reset_index().sort_values('mean_risk', ascending=False)
        if len(r):
            rr = r.iloc[0]
            insights.append(f"**High Risk Region:** Region **{rr['region']}** has the highest mean predicted risk ({rr['mean_risk']:.2%}, with {int(rr['total_failures']):,} failures).")
    if 'model' in df.columns:
         model = df.groupby('model').agg(mean_risk=('predicted_failure_proba','mean'), total_failures=('failure_flag','sum')).reset_index().sort_values('mean_risk', ascending=False)
         if len(model):
             m = model.iloc[0]
             insights.append(f"**High Risk Model:** Model **{m['model']}** has the highest mean predicted risk ({m['mean_risk']:.2%}, with {int(m['total_failures']):,} failures).")

    # Temperature-pressure correlation insight (if columns exist)
    if {'ambient_temp_C','fuel_pressure_psi'}.issubset(df.columns):
        corr = df['ambient_temp_C'].corr(df['fuel_pressure_psi'])
        insights.append(f"**Data Correlation:** Ambient temperature and fuel pressure have a correlation of **{corr:.2f}**.")
        high_temp = df[df['ambient_temp_C'] >= df['ambient_temp_C'].quantile(0.9)]
        if not high_temp.empty and 'predicted_failure_proba' in high_temp.columns:
            insights.append(f"**Temp Impact:** Vehicles operating in the top 10% hottest temperatures show a mean predicted risk of **{high_temp['predicted_failure_proba'].mean():.2%}**.")

    # Model-aware insights (if model_info is available)
    if model_info is not None:
        auc = model_info.get('auc')
        pr_auc = model_info.get('pr_auc')
        perm_df = model_info.get('perm_df')
        insights.append(f"**Model Performance:** Predictive model achieved ROC AUC of **{auc:.2f}** and PR AUC of **{pr_auc:.2f}**.")
        if perm_df is not None and not perm_df.empty:
            top_feats = perm_df['feature'].head(5).tolist()
            insights.append(f"**Key Predictors:** Top 5 most influential factors for predicting failure (by permutation importance) are: **{', '.join(top_feats)}**.")

    # Anomaly-based insights
    if total_anomalies > 0:
        anomalous_failures = df[(df['anomaly_flag'] == 1) & (df['failure_flag'] == 1)].shape[0]
        insights.append(f"**Anomalies:** **{total_anomalies:,}** vehicles were flagged as anomalies. **{anomalous_failures:,}** of these were also historical failures.")
        if total_anomalies > anomalous_failures:
             insights.append(f"**Proactive Leads:** **{total_anomalies - anomalous_failures:,}** anomalous vehicles were not historical failures – these may represent early warnings or potential proactive inspection targets.")


    return insights