# 05 SHAP Explainability

Compute SHAP values for the surrogate model if SHAP is available.


# 說明

本 notebook 透過 SHAP 解釋 surrogate model 的特徵影響，輸出 summary/bar 與 dependence plot。

## 主要輸入
- `tables/filtered_stats.csv`

## 主要輸出
- `figures/shap_summary_*.png`
- `tables/shap_mean_abs_*.csv`

## 合理性檢查建議
- SHAP 重要度應與 model feature importance 大致一致
- 若 SHAP 缺少輸出，確認環境安裝了 `xgboost` 與 `shap`


In [1]:
import os
import numpy as np
from pathlib import Path
import pandas as pd

ANALYZE_DIR = Path('../outputFiles/analyze').resolve()
REPORT_PREFIX = os.environ.get('REPORT_PREFIX', 'analysis_reports')
COLLECT_PREFIX = os.environ.get('COLLECT_PREFIX', REPORT_PREFIX)
REPORT_DIR = (ANALYZE_DIR / REPORT_PREFIX)
COLLECT_DIR = (ANALYZE_DIR / COLLECT_PREFIX)

FILTER_SEARCH_K = os.environ.get('FILTER_SEARCH_K', '10')
TARGET_COLS = [
    'latency_p99_us',
    'latency_p95_us',
    'latency_p50_us',
    'mean_latency_us',
]
LOG_TARGETS = True
MAX_SAMPLES = int(os.environ.get('SHAP_MAX_SAMPLES', '2000'))

FEATURE_CANDIDATES = [
    'build_R','build_L','build_B','build_M',
    'search_K','search_L','search_W','search_T','cache_size',
    'vector_dim','dataset_size',
    'out_degree_p99','out_degree_p95','out_degree_p50','out_degree_mean',
    'expanded_revisit_ratio','expanded_per_query_mean','expanded_steps_mean',
    'node_counts_top10_share','node_counts_top100_share',
    'iostat_aqu-sz_mean','iostat_%util_mean','queue_depth_p99',
    'io_us_p99','cpu_us_p99','thread_util_p99',
]

def apply_search_k_filter(df, value):
    if not value or 'search_K' not in df.columns:
        return df
    try:
        target = int(value)
    except ValueError:
        return df
    return df[df['search_K'] == target].copy()

stats_path = sorted(COLLECT_DIR.glob('collected_stats_*.csv'))[-1]
stats_df = pd.read_csv(stats_path)

filtered_path = (REPORT_DIR / 'tables' / 'filtered_stats.csv')
filtered_df = None
if filtered_path.exists():
    filtered_df = pd.read_csv(filtered_path)
    print('filtered:', filtered_path)
else:
    print('filtered not found:', filtered_path)

base_df = filtered_df if filtered_df is not None else stats_df
base_df = apply_search_k_filter(base_df, FILTER_SEARCH_K)

feature_cols = [c for c in FEATURE_CANDIDATES if c in base_df.columns]
target_cols = [c for c in TARGET_COLS if c in base_df.columns]

print('rows:', len(base_df))
print('features:', feature_cols)
print('targets:', target_cols)

out_fig = (REPORT_DIR / 'figures')
out_tbl = (REPORT_DIR / 'tables')
out_fig.mkdir(parents=True, exist_ok=True)
out_tbl.mkdir(parents=True, exist_ok=True)

try:
    import xgboost as xgb
    import shap
except Exception as e:
    print('SHAP or xgboost not available:', e)
    xgb = None
    shap = None

if xgb is None or shap is None or not feature_cols or not target_cols:
    print('Skip SHAP analysis due to missing deps or columns')
else:
    for target in target_cols:
        df = base_df.replace([np.inf, -np.inf], np.nan).dropna(subset=feature_cols + [target]).copy()
        if len(df) < 50:
            print('Skip target due to insufficient rows:', target)
            continue

        X = df[feature_cols]
        y_raw = df[target]
        if LOG_TARGETS and target.endswith('_us'):
            y = np.log(y_raw.clip(lower=1))
        else:
            y = y_raw

        model = xgb.XGBRegressor(
            n_estimators=300,
            max_depth=6,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
        )
        model.fit(X, y)

        if len(X) > MAX_SAMPLES:
            X_sample = X.sample(n=MAX_SAMPLES, random_state=42)
        else:
            X_sample = X

        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_sample)

        shap.summary_plot(shap_values, X_sample, show=False)
        import matplotlib.pyplot as plt
        plt.tight_layout()
        plt.savefig(out_fig / f'shap_summary_{target}.png', dpi=150)
        plt.close()

        shap.summary_plot(shap_values, X_sample, plot_type='bar', show=False)
        plt.tight_layout()
        plt.savefig(out_fig / f'shap_bar_{target}.png', dpi=150)
        plt.close()

        shap_df = pd.DataFrame(shap_values, columns=feature_cols)
        shap_mean = shap_df.abs().mean().sort_values(ascending=False)
        shap_mean.to_csv(out_tbl / f'shap_mean_abs_{target}.csv')

        top_features = shap_mean.head(5).index.tolist()
        for feat in top_features:
            try:
                shap.dependence_plot(feat, shap_values, X_sample, show=False)
                plt.tight_layout()
                plt.savefig(out_fig / f'shap_dependence_{target}_{feat}.png', dpi=150)
                plt.close()
            except Exception as e:
                print('dependence plot failed:', target, feat, e)

        print('Saved SHAP outputs for', target)


filtered: /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/sift01/tables/filtered_stats.csv
rows: 1380
features: ['build_R', 'build_L', 'build_B', 'build_M', 'search_K', 'search_L', 'search_W', 'search_T', 'vector_dim', 'dataset_size', 'out_degree_p99', 'out_degree_p95', 'out_degree_p50', 'out_degree_mean', 'expanded_revisit_ratio', 'expanded_per_query_mean', 'expanded_steps_mean', 'node_counts_top10_share', 'node_counts_top100_share', 'iostat_aqu-sz_mean', 'iostat_%util_mean', 'queue_depth_p99', 'io_us_p99', 'cpu_us_p99', 'thread_util_p99']
targets: ['latency_p99_us', 'latency_p95_us', 'latency_p50_us', 'mean_latency_us']


Saved SHAP outputs for latency_p99_us


Saved SHAP outputs for latency_p95_us


Saved SHAP outputs for latency_p50_us


Saved SHAP outputs for mean_latency_us
