# 06 Worst-case Report

Rank worst runs by tail latency and export a summary table.


# 說明

本 notebook 定義 worst-case（高延遲分位）並分析其參數模式、特徵排名與可改善方向。

## 主要輸入
- `tables/filtered_stats.csv`

## 主要輸出
- `tables/worstcase_summary.csv`
- `tables/worstcase_param_patterns.csv`
- `tables/worstcase_graph_feature_ranking.csv`
- `tables/worstcase_bottleneck_types.csv`

## 合理性檢查建議
- `worstcase_rate` 應接近設定分位數
- `worstcase_graph_feature_ranking.csv` 前幾名應能對應瓶頸直覺


In [1]:
from pathlib import Path
import os
import pandas as pd
import numpy as np

ANALYZE_DIR = Path('../outputFiles/analyze').resolve()
REPORT_PREFIX = os.environ.get('REPORT_PREFIX', 'analysis_reports')
COLLECT_PREFIX = os.environ.get('COLLECT_PREFIX', REPORT_PREFIX)
REPORT_DIR = (ANALYZE_DIR / REPORT_PREFIX)
COLLECT_DIR = (ANALYZE_DIR / COLLECT_PREFIX)
STATS_CSV = None  # set to a specific file path if needed
TOPK_CSV = None   # set to a specific file path if needed

FILTER_SEARCH_K = os.environ.get('FILTER_SEARCH_K', '10')
WORSTCASE_PCTL = float(os.environ.get('WORSTCASE_PCTL', '0.95'))
MIN_COUNT = int(os.environ.get('WORSTCASE_MIN_COUNT', '10'))
MAX_WORST_SAMPLES = int(os.environ.get('WORSTCASE_MAX_SAMPLES', '200'))

LATENCY_PRIORITY = [
    'latency_p99_us',
    'latency_p999_us',
    'latency_p95_us',
    'latency_p50_us',
    'mean_latency_us',
]

GRAPH_FEATURES = [
    'out_degree_mean','out_degree_p50','out_degree_p90','out_degree_p95','out_degree_p99',
    'expanded_revisit_ratio','expanded_per_query_mean','expanded_per_query_p50','expanded_per_query_p90',
    'expanded_steps_mean','expanded_steps_p50','expanded_steps_p90',
    'node_counts_top1_share','node_counts_top10_share','node_counts_top100_share',
    'node_counts_unique','node_counts_total',
]
BOTTLENECK_SHARE_THRESHOLD = float(os.environ.get('BOTTLENECK_SHARE_THRESHOLD', '0.5'))

def pick_latest(pattern):
    files = sorted(COLLECT_DIR.glob(pattern))
    if not files:
        raise FileNotFoundError(f'No files matched: {pattern}')
    return files[-1]

def apply_search_k_filter(df, value):
    if not value or 'search_K' not in df.columns:
        return df
    try:
        target = int(value)
    except ValueError:
        return df
    return df[df['search_K'] == target].copy()

stats_path = Path(STATS_CSV) if STATS_CSV else pick_latest('collected_stats_*.csv')
topk_path = Path(TOPK_CSV) if TOPK_CSV else pick_latest('collected_topk_*.csv')

print('stats:', stats_path)
print('topk :', topk_path)

stats_df = pd.read_csv(stats_path)
topk_df = pd.read_csv(topk_path)


stats: /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/sift01/collected_stats_sift01_20260107_195000.csv
topk : /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/sift01/collected_topk_sift01_20260107_195000.csv


In [2]:
out_tables = (REPORT_DIR / 'tables')
out_tables.mkdir(parents=True, exist_ok=True)

filtered_path = (REPORT_DIR / 'tables' / 'filtered_stats.csv')
filtered_df = None
if filtered_path.exists():
    filtered_df = pd.read_csv(filtered_path)
    print('filtered:', filtered_path)
else:
    print('filtered not found:', filtered_path)

base_df = filtered_df if filtered_df is not None else stats_df
base_df = apply_search_k_filter(base_df, FILTER_SEARCH_K)
stats_df = apply_search_k_filter(stats_df, FILTER_SEARCH_K)
if filtered_df is not None:
    filtered_df = apply_search_k_filter(filtered_df, FILTER_SEARCH_K)

latency_cols = [c for c in LATENCY_PRIORITY if c in base_df.columns]
latency_primary = latency_cols[0] if latency_cols else None
print('base rows:', len(base_df))
print('latency cols:', latency_cols)

cols = [
    'run_prefix','recall_mean','latency_p99_us','latency_p999_us',
    'latency_p95_us','latency_p50_us','mean_latency_us',
    'ios_p99','queue_depth_p99','io_us_p99','cpu_us_p99',
    'thread_util_p99','expanded_revisit_ratio','node_counts_top10_share',
]
cols = [c for c in cols if c in base_df.columns]
if latency_primary is not None:
    worst = base_df.sort_values(latency_primary, ascending=False).head(20)[cols]
    worst.to_csv(out_tables / f'worst_runs_{latency_primary}.csv', index=False)
else:
    worst = base_df.head(20)[cols]

worst


filtered: /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/sift01/tables/filtered_stats.csv
base rows: 1380
latency cols: ['latency_p99_us', 'latency_p999_us', 'latency_p95_us', 'latency_p50_us', 'mean_latency_us']


Unnamed: 0,run_prefix,recall_mean,latency_p99_us,latency_p999_us,latency_p95_us,latency_p50_us,mean_latency_us,ios_p99,queue_depth_p99,io_us_p99,cpu_us_p99,thread_util_p99,expanded_revisit_ratio,node_counts_top10_share
1095,S421_sift_R16_L64_B2_M2_W16_L64_K10_cache0_T1,0.9445,7518.395,9145.6846,6624.9312,5568.603,5220.8785,176.0,14.4167,7289.8574,215.979,0.9966,0.682311,0.068536
735,S421_sift_R16_L32_B2_M2_W16_L64_K10_cache0_T1,0.9279,7404.459,8699.9551,6638.4741,5557.7998,4917.369,177.0,14.4,7181.0703,239.6799,0.9964,0.684672,0.068306
705,S391_sift_R16_L32_B2_M2_W16_L48_K10_cache0_T1,0.9119,7032.7729,8254.541,6247.4521,5182.0059,4937.0853,164.0,14.1111,6826.5063,203.902,0.9966,0.690708,0.074707
1065,S391_sift_R16_L64_B2_M2_W16_L48_K10_cache0_T1,0.9301,6946.1431,8337.8506,6161.9541,5162.5332,4836.8322,164.0,14.2,6737.0718,199.903,0.9965,0.688262,0.074947
1511,S421_sift_R32_L128_B2_M2_W16_L64_K10_cache0_T1,0.9879,6740.7959,8206.8213,5942.7202,4993.874,4477.9707,152.0,14.3333,6422.5669,379.2241,0.9945,0.640561,0.055765
1961,S421_sift_R32_L64_B2_M2_W16_L64_K10_cache0_T1,0.9835,6724.833,7803.7129,5943.0591,4995.1738,4676.0479,150.0,14.3333,6405.4248,339.7908,0.9949,0.641512,0.055914
675,S361_sift_R16_L32_B2_M2_W16_L32_K10_cache0_T1,0.8857,6624.2881,7801.582,5834.9189,4850.625,4620.5927,152.0,13.8,6433.1377,181.992,0.9966,0.705147,0.082367
1861,S331_sift_R32_L64_B2_M2_W8_L32_K10_cache0_T1,0.951,6442.7739,7795.3271,5707.6382,4632.314,4262.7843,81.0,7.3,6278.3257,192.089,0.9962,0.548163,0.072221
1035,S361_sift_R16_L64_B2_M2_W16_L32_K10_cache0_T1,0.9074,6399.5908,7669.2788,5701.772,4640.835,4349.4619,151.0,13.8889,6214.1528,183.368,0.9963,0.702267,0.082482
1477,S391_sift_R32_L128_B2_M2_W16_L48_K10_cache0_T1,0.9817,6346.2661,7559.2988,5532.7832,4792.7769,4258.8955,138.0,14.125,6067.3052,336.746,0.9945,0.645811,0.062262


In [3]:
import numpy as np
import pandas as pd

metric_candidates = latency_cols if latency_cols else LATENCY_PRIORITY
metric = next((m for m in metric_candidates if m in base_df.columns), None)
if metric is None:
    raise ValueError('No tail latency metric found in base_df')

df = base_df.copy()
df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=[metric])
threshold = df[metric].quantile(WORSTCASE_PCTL)
df['is_worstcase'] = df[metric] >= threshold
overall_rate = df['is_worstcase'].mean()

summary = pd.DataFrame([
    {
        'metric': metric,
        'worstcase_pctl': WORSTCASE_PCTL,
        'threshold': threshold,
        'total_runs': len(df),
        'worstcase_runs': int(df['is_worstcase'].sum()),
        'worstcase_rate': overall_rate,
    }
])
summary.to_csv(out_tables / 'worstcase_summary.csv', index=False)
summary


Unnamed: 0,metric,worstcase_pctl,threshold,total_runs,worstcase_runs,worstcase_rate
0,latency_p99_us,0.95,4780.57964,1380,69,0.05


In [4]:
# Worstcase overlap across latency metrics
overlap_rows = []
metrics_for_overlap = [c for c in LATENCY_PRIORITY if c in base_df.columns]
if metrics_for_overlap:
    flags = {}
    for m in metrics_for_overlap:
        temp = base_df[[m]].replace([np.inf, -np.inf], np.nan).dropna()
        thresh = temp[m].quantile(WORSTCASE_PCTL)
        flags[m] = base_df[m] >= thresh
    for i, m1 in enumerate(metrics_for_overlap):
        for m2 in metrics_for_overlap[i:]:
            set1 = flags[m1]
            set2 = flags[m2]
            both = (set1 & set2).sum()
            union = (set1 | set2).sum()
            jaccard = both / max(union, 1)
            overlap_rows.append({
                'metric_a': m1,
                'metric_b': m2,
                'overlap': int(both),
                'union': int(union),
                'jaccard': jaccard,
            })

overlap_df = pd.DataFrame(overlap_rows)
if not overlap_df.empty:
    overlap_df.to_csv(out_tables / 'worstcase_metric_overlap.csv', index=False)

overlap_df.head(10)


Unnamed: 0,metric_a,metric_b,overlap,union,jaccard
0,latency_p99_us,latency_p99_us,69,69,1.0
1,latency_p99_us,latency_p999_us,59,79,0.746835
2,latency_p99_us,latency_p95_us,66,72,0.916667
3,latency_p99_us,latency_p50_us,61,77,0.792208
4,latency_p99_us,mean_latency_us,63,75,0.84
5,latency_p999_us,latency_p999_us,69,69,1.0
6,latency_p999_us,latency_p95_us,57,81,0.703704
7,latency_p999_us,latency_p50_us,57,81,0.703704
8,latency_p999_us,mean_latency_us,57,81,0.703704
9,latency_p95_us,latency_p95_us,69,69,1.0


In [5]:
# Worstcase vs bottleneck types
bottleneck_df = df.copy()
latency_for_bottleneck = metric

def add_share(df_local, suffix, lat_col, io_col, cpu_col, sort_col):
    if lat_col not in df_local.columns:
        return df_local
    denom = df_local[lat_col].clip(lower=1)
    if io_col in df_local.columns:
        df_local[f'io_share_{suffix}'] = df_local[io_col] / denom
    if cpu_col in df_local.columns:
        df_local[f'cpu_share_{suffix}'] = df_local[cpu_col] / denom
    if sort_col in df_local.columns:
        df_local[f'sort_share_{suffix}'] = df_local[sort_col] / denom
    return df_local

suffix = 'p99'
bottleneck_df = add_share(bottleneck_df, suffix, 'latency_p99_us', 'io_us_p99', 'cpu_us_p99', 'sort_us_p99')
if 'latency_p99_us' not in bottleneck_df.columns:
    suffix = 'p95'
    bottleneck_df = add_share(bottleneck_df, suffix, 'latency_p95_us', 'io_us_p95', 'cpu_us_p95', 'sort_us_p95')
if f'io_share_{suffix}' not in bottleneck_df.columns:
    suffix = 'p50'
    bottleneck_df = add_share(bottleneck_df, suffix, 'latency_p50_us', 'io_us_p50', 'cpu_us_p50', 'sort_us_p50')

def classify_bottleneck(row, suffix_label):
    cols = [
        ('io', f'io_share_{suffix_label}'),
        ('cpu', f'cpu_share_{suffix_label}'),
        ('sort', f'sort_share_{suffix_label}'),
    ]
    shares = [(name, row.get(col)) for name, col in cols if col in row.index and pd.notna(row.get(col))]
    if not shares:
        return 'unknown'
    name, value = max(shares, key=lambda x: x[1])
    if value < BOTTLENECK_SHARE_THRESHOLD:
        return 'mixed'
    return name

bottleneck_df['bottleneck_type'] = bottleneck_df.apply(lambda r: classify_bottleneck(r, suffix), axis=1)

bottleneck_summary = (
    bottleneck_df.groupby('bottleneck_type')['is_worstcase']
    .agg(['count','mean'])
    .reset_index()
)
bottleneck_summary['lift'] = bottleneck_summary['mean'] / max(overall_rate, 1e-9)
bottleneck_summary.to_csv(out_tables / 'worstcase_bottleneck_types.csv', index=False)

bottleneck_summary


Unnamed: 0,bottleneck_type,count,mean,lift
0,cpu,15,0.0,0.0
1,io,1365,0.050549,1.010989


In [6]:
# Worstcase graph feature rankings
graph_cols = [c for c in GRAPH_FEATURES if c in df.columns]
graph_rank_rows = []
if graph_cols:
    for col in graph_cols:
        if df[col].nunique() < 2:
            continue
        series = df[[col, 'is_worstcase', metric]].dropna()
        if series.empty:
            continue
        graph_rank_rows.append({
            'feature': col,
            'spearman_worstcase': series[col].corr(series['is_worstcase'].astype(int), method='spearman'),
            'spearman_metric': series[col].corr(series[metric], method='spearman'),
            'count': len(series),
        })

graph_rank_df = pd.DataFrame(graph_rank_rows)
if not graph_rank_df.empty:
    graph_rank_df['abs_worstcase'] = graph_rank_df['spearman_worstcase'].abs()
    graph_rank_df = graph_rank_df.sort_values('abs_worstcase', ascending=False)
    graph_rank_df.to_csv(out_tables / 'worstcase_graph_feature_ranking.csv', index=False)

graph_rank_df.head(20)


Unnamed: 0,feature,spearman_worstcase,spearman_metric,count,abs_worstcase
7,expanded_per_query_p50,0.13817,0.329392,1380,0.13817
10,expanded_steps_p50,0.13817,0.329392,1380,0.13817
15,node_counts_unique,0.136551,0.352178,1380,0.136551
9,expanded_steps_mean,0.135424,0.330883,1380,0.135424
6,expanded_per_query_mean,0.135424,0.330883,1380,0.135424
16,node_counts_total,0.135424,0.330883,1380,0.135424
12,node_counts_top1_share,-0.135424,-0.330883,1380,0.135424
13,node_counts_top10_share,-0.134798,-0.425357,1380,0.134798
11,expanded_steps_p90,0.133817,0.327995,1380,0.133817
8,expanded_per_query_p90,0.133817,0.327995,1380,0.133817


In [7]:
param_cols = [
    'build_R','build_L','build_B','build_M',
    'search_K','search_L','search_W','search_T',
    'cache_size',
    'vector_dim','dataset_size',
]
param_cols = [c for c in param_cols if c in df.columns]

param_patterns = None
if param_cols:
    param_patterns = (
        df.groupby(param_cols)['is_worstcase']
        .agg(['mean','count'])
        .reset_index()
    )
    param_patterns = param_patterns[param_patterns['count'] >= MIN_COUNT]
    param_patterns['lift'] = param_patterns['mean'] / max(overall_rate, 1e-9)
    param_patterns = param_patterns.sort_values(['lift','mean','count'], ascending=False)
    param_patterns.head(50).to_csv(out_tables / 'worstcase_param_patterns.csv', index=False)

metric_cols = [
    'ios_p99','queue_depth_p99','io_us_p99','cpu_us_p99','thread_util_p99',
    'expanded_revisit_ratio','node_counts_top10_share','iostat_aqu-sz_mean',
]
metric_cols = [c for c in metric_cols if c in df.columns]

feature_rows = []
for c in param_cols:
    grouped = df.groupby(c)['is_worstcase'].agg(['mean','count']).reset_index()
    grouped = grouped[grouped['count'] >= MIN_COUNT]
    for _, row in grouped.iterrows():
        feature_rows.append({
            'feature': c,
            'value': row[c],
            'worstcase_rate': row['mean'],
            'count': int(row['count']),
            'lift': row['mean'] / max(overall_rate, 1e-9),
        })

def make_bins(series, bins=4):
    if series.nunique() < 2:
        return None
    try:
        return pd.qcut(series, q=bins, duplicates='drop')
    except Exception:
        return None

for c in metric_cols:
    binned = make_bins(df[c])
    if binned is None:
        continue
    grouped = df.groupby(binned)['is_worstcase'].agg(['mean','count']).reset_index()
    grouped = grouped[grouped['count'] >= MIN_COUNT]
    for _, row in grouped.iterrows():
        feature_rows.append({
            'feature': f'{c}_bin',
            'value': str(row[binned.name]),
            'worstcase_rate': row['mean'],
            'count': int(row['count']),
            'lift': row['mean'] / max(overall_rate, 1e-9),
        })

feature_patterns = pd.DataFrame(feature_rows)
if not feature_patterns.empty:
    feature_patterns = feature_patterns.sort_values(['lift','worstcase_rate','count'], ascending=False)
    feature_patterns.head(100).to_csv(out_tables / 'worstcase_feature_patterns.csv', index=False)

feature_patterns.head(20)


  grouped = df.groupby(binned)['is_worstcase'].agg(['mean','count']).reset_index()
  grouped = df.groupby(binned)['is_worstcase'].agg(['mean','count']).reset_index()
  grouped = df.groupby(binned)['is_worstcase'].agg(['mean','count']).reset_index()
  grouped = df.groupby(binned)['is_worstcase'].agg(['mean','count']).reset_index()
  grouped = df.groupby(binned)['is_worstcase'].agg(['mean','count']).reset_index()
  grouped = df.groupby(binned)['is_worstcase'].agg(['mean','count']).reset_index()
  grouped = df.groupby(binned)['is_worstcase'].agg(['mean','count']).reset_index()
  grouped = df.groupby(binned)['is_worstcase'].agg(['mean','count']).reset_index()


Unnamed: 0,feature,value,worstcase_rate,count,lift
39,io_us_p99_bin,"(3188.415, 7289.857]",0.2,345,4.0
47,thread_util_p99_bin,"(0.992, 0.997]",0.165698,344,3.313953
23,search_T,1.0,0.15,460,3.0
17,search_L,64.0,0.133333,105,2.666667
31,ios_p99_bin,"(68.0, 177.0]",0.115044,339,2.300885
42,cpu_us_p99_bin,"(152.566, 256.531]",0.095652,345,1.913043
22,search_W,16.0,0.095238,315,1.904762
35,queue_depth_p99_bin,"(8.0, 16.0]",0.095238,315,1.904762
52,node_counts_top10_share_bin,"(0.041999999999999996, 0.0741]",0.092754,345,1.855072
51,expanded_revisit_ratio_bin,"(0.597, 0.705]",0.086957,345,1.73913


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score

feature_cols = [
    'build_R','build_L','build_B','build_M',
    'search_K','search_L','search_W','search_T',
    'cache_size','vector_dim','dataset_size',
    'out_degree_p99','expanded_revisit_ratio','node_counts_top10_share',
    'ios_p99','queue_depth_p99','io_us_p99','cpu_us_p99','thread_util_p99',
    'iostat_aqu-sz_mean',
]
feature_cols = [c for c in feature_cols if c in df.columns]

model_metrics = None
feature_importance = None
predictions = None

if feature_cols:
    model_df = df.dropna(subset=feature_cols + ['is_worstcase']).copy()
    X = model_df[feature_cols]
    y = model_df['is_worstcase'].astype(int)

    if y.nunique() > 1 and len(model_df) >= 20:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        model = None
        try:
            import xgboost as xgb
            model = xgb.XGBClassifier(
                n_estimators=300,
                max_depth=6,
                learning_rate=0.05,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
            )
        except Exception as e:
            print('xgboost not available for classification:', e)

        if model is None:
            from sklearn.ensemble import RandomForestClassifier
            model = RandomForestClassifier(n_estimators=300, random_state=42)

        model.fit(X_train, y_train)
        proba = model.predict_proba(X_test)[:, 1]
        pred = (proba >= 0.5).astype(int)

        model_metrics = pd.DataFrame([
            {
                'roc_auc': roc_auc_score(y_test, proba),
                'avg_precision': average_precision_score(y_test, proba),
                'accuracy': accuracy_score(y_test, pred),
                'worstcase_rate': float(y.mean()),
                'num_samples': int(len(model_df)),
            }
        ])
        model_metrics.to_csv(out_tables / 'worstcase_model_metrics.csv', index=False)

        if hasattr(model, 'feature_importances_'):
            feature_importance = pd.DataFrame({
                'feature': feature_cols,
                'importance': model.feature_importances_,
            }).sort_values('importance', ascending=False)
            feature_importance.to_csv(out_tables / 'worstcase_feature_importance.csv', index=False)

        full_proba = model.predict_proba(X)[:, 1]
        predictions = model_df[['run_prefix', metric, 'is_worstcase']].copy()
        predictions['worstcase_score'] = full_proba
        predictions.to_csv(out_tables / 'worstcase_predictions.csv', index=False)

model_metrics


Unnamed: 0,roc_auc,avg_precision,accuracy,worstcase_rate,num_samples
0,1.0,1.0,1.0,0.05,1380


In [9]:
numeric_cols = [c for c in feature_cols if pd.api.types.is_numeric_dtype(df[c])]
direction_rows = []
for c in numeric_cols:
    if df[c].nunique() < 2:
        continue
    corr_worst = df[c].corr(df['is_worstcase'].astype(int), method='spearman')
    corr_metric = df[c].corr(df[metric], method='spearman')
    if pd.isna(corr_worst):
        continue
    direction = 'decrease' if corr_worst > 0 else 'increase'
    direction_rows.append({
        'feature': c,
        'corr_worstcase': corr_worst,
        'corr_metric': corr_metric,
        'suggested_direction': direction,
    })

direction_df = pd.DataFrame(direction_rows)
if not direction_df.empty:
    direction_df['abs_corr'] = direction_df['corr_worstcase'].abs()
    direction_df = direction_df.sort_values('abs_corr', ascending=False)
    direction_df.to_csv(out_tables / 'worstcase_directional_signals.csv', index=False)

best_values_rows = []
for c in param_cols:
    grouped = df.groupby(c)['is_worstcase'].agg(['mean','count']).reset_index()
    grouped = grouped[grouped['count'] >= MIN_COUNT]
    if grouped.empty:
        continue
    best = grouped.sort_values('mean').head(1)
    worst = grouped.sort_values('mean', ascending=False).head(1)
    best_values_rows.append({
        'feature': c,
        'best_value': best.iloc[0][c],
        'best_worstcase_rate': best.iloc[0]['mean'],
        'best_count': int(best.iloc[0]['count']),
        'worst_value': worst.iloc[0][c],
        'worst_worstcase_rate': worst.iloc[0]['mean'],
        'worst_count': int(worst.iloc[0]['count']),
    })

best_values_df = pd.DataFrame(best_values_rows)
if not best_values_df.empty:
    best_values_df.to_csv(out_tables / 'worstcase_param_value_extremes.csv', index=False)

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances

suggested_moves = None
if numeric_cols:
    work_df = df.reset_index(drop=True)
    worst_idx = work_df.index[work_df['is_worstcase']].to_numpy()
    non_worst_idx = work_df.index[~work_df['is_worstcase']].to_numpy()
    if worst_idx.size > 0 and non_worst_idx.size > 0:
        rng = np.random.default_rng(42)
        if worst_idx.size > MAX_WORST_SAMPLES:
            worst_idx = rng.choice(worst_idx, size=MAX_WORST_SAMPLES, replace=False)
        X_scaled = StandardScaler().fit_transform(work_df[numeric_cols])
        distances = pairwise_distances(X_scaled[worst_idx], X_scaled[non_worst_idx])
        nearest = distances.argmin(axis=1)
        nearest_df = work_df.loc[non_worst_idx].iloc[nearest][numeric_cols].reset_index(drop=True)
        worst_df = work_df.loc[worst_idx, numeric_cols].reset_index(drop=True)
        deltas = nearest_df - worst_df
        suggested_moves = deltas.median().to_frame('median_delta')
        suggested_moves['direction'] = suggested_moves['median_delta'].apply(
            lambda v: 'increase' if v > 0 else ('decrease' if v < 0 else 'neutral')
        )
        suggested_moves = suggested_moves.reset_index().rename(columns={'index': 'feature'})
        suggested_moves.to_csv(out_tables / 'worstcase_suggested_moves.csv', index=False)

direction_df.head(20)


Unnamed: 0,feature,corr_worstcase,corr_metric,suggested_direction,abs_corr
10,io_us_p99,0.376949,0.988764,decrease,0.376949
12,thread_util_p99,0.290078,0.754756,decrease,0.290078
4,search_T,-0.280976,-0.641852,increase,0.280976
8,ios_p99,0.190722,0.371436,decrease,0.190722
11,cpu_us_p99,0.158397,0.229627,decrease,0.158397
2,search_L,0.148605,0.381464,decrease,0.148605
3,search_W,0.144206,0.323657,decrease,0.144206
7,node_counts_top10_share,-0.134798,-0.425357,increase,0.134798
6,expanded_revisit_ratio,0.125031,0.286819,decrease,0.125031
9,queue_depth_p99,0.090218,0.304885,decrease,0.090218


In [10]:
# Worstcase vs recall buckets
recall_bucket_df = pd.DataFrame()
if 'recall_mean' in df.columns:
    temp = df[['recall_mean', 'is_worstcase']].dropna()
    if not temp.empty:
        try:
            temp['recall_bin'] = pd.qcut(temp['recall_mean'], q=5, duplicates='drop')
            recall_bucket_df = (
                temp.groupby('recall_bin')['is_worstcase']
                .agg(['count','mean'])
                .reset_index()
            )
            recall_bucket_df['lift'] = recall_bucket_df['mean'] / max(overall_rate, 1e-9)
            recall_bucket_df.to_csv(out_tables / 'worstcase_recall_bins.csv', index=False)
        except Exception as e:
            print('recall binning failed:', e)

recall_bucket_df


  temp.groupby('recall_bin')['is_worstcase']


Unnamed: 0,recall_bin,count,mean,lift
0,"(0.7, 0.813]",285,0.0,0.0
1,"(0.813, 0.893]",300,0.016667,0.333333
2,"(0.893, 0.941]",255,0.066667,1.333333
3,"(0.941, 0.979]",270,0.1,2.0
4,"(0.979, 0.999]",270,0.074074,1.481481


In [11]:
# Worstcase distribution summary
dist_rows = []
if metric in df.columns:
    worst_vals = df.loc[df['is_worstcase'], metric]
    non_vals = df.loc[~df['is_worstcase'], metric]
    if not worst_vals.empty and not non_vals.empty:
        dist_rows.append({
            'group': 'worstcase',
            'count': len(worst_vals),
            'median': worst_vals.median(),
            'p90': worst_vals.quantile(0.9),
            'p99': worst_vals.quantile(0.99),
        })
        dist_rows.append({
            'group': 'non_worstcase',
            'count': len(non_vals),
            'median': non_vals.median(),
            'p90': non_vals.quantile(0.9),
            'p99': non_vals.quantile(0.99),
        })

dist_df = pd.DataFrame(dist_rows)
if not dist_df.empty:
    dist_df.to_csv(out_tables / 'worstcase_distribution_summary.csv', index=False)

dist_df


Unnamed: 0,group,count,median,p90,p99
0,worstcase,69,5367.853,6479.07674,7440.91852
1,non_worstcase,1311,1383.13,3964.3149,4622.75259


In [12]:
# Worstcase rate by topk coverage (if available)
topk_df = topk_df.copy()
topk_rate_df = pd.DataFrame()
if 'run_prefix' in df.columns and 'run_prefix' in topk_df.columns:
    merged_topk = df[['run_prefix', 'is_worstcase']].merge(topk_df, on='run_prefix', how='left')
    topk_cols = [c for c in merged_topk.columns if c.endswith('_topk')]
    rows = []
    for col in topk_cols:
        series = merged_topk[[col, 'is_worstcase']].dropna()
        if series.empty or series[col].nunique() < 2:
            continue
        try:
            series['bin'] = pd.qcut(series[col], q=4, duplicates='drop')
        except Exception:
            continue
        grouped = series.groupby('bin')['is_worstcase'].agg(['count','mean']).reset_index()
        grouped['feature'] = col
        grouped['lift'] = grouped['mean'] / max(overall_rate, 1e-9)
        rows.append(grouped)
    if rows:
        topk_rate_df = pd.concat(rows, ignore_index=True)
        topk_rate_df.to_csv(out_tables / 'worstcase_topk_bins.csv', index=False)

topk_rate_df.head(10)
