# 01 Basic Tradeoff Plots

Recall/latency and IO/latency tradeoffs.


# 說明

本 notebook 產生 recall/latency 與系統指標的 tradeoff 圖表與摘要表，並輸出 Pareto frontier。

## 主要輸入
- `tables/filtered_stats.csv`（存在時優先）

## 主要輸出
- `figures/recall_vs_*`
- `tables/pareto_recall_latency.csv`
- `tables/tradeoff_by_params.csv`
- `tables/latency_correlation_*.csv`

## 合理性檢查建議
- Pareto frontier 點數應合理（不為 0）
- 不同 K 的分析需用 `FILTER_SEARCH_K` 控制一致性


In [1]:
from pathlib import Path
import os
import re
import pandas as pd
import numpy as np

ANALYZE_DIR = Path('../outputFiles/analyze').resolve()
REPORT_PREFIX = os.environ.get('REPORT_PREFIX', 'analysis_reports')
COLLECT_PREFIX = os.environ.get('COLLECT_PREFIX', REPORT_PREFIX)
REPORT_DIR = (ANALYZE_DIR / REPORT_PREFIX)
COLLECT_DIR = (ANALYZE_DIR / COLLECT_PREFIX)
STATS_CSV = None  # set to a specific file path if needed
TOPK_CSV = None   # set to a specific file path if needed

PLOT_MAX_POINTS = int(os.environ.get('PLOT_MAX_POINTS', '20000'))
PLOT_ALPHA = float(os.environ.get('PLOT_ALPHA', '0.6'))
PLOT_LOG_LATENCY = os.environ.get('PLOT_LOG_LATENCY', '1') != '0'

RECALL_COL = 'recall_mean'
LATENCY_PRI_COLS = [
    'latency_p99_us',
    'latency_p95_us',
    'latency_p50_us',
    'mean_latency_us',
    'latency_p0_us',
    'latency_p999_us',
]

FILTER_SEARCH_K = os.environ.get('FILTER_SEARCH_K', '10')

COLOR_CANDIDATES = ['search_W', 'search_L', 'search_T', 'search_K']
TRADEOFF_FEATURES = [
    'ios_p99','queue_depth_p99','io_us_p99','cpu_us_p99','thread_util_p99',
    'expanded_revisit_ratio','node_counts_top10_share','out_degree_p99',
    'iostat_aqu-sz_mean',
]

def pick_latest(pattern):
    files = sorted(COLLECT_DIR.glob(pattern))
    if not files:
        raise FileNotFoundError(f'No files matched: {pattern}')
    return files[-1]

def downsample_df(df, max_points=PLOT_MAX_POINTS, seed=42):
    if len(df) <= max_points:
        return df
    return df.sample(n=max_points, random_state=seed)

def safe_scatter(df, x, y, color=None, title=None, out_path=None, cmap='viridis'):
    if x not in df.columns or y not in df.columns:
        print('skip scatter missing columns:', x, y)
        return
    cols = [x, y] + ([color] if color and color in df.columns else [])
    plot_df = df[cols].dropna()
    if plot_df.empty:
        print('skip scatter empty data:', x, y)
        return
    plot_df = downsample_df(plot_df)
    ax = plot_df.plot.scatter(
        x=x,
        y=y,
        c=color if color in plot_df.columns else None,
        cmap=cmap,
        alpha=PLOT_ALPHA,
        figsize=(6, 4),
    )
    if title:
        ax.set_title(title)
    if PLOT_LOG_LATENCY and y.endswith('_us'):
        ax.set_yscale('log')
    import matplotlib.pyplot as plt
    plt.tight_layout()
    if out_path:
        plt.savefig(out_path, dpi=150)
    plt.close()

def pareto_frontier(df, recall_col, latency_col):
    if recall_col not in df.columns or latency_col not in df.columns:
        return pd.DataFrame()
    subset = df[[recall_col, latency_col, 'run_prefix']].dropna().copy()
    if subset.empty:
        return pd.DataFrame()
    subset = subset.sort_values(recall_col, ascending=False)
    best_latency = np.inf
    keep = []
    for idx, row in subset.iterrows():
        if row[latency_col] < best_latency:
            keep.append(idx)
            best_latency = row[latency_col]
    return subset.loc[keep].sort_values(recall_col)

stats_path = Path(STATS_CSV) if STATS_CSV else pick_latest('collected_stats_*.csv')
topk_path = Path(TOPK_CSV) if TOPK_CSV else pick_latest('collected_topk_*.csv')

print('stats:', stats_path)
print('topk :', topk_path)

stats_df = pd.read_csv(stats_path)
topk_df = pd.read_csv(topk_path)


stats: /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/sift01/collected_stats_sift01_20260107_195000.csv
topk : /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/sift01/collected_topk_sift01_20260107_195000.csv


In [2]:
merged = stats_df.copy()
if 'run_prefix' in stats_df.columns and 'run_prefix' in topk_df.columns:
    merged = stats_df.merge(topk_df, on='run_prefix', how='left', suffixes=('', '_topk'))
else:
    print('skip merge: run_prefix missing')

merged.head()


Unnamed: 0,id,dataset_name,data_type,build_R,build_L,build_B,build_M,search_K,search_L,search_W,...,topk_degree_p25,topk_degree_p50,topk_degree_p75,topk_degree_p90,topk_degree_p95,topk_degree_p99,topk_cover_ratio,topk_neighbors_path,topk_nodes_path,summary_stats_path
0,1,sift,float,128,256,2.0,2.0,10,10,2,...,128.0,128.0,128.0,128.0,128.0,128.0,0.203047,/home/gt/research/DiskANN/scripts/paramAnalysi...,/home/gt/research/DiskANN/scripts/paramAnalysi...,/home/gt/research/DiskANN/scripts/paramAnalysi...
1,2,sift,float,128,256,2.0,2.0,10,10,2,...,128.0,128.0,128.0,128.0,128.0,128.0,0.203047,/home/gt/research/DiskANN/scripts/paramAnalysi...,/home/gt/research/DiskANN/scripts/paramAnalysi...,/home/gt/research/DiskANN/scripts/paramAnalysi...
2,3,sift,float,128,256,2.0,2.0,10,10,2,...,128.0,128.0,128.0,128.0,128.0,128.0,0.203047,/home/gt/research/DiskANN/scripts/paramAnalysi...,/home/gt/research/DiskANN/scripts/paramAnalysi...,/home/gt/research/DiskANN/scripts/paramAnalysi...
3,4,sift,float,128,256,2.0,2.0,10,10,2,...,128.0,128.0,128.0,128.0,128.0,128.0,0.203047,/home/gt/research/DiskANN/scripts/paramAnalysi...,/home/gt/research/DiskANN/scripts/paramAnalysi...,/home/gt/research/DiskANN/scripts/paramAnalysi...
4,5,sift,float,128,256,2.0,2.0,10,10,2,...,128.0,128.0,128.0,128.0,128.0,128.0,0.203047,/home/gt/research/DiskANN/scripts/paramAnalysi...,/home/gt/research/DiskANN/scripts/paramAnalysi...,/home/gt/research/DiskANN/scripts/paramAnalysi...


In [3]:
# Load filtered dataset if available
filtered_path = (REPORT_DIR / 'tables' / 'filtered_stats.csv')
filtered_df = None
if filtered_path.exists():
    filtered_df = pd.read_csv(filtered_path)
    print('filtered:', filtered_path)
else:
    print('filtered not found:', filtered_path)

base_df = filtered_df if filtered_df is not None else merged

def apply_search_k_filter(df, value):
    if not value or 'search_K' not in df.columns:
        return df
    try:
        target = int(value)
    except ValueError:
        return df
    return df[df['search_K'] == target].copy()

base_df = apply_search_k_filter(base_df, FILTER_SEARCH_K)
merged = apply_search_k_filter(merged, FILTER_SEARCH_K)
if filtered_df is not None:
    filtered_df = apply_search_k_filter(filtered_df, FILTER_SEARCH_K)

latency_cols = [c for c in LATENCY_PRI_COLS if c in base_df.columns]
if not latency_cols:
    latency_cols = [c for c in base_df.columns if c.startswith('latency_') and c.endswith('_us')]

latency_primary = latency_cols[0] if latency_cols else None

print('base_df rows:', len(base_df))
print('latency cols:', latency_cols)

color_col = next((c for c in COLOR_CANDIDATES if c in base_df.columns), None)
print('color column:', color_col)


filtered: /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/sift01/tables/filtered_stats.csv
base_df rows: 1380
latency cols: ['latency_p99_us', 'latency_p95_us', 'latency_p50_us', 'mean_latency_us', 'latency_p0_us', 'latency_p999_us']
color column: search_W


In [4]:
import matplotlib.pyplot as plt

fig_dir = (REPORT_DIR / 'figures')
fig_dir.mkdir(parents=True, exist_ok=True)

for lat in latency_cols:
    safe_scatter(
        merged,
        RECALL_COL,
        lat,
        color=color_col,
        title=f'Recall vs {lat} (raw)',
        out_path=fig_dir / f'recall_vs_{lat}_raw.png',
    )
    if filtered_df is not None:
        safe_scatter(
            filtered_df,
            RECALL_COL,
            lat,
            color=color_col,
            title=f'Recall vs {lat} (filtered)',
            out_path=fig_dir / f'recall_vs_{lat}_filtered.png',
        )

print('Saved figures to', fig_dir)


Saved figures to /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/sift01/figures


In [5]:
import matplotlib.pyplot as plt
fig_dir = (REPORT_DIR / 'figures')
fig_dir.mkdir(parents=True, exist_ok=True)

for feat in TRADEOFF_FEATURES:
    for lat in latency_cols:
        safe_scatter(
            merged,
            feat,
            lat,
            color=color_col,
            title=f'{feat} vs {lat} (raw)',
            out_path=fig_dir / f'{feat}_vs_{lat}_raw.png',
            cmap='plasma',
        )
        if filtered_df is not None:
            safe_scatter(
                filtered_df,
                feat,
                lat,
                color=color_col,
                title=f'{feat} vs {lat} (filtered)',
                out_path=fig_dir / f'{feat}_vs_{lat}_filtered.png',
                cmap='plasma',
            )

for feat in TRADEOFF_FEATURES:
    safe_scatter(
        base_df,
        RECALL_COL,
        feat,
        color=color_col,
        title=f'Recall vs {feat}',
        out_path=fig_dir / f'recall_vs_{feat}.png',
        cmap='magma',
    )

print('Saved figures to', fig_dir)


Saved figures to /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/sift01/figures


In [6]:
# Pareto frontier on recall vs latency
out_tables = (REPORT_DIR / 'tables')
out_tables.mkdir(parents=True, exist_ok=True)

if latency_primary is None:
    print('No latency column available for Pareto frontier')
    frontier_df = pd.DataFrame()
else:
    frontier_df = pareto_frontier(base_df, RECALL_COL, latency_primary)

if not frontier_df.empty:
    frontier_df.to_csv(out_tables / 'pareto_recall_latency.csv', index=False)
    plot_df = base_df[[RECALL_COL, latency_primary]].dropna()
    plot_df = downsample_df(plot_df)
    import matplotlib.pyplot as plt
    plt.figure(figsize=(6, 4))
    plt.scatter(plot_df[RECALL_COL], plot_df[latency_primary], s=10, alpha=PLOT_ALPHA)
    plt.plot(frontier_df[RECALL_COL], frontier_df[latency_primary], color='#d62728', linewidth=2)
    plt.title('Recall vs Latency with Pareto Frontier')
    plt.xlabel(RECALL_COL)
    plt.ylabel(latency_primary)
    if PLOT_LOG_LATENCY:
        plt.yscale('log')
    plt.tight_layout()
    plt.savefig(REPORT_DIR / 'figures' / f'recall_vs_{latency_primary}_with_frontier.png', dpi=150)
    plt.close()
else:
    print('Pareto frontier not available')


In [7]:
# Aggregated tradeoff tables
param_cols = [
    'build_R','build_L','build_B','build_M',
    'search_K','search_L','search_W','search_T','cache_size',
]
param_cols = [c for c in param_cols if c in base_df.columns]
metrics = [c for c in [RECALL_COL] + latency_cols if c in base_df.columns]

tradeoff_by_params = pd.DataFrame()
if param_cols and metrics:
    agg_map = {m: ['median', 'mean', 'min', 'max'] for m in metrics}
    tradeoff_by_params = base_df.groupby(param_cols).agg(agg_map)
    tradeoff_by_params.columns = ['_'.join(col).strip() for col in tradeoff_by_params.columns]
    tradeoff_by_params['count'] = base_df.groupby(param_cols).size().values
    tradeoff_by_params = tradeoff_by_params.reset_index()
    tradeoff_by_params = tradeoff_by_params.sort_values('count', ascending=False)
    tradeoff_by_params.to_csv(out_tables / 'tradeoff_by_params.csv', index=False)

if RECALL_COL in base_df.columns and latency_primary is not None:
    bins = np.linspace(base_df[RECALL_COL].min(), base_df[RECALL_COL].max(), 11)
    recall_bins = base_df.copy()
    recall_bins['recall_bin'] = pd.cut(recall_bins[RECALL_COL], bins=bins, include_lowest=True)
    recall_bin_summary = (
        recall_bins.groupby('recall_bin')[[latency_primary]]
        .agg(['median','mean','min','max','count'])
        .reset_index()
    )
    recall_bin_summary.columns = [
        'recall_bin',
        'latency_median','latency_mean','latency_min','latency_max','count',
    ]
    recall_bin_summary.to_csv(out_tables / f'tradeoff_recall_bins_{latency_primary}.csv', index=False)

tradeoff_by_params.head(5)


  recall_bins.groupby('recall_bin')[[latency_primary]]


Unnamed: 0,build_R,build_L,build_B,build_M,search_K,search_L,search_W,search_T,recall_mean_median,recall_mean_mean,...,mean_latency_us_max,latency_p0_us_median,latency_p0_us_mean,latency_p0_us_min,latency_p0_us_max,latency_p999_us_median,latency_p999_us_mean,latency_p999_us_min,latency_p999_us_max,count
56,32,64,2.0,2.0,10,10,2,16,0.7965,0.7965,...,852.0319,31.199,111.570333,27.732,454.982,1216.649,1506.907533,1109.488,3930.761,15
86,32,128,2.0,2.0,10,10,1,16,0.7856,0.7856,...,842.8429,29.395,119.635533,26.61,499.586,1657.886,1764.67918,1229.7729,4437.1318,15
85,32,128,2.0,2.0,10,10,1,4,0.7856,0.7856,...,829.3928,26.71,110.818867,25.658,465.492,1867.73,1796.912333,1424.969,1990.049,15
52,32,64,2.0,2.0,10,10,1,4,0.7767,0.7767,...,906.8202,26.089,116.497533,25.037,505.758,1771.778,1678.233667,1343.887,1909.066,15
51,32,64,2.0,2.0,10,10,1,1,0.7767,0.7767,...,2687.9921,27.321,244.496667,25.789,1207.972,5353.6958,5366.940953,4933.4292,5707.0869,15


In [8]:
# Correlation analysis vs latency
corr_df = pd.DataFrame()
if latency_primary is not None and latency_primary in base_df.columns:
    num_cols = [c for c in base_df.columns if pd.api.types.is_numeric_dtype(base_df[c])]
    rows = []
    for col in num_cols:
        if col == latency_primary:
            continue
        series = base_df[[col, latency_primary]].dropna()
        if series.empty:
            continue
        pearson = series[col].corr(series[latency_primary], method='pearson')
        spearman = series[col].corr(series[latency_primary], method='spearman')
        rows.append({
            'feature': col,
            'pearson': pearson,
            'spearman': spearman,
            'count': len(series),
        })
    corr_df = pd.DataFrame(rows).sort_values('spearman', ascending=False)
    corr_df.to_csv(out_tables / f'latency_correlation_{latency_primary}.csv', index=False)

corr_df.head(10)


  c /= stddev[:, None]
  c /= stddev[None, :]
  return spearmanr(a, b)[0]
  c /= stddev[:, None]
  c /= stddev[None, :]
  return spearmanr(a, b)[0]
  c /= stddev[:, None]
  c /= stddev[None, :]
  return spearmanr(a, b)[0]
  c /= stddev[:, None]
  c /= stddev[None, :]
  return spearmanr(a, b)[0]
  c /= stddev[:, None]
  c /= stddev[None, :]
  return spearmanr(a, b)[0]
  c /= stddev[:, None]
  c /= stddev[None, :]
  return spearmanr(a, b)[0]
  c /= stddev[:, None]
  c /= stddev[None, :]
  return spearmanr(a, b)[0]


  c /= stddev[:, None]
  c /= stddev[None, :]
  return spearmanr(a, b)[0]
  c /= stddev[:, None]
  c /= stddev[None, :]
  return spearmanr(a, b)[0]
  c /= stddev[:, None]
  c /= stddev[None, :]
  return spearmanr(a, b)[0]
  c /= stddev[:, None]
  c /= stddev[None, :]
  return spearmanr(a, b)[0]
  c /= stddev[:, None]
  c /= stddev[None, :]
  return spearmanr(a, b)[0]
  c /= stddev[:, None]
  c /= stddev[None, :]
  return spearmanr(a, b)[0]
  c /= stddev[:, None]
  c /= stddev[None, :]
  return spearmanr(a, b)[0]
  c /= stddev[:, None]
  c /= stddev[None, :]
  return spearmanr(a, b)[0]
  c /= stddev[:, None]
  c /= stddev[None, :]
  return spearmanr(a, b)[0]
  c /= stddev[:, None]
  c /= stddev[None, :]
  return spearmanr(a, b)[0]
  c /= stddev[:, None]
  c /= stddev[None, :]
  return spearmanr(a, b)[0]
  c /= stddev[:, None]
  c /= stddev[None, :]
  return spearmanr(a, b)[0]
  c /= stddev[:, None]
  c /= stddev[None, :]
  return spearmanr(a, b)[0]
  c /= stddev[:, None]
  c /= stddev[N

Unnamed: 0,feature,pearson,spearman,count
37,latency_p95_us,0.992646,0.994345,1380
36,latency_p90_us,0.989202,0.990474,1380
62,io_us_p99,0.994453,0.988764,1380
35,latency_p75_us,0.983195,0.982779,1380
61,io_us_p95,0.987619,0.980968,1380
27,mean_latency_us,0.980885,0.980903,1380
60,io_us_p90,0.984547,0.97643,1380
28,log_mean_latency_us,0.97312,0.974948,1380
34,latency_p50_us,0.972633,0.973524,1380
59,io_us_p75,0.979023,0.9645,1380


In [9]:
# Top-k signal correlations (if present)
topk_corr_df = pd.DataFrame()
if latency_primary is not None and latency_primary in merged.columns:
    topk_cols = [c for c in merged.columns if c.endswith('_topk')]
    rows = []
    for col in topk_cols:
        series = merged[[col, latency_primary]].dropna()
        if series.empty:
            continue
        rows.append({
            'feature': col,
            'spearman': series[col].corr(series[latency_primary], method='spearman'),
            'count': len(series),
        })
    if rows:
        topk_corr_df = pd.DataFrame(rows).sort_values('spearman', ascending=False)
        topk_corr_df.to_csv(out_tables / f'latency_topk_correlation_{latency_primary}.csv', index=False)

topk_corr_df.head(10)
