# 02 Bottleneck Attribution

Compute IO/CPU/Sort shares and visualize IO-bound vs CPU-bound behavior.


# 說明

本 notebook 以 IO/CPU/Sort 佔比分析瓶頸，並連結延遲與系統指標。

## 主要輸入
- `tables/filtered_stats.csv`

## 主要輸出
- `tables/bottleneck_summary_*.csv`
- `tables/bottleneck_flags.csv`
- `figures/*share*`、`queue_vs_*`

## 合理性檢查建議
- `component_balance.csv` 的 residual 不應長期為負或過大
- `bottleneck_flags.csv` 過高的飽和比例需解釋


In [1]:
from pathlib import Path
import os
import pandas as pd
import numpy as np

ANALYZE_DIR = Path('../outputFiles/analyze').resolve()
REPORT_PREFIX = os.environ.get('REPORT_PREFIX', 'analysis_reports')
COLLECT_PREFIX = os.environ.get('COLLECT_PREFIX', REPORT_PREFIX)
REPORT_DIR = (ANALYZE_DIR / REPORT_PREFIX)
COLLECT_DIR = (ANALYZE_DIR / COLLECT_PREFIX)
STATS_CSV = None  # set to a specific file path if needed
TOPK_CSV = None   # set to a specific file path if needed

PLOT_MAX_POINTS = int(os.environ.get('PLOT_MAX_POINTS', '20000'))
PLOT_ALPHA = float(os.environ.get('PLOT_ALPHA', '0.6'))
PLOT_LOG_LATENCY = os.environ.get('PLOT_LOG_LATENCY', '1') != '0'
FILTER_SEARCH_K = os.environ.get('FILTER_SEARCH_K', '27')
BOTTLENECK_SHARE_THRESHOLD = float(os.environ.get('BOTTLENECK_SHARE_THRESHOLD', '0.5'))

LATENCY_PRIORITY = [
    'latency_p99_us',
    'latency_p95_us',
    'latency_p50_us',
    'mean_latency_us',
]
SHARE_SPECS = [
    ('p99', 'latency_p99_us', 'io_us_p99', 'cpu_us_p99', 'sort_us_p99'),
    ('p95', 'latency_p95_us', 'io_us_p95', 'cpu_us_p95', 'sort_us_p95'),
    ('p50', 'latency_p50_us', 'io_us_p50', 'cpu_us_p50', 'sort_us_p50'),
    ('mean', 'mean_latency_us', 'io_us_mean', 'cpu_us_mean', 'sort_us_mean'),
]

def pick_latest(pattern):
    files = sorted(COLLECT_DIR.glob(pattern))
    if not files:
        raise FileNotFoundError(f'No files matched: {pattern}')
    return files[-1]

def downsample_df(df, max_points=PLOT_MAX_POINTS, seed=42):
    if len(df) <= max_points:
        return df
    return df.sample(n=max_points, random_state=seed)

def safe_scatter(df, x, y, color=None, title=None, out_path=None, cmap='viridis'):
    if x not in df.columns or y not in df.columns:
        print('skip scatter missing columns:', x, y)
        return
    cols = [x, y] + ([color] if color and color in df.columns else [])
    plot_df = df[cols].dropna()
    if plot_df.empty:
        print('skip scatter empty data:', x, y)
        return
    plot_df = downsample_df(plot_df)
    color_values = None
    if color and color in plot_df.columns:
        if pd.api.types.is_numeric_dtype(plot_df[color]):
            color_values = plot_df[color]
        else:
            color_values = plot_df[color].astype('category').cat.codes
    ax = plot_df.plot.scatter(
        x=x,
        y=y,
        c=color_values,
        cmap=cmap,
        alpha=PLOT_ALPHA,
        figsize=(6, 4),
    )
    if title:
        ax.set_title(title)
    if PLOT_LOG_LATENCY and y.endswith('_us'):
        ax.set_yscale('log')
    import matplotlib.pyplot as plt
    plt.tight_layout()
    if out_path:
        plt.savefig(out_path, dpi=150)
    plt.close()

def apply_search_k_filter(df, value):
    if not value or 'search_K' not in df.columns:
        return df
    try:
        target = int(value)
    except ValueError:
        return df
    return df[df['search_K'] == target].copy()

def add_share_columns(df, suffix, latency_col, io_col, cpu_col, sort_col):
    if latency_col not in df.columns:
        return df
    denom = df[latency_col].clip(lower=1)
    if io_col in df.columns:
        df[f'io_share_{suffix}'] = df[io_col] / denom
    if cpu_col in df.columns:
        df[f'cpu_share_{suffix}'] = df[cpu_col] / denom
    if sort_col in df.columns:
        df[f'sort_share_{suffix}'] = df[sort_col] / denom
    return df

stats_path = Path(STATS_CSV) if STATS_CSV else pick_latest('collected_stats_*.csv')
topk_path = Path(TOPK_CSV) if TOPK_CSV else pick_latest('collected_topk_*.csv')

print('stats:', stats_path)
print('topk :', topk_path)

stats_df = pd.read_csv(stats_path)
topk_df = pd.read_csv(topk_path)


stats: /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/sift01/collected_stats_sift01_20260107_195000.csv
topk : /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/sift01/collected_topk_sift01_20260107_195000.csv


In [2]:
# Load filtered dataset if available
filtered_path = (REPORT_DIR / 'tables' / 'filtered_stats.csv')
filtered_df = None
if filtered_path.exists():
    filtered_df = pd.read_csv(filtered_path)
    print('filtered:', filtered_path)
else:
    print('filtered not found:', filtered_path)

base_df = filtered_df if filtered_df is not None else stats_df
base_df = apply_search_k_filter(base_df, FILTER_SEARCH_K)
stats_df = apply_search_k_filter(stats_df, FILTER_SEARCH_K)
if filtered_df is not None:
    filtered_df = apply_search_k_filter(filtered_df, FILTER_SEARCH_K)

latency_cols = [c for c in LATENCY_PRIORITY if c in base_df.columns]
latency_primary = latency_cols[0] if latency_cols else None

print('base_df rows:', len(base_df))
print('latency cols:', latency_cols)


filtered: /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/sift01/tables/filtered_stats.csv
base_df rows: 1380
latency cols: ['latency_p99_us', 'latency_p95_us', 'latency_p50_us', 'mean_latency_us']


In [3]:
df = base_df.copy()
for suffix, lat_col, io_col, cpu_col, sort_col in SHARE_SPECS:
    df = add_share_columns(df, suffix, lat_col, io_col, cpu_col, sort_col)
    if filtered_df is not None:
        filtered_df = add_share_columns(filtered_df, suffix, lat_col, io_col, cpu_col, sort_col)

for suffix, lat_col, io_col, cpu_col, sort_col in SHARE_SPECS:
    if lat_col not in df.columns:
        continue
    comp_cols = [c for c in [io_col, cpu_col, sort_col] if c in df.columns]
    if not comp_cols:
        continue
    df[f'component_sum_{suffix}'] = df[comp_cols].sum(axis=1)
    df[f'residual_{suffix}'] = df[lat_col] - df[f'component_sum_{suffix}']
    if filtered_df is not None:
        fcomp_cols = [c for c in [io_col, cpu_col, sort_col] if c in filtered_df.columns]
        if fcomp_cols:
            filtered_df[f'component_sum_{suffix}'] = filtered_df[fcomp_cols].sum(axis=1)
            filtered_df[f'residual_{suffix}'] = filtered_df[lat_col] - filtered_df[f'component_sum_{suffix}']

def classify_bottleneck(row, suffix):
    cols = [
        ('io', f'io_share_{suffix}'),
        ('cpu', f'cpu_share_{suffix}'),
        ('sort', f'sort_share_{suffix}'),
    ]
    shares = [(name, row.get(col)) for name, col in cols if col in row.index and pd.notna(row.get(col))]
    if not shares:
        return 'unknown'
    name, value = max(shares, key=lambda x: x[1])
    if value < BOTTLENECK_SHARE_THRESHOLD:
        return 'mixed'
    return name

if 'io_share_p99' in df.columns or 'cpu_share_p99' in df.columns or 'sort_share_p99' in df.columns:
    df['bottleneck_p99'] = df.apply(lambda r: classify_bottleneck(r, 'p99'), axis=1)
    if filtered_df is not None:
        filtered_df['bottleneck_p99'] = filtered_df.apply(lambda r: classify_bottleneck(r, 'p99'), axis=1)


In [4]:
import matplotlib.pyplot as plt

fig_dir = (REPORT_DIR / 'figures')
fig_dir.mkdir(parents=True, exist_ok=True)

safe_scatter(
    df,
    'io_share_p99',
    'cpu_share_p99',
    title='IO vs CPU share (p99) (base)',
    out_path=fig_dir / 'io_cpu_share_p99_base.png',
)
if filtered_df is not None:
    safe_scatter(
        filtered_df,
        'io_share_p99',
        'cpu_share_p99',
        title='IO vs CPU share (p99) (filtered)',
        out_path=fig_dir / 'io_cpu_share_p99_filtered.png',
    )

for suffix in ['p99', 'p95', 'p50', 'mean']:
    safe_scatter(
        df,
        f'io_share_{suffix}',
        f'sort_share_{suffix}',
        title=f'IO vs Sort share ({suffix})',
        out_path=fig_dir / f'io_sort_share_{suffix}.png',
    )
    safe_scatter(
        df,
        f'cpu_share_{suffix}',
        f'sort_share_{suffix}',
        title=f'CPU vs Sort share ({suffix})',
        out_path=fig_dir / f'cpu_sort_share_{suffix}.png',
    )

print('Saved figures to', fig_dir)


  scatter = ax.scatter(
  scatter = ax.scatter(
  scatter = ax.scatter(
  scatter = ax.scatter(
  scatter = ax.scatter(


  scatter = ax.scatter(
  scatter = ax.scatter(
  scatter = ax.scatter(
  scatter = ax.scatter(


Saved figures to /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/sift01/figures


  scatter = ax.scatter(


In [5]:
import matplotlib.pyplot as plt
fig_dir = (REPORT_DIR / 'figures')
fig_dir.mkdir(parents=True, exist_ok=True)

if latency_primary is not None:
    safe_scatter(
        df,
        'queue_depth_p99',
        latency_primary,
        color='bottleneck_p99' if 'bottleneck_p99' in df.columns else None,
        title=f'Queue Depth vs {latency_primary} (base)',
        out_path=fig_dir / f'queue_vs_{latency_primary}_base.png',
    )
    if filtered_df is not None:
        safe_scatter(
            filtered_df,
            'queue_depth_p99',
            latency_primary,
            color='bottleneck_p99' if 'bottleneck_p99' in filtered_df.columns else None,
            title=f'Queue Depth vs {latency_primary} (filtered)',
            out_path=fig_dir / f'queue_vs_{latency_primary}_filtered.png',
        )

for col in ['iostat_%util_mean', 'iostat_aqu-sz_mean', 'thread_util_p99']:
    safe_scatter(
        df,
        col,
        latency_primary if latency_primary else 'latency_p99_us',
        title=f'{col} vs latency',
        out_path=fig_dir / f'{col}_vs_latency.png',
        cmap='magma',
    )

for suffix in ['p99', 'p95', 'p50', 'mean']:
    safe_scatter(
        df,
        f'io_share_{suffix}',
        latency_primary if latency_primary else 'latency_p99_us',
        title=f'IO share {suffix} vs latency',
        out_path=fig_dir / f'io_share_{suffix}_vs_latency.png',
    )
    safe_scatter(
        df,
        f'cpu_share_{suffix}',
        latency_primary if latency_primary else 'latency_p99_us',
        title=f'CPU share {suffix} vs latency',
        out_path=fig_dir / f'cpu_share_{suffix}_vs_latency.png',
    )

print('Saved figures to', fig_dir)


  scatter = ax.scatter(
  scatter = ax.scatter(
  scatter = ax.scatter(


  scatter = ax.scatter(
  scatter = ax.scatter(
  scatter = ax.scatter(


  scatter = ax.scatter(
  scatter = ax.scatter(
  scatter = ax.scatter(


Saved figures to /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/sift01/figures


  scatter = ax.scatter(
  scatter = ax.scatter(


In [6]:
# Bottleneck summaries
out_tables = (REPORT_DIR / 'tables')
out_tables.mkdir(parents=True, exist_ok=True)

summary_rows = []
if 'bottleneck_p99' in df.columns and latency_primary is not None:
    grouped = df.groupby('bottleneck_p99')
    summary = grouped[latency_primary].agg(['count','median','mean','min','max']).reset_index()
    summary.to_csv(out_tables / f'bottleneck_summary_{latency_primary}.csv', index=False)
    summary_rows.append(summary)

bottleneck_runs = pd.DataFrame()
if 'bottleneck_p99' in df.columns and latency_primary is not None:
    bottleneck_runs = df.sort_values(latency_primary, ascending=False).head(50)
    bottleneck_runs.to_csv(out_tables / f'bottleneck_worst_runs_{latency_primary}.csv', index=False)

param_cols = [
    'build_R','build_L','build_B','build_M',
    'search_K','search_L','search_W','search_T','cache_size',
]
param_cols = [c for c in param_cols if c in df.columns]
if 'bottleneck_p99' in df.columns and param_cols:
    bottleneck_by_params = (
        df.groupby(param_cols + ['bottleneck_p99'])
        .size()
        .reset_index(name='count')
        .sort_values('count', ascending=False)
    )
    bottleneck_by_params.to_csv(out_tables / 'bottleneck_by_params.csv', index=False)

summary_rows[0].head(10) if summary_rows else None


Unnamed: 0,bottleneck_p99,count,median,mean,min,max
0,cpu,15,1235.885,1479.305667,950.75,2308.845
1,io,1365,1487.9969,2174.682599,538.499,7518.395


In [7]:
# Component balance and residual analysis
component_rows = []
for suffix, lat_col, io_col, cpu_col, sort_col in SHARE_SPECS:
    if lat_col not in df.columns:
        continue
    comp_sum_col = f'component_sum_{suffix}'
    res_col = f'residual_{suffix}'
    if comp_sum_col not in df.columns or res_col not in df.columns:
        continue
    component_rows.append({
        'suffix': suffix,
        'latency_col': lat_col,
        'component_sum_median': df[comp_sum_col].median(),
        'component_sum_mean': df[comp_sum_col].mean(),
        'residual_median': df[res_col].median(),
        'residual_mean': df[res_col].mean(),
        'residual_p99': df[res_col].quantile(0.99),
    })

component_balance_df = pd.DataFrame(component_rows)
if not component_balance_df.empty:
    component_balance_df.to_csv(out_tables / 'component_balance.csv', index=False)

component_balance_df


Unnamed: 0,suffix,latency_col,component_sum_median,component_sum_mean,residual_median,residual_mean,residual_p99
0,p99,latency_p99_us,1488.6804,2178.561486,-2.884,-11.437333,28.7961
1,p95,latency_p95_us,1292.5889,1798.27139,7.54295,6.845821,49.866866
2,p50,latency_p50_us,923.37355,1313.657276,15.5175,22.288069,78.95746
3,mean,mean_latency_us,956.57415,1300.632792,14.2452,20.838601,66.836405


In [8]:
# Bottleneck flags for saturation
flag_rows = []
if latency_primary is not None:
    temp = df.copy()
    if 'queue_depth_p99' in temp.columns:
        temp['flag_queue_depth_high'] = temp['queue_depth_p99'] >= temp['queue_depth_p99'].quantile(0.95)
    if 'iostat_%util_mean' in temp.columns:
        temp['flag_iostat_util_high'] = temp['iostat_%util_mean'] >= 90
    if 'thread_util_p99' in temp.columns:
        temp['flag_thread_util_high'] = temp['thread_util_p99'] >= 0.95
    for col in ['flag_queue_depth_high','flag_iostat_util_high','flag_thread_util_high']:
        if col in temp.columns:
            flag_rows.append({
                'flag': col,
                'count': int(temp[col].sum()),
                'rate': float(temp[col].mean()),
            })

flag_df = pd.DataFrame(flag_rows)
if not flag_df.empty:
    flag_df.to_csv(out_tables / 'bottleneck_flags.csv', index=False)

flag_df


Unnamed: 0,flag,count,rate
0,flag_queue_depth_high,90,0.065217
1,flag_iostat_util_high,351,0.254348
2,flag_thread_util_high,1377,0.997826


In [9]:
# Correlations with latency and shares
corr_rows = []
if latency_primary is not None:
    num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
    for col in num_cols:
        if col == latency_primary:
            continue
        series = df[[col, latency_primary]].dropna()
        if series.empty:
            continue
        corr_rows.append({
            'feature': col,
            'spearman': series[col].corr(series[latency_primary], method='spearman'),
            'count': len(series),
        })

corr_df = pd.DataFrame(corr_rows)
if not corr_df.empty:
    corr_df = corr_df.sort_values('spearman', ascending=False)
    corr_df.to_csv(out_tables / f'bottleneck_correlation_{latency_primary}.csv', index=False)

corr_df.head(10)


  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]
  return spearmanr(a

Unnamed: 0,feature,spearman,count
275,component_sum_p99,0.999618,1380
277,component_sum_p95,0.994358,1380
37,latency_p95_us,0.994345,1380
36,latency_p90_us,0.990474,1380
62,io_us_p99,0.988764,1380
35,latency_p75_us,0.982779,1380
61,io_us_p95,0.980968,1380
27,mean_latency_us,0.980903,1380
281,component_sum_mean,0.980701,1380
60,io_us_p90,0.97643,1380
