# 03 Graph Structure Analysis

Relate out-degree, node hotness, and revisit ratio to tail latency.


# 說明

本 notebook 聚焦圖結構特徵（out-degree、revisit、hot nodes）與延遲的關係。

## 主要輸入
- `tables/filtered_stats.csv`

## 主要輸出
- `tables/graph_corr_*.csv`
- `tables/graph_feature_summary.csv`
- `tables/graph_interactions_*.csv`

## 合理性檢查建議
- 相關性應符合直覺（例如 revisit ratio 上升時延遲增加）
- 交互作用範圍若極大，需檢查資料分布


In [1]:
from pathlib import Path
import os
import pandas as pd
import numpy as np

ANALYZE_DIR = Path('../outputFiles/analyze').resolve()
REPORT_PREFIX = os.environ.get('REPORT_PREFIX', 'analysis_reports')
COLLECT_PREFIX = os.environ.get('COLLECT_PREFIX', REPORT_PREFIX)
REPORT_DIR = (ANALYZE_DIR / REPORT_PREFIX)
COLLECT_DIR = (ANALYZE_DIR / COLLECT_PREFIX)
STATS_CSV = None  # set to a specific file path if needed
TOPK_CSV = None   # set to a specific file path if needed

PLOT_MAX_POINTS = int(os.environ.get('PLOT_MAX_POINTS', '20000'))
PLOT_ALPHA = float(os.environ.get('PLOT_ALPHA', '0.6'))
PLOT_LOG_LATENCY = os.environ.get('PLOT_LOG_LATENCY', '1') != '0'
FILTER_SEARCH_K = os.environ.get('FILTER_SEARCH_K', '10')

LATENCY_PRIORITY = [
    'latency_p99_us',
    'latency_p95_us',
    'latency_p50_us',
    'mean_latency_us',
]
GRAPH_FEATURES = [
    'out_degree_mean','out_degree_p50','out_degree_p90','out_degree_p95','out_degree_p99',
    'expanded_revisit_ratio','expanded_per_query_mean','expanded_per_query_p50','expanded_per_query_p90',
    'expanded_steps_mean','expanded_steps_p50','expanded_steps_p90',
    'node_counts_top1_share','node_counts_top10_share','node_counts_top100_share',
    'node_counts_unique','node_counts_total',
]

def pick_latest(pattern):
    files = sorted(COLLECT_DIR.glob(pattern))
    if not files:
        raise FileNotFoundError(f'No files matched: {pattern}')
    return files[-1]

def downsample_df(df, max_points=PLOT_MAX_POINTS, seed=42):
    if len(df) <= max_points:
        return df
    return df.sample(n=max_points, random_state=seed)

def safe_scatter(df, x, y, color=None, title=None, out_path=None, cmap='viridis'):
    if x not in df.columns or y not in df.columns:
        print('skip scatter missing columns:', x, y)
        return
    cols = [x, y] + ([color] if color and color in df.columns else [])
    plot_df = df[cols].dropna()
    if plot_df.empty:
        print('skip scatter empty data:', x, y)
        return
    plot_df = downsample_df(plot_df)
    ax = plot_df.plot.scatter(
        x=x,
        y=y,
        c=color if color in plot_df.columns else None,
        cmap=cmap,
        alpha=PLOT_ALPHA,
        figsize=(6, 4),
    )
    if title:
        ax.set_title(title)
    if PLOT_LOG_LATENCY and y.endswith('_us'):
        ax.set_yscale('log')
    import matplotlib.pyplot as plt
    plt.tight_layout()
    if out_path:
        plt.savefig(out_path, dpi=150)
    plt.close()

def apply_search_k_filter(df, value):
    if not value or 'search_K' not in df.columns:
        return df
    try:
        target = int(value)
    except ValueError:
        return df
    return df[df['search_K'] == target].copy()

stats_path = Path(STATS_CSV) if STATS_CSV else pick_latest('collected_stats_*.csv')
topk_path = Path(TOPK_CSV) if TOPK_CSV else pick_latest('collected_topk_*.csv')

print('stats:', stats_path)
print('topk :', topk_path)

stats_df = pd.read_csv(stats_path)
topk_df = pd.read_csv(topk_path)


stats: /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/sift01/collected_stats_sift01_20260107_195000.csv
topk : /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/sift01/collected_topk_sift01_20260107_195000.csv


In [2]:
# Load filtered dataset if available
filtered_path = (REPORT_DIR / 'tables' / 'filtered_stats.csv')
filtered_df = None
if filtered_path.exists():
    filtered_df = pd.read_csv(filtered_path)
    print('filtered:', filtered_path)
else:
    print('filtered not found:', filtered_path)

base_df = filtered_df if filtered_df is not None else stats_df
base_df = apply_search_k_filter(base_df, FILTER_SEARCH_K)
stats_df = apply_search_k_filter(stats_df, FILTER_SEARCH_K)
if filtered_df is not None:
    filtered_df = apply_search_k_filter(filtered_df, FILTER_SEARCH_K)

latency_cols = [c for c in LATENCY_PRIORITY if c in base_df.columns]
latency_primary = latency_cols[0] if latency_cols else None

graph_cols = [c for c in GRAPH_FEATURES if c in base_df.columns]

print('base_df rows:', len(base_df))
print('latency cols:', latency_cols)
print('graph cols:', graph_cols)


filtered: /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/sift01/tables/filtered_stats.csv
base_df rows: 1380
latency cols: ['latency_p99_us', 'latency_p95_us', 'latency_p50_us', 'mean_latency_us']
graph cols: ['out_degree_mean', 'out_degree_p50', 'out_degree_p90', 'out_degree_p95', 'out_degree_p99', 'expanded_revisit_ratio', 'expanded_per_query_mean', 'expanded_per_query_p50', 'expanded_per_query_p90', 'expanded_steps_mean', 'expanded_steps_p50', 'expanded_steps_p90', 'node_counts_top1_share', 'node_counts_top10_share', 'node_counts_top100_share', 'node_counts_unique', 'node_counts_total']


In [3]:
df = base_df.copy()


In [4]:
import matplotlib.pyplot as plt

fig_dir = (REPORT_DIR / 'figures')
fig_dir.mkdir(parents=True, exist_ok=True)

if latency_primary is not None:
    for feat in graph_cols:
        safe_scatter(
            df,
            feat,
            latency_primary,
            title=f'{feat} vs {latency_primary} (base)',
            out_path=fig_dir / f'{feat}_vs_{latency_primary}_base.png',
        )
        if filtered_df is not None:
            safe_scatter(
                filtered_df,
                feat,
                latency_primary,
                title=f'{feat} vs {latency_primary} (filtered)',
                out_path=fig_dir / f'{feat}_vs_{latency_primary}_filtered.png',
            )

print('Saved figures to', fig_dir)


  scatter = ax.scatter(
  scatter = ax.scatter(
  scatter = ax.scatter(


  scatter = ax.scatter(
  scatter = ax.scatter(
  scatter = ax.scatter(
  scatter = ax.scatter(


  scatter = ax.scatter(
  scatter = ax.scatter(
  scatter = ax.scatter(
  scatter = ax.scatter(


  scatter = ax.scatter(
  scatter = ax.scatter(
  scatter = ax.scatter(


  scatter = ax.scatter(
  scatter = ax.scatter(
  scatter = ax.scatter(


  scatter = ax.scatter(
  scatter = ax.scatter(
  scatter = ax.scatter(


  scatter = ax.scatter(
  scatter = ax.scatter(
  scatter = ax.scatter(


  scatter = ax.scatter(
  scatter = ax.scatter(
  scatter = ax.scatter(


  scatter = ax.scatter(
  scatter = ax.scatter(
  scatter = ax.scatter(


  scatter = ax.scatter(
  scatter = ax.scatter(
  scatter = ax.scatter(


Saved figures to /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/sift01/figures


  scatter = ax.scatter(
  scatter = ax.scatter(


In [5]:
import matplotlib.pyplot as plt

fig_dir = (REPORT_DIR / 'figures')
fig_dir.mkdir(parents=True, exist_ok=True)

if latency_primary is not None:
    hot_cols = [c for c in ['node_counts_top1_share','node_counts_top10_share','node_counts_top100_share'] if c in df.columns]
    for col in hot_cols:
        safe_scatter(
            df,
            col,
            latency_primary,
            title=f'{col} vs {latency_primary} (base)',
            out_path=fig_dir / f'{col}_vs_{latency_primary}_base.png',
        )
        if filtered_df is not None:
            safe_scatter(
                filtered_df,
                col,
                latency_primary,
                title=f'{col} vs {latency_primary} (filtered)',
                out_path=fig_dir / f'{col}_vs_{latency_primary}_filtered.png',
            )

    safe_scatter(
        df,
        'expanded_revisit_ratio',
        latency_primary,
        title=f'Revisit ratio vs {latency_primary} (base)',
        out_path=fig_dir / f'revisit_ratio_vs_{latency_primary}_base.png',
    )
    if filtered_df is not None:
        safe_scatter(
            filtered_df,
            'expanded_revisit_ratio',
            latency_primary,
            title=f'Revisit ratio vs {latency_primary} (filtered)',
            out_path=fig_dir / f'revisit_ratio_vs_{latency_primary}_filtered.png',
        )

print('Saved figures to', fig_dir)


  scatter = ax.scatter(


  scatter = ax.scatter(
  scatter = ax.scatter(


  scatter = ax.scatter(


  scatter = ax.scatter(
  scatter = ax.scatter(


  scatter = ax.scatter(


  scatter = ax.scatter(


Saved figures to /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/sift01/figures
