# 04 Surrogate Model (XGBoost / LightGBM)

Train a simple model to predict log(latency_p99_us).


# 說明

本 notebook 訓練 surrogate model（XGBoost/RandomForest）預測延遲指標，輸出重要特徵與殘差分析。

## 主要輸入
- `tables/filtered_stats.csv`

## 主要輸出
- `tables/model_metrics.csv`
- `tables/model_feature_importance_*.csv`
- `tables/model_residual_summary.csv`

## 合理性檢查建議
- `r2` 不應為明顯負值
- `baseline_mae_raw` 應高於模型的 `mae_raw`


In [1]:
from pathlib import Path
import os
import pandas as pd
import numpy as np

ANALYZE_DIR = Path('../outputFiles/analyze').resolve()
REPORT_PREFIX = os.environ.get('REPORT_PREFIX', 'analysis_reports')
COLLECT_PREFIX = os.environ.get('COLLECT_PREFIX', REPORT_PREFIX)
REPORT_DIR = (ANALYZE_DIR / REPORT_PREFIX)
COLLECT_DIR = (ANALYZE_DIR / COLLECT_PREFIX)
STATS_CSV = None  # set to a specific file path if needed
TOPK_CSV = None   # set to a specific file path if needed

FILTER_SEARCH_K = os.environ.get('FILTER_SEARCH_K', '27')
TARGET_COLS = [
    'latency_p99_us',
    'latency_p95_us',
    'latency_p50_us',
    'mean_latency_us',
]
LOG_TARGETS = True
TEST_SIZE = float(os.environ.get('MODEL_TEST_SIZE', '0.2'))
RANDOM_STATE = int(os.environ.get('MODEL_RANDOM_STATE', '42'))

FEATURE_CANDIDATES = [
    'build_R','build_L','build_B','build_M',
    'search_K','search_L','search_W','search_T','cache_size',
    'vector_dim','dataset_size',
    'out_degree_p99','out_degree_p95','out_degree_p50','out_degree_mean',
    'expanded_revisit_ratio','expanded_per_query_mean','expanded_steps_mean',
    'node_counts_top10_share','node_counts_top100_share',
    'iostat_aqu-sz_mean','iostat_%util_mean','queue_depth_p99',
    'io_us_p99','cpu_us_p99','thread_util_p99',
]

def pick_latest(pattern):
    files = sorted(COLLECT_DIR.glob(pattern))
    if not files:
        raise FileNotFoundError(f'No files matched: {pattern}')
    return files[-1]

def apply_search_k_filter(df, value):
    if not value or 'search_K' not in df.columns:
        return df
    try:
        target = int(value)
    except ValueError:
        return df
    return df[df['search_K'] == target].copy()

stats_path = Path(STATS_CSV) if STATS_CSV else pick_latest('collected_stats_*.csv')
topk_path = Path(TOPK_CSV) if TOPK_CSV else pick_latest('collected_topk_*.csv')

print('stats:', stats_path)
print('topk :', topk_path)

stats_df = pd.read_csv(stats_path)
topk_df = pd.read_csv(topk_path)


stats: /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/sift01/collected_stats_sift01_20260107_195000.csv
topk : /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/sift01/collected_topk_sift01_20260107_195000.csv


In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.inspection import permutation_importance

# Load filtered dataset if available
filtered_path = (REPORT_DIR / 'tables' / 'filtered_stats.csv')
filtered_df = None
if filtered_path.exists():
    filtered_df = pd.read_csv(filtered_path)
    print('filtered:', filtered_path)
else:
    print('filtered not found:', filtered_path)

base_df = filtered_df if filtered_df is not None else stats_df
base_df = apply_search_k_filter(base_df, FILTER_SEARCH_K)

feature_cols = [c for c in FEATURE_CANDIDATES if c in base_df.columns]
target_cols = [c for c in TARGET_COLS if c in base_df.columns]

print('rows:', len(base_df))
print('features:', feature_cols)
print('targets:', target_cols)

out_tables = (REPORT_DIR / 'tables')
out_tables.mkdir(parents=True, exist_ok=True)

metrics_rows = []
all_predictions = []

if not feature_cols or not target_cols:
    print('Skip model training: missing features or targets')
else:
    for target in target_cols:
        df = base_df.replace([np.inf, -np.inf], np.nan).dropna(subset=feature_cols + [target]).copy()
        if len(df) < 20:
            print('Skip target due to insufficient rows:', target)
            continue

        X = df[feature_cols]
        y_raw = df[target]
        if LOG_TARGETS and target.endswith('_us'):
            y = np.log(y_raw.clip(lower=1))
        else:
            y = y_raw

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
        )

        model = None
        try:
            import xgboost as xgb
            model = xgb.XGBRegressor(
                n_estimators=300,
                max_depth=6,
                learning_rate=0.05,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=RANDOM_STATE,
            )
        except Exception as e:
            print('xgboost not available:', e)

        if model is None:
            from sklearn.ensemble import RandomForestRegressor
            model = RandomForestRegressor(n_estimators=300, random_state=RANDOM_STATE)

        model.fit(X_train, y_train)
        pred = model.predict(X_test)

        mae = mean_absolute_error(y_test, pred)
        rmse = np.sqrt(mean_squared_error(y_test, pred))
        r2 = r2_score(y_test, pred)
        baseline = np.median(y_train)
        baseline_mae = mean_absolute_error(y_test, np.full_like(y_test, baseline))

        if LOG_TARGETS and target.endswith('_us'):
            y_test_raw = np.exp(y_test)
            pred_raw = np.exp(pred)
            baseline_raw = np.median(np.exp(y_train))
        else:
            y_test_raw = y_test
            pred_raw = pred
            baseline_raw = np.median(y_train)

        mae_raw = mean_absolute_error(y_test_raw, pred_raw)
        rmse_raw = np.sqrt(mean_squared_error(y_test_raw, pred_raw))
        r2_raw = r2_score(y_test_raw, pred_raw)
        baseline_mae_raw = mean_absolute_error(y_test_raw, np.full_like(y_test_raw, baseline_raw))

        metrics_rows.append({
            'target': target,
            'rows': len(df),
            'mae': float(mae),
            'rmse': float(rmse),
            'r2': float(r2),
            'baseline_mae': float(baseline_mae),
            'mae_raw': float(mae_raw),
            'rmse_raw': float(rmse_raw),
            'r2_raw': float(r2_raw),
            'baseline_mae_raw': float(baseline_mae_raw),
            'log_target': bool(LOG_TARGETS and target.endswith('_us')),
            'feature_count': len(feature_cols),
        })

        if hasattr(model, 'feature_importances_'):
            fi = pd.DataFrame({
                'feature': feature_cols,
                'importance': model.feature_importances_,
            }).sort_values('importance', ascending=False)
            fi.to_csv(out_tables / f'model_feature_importance_{target}.csv', index=False)

        try:
            perm = permutation_importance(model, X_test, y_test, n_repeats=8, random_state=RANDOM_STATE)
            perm_df = pd.DataFrame({
                'feature': feature_cols,
                'importance_mean': perm.importances_mean,
                'importance_std': perm.importances_std,
            }).sort_values('importance_mean', ascending=False)
            perm_df.to_csv(out_tables / f'model_permutation_importance_{target}.csv', index=False)
        except Exception as e:
            print('Permutation importance failed:', e)

        full_pred = model.predict(X)
        if LOG_TARGETS and target.endswith('_us'):
            full_pred_raw = np.exp(full_pred)
            y_true_model = np.log(y_raw.clip(lower=1))
        else:
            full_pred_raw = full_pred
            y_true_model = y_raw.values
        pred_df = pd.DataFrame({
            'run_prefix': df['run_prefix'] if 'run_prefix' in df.columns else df.index,
            'target': target,
            'y_true_raw': y_raw.values,
            'y_pred_raw': full_pred_raw,
            'y_true_model': y_true_model,
            'y_pred_model': full_pred,
        })
        pred_df.to_csv(out_tables / f'model_predictions_{target}.csv', index=False)
        all_predictions.append(pred_df)

metrics_df = pd.DataFrame(metrics_rows)
if not metrics_df.empty:
    metrics_df.to_csv(out_tables / 'model_metrics.csv', index=False)

metrics_df


filtered: /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/sift01/tables/filtered_stats.csv
rows: 1380
features: ['build_R', 'build_L', 'build_B', 'build_M', 'search_K', 'search_L', 'search_W', 'search_T', 'vector_dim', 'dataset_size', 'out_degree_p99', 'out_degree_p95', 'out_degree_p50', 'out_degree_mean', 'expanded_revisit_ratio', 'expanded_per_query_mean', 'expanded_steps_mean', 'node_counts_top10_share', 'node_counts_top100_share', 'iostat_aqu-sz_mean', 'iostat_%util_mean', 'queue_depth_p99', 'io_us_p99', 'cpu_us_p99', 'thread_util_p99']
targets: ['latency_p99_us', 'latency_p95_us', 'latency_p50_us', 'mean_latency_us']


Unnamed: 0,target,rows,mae,rmse,r2,baseline_mae,mae_raw,rmse_raw,r2_raw,baseline_mae_raw,log_target,feature_count
0,latency_p99_us,1380,0.015051,0.02599,0.998586,0.60887,27.434358,46.656382,0.999001,1164.723936,True,25
1,latency_p95_us,1380,0.019122,0.027884,0.998292,0.597401,35.024135,59.758135,0.997632,961.152484,True,25
2,latency_p50_us,1380,0.035708,0.080076,0.986423,0.605555,49.149794,120.200864,0.983874,716.351357,True,25
3,mean_latency_us,1380,0.027634,0.04446,0.995524,0.591246,36.085281,67.502916,0.994205,692.583709,True,25


In [3]:
# Residual analysis
import pandas as pd
residual_rows = []
worst_rows = []
if 'metrics_df' in globals() and not metrics_df.empty:
    for target in metrics_df['target']:
        pred_path = out_tables / f'model_predictions_{target}.csv'
        if not pred_path.exists():
            continue
        pred_df = pd.read_csv(pred_path)
        if 'y_true_raw' in pred_df.columns and 'y_pred_raw' in pred_df.columns:
            pred_df['abs_error'] = (pred_df['y_true_raw'] - pred_df['y_pred_raw']).abs()
            error_label = 'raw'
        else:
            pred_df['abs_error'] = (pred_df['y_true_model'] - pred_df['y_pred_model']).abs()
            error_label = 'model'
        residual_rows.append({
            'target': target,
            'mae': pred_df['abs_error'].mean(),
            'p50_abs_error': pred_df['abs_error'].median(),
            'p90_abs_error': pred_df['abs_error'].quantile(0.9),
            'p99_abs_error': pred_df['abs_error'].quantile(0.99),
            'error_space': error_label,
        })
        worst = pred_df.sort_values('abs_error', ascending=False).head(20)
        worst['target'] = target
        worst_rows.append(worst)

residual_df = pd.DataFrame(residual_rows)
if not residual_df.empty:
    residual_df.to_csv(out_tables / 'model_residual_summary.csv', index=False)

if worst_rows:
    worst_df = pd.concat(worst_rows, ignore_index=True)
    worst_df.to_csv(out_tables / 'model_worst_residuals.csv', index=False)

residual_df


Unnamed: 0,target,mae,p50_abs_error,p90_abs_error,p99_abs_error,error_space
0,latency_p99_us,10.600035,4.94838,24.80414,91.527895,raw
1,latency_p95_us,14.20407,5.865815,32.11756,141.989962,raw
2,latency_p50_us,16.953303,5.93814,36.00894,165.058717,raw
3,mean_latency_us,13.730007,5.9999,29.9464,133.13104,raw
