# 04 Surrogate Model (XGBoost / LightGBM)

Train a simple model to predict log(latency_p99_us).


In [1]:
from pathlib import Path
import os
import pandas as pd
import numpy as np

ANALYZE_DIR = Path('../outputFiles/analyze').resolve()
REPORT_PREFIX = os.environ.get('REPORT_PREFIX', 'analysis_reports')
REPORT_DIR = (Path('../outputFiles/analyze').resolve() / REPORT_PREFIX)
STATS_CSV = None  # set to a specific file path if needed
TOPK_CSV = None   # set to a specific file path if needed

def pick_latest(pattern):
    files = sorted(ANALYZE_DIR.glob(pattern))
    if not files:
        raise FileNotFoundError(f'No files matched: {pattern}')
    return files[-1]

stats_path = Path(STATS_CSV) if STATS_CSV else pick_latest('collected_stats_*.csv')
topk_path = Path(TOPK_CSV) if TOPK_CSV else pick_latest('collected_topk_*.csv')

print('stats:', stats_path)
print('topk :', topk_path)

stats_df = pd.read_csv(stats_path)
topk_df = pd.read_csv(topk_path)


stats: /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/collected_stats_exp01_20260101_235226.csv
topk : /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/collected_topk_exp01_20260101_235226.csv


In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

df = stats_df.copy()
df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=['latency_p99_us'])
df['log_latency_p99'] = np.log(df['latency_p99_us'].clip(lower=1))

features = [
    'build_R','build_L','build_B','build_M',
    'search_K','search_L','search_W','search_T',
    'vector_dim','dataset_size',
    'out_degree_p99','expanded_revisit_ratio','node_counts_top10_share',
    'iostat_aqu-sz_mean',
]
features = [f for f in features if f in df.columns]

X = df[features]
y = df['log_latency_p99']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = None
try:
    import xgboost as xgb
    model = xgb.XGBRegressor(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
    )
except Exception as e:
    print('xgboost not available:', e)

if model is None:
    from sklearn.ensemble import RandomForestRegressor
    model = RandomForestRegressor(n_estimators=200, random_state=42)

model.fit(X_train, y_train)
pred = model.predict(X_test)
mae = mean_absolute_error(y_test, pred)
print('MAE (log latency p99):', mae)


MAE (log latency p99): 0.09281692950447448
