# 00 Load and QC

Load the latest collected CSVs, validate uniqueness, and filter degenerate runs.


In [1]:
from pathlib import Path
import os
import pandas as pd
import numpy as np

ANALYZE_DIR = Path('../outputFiles/analyze').resolve()
REPORT_PREFIX = os.environ.get('REPORT_PREFIX', 'analysis_reports')
COLLECT_PREFIX = os.environ.get('COLLECT_PREFIX', REPORT_PREFIX)
REPORT_DIR = (ANALYZE_DIR / REPORT_PREFIX)
COLLECT_DIR = (ANALYZE_DIR / COLLECT_PREFIX)
STATS_CSV = None  # set to a specific file path if needed
TOPK_CSV = None   # set to a specific file path if needed

def pick_latest(pattern):
    files = sorted(COLLECT_DIR.glob(pattern))
    if not files:
        raise FileNotFoundError(f'No files matched: {pattern}')
    return files[-1]

stats_path = Path(STATS_CSV) if STATS_CSV else pick_latest('collected_stats_*.csv')
topk_path = Path(TOPK_CSV) if TOPK_CSV else pick_latest('collected_topk_*.csv')

print('stats:', stats_path)
print('topk :', topk_path)

stats_df = pd.read_csv(stats_path)
topk_df = pd.read_csv(topk_path)


stats: /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/siftsmall01/collected_stats_siftsmall01_20260102_190313.csv
topk : /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/siftsmall01/collected_topk_siftsmall01_20260102_190313.csv


In [2]:
# Basic integrity checks
print('rows stats:', len(stats_df))
print('rows topk :', len(topk_df))

dup = stats_df['run_prefix'].duplicated().sum()
print('duplicate run_prefix in stats:', dup)

params_per_run = stats_df.groupby('run_prefix')[['search_L','search_W','search_K','search_T']].nunique()
inconsistent = (params_per_run > 1).any(axis=1).sum()
print('runs with inconsistent search params:', inconsistent)

num_queries_unique = stats_df['num_queries'].nunique()
print('num_queries unique values:', num_queries_unique)


rows stats: 3150
rows topk : 3150
duplicate run_prefix in stats: 0
runs with inconsistent search params: 0
num_queries unique values: 1


In [3]:
# Degenerate filters and recall thresholds
recall_threshold = 0.7
degenerate = (stats_df['search_L'] < stats_df['search_K']) | (stats_df['search_L'] < 2 * stats_df['search_W'])
low_recall = stats_df['recall_mean'] < recall_threshold

stats_df['flag_degenerate'] = degenerate
stats_df['flag_low_recall'] = low_recall

filtered = stats_df[~degenerate].copy()
filtered = filtered[~low_recall].copy()

print('filtered rows:', len(filtered))

qc_summary = {
    'total_rows': len(stats_df),
    'degenerate_rows': int(degenerate.sum()),
    'low_recall_rows': int(low_recall.sum()),
    'filtered_rows': len(filtered),
}
qc_df = pd.DataFrame([qc_summary])
qc_df


filtered rows: 3150


Unnamed: 0,total_rows,degenerate_rows,low_recall_rows,filtered_rows
0,3150,0,0,3150


In [4]:
# Save QC outputs
out_tables = (REPORT_DIR / 'tables')
out_tables.mkdir(parents=True, exist_ok=True)
qc_df.to_csv(out_tables / 'qc_summary.csv', index=False)
filtered.to_csv(out_tables / 'filtered_stats.csv', index=False)
print('Saved:', out_tables / 'qc_summary.csv')
print('Saved:', out_tables / 'filtered_stats.csv')


Saved: /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/siftsmall01/tables/qc_summary.csv
Saved: /home/gt/research/DiskANN/scripts/paramAnalysis/gridSearch/outputFiles/analyze/siftsmall01/tables/filtered_stats.csv
