# Evaluation — Recall@k (slice & study)

Runs retrieval for a set of phrases and reports Recall@{1,5,10}.

In [None]:
# --- Imports & setup
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import torch

repo_root = Path.cwd().resolve().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
sys.path.append(str(repo_root / "src"))

from pgr import encoders, index
from pgr.utils import seed_everything, get_device
from pgr_dl import io_deeplesion as io, phrases, adapters, eval_dl


In [None]:
# --- Config
RESULTS_DIR = repo_root / "results" / "kaggle_v1"
INDEX_PATH  = RESULTS_DIR / "index.faiss"
IDS_PATH    = RESULTS_DIR / "ids.parquet"
MODEL_NAME  = "ViT-B/16"
PHRASE_SET  = ["liver lesion","renal mass","splenic lesion","lung nodule","enlarged lymph node","bone lesion"]
K_LIST      = [1, 5, 10]
K_BUILD     = 100  # retrieve more than you report

seed_everything(42)
ids_df = pd.read_parquet(IDS_PATH)
assert len(ids_df) > 0, "ids.parquet is empty"


In [None]:
# --- Load index + encoder
fa = index.FaissIndex.load(str(INDEX_PATH))
enc = encoders.ClipEncoder(model_name=MODEL_NAME, device=str(get_device(None)))
print(f"Index ready (dim={fa.dim}), encoder={MODEL_NAME}")


In [None]:
# --- Slice-level Recall@k
rows = []
for phrase in PHRASE_SET:
    q_vec = adapters.encode_phrase(enc, phrase).cpu().numpy()  # (1,D) float32, L2-normalized
    scores, I, _ = fa.search(q_vec, k=K_BUILD)

    hits = []
    for rk, idx in enumerate(I[0]):
        row = ids_df.iloc[int(idx)]
        is_pos = phrases.tags_match_phrase(str(row.get("body_part", "")),
                                            str(row.get("lesion_type", "")),
                                            phrase)
        hits.append({
            "query_phrase": phrase,
            "rank": rk + 1,
            "score": float(scores[0, rk]),
            "study_id": str(row["study_id"]),
            "slice_idx": int(row["slice_idx"]),
            "is_positive": bool(is_pos),
        })
    res_df = pd.DataFrame(hits)

    for k in K_LIST:
        r = eval_dl.recall_at_k(res_df, k=k)
        rows.append({"level": "slice", "phrase": phrase, "k": k, "recall": r})

slice_table = pd.DataFrame(rows)
slice_table.pivot(index="phrase", columns="k", values="recall")


In [None]:
# --- Study-level Recall@k
study_rows = []
for phrase in PHRASE_SET:
    q_vec = adapters.encode_phrase(enc, phrase).cpu().numpy()
    scores, I, _ = fa.search(q_vec, k=K_BUILD)

    hits = []
    for rk, idx in enumerate(I[0]):
        row = ids_df.iloc[int(idx)]
        is_pos = phrases.tags_match_phrase(str(row.get("body_part", "")),
                                            str(row.get("lesion_type", "")),
                                            phrase)
        hits.append({
            "query_phrase": phrase,
            "rank": rk + 1,
            "score": float(scores[0, rk]),
            "study_id": str(row["study_id"]),
            "slice_idx": int(row["slice_idx"]),
            "is_positive": bool(is_pos),
        })
    res_df = pd.DataFrame(hits)

    # aggregate best slice per study (then recall@k)
    best = eval_dl.aggregate_best_slice_per_study(res_df)
    for k in K_LIST:
        r = eval_dl.recall_at_k(best, k=k)
        study_rows.append({"level": "study", "phrase": phrase, "k": k, "recall": r})

study_table = pd.DataFrame(study_rows)
study_table.pivot(index="phrase", columns="k", values="recall")


In [None]:
# --- Save metrics table
out = (
    pd.concat([slice_table, study_table], ignore_index=True)
        .sort_values(["level", "phrase", "k"])
        .reset_index(drop=True)
)
out_path = RESULTS_DIR / "recall_table.csv"
out.to_csv(out_path, index=False)
print("Saved:", out_path)
out
