# Ranking Ablation Study

Objective:
Quantify the impact of ranking signals beyond semantic similarity.


In [None]:
import json, faiss, numpy as np, pandas as pd
from sentence_transformers import SentenceTransformer
from intent_intelligence.intent_resolver import resolve_intent


In [None]:
catalog = json.load(open("../artifacts/shl_individual_assessments.json"))
train_df = pd.read_csv("../data/train.csv")

queries = train_df["Query"].unique()
truth = train_df.groupby("Query")["Assessment_url"].apply(list).to_dict()


In [None]:
model = SentenceTransformer("all-mpnet-base-v2")
vecs = model.encode([c["semantic_profile_text"] for c in catalog])
faiss.normalize_L2(vecs)

index = faiss.IndexFlatIP(vecs.shape[1])
index.add(vecs)


In [None]:
def candidates(query):
    qv = model.encode([query])
    faiss.normalize_L2(qv)
    scores, idxs = index.search(qv, 50)
    intent = resolve_intent(query)

    out = []
    for s, i in zip(scores[0], idxs[0]):
        c = catalog[i]
        out.append({
            "url": c["url"],
            "sim": float(s),
            "skill": len(set(c["test_types"]) & set(intent["desired_test_types"])),
            "type": any(t in c["test_types"] for t in intent["desired_test_types"])
        })
    return out


In [None]:
def score_sem(c): return c["sim"]
def score_sem_skill(c): return 0.7*c["sim"] + 0.3*c["skill"]
def score_full(c): return 0.6*c["sim"] + 0.25*c["skill"] + 0.15*c["type"]


In [None]:
def eval_variant(fn):
    scores = []
    for q in queries:
        ranked = sorted(candidates(q), key=fn, reverse=True)
        pred = [c["url"] for c in ranked[:10]]
        scores.append(recall_at_k(pred, truth[q]))
    return np.mean(scores)

results = [
    {"variant": "semantic_only", "Recall@10": eval_variant(score_sem)},
    {"variant": "semantic + skill", "Recall@10": eval_variant(score_sem_skill)},
    {"variant": "full_scoring", "Recall@10": eval_variant(score_full)}
]

pd.DataFrame(results)


## Conclusion

Adding intent-aligned signals improves Recall@10 consistently.

Final system uses:
semantic similarity + skill overlap + test-type alignment
