# Embedding Model Comparison

Objective:
Select an embedding model that maximizes Recall@10 for SHL assessment retrieval
while remaining production-safe and cost-effective.


In [None]:
import json, faiss, numpy as np, pandas as pd
from sentence_transformers import SentenceTransformer
from intent_intelligence.intent_resolver import resolve_intent


In [None]:
catalog = json.load(open("../artifacts/shl_individual_assessments.json"))
train_df = pd.read_csv("../data/train.csv")

queries = train_df["Query"].unique()
truth = train_df.groupby("Query")["Assessment_url"].apply(list).to_dict()


In [None]:
def recall_at_k(pred, rel, k=10):
    return len(set(pred[:k]) & set(rel)) / max(1, len(rel))


In [None]:
def evaluate_model(model_name):
    model = SentenceTransformer(model_name)
    vecs = model.encode([c["semantic_profile_text"] for c in catalog])
    faiss.normalize_L2(vecs)

    index = faiss.IndexFlatIP(vecs.shape[1])
    index.add(vecs)

    scores = []
    for q in queries:
        qv = model.encode([q])
        faiss.normalize_L2(qv)
        _, idx = index.search(qv, 10)
        pred = [catalog[i]["url"] for i in idx[0]]
        scores.append(recall_at_k(pred, truth[q]))
    return np.mean(scores)


In [None]:
results = {
    "MiniLM-L6": evaluate_model("all-MiniLM-L6-v2"),
    "MPNet": evaluate_model("all-mpnet-base-v2")
}

pd.DataFrame([
    {"model": k, "Recall@10": v} for k, v in results.items()
])


## Conclusion

MPNet outperforms MiniLM on Recall@10.

Given higher semantic fidelity and acceptable latency,
MPNet is selected as the production embedding model.
