# Enhanced FAQ Retrieval Evaluation

This notebook evaluates the **enhanced performance** of the FAQ Retrieval Assistant
after introducing **multilingual question variants** mapped to canonical answers.

The system now supports language-agnostic retrieval with normalized output

The evaluation measures:
- **Top-1 accuracy** and **Top-3 accuracy**
- **Confidence behavior** of the retrieval results
- **Performance differences between English and Macedonian queries**

In [1]:
import json
import pandas as pd
import os
from datetime import datetime

from app.retrieval import search

In [2]:
EVAL_QUERIES_PATH = "queries.json"

with open(EVAL_QUERIES_PATH, "r", encoding="utf-8") as f:
    eval_queries = json.load(f)

len(eval_queries)

32

In [3]:
results = []

for entry in eval_queries:
    query = entry["query"]
    expected_id = entry["expected_faq_id"]
    language = entry["language"]

    response = search(query, top_k=3)

    retrieved_ids = [r["id"] for r in response["results"]]
    confidence = response["confidence"]

    top1_correct = retrieved_ids[0] == expected_id
    top3_correct = expected_id in retrieved_ids

    if expected_id in retrieved_ids:
        rank_of_expected = retrieved_ids.index(expected_id) + 1
    else:
        rank_of_expected = None

    results.append({
        "query": query,
        "language": language,
        "expected_faq_id": expected_id,
        "retrieved_ids": retrieved_ids,
        "rank_of_expected": rank_of_expected,
        "top1_correct": top1_correct,
        "top3_correct": top3_correct,
        "confidence": confidence
    })

In [4]:
df = pd.DataFrame(results)
df.head()

Unnamed: 0,query,language,expected_faq_id,retrieved_ids,rank_of_expected,top1_correct,top3_correct,confidence
0,"I can't log into my account, how do I reset my...",en,reset_password_q_en,"[reset_password_q_en, forgot_password_login_q_...",1,True,True,0.681
1,Forgot my password and now I'm locked out,en,forgot_password_login_q_en,"[forgot_password_login_q_en, reset_password_q_...",1,True,True,0.648
2,The password reset email never arrives,en,password_reset_email_missing_q_en,"[password_reset_email_missing_q_en, reset_pass...",1,True,True,0.59
3,How can I update the email linked to my account?,en,change_email_q_en,"[change_email_q_en, update_billing_q_en, upgra...",1,True,True,0.485
4,Where can I change my credit card details?,en,update_billing_q_en,"[change_email_q_en, update_billing_q_en, upgra...",2,False,True,0.151


In [5]:
top1_accuracy = df["top1_correct"].mean()
top3_accuracy = df["top3_correct"].mean()

top1_accuracy, top3_accuracy

(np.float64(0.90625), np.float64(1.0))

In [6]:
df.groupby("language")[["top1_correct", "top3_correct"]].mean()

Unnamed: 0_level_0,top1_correct,top3_correct
language,Unnamed: 1_level_1,Unnamed: 2_level_1
en,0.95,1.0
mk,0.833333,1.0


In [7]:
df.groupby("top1_correct")["confidence"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
top1_correct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,3.0,0.583667,0.374834,0.151,0.4705,0.79,0.8,0.81
True,29.0,0.696207,0.138395,0.413,0.633,0.683,0.767,1.0


In [8]:
df[(~df["top3_correct"])][
    ["query", "language", "expected_faq_id", "retrieved_ids", "confidence"]
]

Unnamed: 0,query,language,expected_faq_id,retrieved_ids,confidence


In [9]:
RESULTS_DIR = "results"
os.makedirs(RESULTS_DIR, exist_ok=True)

RESULTS_DIR

'results'

In [10]:
RUN_METADATA = {
    "run_name": "enhanced",
    "timestamp_utc": datetime.now().isoformat(),
    "embedding_model": "text-embedding-3-small",
    "top_k": 3,
    "confidence_formula": "0.7*similarity + 0.3*margin",
    "notes": "Evaluation on enhanced multilingual FAQ augmentation"
}

RUN_METADATA

{'run_name': 'enhanced',
 'timestamp_utc': '2025-12-26T00:34:03.932010',
 'embedding_model': 'text-embedding-3-small',
 'top_k': 3,
 'confidence_formula': '0.7*similarity + 0.3*margin',
 'notes': 'Evaluation on enhanced multilingual FAQ augmentation'}

In [11]:
PER_QUERY_PATH = os.path.join(
    RESULTS_DIR,
    f"{RUN_METADATA['run_name']}_results.csv"
)

df.to_csv(PER_QUERY_PATH, index=False)

PER_QUERY_PATH

'results/enhanced_results.csv'

In [12]:
metrics = {
    "run_metadata": RUN_METADATA,
    "num_queries": len(df),
    "top1_accuracy": float(df["top1_correct"].mean()),
    "top3_accuracy": float(df["top3_correct"].mean()),
    "accuracy_by_language": df.groupby("language")[
        ["top1_correct", "top3_correct"]
    ].mean().to_dict(),
    "confidence_stats": {
        "correct": df[df["top1_correct"]]["confidence"].describe().to_dict(),
        "incorrect": df[~df["top1_correct"]]["confidence"].describe().to_dict()
    }
}

metrics

{'run_metadata': {'run_name': 'enhanced',
  'timestamp_utc': '2025-12-26T00:34:03.932010',
  'embedding_model': 'text-embedding-3-small',
  'top_k': 3,
  'confidence_formula': '0.7*similarity + 0.3*margin',
  'notes': 'Evaluation on enhanced multilingual FAQ augmentation'},
 'num_queries': 32,
 'top1_accuracy': 0.90625,
 'top3_accuracy': 1.0,
 'accuracy_by_language': {'top1_correct': {'en': 0.95,
   'mk': 0.8333333333333334},
  'top3_correct': {'en': 1.0, 'mk': 1.0}},
 'confidence_stats': {'correct': {'count': 29.0,
   'mean': 0.6962068965517241,
   'std': 0.13839471999381872,
   'min': 0.413,
   '25%': 0.633,
   '50%': 0.683,
   '75%': 0.767,
   'max': 1.0},
  'incorrect': {'count': 3.0,
   'mean': 0.5836666666666667,
   'std': 0.3748337409216696,
   'min': 0.151,
   '25%': 0.47050000000000003,
   '50%': 0.79,
   '75%': 0.8,
   'max': 0.81}}}

In [13]:
METRICS_PATH = os.path.join(
    RESULTS_DIR,
    f"{RUN_METADATA['run_name']}_metrics.json"
)

with open(METRICS_PATH, "w", encoding="utf-8") as f:
    json.dump(metrics, f, indent=2)

METRICS_PATH

'results/enhanced_metrics.json'