# Baseline FAQ Retrieval Evaluation

This notebook evaluates the **baseline performance** of the FAQ Retrieval Assistant
*before* introducing multilingual FAQ variants or translation-based enhancements.

The evaluation measures:
- **Top-1 accuracy** and **Top-3 accuracy**
- **Confidence behavior** of the retrieval results
- **Performance differences between English and Macedonian queries**

The results from this notebook serve as a **reference baseline** for comparing future
improvements such as multilingual augmentation, threshold tuning, and retrieval refinements.

In [41]:
import json
import pandas as pd
import os
from datetime import datetime

from app.retrieval import search

In [42]:
# test
result = search("Man whats my password")
faq_results = result["results"]
result

{'results': [{'id': 1,
   'question': 'How do I reset my password?',
   'answer': 'You can reset your password by clicking the Forgot password link on the login page and following the instructions sent to your email.',
   'similarity': 0.5578154921531677},
  {'id': 2,
   'question': 'I forgot my password and cannot log in',
   'answer': 'If you forgot your password, use the Forgot password option on the login page to receive a reset link via email.',
   'similarity': 0.46157339215278625},
  {'id': 3,
   'question': 'Why am I not receiving the password reset email?',
   'answer': 'Please check your spam folder and ensure your email address is correct. If the issue persists, contact customer support.',
   'similarity': 0.3673616349697113}],
 'confidence': 0.273}

In [43]:
print("result: " + faq_results[0]["question"])
print("similarity: " + str(faq_results[0]["similarity"]))
print("confidence: " + str(result["confidence"]))

result: How do I reset my password?
similarity: 0.5578154921531677
confidence: 0.273


In [44]:
EVAL_QUERIES_PATH = "queries.json"

with open(EVAL_QUERIES_PATH, "r", encoding="utf-8") as f:
    eval_queries = json.load(f)

len(eval_queries)

32

In [45]:
results = []

for entry in eval_queries:
    query = entry["query"]
    expected_id = entry["expected_faq_id"]
    language = entry["language"]

    response = search(query, top_k=3)

    retrieved_ids = [r["id"] for r in response["results"]]
    confidence = response["confidence"]

    top1_correct = retrieved_ids[0] == expected_id
    top3_correct = expected_id in retrieved_ids

    if expected_id in retrieved_ids:
        rank_of_expected = retrieved_ids.index(expected_id) + 1
    else:
        rank_of_expected = None

    results.append({
        "query": query,
        "language": language,
        "expected_faq_id": expected_id,
        "retrieved_ids": retrieved_ids,
        "rank_of_expected": rank_of_expected,
        "top1_correct": top1_correct,
        "top3_correct": top3_correct,
        "confidence": confidence
    })

In [46]:
df = pd.DataFrame(results)
df.head()

Unnamed: 0,query,language,expected_faq_id,retrieved_ids,rank_of_expected,top1_correct,top3_correct,confidence
0,"I can't log into my account, how do I reset my...",en,1,"[1, 2, 13]",1.0,True,True,0.681
1,Forgot my password and now I'm locked out,en,2,"[2, 1, 3]",1.0,True,True,0.648
2,The password reset email never arrives,en,3,"[3, 1, 2]",1.0,True,True,0.59
3,How can I update the email linked to my account?,en,4,"[4, 5, 16]",1.0,True,True,0.485
4,Where can I change my credit card details?,en,5,"[4, 5, 16]",2.0,False,True,0.151


In [47]:
top1_accuracy = df["top1_correct"].mean()
top3_accuracy = df["top3_correct"].mean()

top1_accuracy, top3_accuracy

(np.float64(0.84375), np.float64(0.9375))

In [48]:
df.groupby("language")[["top1_correct", "top3_correct"]].mean()

Unnamed: 0_level_0,top1_correct,top3_correct
language,Unnamed: 1_level_1,Unnamed: 2_level_1
en,0.95,1.0
mk,0.666667,0.833333


In [49]:
df.groupby("top1_correct")["confidence"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
top1_correct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,5.0,0.0992,0.063385,0.032,0.042,0.097,0.151,0.174
True,27.0,0.53463,0.2207,0.08,0.381,0.648,0.688,0.807


In [50]:
df[(~df["top3_correct"])][
    ["query", "language", "expected_faq_id", "retrieved_ids", "confidence"]
]

Unnamed: 0,query,language,expected_faq_id,retrieved_ids,confidence
20,"Не можам да се најавам, ја заборавив лозинката",mk,1,"[2, 4, 7]",0.097
25,Сакам да ја откажам претплатата,mk,7,"[10, 5, 8]",0.042


In [51]:
RESULTS_DIR = "results"
os.makedirs(RESULTS_DIR, exist_ok=True)

RESULTS_DIR

'results'

In [52]:
RUN_METADATA = {
    "run_name": "baseline",
    "timestamp_utc": datetime.now().isoformat(),
    "embedding_model": "text-embedding-3-small",
    "top_k": 3,
    "confidence_formula": "0.7*similarity + 0.3*margin",
    "notes": "Baseline evaluation before multilingual FAQ augmentation"
}

RUN_METADATA

{'run_name': 'baseline',
 'timestamp_utc': '2025-12-25T21:27:11.249760',
 'embedding_model': 'text-embedding-3-small',
 'top_k': 3,
 'confidence_formula': '0.7*similarity + 0.3*margin',
 'notes': 'Baseline evaluation before multilingual FAQ augmentation'}

In [53]:
PER_QUERY_PATH = os.path.join(
    RESULTS_DIR,
    f"{RUN_METADATA['run_name']}_results.csv"
)

df.to_csv(PER_QUERY_PATH, index=False)

PER_QUERY_PATH

'results/baseline_results.csv'

In [54]:
metrics = {
    "run_metadata": RUN_METADATA,
    "num_queries": len(df),
    "top1_accuracy": float(df["top1_correct"].mean()),
    "top3_accuracy": float(df["top3_correct"].mean()),
    "accuracy_by_language": df.groupby("language")[
        ["top1_correct", "top3_correct"]
    ].mean().to_dict(),
    "confidence_stats": {
        "correct": df[df["top1_correct"]]["confidence"].describe().to_dict(),
        "incorrect": df[~df["top1_correct"]]["confidence"].describe().to_dict()
    }
}

metrics

{'run_metadata': {'run_name': 'baseline',
  'timestamp_utc': '2025-12-25T21:27:11.249760',
  'embedding_model': 'text-embedding-3-small',
  'top_k': 3,
  'confidence_formula': '0.7*similarity + 0.3*margin',
  'notes': 'Baseline evaluation before multilingual FAQ augmentation'},
 'num_queries': 32,
 'top1_accuracy': 0.84375,
 'top3_accuracy': 0.9375,
 'accuracy_by_language': {'top1_correct': {'en': 0.95,
   'mk': 0.6666666666666666},
  'top3_correct': {'en': 1.0, 'mk': 0.8333333333333334}},
 'confidence_stats': {'correct': {'count': 27.0,
   'mean': 0.5346296296296297,
   'std': 0.22070030987886444,
   'min': 0.08,
   '25%': 0.381,
   '50%': 0.648,
   '75%': 0.688,
   'max': 0.807},
  'incorrect': {'count': 5.0,
   'mean': 0.0992,
   'std': 0.06338532953294476,
   'min': 0.032,
   '25%': 0.042,
   '50%': 0.097,
   '75%': 0.151,
   'max': 0.174}}}

In [55]:
METRICS_PATH = os.path.join(
    RESULTS_DIR,
    f"{RUN_METADATA['run_name']}_metrics.json"
)

with open(METRICS_PATH, "w", encoding="utf-8") as f:
    json.dump(metrics, f, indent=2)

METRICS_PATH

'results/baseline_metrics.json'