**Setup**

In [1]:
from pathlib import Path
import sys
import csv

from dotenv import load_dotenv

project_root = Path().resolve().parents[0]
src_path = project_root / "src"
sys.path.append(str(src_path))

load_dotenv(dotenv_path=project_root / ".env")

from rag_pipeline import answer_question

**Load evaluation dataset**

In [2]:
docs_dir = project_root / "docs"
dataset_path = docs_dir / "week2_eval_dataset.csv"

questions = []
with dataset_path.open("r", encoding="utf-8", newline="") as f:
    reader = csv.DictReader(f)
    for row in reader:
        questions.append(row)

len(questions), questions[0]

(30,
 {'id': 'Q01',
  'question': 'What city and country is Cubic Engineering Consultancy based in?',
  'expected_source': 'cec_overview',
  'notes': 'basic company location'})

**Run evaluation**

In [3]:
results = []

for row in questions:
    qid = row["id"]
    question = row["question"]
    expected = row["expected_source"]

    print(f"Running {qid}: {question}")
    result = answer_question(question)

    results.append(
        {
            "id": qid,
            "question": question,
            "expected_source": expected,
            "ok": result["ok"],
            "reason": result["reason"],
            "sources": result.get("sources", []),
        }
    )

Running Q01: What city and country is Cubic Engineering Consultancy based in?
Running Q02: Name two stages of the project lifecycle that CEC supports.
Running Q03: What is the main goal of the internal RAG system at Cubic?
Running Q04: How many main steps does the internal RAG pipeline use
Running Q05: Which embedding model is currently used in the RAG prototype?
Running Q06: Which generation model is used to produce answers in the RAG system?
Running Q07: How many top documents are retrieved by default for each question?
Running Q08: What happens if none of the retrieved documents meet the minimum similarity threshold?
Running Q09: What is the main role of LLM assistants at CEC with respect to final engineering decisions?
Running Q10: Which values does CEC emphasize as core values in its work?
Running Q11: What are two example tasks the AI assistant supports engineers with?
Running Q12: Why is it important that the assistant uses only the retrieved context when answering?
Running Q13:

**Metrics**

In [4]:
total = len(results)
correct = 0
safe_refusal = 0
failures = []

for r in results:
    if not r["ok"] and r["reason"] == "no_relevant_context":
        safe_refusal += 1
    elif r["expected_source"] in r["sources"]:
        correct += 1
    else:
        failures.append(r)

print(f"Total questions: {total}")
print(f"Correctly grounded answers: {correct}")
print(f"Safe refusals: {safe_refusal}")
print(f"Failures: {len(failures)}")


Total questions: 30
Correctly grounded answers: 17
Safe refusals: 3
Failures: 10


**Failure examples**

In [5]:
failures[:3]

[{'id': 'Q04',
  'question': 'How many main steps does the internal RAG pipeline use',
  'expected_source': ' according to the documentation?',
  'ok': True,
  'reason': 'answered',
  'sources': ['rag_overview']},
 {'id': 'Q12',
  'question': 'Why is it important that the assistant uses only the retrieved context when answering?',
  'expected_source': 'llm_policies',
  'ok': True,
  'reason': 'answered',
  'sources': ['rag_overview']},
 {'id': 'Q13',
  'question': 'What should the assistant do when it does not have enough information to answer a question?',
  'expected_source': 'llm_policies',
  'ok': True,
  'reason': 'answered',
  'sources': ['rag_overview']}]

**Demo refusal**

In [6]:
answer_question("What is the capital of France?")

{'ok': False,
 'reason': 'no_relevant_context',
 'answer': 'I’m not able to answer this question based on the available documents. Please consult a human engineer or update the knowledge base.',
 'sources': [],
 'retrieved': [{'doc_id': 'cec_overview',
   'text': '# Cubic Engineering Consultancy – Overview\n\nCubic Engineering Consultancy (CEC) is an engineering and architectural consultancy based in Dubai, United Arab Emirates. CEC supports clients across the full project lifecycle, including design, feasibility studies, planning, project management and construction supervision.\n\nCEC focuses on solving complex engineering challenges, maintaining high quality standards, and building long-term partnerships with its clients. The company emphasizes honesty, responsibility, transparency and customer care as core values.\n\nIn internal LLM and RAG experiments, CEC uses AI assistants to help engineers navigate documentation, summarize technical notes and speed up access to project informat