In [2]:
#!/usr/bin/env python3
"""
Evaluate an OpenWebUI-powered compliance chatbot against a Q&A gold set.

What it does:
1) Builds a CSV with 2 columns: question, answer (gold), from the provided PDF.
2) Sends each question to OpenWebUI Chat Completions endpoint and stores model answers.
3) Uses an evaluation model (via the same OpenWebUI endpoint, or any OpenAI-compatible endpoint)
   to score:
   - fidelity_score (0-5): faithfulness to the gold answer (no contradictions, covers key points)
   - quality_score  (0-5): clarity, completeness, usefulness (structure, actionable, concise)
4) Writes a results CSV and a JSONL log for auditability.

Requirements:
  pip install pdfplumber pandas requests python-dotenv

Environment variables (recommended via .env):
  # OpenWebUI target (the model you're evaluating)
  OWUI_BASE_URL=http://localhost:3000
  OWUI_API_KEY=            # optional depending on your setup
  OWUI_MODEL=llama3.1:8b   # any model visible to OpenWebUI
  OWUI_ENDPOINT=/api/chat/completions  # default; could also be /v1/chat/completions on newer setups

  # Evaluator (can be the same OpenWebUI or a different OpenAI-compatible server)
  EVAL_BASE_URL=http://localhost:3000
  EVAL_API_KEY=
  EVAL_MODEL=qwen2.5:14b-instruct
  EVAL_ENDPOINT=/api/chat/completions

Usage:
  python eval_chatbot.py \
    --pdf "Intelligence_artificielle_-_Questions_et_r_ponses_.pdf" \
    --out results.csv

Notes:
- This script assumes the PDF is a Commission "Questions et réponses" format, where questions are headings.
- The extraction heuristic may need minor tuning if your PDF formatting differs.
"""

import argparse
import json
import os
import re
import time
from typing import Dict, List, Tuple, Optional

import pandas as pd
import pdfplumber
import requests

INTERROG_STARTS = (
    "Pourquoi", "À qui", "Quelles", "Quels", "Comment", "Quand",
    "En quoi", "Quel", "Qu'est-ce", "Qu’", "Qu'", "À quoi",
)

def extract_qa_from_pdf(pdf_path: str) -> pd.DataFrame:
    """Extract (question, answer) pairs from the PDF using simple heading heuristics."""
    text_pages: List[str] = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text_pages.append(page.extract_text() or "")

    all_lines: List[str] = []
    for pg in text_pages:
        all_lines += pg.splitlines()
    all_lines = [l.strip() for l in all_lines if l and l.strip()]

    pairs: List[Dict[str, str]] = []
    i = 0
    while i < len(all_lines):
        line = all_lines[i]
        if line.startswith(INTERROG_STARTS):
            q_lines = [line]
            while not q_lines[-1].endswith("?") and i + 1 < len(all_lines):
                i += 1
                q_lines.append(all_lines[i].strip())
                if len(" ".join(q_lines)) > 350:
                    break

            question = " ".join(q_lines).replace("  ", " ").strip()

            ans_lines: List[str] = []
            i += 1
            while i < len(all_lines):
                nxt = all_lines[i]
                if nxt.startswith(INTERROG_STARTS):
                    i -= 1
                    break
                ans_lines.append(nxt)
                i += 1

            answer = " ".join(ans_lines).replace("  ", " ").strip()
            pairs.append({"question": question, "answer": answer})
        i += 1

    if not pairs:
        raise RuntimeError("No Q&A pairs extracted. Check PDF format or tweak INTERROG_STARTS.")
    return pd.DataFrame(pairs)


def _headers(api_key: str) -> Dict[str, str]:
    h = {"Content-Type": "application/json"}
    if api_key:
        h["Authorization"] = f"Bearer {api_key}"
    return h


def call_chat_completions(
    base_url: str,
    endpoint: str,
    api_key: str,
    model: str,
    messages: List[Dict[str, str]],
    temperature: float = 0.2,
    timeout_s: int = 120,
) -> Dict:
    url = base_url.rstrip("/") + endpoint
    payload = {
        "model": model,
        "messages": messages,
        "temperature": temperature,
        "stream": False,
    }
    r = requests.post(url, headers=_headers(api_key), json=payload, timeout=timeout_s)
    r.raise_for_status()
    return r.json()


def extract_text_from_response(resp: Dict) -> str:
    """
    OpenAI-style response typically:
      resp["choices"][0]["message"]["content"]
    Some servers may return variants; this handles common cases.
    """
    try:
        return (resp.get("choices", [{}])[0].get("message", {}) or {}).get("content", "") or ""
    except Exception:
        return ""


EVAL_SYSTEM = """You are a strict evaluator for an EU AI compliance chatbot.
You will be given:
- QUESTION
- GOLD_ANSWER (reference)
- MODEL_ANSWER (candidate)

Return ONLY valid JSON with:
{
  "fidelity_score": <0-5 integer>,
  "quality_score": <0-5 integer>,
  "fidelity_rationale": "<brief>",
  "quality_rationale": "<brief>"
}

Scoring:
- fidelity_score:
  5 = fully aligned with gold; no contradictions; covers the key obligations/definitions.
  3 = mostly aligned; minor omissions or slight overreach without contradiction.
  1 = significant mismatch or misleading.
  0 = unrelated or clearly wrong.

- quality_score:
  5 = clear, well-structured, directly answers, actionable, appropriately cautious.
  3 = understandable but could be clearer/structured; some verbosity/ambiguity.
  1 = confusing, poorly structured, or unhelpful.
  0 = unusable.

Rules:
- Do not invent facts. Evaluate only based on GOLD_ANSWER.
- Penalize MODEL_ANSWER if it adds claims not supported by GOLD_ANSWER (hallucination risk).
"""


def safe_parse_json(text: str) -> Optional[Dict]:
    # Try direct parse; if the model wrapped it, extract the first JSON object.
    try:
        return json.loads(text)
    except Exception:
        m = re.search(r"\{.*\}", text, flags=re.DOTALL)
        if m:
            try:
                return json.loads(m.group(0))
            except Exception:
                return None
    return None


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--pdf", required=True, help="Path to the Q&A PDF.")
    ap.add_argument("--out", default="results.csv", help="Output CSV path.")
    ap.add_argument("--jsonl", default="results.jsonl", help="Audit log JSONL path.")
    ap.add_argument("--sleep", type=float, default=0.2, help="Sleep seconds between calls.")
    args = ap.parse_args()

    # Load config
    owui_base = os.getenv("OWUI_BASE_URL", "https://k2vm-74.mde.epf.fr/api/chat/completions")
    owui_ep = os.getenv("OWUI_ENDPOINT", "/api/chat/completions")
    owui_key = os.getenv("OWUI_API_KEY", "")
    owui_model = os.getenv("OWUI_MODEL", "n8n")

    eval_base = os.getenv("EVAL_BASE_URL", owui_base)
    eval_ep = os.getenv("EVAL_ENDPOINT", owui_ep)
    eval_key = os.getenv("EVAL_API_KEY", owui_key)
    eval_model = os.getenv("EVAL_MODEL", owui_model)

    df = extract_qa_from_pdf(args.pdf)
    df["model_answer"] = ""
    df["fidelity_score"] = None
    df["quality_score"] = None
    df["fidelity_rationale"] = ""
    df["quality_rationale"] = ""

    os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
    os.makedirs(os.path.dirname(args.jsonl) or ".", exist_ok=True)

    with open(args.jsonl, "w", encoding="utf-8") as fjsonl:
        for idx, row in df.iterrows():
            q = row["question"]
            gold = row["answer"]

            # 1) Call your chatbot (OpenWebUI)
            resp = call_chat_completions(
                base_url=owui_base,
                endpoint=owui_ep,
                api_key=owui_key,
                model=owui_model,
                messages=[{"role": "user", "content": q}],
                temperature=0.2,
            )
            model_answer = extract_text_from_response(resp).strip()
            df.at[idx, "model_answer"] = model_answer

            time.sleep(args.sleep)

            # 2) Evaluate with an evaluator model
            eval_prompt = f"""QUESTION:
{q}

GOLD_ANSWER:
{gold}

MODEL_ANSWER:
{model_answer}
"""
            eresp = call_chat_completions(
                base_url=eval_base,
                endpoint=eval_ep,
                api_key=eval_key,
                model=eval_model,
                messages=[
                    {"role": "system", "content": EVAL_SYSTEM},
                    {"role": "user", "content": eval_prompt},
                ],
                temperature=0.0,
            )
            etext = extract_text_from_response(eresp).strip()
            ej = safe_parse_json(etext) or {}

            # Defensive parsing
            fidelity = ej.get("fidelity_score", None)
            quality = ej.get("quality_score", None)

            # Coerce to int in 0..5 if possible
            def _coerce(v):
                try:
                    vi = int(v)
                    return max(0, min(5, vi))
                except Exception:
                    return None

            fidelity = _coerce(fidelity)
            quality = _coerce(quality)

            df.at[idx, "fidelity_score"] = fidelity
            df.at[idx, "quality_score"] = quality
            df.at[idx, "fidelity_rationale"] = (ej.get("fidelity_rationale") or "").strip()
            df.at[idx, "quality_rationale"] = (ej.get("quality_rationale") or "").strip()

            # Audit log line
            fjsonl.write(json.dumps({
                "i": int(idx),
                "question": q,
                "gold_answer": gold,
                "model_answer": model_answer,
                "model_call": {"base_url": owui_base, "endpoint": owui_ep, "model": owui_model},
                "eval_call": {"base_url": eval_base, "endpoint": eval_ep, "model": eval_model},
                "eval_raw_text": etext,
                "eval_parsed": {
                    "fidelity_score": fidelity,
                    "quality_score": quality,
                    "fidelity_rationale": df.at[idx, "fidelity_rationale"],
                    "quality_rationale": df.at[idx, "quality_rationale"],
                }
            }, ensure_ascii=False) + "\n")

            time.sleep(args.sleep)

    # Aggregate metrics
    fidelity_mean = df["fidelity_score"].dropna().mean()
    quality_mean = df["quality_score"].dropna().mean()

    # Add summary row (optional)
    summary = {
        "question": "__SUMMARY__",
        "answer": "",
        "model_answer": "",
        "fidelity_score": fidelity_mean,
        "quality_score": quality_mean,
        "fidelity_rationale": "mean over evaluated rows",
        "quality_rationale": "mean over evaluated rows",
    }
    df_out = pd.concat([df, pd.DataFrame([summary])], ignore_index=True)

    df_out.to_csv(args.out, index=False, encoding="utf-8")
    print(f"Wrote: {args.out}")
    print(f"Wrote: {args.jsonl}")
    print(f"Mean fidelity_score: {fidelity_mean:.2f} / 5")
    print(f"Mean quality_score : {quality_mean:.2f} / 5")


if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] --pdf PDF [--out OUT] [--jsonl JSONL]
                             [--sleep SLEEP]
ipykernel_launcher.py: error: the following arguments are required: --pdf


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
import os
import time
import json
import re
from typing import Dict, Any, Tuple, Optional

import pandas as pd
import requests


# =========================
# 0) CONFIG
# =========================

# OpenWebUI base URL (ex: http://localhost:3000 or http://openwebui:8080)
OPENWEBUI_BASE_URL = os.getenv("OPENWEBUI_BASE_URL", "http://localhost:3000")

# OpenWebUI often supports OpenAI-compatible routes under /v1
CHAT_COMPLETIONS_PATH = os.getenv("OPENWEBUI_CHAT_PATH", "/v1/chat/completions")

# Model used to generate answers (your compliance chatbot model)
MODEL_NAME = os.getenv("MODEL_NAME", "ollama:llama3.1")  # example

# Validation/judge model (can be different from MODEL_NAME)
JUDGE_MODEL_NAME = os.getenv("JUDGE_MODEL_NAME", "ollama:qwen2.5")  # example

# If your OpenWebUI is protected, provide bearer token
OPENWEBUI_API_KEY = os.getenv("OPENWEBUI_API_KEY", "")  # optional

# Input / Output
INPUT_CSV = os.getenv("INPUT_CSV", "eu_ai_act_qna_dataset.csv")
OUTPUT_CSV = os.getenv("OUTPUT_CSV", "eu_ai_act_qna_scored.csv")

# Throttling / reliability
REQUEST_TIMEOUT_S = int(os.getenv("REQUEST_TIMEOUT_S", "120"))
SLEEP_BETWEEN_CALLS_S = float(os.getenv("SLEEP_BETWEEN_CALLS_S", "0.5"))
MAX_RETRIES = int(os.getenv("MAX_RETRIES", "3"))


# =========================
# 1) LOW-LEVEL API CALL
# =========================

def _headers() -> Dict[str, str]:
    h = {"Content-Type": "application/json"}
    if OPENWEBUI_API_KEY:
        h["Authorization"] = f"Bearer {OPENWEBUI_API_KEY}"
    return h


def call_openwebui_chat(model: str, user_message: str, system_message: Optional[str] = None) -> str:
    """
    Calls OpenWebUI using an OpenAI-like chat.completions endpoint.
    Returns assistant text.
    """
    url = OPENWEBUI_BASE_URL.rstrip("/") + CHAT_COMPLETIONS_PATH

    messages = []
    if system_message:
        messages.append({"role": "system", "content": system_message})
    messages.append({"role": "user", "content": user_message})

    payload = {
        "model": model,
        "messages": messages,
        "temperature": 0.2,
    }

    last_err = None
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            r = requests.post(url, headers=_headers(), json=payload, timeout=REQUEST_TIMEOUT_S)
            r.raise_for_status()
            data = r.json()

            # OpenAI-like shape:
            # data["choices"][0]["message"]["content"]
            content = data["choices"][0]["message"]["content"]
            return content.strip()

        except Exception as e:
            last_err = e
            if attempt < MAX_RETRIES:
                time.sleep(1.0 * attempt)
            else:
                raise RuntimeError(f"OpenWebUI call failed after {MAX_RETRIES} tries: {last_err}") from last_err


# =========================
# 2) JUDGE PROMPT + PARSING
# =========================

JUDGE_SYSTEM = (
    "Vous êtes un évaluateur strict d'un chatbot de compliance (EU AI Act). "
    "Vous devez noter la réponse du modèle par rapport à la réponse de référence.\n\n"
    "Définitions des scores (0 à 5):\n"
    "- fidelity_score: fidélité factuelle et normative par rapport à la référence (0 = faux, 5 = parfaitement fidèle)\n"
    "- quality_score: clarté, structure, actionnabilité, absence d'hallucinations, bonne mise en garde (0 = mauvais, 5 = excellent)\n\n"
    "Répondez EXCLUSIVEMENT en JSON compact sur une seule ligne, au format:\n"
    "{\"fidelity_score\":<0-5>,\"quality_score\":<0-5>,\"comment\":\"...\"}\n"
)

def parse_judge_json(text: str) -> Dict[str, Any]:
    """
    Extract JSON from judge output (robust to extra text).
    """
    # Try direct parse
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        # Extract first {...}
        m = re.search(r"\{.*\}", text, flags=re.DOTALL)
        if not m:
            raise ValueError(f"Judge output is not JSON: {text[:200]}")
        return json.loads(m.group(0))


def judge_pair(reference_answer: str, model_answer: str, question: str) -> Tuple[float, float, str]:
    user_msg = (
        "Évaluez la réponse du modèle.\n\n"
        f"QUESTION:\n{question}\n\n"
        f"RÉPONSE DE RÉFÉRENCE:\n{reference_answer}\n\n"
        f"RÉPONSE DU MODÈLE:\n{model_answer}\n"
    )
    raw = call_openwebui_chat(JUDGE_MODEL_NAME, user_msg, system_message=JUDGE_SYSTEM)
    obj = parse_judge_json(raw)

    fidelity = float(obj.get("fidelity_score"))
    quality = float(obj.get("quality_score"))
    comment = str(obj.get("comment", "")).strip()
    return fidelity, quality, comment


# =========================
# 3) MAIN LOOP
# =========================

def main():
    df = pd.read_csv(INPUT_CSV)

    # Ensure columns exist
    if "question" not in df.columns or "reference_answer" not in df.columns:
        raise ValueError("CSV must contain columns: question, reference_answer")

    # Prepare output columns
    if "model_response" not in df.columns:
        df["model_response"] = ""
    if "fidelity_score" not in df.columns:
        df["fidelity_score"] = None
    if "quality_score" not in df.columns:
        df["quality_score"] = None
    if "judge_comment" not in df.columns:
        df["judge_comment"] = ""

    # Optional: a short system prompt for your chatbot behavior
    chatbot_system = (
        "Vous êtes un assistant de compliance centré sur l'EU AI Act. "
        "Répondez clairement, structurez la réponse, et si vous n'êtes pas certain, dites-le."
    )

    for i, row in df.iterrows():
        q = str(row["question"]).strip()
        ref = str(row["reference_answer"]).strip()

        # Skip if already processed (useful for resume)
        if isinstance(row.get("model_response", ""), str) and row["model_response"].strip():
            continue

        # 1) Model answer
        model_ans = call_openwebui_chat(MODEL_NAME, q, system_message=chatbot_system)
        df.at[i, "model_response"] = model_ans

        # 2) Judge scores
        fidelity, quality, comment = judge_pair(ref, model_ans, q)
        df.at[i, "fidelity_score"] = fidelity
        df.at[i, "quality_score"] = quality
        df.at[i, "judge_comment"] = comment

        # Persist at each step (safe)
        df.to_csv(OUTPUT_CSV, index=False)

        time.sleep(SLEEP_BETWEEN_CALLS_S)

    print(f"Done. Scored CSV saved to: {OUTPUT_CSV}")


if __name__ == "__main__":
    main()
