In [2]:
import numpy as np
import chromadb
import os
from groq import Groq
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
class RAG:
    def __init__(self, collection_name="aus_food_nutrition"):
        self.chroma_client = chromadb.CloudClient(
            tenant='a0123436-2e87-4752-8983-73168aafe2e9',
            database='nutribot',
            api_key=os.environ.get("CHROMA_API_KEY"),
        )
        self.collection = self.chroma_client.get_or_create_collection(name=collection_name)
        self.count = self.collection.count()


    def add_documents(self, docs):
        for doc in docs:
            _id = f"id{self.count}"
            self.collection.upsert(ids=[_id], documents=[doc])
            self.count += 1

    def retrieve(self, prompt, n_results=2):
        res = self.collection.query(query_texts=[prompt], n_results=n_results)
        return res.get("documents", [[]])[0]

In [None]:
rag = RAG()
rag.add_documents([
    "Vegemite is a popular Australian spread made from brewers' yeast extract.",
    "Kangaroo meat is a lean source of protein, low in fat.",
])

In [11]:
# Retrieve by prompt
analyzed_health_condition = """
{
    "obesity_prediction": {
        "obesity_level": "Overweight_Level_II"
        "confidence": 10%
    },
    "diabetes_prediction": {
        "diabetes": true,
        "confidence": 90%
    }
"""

weekly_plan_format = """{
"suggestion": STRING
"weekly_plan": [
    {
        "week": 1,
        "target_calories_per_day": INT,
        "focus": STRING,
        "workouts": [ARRAY OF STRINGS],
        "meal_notes": STRING,
        "reminders": [ARRAY OF STRINGS]
    },
    ... (repeat for as many weeks as appropriate)
}]"""
prompt = f"""
You are a nutrition and fitness assistant.
Below is an analyzed health condition for a user, expressed in JSON: {analyzed_health_condition}
Your task: Based on the analyzed health condition and using the retrieved knowledge, generate a weekly plan strictly in this JSON format (replace INT and STRING placeholders): {weekly_plan_format} 
"""
relevant_texts = rag.retrieve(prompt, n_results=2)
prompt = prompt + f"\tUse this as context for answering: {relevant_texts}"
print("Retrieved texts:", relevant_texts)

Retrieved texts: ['The Australian Dietary Guidelines advise reducing the intake of processed foods and sugary drinks.', 'A balanced diet, as recommended by Australian Dietary Guidelines, includes moderate portions of protein and whole grains.']


In [12]:
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model="llama-3.1-8b-instant",
)

print(chat_completion.choices[0].message.content)

Based on the given health condition and the provided knowledge, I'll generate a weekly plan for the user. 

Given the user's obesity level is categorized as Overweight_Level_II with 10% confidence in obesity prediction, and the user has diabetes with 90% confidence, here's a suggested weekly plan:

```json
{
  "suggestion": "Maintain a strict diet and regular exercise to manage your diabetes and work towards healthy weight management.",
  "weekly_plan": [
    {
      "week": 1,
      "target_calories_per_day": 1700,
      "focus": "Balancing macronutrients (proteins, healthy fats, complex carbohydrates)",
      "workouts": [
        "Monday: 30 minutes of brisk walking",
        "Wednesday: Bodyweight exercises (20 reps of push-ups, squats, lunges)",
        "Friday: 20 minutes of cycling"
      ],
      "meal_notes": "Eat frequent, balanced meals with portion control. Include whole grains, lean proteins, and plenty of vegetables and fruits.",
      "reminders": [
        "Drink at lea

# GPQA Evaluation

GPQA introduced in Nov 2023
### Model Cutoff
**llama 3.3 70b**: Dec 2023

In [37]:
from datasets import load_dataset
import random
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [38]:
dataset = load_dataset("Idavidrein/gpqa", "gpqa_diamond")["train"]

In [57]:
import os
import random
import re
from groq import Groq
from datasets import load_dataset

class APIUnavailable(RuntimeError):
    pass

def _extract_item(example):
    # Try revised/capitalized keys first; fall back to HF schema.
    q = example.get("Question") or example.get("question")
    ca = example.get("Correct Answer") or example.get("correct_answer")
    inc = [
        example.get("Incorrect Answer 1"),
        example.get("Incorrect Answer 2"),
        example.get("Incorrect Answer 3"),
    ]
    if not any(inc) and "incorrect_answers" in example:
        inc = example["incorrect_answers"]
    inc = [x for x in (s.strip() if isinstance(s, str) else s for s in inc) if x]
    return q, ca, inc

def evaluate_gpqa(question, options, model="llama-3.1-8b-instant"):
    client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
    opts_str = "\n".join(f"{chr(65+i)}. {opt}" for i, opt in enumerate(options))
    prompt = (
        "You are answering a multiple-choice question. Choose the correct answer "
        "from the options below:\n\n"
        f"Question: {question}\nOptions:\n{opts_str}:"
    )
    try:
        # relevant_texts = rag.retrieve(prompt, n_results=2)

        resp = client.chat.completions.create(
            messages=[
                {"role":"system","content":"You are a multiple-choice grader. Return the answer in EXACTLY this format and nothing else:\n@@ANSWER=<LETTER>@@"},
                {"role": "user", "content": f'{prompt}'}
            ],
            model=model,
        )
        ans = (resp.choices[0].message.content or "").strip()
        # print(ans)
        m = re.compile(r"@@ANSWER=([A-D])@@\s*$", flags=re.MULTILINE).search(ans.strip())
        return m.group(1).upper() if m else None
    except Exception as e:
        print(f"API error: {e}")
        raise APIUnavailable from e

def run_eval(num_questions):
    dataset = load_dataset("Idavidrein/gpqa", "gpqa_diamond")["train"]
    correct = 0
    attempted = 0

    for i in range(min(num_questions, len(dataset))):
        q, ca, inc = _extract_item(dataset[i])
        if not (q and ca and len(inc) >= 3):
            continue

        options = inc[:3] + [ca]
        random.shuffle(options)
        true = chr(65 + options.index(ca))

        try:
            pred = evaluate_gpqa(q, options)
        except APIUnavailable:
            break

        attempted += 1
        if pred == true:
            correct += 1

        print(f"Q{i+1}: {'✓' if pred == true else '✗'}  Pred={pred or '-'}  True={true}")

    if attempted == 0:
        print("No questions evaluated.")
    else:
        pct = 100.0 * correct / attempted
        print(f"\nFinal Score: {correct}/{attempted} ({pct:.1f}%)")

if __name__ == "__main__":
    run_eval(100)

Q1: ✗  Pred=B  True=D
Q2: ✗  Pred=D  True=C
Q3: ✗  Pred=B  True=C
Q4: ✗  Pred=A  True=D
Q5: ✗  Pred=D  True=C
Q6: ✓  Pred=D  True=D
Q7: ✗  Pred=B  True=A
Q8: ✗  Pred=A  True=C
Q9: ✓  Pred=B  True=B
Q10: ✗  Pred=D  True=C
Q11: ✗  Pred=C  True=A
Q12: ✗  Pred=B  True=A
Q13: ✗  Pred=A  True=D
Q14: ✗  Pred=D  True=C
Q15: ✗  Pred=B  True=D
Q16: ✓  Pred=B  True=B
Q17: ✗  Pred=C  True=B
Q18: ✗  Pred=C  True=A
Q19: ✗  Pred=C  True=D
Q20: ✓  Pred=C  True=C
Q21: ✗  Pred=D  True=B
Q22: ✗  Pred=B  True=D
Q23: ✗  Pred=B  True=A
Q24: ✓  Pred=D  True=D
Q25: ✗  Pred=C  True=A
Q26: ✗  Pred=B  True=C
Q27: ✗  Pred=B  True=D
Q28: ✗  Pred=D  True=C
Q29: ✓  Pred=D  True=D
Q30: ✗  Pred=B  True=D
Q31: ✓  Pred=C  True=C
Q32: ✗  Pred=B  True=C
Q33: ✓  Pred=A  True=A
Q34: ✗  Pred=C  True=D
Q35: ✓  Pred=D  True=D
Q36: ✗  Pred=B  True=D
Q37: ✗  Pred=D  True=A
Q38: ✗  Pred=C  True=D
Q39: ✓  Pred=A  True=A
Q40: ✗  Pred=B  True=D
Q41: ✗  Pred=B  True=C
Q42: ✓  Pred=B  True=B
Q43: ✗  Pred=C  True=A
Q44: ✓  Pred=D  True

# Customized QA Evaluation

In [4]:
def chat_with_rag(prompt):
    rag = RAG()
    client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
    relevant_texts = rag.retrieve(prompt, n_results=5)
    prompt = prompt + f"\tUse this as context for answering: {relevant_texts}"
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama-3.1-8b-instant",
    )
    return chat_completion.choices[0].message.content

def simple_chat(prompt):
    client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama-3.1-8b-instant",
    )
    return chat_completion.choices[0].message.content

In [5]:
import os, csv, json, random, time, re
from groq import Groq

# ---------- CONFIG ----------
CSV_PATH = "nutrition_eval_dataset.csv"
JUDGE_MODEL = "llama-3.3-70b-versatile"   # or "llama-3.1-8b-instant" if needed
TEMPERATURE = 0
PAUSE_BETWEEN_CALLS = 0.3
# ----------------------------

client = Groq(api_key=os.environ["GROQ_API_KEY"])

JUDGE_SYSTEM = (
    "You are a strict evaluation judge. "
    "Output ONLY valid JSON (no commentary, no code fences). "
    "Judge ONLY using the provided REFERENCE_EXCERPTS and GOLD. "
    "Do NOT use outside knowledge. If evidence is insufficient, score lower."
)

JUDGE_RUBRIC = """
You will compare two candidate answers (A and B) to a nutrition QUESTION using REFERENCE_EXCERPTS (authoritative Australian sources) and a short GOLD answer.
Score each answer on integer scales (0–5):
1) accuracy: matches REFERENCE_EXCERPTS/GOLD; contradictions score low
2) relevance: directly answers the QUESTION
3) grounding: claims explicitly supported by the provided texts
4) safety: avoids misleading/harmful guidance; AU context only if supported

Tie-breakers: accuracy > grounding > relevance > safety.

Return STRICT JSON ONLY:
{
  "accuracy_A": 0-5, "accuracy_B": 0-5,
  "relevance_A": 0-5, "relevance_B": 0-5,
  "grounding_A": 0-5, "grounding_B": 0-5,
  "safety_A": 0-5, "safety_B": 0-5,
  "winner": "A"|"B"|"tie",
  "explanation": "1-2 short sentences based ONLY on the provided texts"
}
"""

def judge_pair(question, reference_excerpt, ansA, ansB, model=JUDGE_MODEL):
    prompt = f"""QUESTION:
{question}

REFERENCE_EXCERPTS:
{reference_excerpt}

CANDIDATE A:
{ansA}

CANDIDATE B:
{ansB}

{JUDGE_RUBRIC}
"""
    resp = client.chat.completions.create(
        model=model,
        messages=[
            {"role":"system", "content": JUDGE_SYSTEM},
            {"role":"user", "content": prompt},
        ],
        temperature=TEMPERATURE,
    )
    txt = resp.choices[0].message.content.strip()

    # Robust cleanup: strip code fences and extract the JSON object
    # 1) remove leading/trailing backticks if present
    txt = txt.strip("`")
    # 2) find the first {...} block
    m = re.search(r"\{.*\}", txt, re.DOTALL)
    if m:
        txt = m.group(0)

    try:
        return json.loads(txt)
    except Exception:
        # Neutral tie fallback if parsing fails
        return {
            "accuracy_A": 0, "accuracy_B": 0,
            "relevance_A": 0, "relevance_B": 0,
            "grounding_A": 0, "grounding_B": 0,
            "safety_A": 0, "safety_B": 0,
            "winner": "tie",
            "explanation": "Parse error; counted as tie."
        }

def run_eval(csv_path=CSV_PATH):
    # Load dataset
    rows = []
    with open(csv_path, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        rows = list(reader)

    results = []
    rag_wins = base_wins = ties = 0

    for i, r in enumerate(rows, start=1):
        q = r["question"].strip()
        ref = r.get("reference", "").strip()
        gold = r.get("correct_answer", "").strip()

        # Get answers from your two systems
        rag_answer = chat_with_rag(q)
        base_answer = simple_chat(q)

        # Blind order to reduce position bias
        if random.random() < 0.5:
            labelA, ansA, labelB, ansB = "rag", rag_answer, "base", base_answer
        else:
            labelA, ansA, labelB, ansB = "base", base_answer, "rag", rag_answer

        time.sleep(PAUSE_BETWEEN_CALLS)
        verdict = judge_pair(q, f"{ref}\nGOLD:\n{gold}", ansA, ansB)

        # Map judge winner back to rag/base
        w = verdict.get("winner", "tie")
        winner_sys = "tie"
        if w == "A":
            winner_sys = labelA
        elif w == "B":
            winner_sys = labelB

        if winner_sys == "rag":
            rag_wins += 1
        elif winner_sys == "base":
            base_wins += 1
        else:
            ties += 1

        results.append({
            "id": r.get("id", i),
            "question": q,
            "reference": ref,
            "gold_answer": gold,
            "rag_answer": rag_answer,
            "base_answer": base_answer,
            "winner": winner_sys,
            "accuracy_A": verdict.get("accuracy_A"),
            "accuracy_B": verdict.get("accuracy_B"),
            "relevance_A": verdict.get("relevance_A"),
            "relevance_B": verdict.get("relevance_B"),
            "grounding_A": verdict.get("grounding_A"),
            "grounding_B": verdict.get("grounding_B"),
            "safety_A": verdict.get("safety_A"),
            "safety_B": verdict.get("safety_B"),
            "judge_explanation": verdict.get("explanation", ""),
            "blind_labelA": labelA,
            "blind_labelB": labelB,
        })

        print(f"[{i}/{len(rows)}] Winner: {winner_sys}")
        time.sleep(PAUSE_BETWEEN_CALLS)

    # Summary
    n = len(rows) or 1
    print("\n=== SUMMARY ===")
    print(f"Total items: {len(rows)}")
    print(f"RAG wins : {rag_wins} ({rag_wins/n*100:.1f}%)")
    print(f"Base wins: {base_wins} ({base_wins/n*100:.1f}%)")
    print(f"Ties     : {ties} ({ties/n*100:.1f}%)")

    # Save detailed results
    out_path = "eval_results_llm_judge.csv"
    fieldnames = list(results[0].keys()) if results else []
    with open(out_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(results)
    print(f"\nSaved detailed results to: {out_path}")

if __name__ == "__main__":
    run_eval()

[1/50] Winner: rag
[2/50] Winner: rag
[3/50] Winner: rag
[4/50] Winner: rag
[5/50] Winner: base
[6/50] Winner: tie
[7/50] Winner: rag
[8/50] Winner: rag
[9/50] Winner: base
[10/50] Winner: rag
[11/50] Winner: rag
[12/50] Winner: rag
[13/50] Winner: rag
[14/50] Winner: base
[15/50] Winner: base
[16/50] Winner: rag
[17/50] Winner: rag
[18/50] Winner: base
[19/50] Winner: rag
[20/50] Winner: rag
[21/50] Winner: base
[22/50] Winner: rag
[23/50] Winner: tie
[24/50] Winner: base
[25/50] Winner: rag
[26/50] Winner: tie
[27/50] Winner: rag
[28/50] Winner: tie
[29/50] Winner: rag
[30/50] Winner: base
[31/50] Winner: base
[32/50] Winner: rag
[33/50] Winner: tie
[34/50] Winner: rag
[35/50] Winner: rag
[36/50] Winner: rag
[37/50] Winner: tie
[38/50] Winner: base
[39/50] Winner: rag
[40/50] Winner: base
[41/50] Winner: base
[42/50] Winner: base
[43/50] Winner: rag
[44/50] Winner: rag
[45/50] Winner: base
[46/50] Winner: base
[47/50] Winner: rag
[48/50] Winner: rag
[49/50] Winner: rag
[50/50] Winner

# Add RAG from Crawler

In [None]:
import json

rag = RAG()

docs = []
with open("document-parser/data/sentences.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        rec = json.loads(line)
        sent = rec.get("sentence", "").strip()
        if sent:
            docs.append(sent)

# docs = docs[:1000]

rag.add_documents(docs)