In [1]:
import numpy as np
import chromadb
import os
from groq import Groq
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
class RAG:
    def __init__(self, collection_name="aus_food_nutrition"):
        self.chroma_client = chromadb.CloudClient(
            tenant='a0123436-2e87-4752-8983-73168aafe2e9',
            database='nutribot',
            api_key=os.environ.get("CHROMA_API_KEY"),
        )
        self.collection = self.chroma_client.get_or_create_collection(name=collection_name)
        self.count = self.collection.count()


    def add_documents(self, docs):
        for doc in docs:
            _id = f"id{self.count}"
            self.collection.upsert(ids=[_id], documents=[doc])
            self.count += 1

    def retrieve(self, prompt, n_results=2):
        res = self.collection.query(query_texts=[prompt], n_results=n_results)
        return res.get("documents", [[]])[0]

In [None]:
rag = RAG()
rag.add_documents([
    "Vegemite is a popular Australian spread made from brewers' yeast extract.",
    "Kangaroo meat is a lean source of protein, low in fat.",
])

In [11]:
# Retrieve by prompt
analyzed_health_condition = """
{
    "obesity_prediction": {
        "obesity_level": "Overweight_Level_II"
        "confidence": 10%
    },
    "diabetes_prediction": {
        "diabetes": true,
        "confidence": 90%
    }
"""

weekly_plan_format = """{
"suggestion": STRING
"weekly_plan": [
    {
        "week": 1,
        "target_calories_per_day": INT,
        "focus": STRING,
        "workouts": [ARRAY OF STRINGS],
        "meal_notes": STRING,
        "reminders": [ARRAY OF STRINGS]
    },
    ... (repeat for as many weeks as appropriate)
}]"""
prompt = f"""
You are a nutrition and fitness assistant.
Below is an analyzed health condition for a user, expressed in JSON: {analyzed_health_condition}
Your task: Based on the analyzed health condition and using the retrieved knowledge, generate a weekly plan strictly in this JSON format (replace INT and STRING placeholders): {weekly_plan_format} 
"""
relevant_texts = rag.retrieve(prompt, n_results=2)
prompt = prompt + f"\tUse this as context for answering: {relevant_texts}"
print("Retrieved texts:", relevant_texts)

Retrieved texts: ['The Australian Dietary Guidelines advise reducing the intake of processed foods and sugary drinks.', 'A balanced diet, as recommended by Australian Dietary Guidelines, includes moderate portions of protein and whole grains.']


In [12]:
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model="llama-3.1-8b-instant",
)

print(chat_completion.choices[0].message.content)

Based on the given health condition and the provided knowledge, I'll generate a weekly plan for the user. 

Given the user's obesity level is categorized as Overweight_Level_II with 10% confidence in obesity prediction, and the user has diabetes with 90% confidence, here's a suggested weekly plan:

```json
{
  "suggestion": "Maintain a strict diet and regular exercise to manage your diabetes and work towards healthy weight management.",
  "weekly_plan": [
    {
      "week": 1,
      "target_calories_per_day": 1700,
      "focus": "Balancing macronutrients (proteins, healthy fats, complex carbohydrates)",
      "workouts": [
        "Monday: 30 minutes of brisk walking",
        "Wednesday: Bodyweight exercises (20 reps of push-ups, squats, lunges)",
        "Friday: 20 minutes of cycling"
      ],
      "meal_notes": "Eat frequent, balanced meals with portion control. Include whole grains, lean proteins, and plenty of vegetables and fruits.",
      "reminders": [
        "Drink at lea

# GPQA Evaluation

GPQA introduced in Nov 2023
### Model Cutoff
**llama 3.3 70b**: Dec 2023

In [37]:
from datasets import load_dataset
import random
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [38]:
dataset = load_dataset("Idavidrein/gpqa", "gpqa_diamond")["train"]

In [57]:
import os
import random
import re
from groq import Groq
from datasets import load_dataset

class APIUnavailable(RuntimeError):
    pass

def _extract_item(example):
    # Try revised/capitalized keys first; fall back to HF schema.
    q = example.get("Question") or example.get("question")
    ca = example.get("Correct Answer") or example.get("correct_answer")
    inc = [
        example.get("Incorrect Answer 1"),
        example.get("Incorrect Answer 2"),
        example.get("Incorrect Answer 3"),
    ]
    if not any(inc) and "incorrect_answers" in example:
        inc = example["incorrect_answers"]
    inc = [x for x in (s.strip() if isinstance(s, str) else s for s in inc) if x]
    return q, ca, inc

def evaluate_gpqa(question, options, model="llama-3.1-8b-instant"):
    client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
    opts_str = "\n".join(f"{chr(65+i)}. {opt}" for i, opt in enumerate(options))
    prompt = (
        "You are answering a multiple-choice question. Choose the correct answer "
        "from the options below:\n\n"
        f"Question: {question}\nOptions:\n{opts_str}:"
    )
    try:
        # relevant_texts = rag.retrieve(prompt, n_results=2)

        resp = client.chat.completions.create(
            messages=[
                {"role":"system","content":"You are a multiple-choice grader. Return the answer in EXACTLY this format and nothing else:\n@@ANSWER=<LETTER>@@"},
                {"role": "user", "content": f'{prompt}'}
            ],
            model=model,
        )
        ans = (resp.choices[0].message.content or "").strip()
        # print(ans)
        m = re.compile(r"@@ANSWER=([A-D])@@\s*$", flags=re.MULTILINE).search(ans.strip())
        return m.group(1).upper() if m else None
    except Exception as e:
        print(f"API error: {e}")
        raise APIUnavailable from e

def run_eval(num_questions):
    dataset = load_dataset("Idavidrein/gpqa", "gpqa_diamond")["train"]
    correct = 0
    attempted = 0

    for i in range(min(num_questions, len(dataset))):
        q, ca, inc = _extract_item(dataset[i])
        if not (q and ca and len(inc) >= 3):
            continue

        options = inc[:3] + [ca]
        random.shuffle(options)
        true = chr(65 + options.index(ca))

        try:
            pred = evaluate_gpqa(q, options)
        except APIUnavailable:
            break

        attempted += 1
        if pred == true:
            correct += 1

        print(f"Q{i+1}: {'✓' if pred == true else '✗'}  Pred={pred or '-'}  True={true}")

    if attempted == 0:
        print("No questions evaluated.")
    else:
        pct = 100.0 * correct / attempted
        print(f"\nFinal Score: {correct}/{attempted} ({pct:.1f}%)")

if __name__ == "__main__":
    run_eval(100)

Q1: ✗  Pred=B  True=D
Q2: ✗  Pred=D  True=C
Q3: ✗  Pred=B  True=C
Q4: ✗  Pred=A  True=D
Q5: ✗  Pred=D  True=C
Q6: ✓  Pred=D  True=D
Q7: ✗  Pred=B  True=A
Q8: ✗  Pred=A  True=C
Q9: ✓  Pred=B  True=B
Q10: ✗  Pred=D  True=C
Q11: ✗  Pred=C  True=A
Q12: ✗  Pred=B  True=A
Q13: ✗  Pred=A  True=D
Q14: ✗  Pred=D  True=C
Q15: ✗  Pred=B  True=D
Q16: ✓  Pred=B  True=B
Q17: ✗  Pred=C  True=B
Q18: ✗  Pred=C  True=A
Q19: ✗  Pred=C  True=D
Q20: ✓  Pred=C  True=C
Q21: ✗  Pred=D  True=B
Q22: ✗  Pred=B  True=D
Q23: ✗  Pred=B  True=A
Q24: ✓  Pred=D  True=D
Q25: ✗  Pred=C  True=A
Q26: ✗  Pred=B  True=C
Q27: ✗  Pred=B  True=D
Q28: ✗  Pred=D  True=C
Q29: ✓  Pred=D  True=D
Q30: ✗  Pred=B  True=D
Q31: ✓  Pred=C  True=C
Q32: ✗  Pred=B  True=C
Q33: ✓  Pred=A  True=A
Q34: ✗  Pred=C  True=D
Q35: ✓  Pred=D  True=D
Q36: ✗  Pred=B  True=D
Q37: ✗  Pred=D  True=A
Q38: ✗  Pred=C  True=D
Q39: ✓  Pred=A  True=A
Q40: ✗  Pred=B  True=D
Q41: ✗  Pred=B  True=C
Q42: ✓  Pred=B  True=B
Q43: ✗  Pred=C  True=A
Q44: ✓  Pred=D  True

# Customized QA Evaluation

In [None]:
def chat_with_rag(prompt):
    rag = RAG()
    client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
    relevant_texts = rag.retrieve(prompt, n_results=5)
    prompt = prompt + f"\tUse this as context for answering: {relevant_texts}"
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama-3.1-8b-instant",
    )
    return chat_completion.choices[0].message.content

def simple_chat(prompt):
    client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama-3.1-8b-instant",
    )
    return chat_completion.choices[0].message.content

Q: What proportion of Australian children aged 2 to 17 do not eat enough vegetables?\ #
A: 94% of children aged 2 to 17 don’t eat enough vegetables.

Q: Roughly what proportion of Australian adults eat enough fruit and vegetables?\
A: Only 1 in 13 adults eat enough fruit and vegetables.

Q: About what share of Australia’s total energy intake comes from foods we don’t need (discretionary foods)?\
A: About one-third of our energy intake comes from food we don’t need.

Q: Where on the department website can I read about getting the nutrients I need, at any age?\
A: On the Food and nutrition topic page under “Eating what your body needs.”

Q: Which section covers actions to reduce harmful ingredients in processed foods?\
A: The “Reducing harmful ingredients” section in the Food and nutrition topic.

Q: Name one 2025 resource mentioned on the Food and nutrition page related to the Healthy Food Partnership.\
A: Healthy Food Partnership Program – Executive Committee communique – February 2025.

In [15]:
print("---------- RAG response ----------")
print("-> Q1:" + chat_with_rag("What proportion of Australian children aged 2 to 17 do not eat enough vegetables? Keep concise"))
print("-> Q2:" + chat_with_rag("Roughly what proportion of Australian adults eat enough fruit and vegetables? Keep concise"))
print("-> Q3:" + chat_with_rag("About what share of Australia’s total energy intake comes from foods we don’t need (discretionary foods)? Keep concise"))
print("-> Q4:" + chat_with_rag("Where on the department website can I read about getting the nutrients I need, at any age? Keep concise"))
print("-> Q5:" + chat_with_rag("Which section covers actions to reduce harmful ingredients in processed foods? Keep concise"))
print("-> Q6:" + chat_with_rag("Name one 2025 resource mentioned on the Food and nutrition page related to the Healthy Food Partnership. Keep concise"))

print("\n---------- Non RAG response ----------")
print("-> Q1:" + simple_chat("What proportion of Australian children aged 2 to 17 do not eat enough vegetables? Keep concise"))
print("-> Q2:" + simple_chat("Roughly what proportion of Australian adults eat enough fruit and vegetables? Keep concise"))
print("-> Q3:" + simple_chat("About what share of Australia’s total energy intake comes from foods we don’t need (discretionary foods)? Keep concise"))
print("-> Q4:" + simple_chat("Where on the department website can I read about getting the nutrients I need, at any age? Keep concise"))
print("-> Q5:" + simple_chat("Which section covers actions to reduce harmful ingredients in processed foods? Keep concise"))
print("-> Q6:" + simple_chat("Name one 2025 resource mentioned on the Food and nutrition page related to the Healthy Food Partnership. Keep concise"))

---------- RAG response ----------
-> Q1:According to the Australian Health Survey and the context provided, 94% of children aged 2 to 17 in Australia do not eat enough vegetables.
-> Q2:According to the provided information, only 1 in 13 (approximately 7.69%) Australian adults eat enough fruit and vegetables.
-> Q3:About 25-33% of Australia's total energy intake comes from discretionary foods.
-> Q4:On the department website, you can read about getting the nutrients you need at any age by visiting the page that discusses detailed information about individual nutrients.
-> Q5:"Reducing harmful ingredients from processed and manufactured foods."
-> Q6:Unfortunately, I have no access to the 2025 resource related to the Healthy Food Partnership mentioned. However, one 2024 resource, isn't related to the question so I will look for a 2024 or 2023 resource mentioned:

However, in December 2023,  a resource related to the Healthy Food Partnership was mentioned - 

Healthy Food Partnership Pr

# Add RAG from Crawler

In [None]:
import json

rag = RAG()

docs = []
with open("document-parser/data/sentences.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        rec = json.loads(line)
        sent = rec.get("sentence", "").strip()
        if sent:
            docs.append(sent)

# docs = docs[:1000]

rag.add_documents(docs)