In [None]:
!pip install pymupdf langchain langchain-community sentence-transformers chromadb bs4

Collecting pymupdf
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting chromadb
  Downloading chromadb-1.3.7-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community)
  Downloading langchain_classic-1.0.0-py3-none-any.whl.metadata (3.9 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain-community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.3-cp3

In [None]:
import langchain
langchain.__version__

'1.1.3'

In [None]:
pip install langchain-google-genai

Collecting langchain-google-genai
  Downloading langchain_google_genai-4.1.1-py3-none-any.whl.metadata (2.7 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain-google-genai)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Downloading langchain_google_genai-4.1.1-py3-none-any.whl (65 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Installing collected packages: filetype, langchain-google-genai
Successfully installed filetype-1.2.0 langchain-google-genai-4.1.1


In [None]:
f = open("/content/gemini_key2.txt", "r")
api_key = f.read()

# **STEP 1: Imports**

In [None]:
import os
import requests
import urllib3
from bs4 import BeautifulSoup
import re

from langchain_core.documents import Document
from langchain_community.document_loaders import PyMuPDFLoader
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)



# **STEP 2: Load all PDFS**

In [None]:
pdf_folder = "data/pdfs/"
pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.endswith(".pdf")]

pdf_docs = []

for file in pdf_files:
    loader = PyMuPDFLoader(file)
    docs = loader.load()      # each page becomes a Document
    for d in docs:
        d.metadata["source"] = os.path.basename(file)
    pdf_docs.extend(docs)

print("PDF pages loaded:", len(pdf_docs))

PDF pages loaded: 502


# **STEP 3: SCRAPE WEB PAGES**

In [None]:
urls = [
    "https://www.who.int/news-room/fact-sheets/detail/healthy-diet",
    "https://www.who.int/news-room/fact-sheets/detail/food-safety",
    "https://www.diabetes.ca/nutrition-fitness/healthy-eating/healthy-eating-tips",
    "https://www.diabetes.ca/nutrition-fitness/healthy-eating/planning-healthy-meals",
    "https://www.diabetes.ca/nutrition-fitness/healthy-eating/carb-counting",
    "https://www.diabetes.ca/nutrition-fitness/healthy-eating/sugars-and-sweeteners",
    "https://nutritionsource.hsph.harvard.edu/2023/07/17/who-updated-guidelines-healthy-diets-total-fat/",
    "https://www.who.int/news-room/fact-sheets/detail/obesity-and-overweight"
]

In [None]:
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}

In [None]:
#defining method to scrape the data from web
def scrape_and_clean(url):

    print(f"🔍 Scraping: {url}")

    response = requests.get(url, headers=headers, timeout=20, verify=False)
    html = response.text
    soup = BeautifulSoup(html, "html.parser")

    # Remove noise
    for tag in soup(["script", "style", "header", "footer", "nav", "button", "form", "svg", "img"]):
        tag.decompose()

    # Try best main content
    main = soup.find("main") or soup.find("article") or soup.find("body") or soup

    text = main.get_text(separator="\n", strip=True)

    # Remove timestamps (video transcripts)
    text = re.sub(r"\b\d{1,2}:\d{2}\b", "", text)

    # Remove blank lines
    text = "\n".join([line for line in text.split("\n") if line.strip()])

    return text


In [None]:
web_docs = []

for url in urls:
    try:
        cleaned_text = scrape_and_clean(url)

        filename = url.split("/")[-1] or "index"
        filename = filename.replace("-", "_")

        web_docs.append(
            Document(
                page_content=cleaned_text,
                metadata={"source": filename, "type": "web"}
            )
        )

        print(f"Added web document: {filename}")

    except Exception as e:
        print(f"Error scraping {url}: {e}")

for doc in web_docs:
    filename = doc.metadata["source"] + ".txt"
    filepath = os.path.join("data/web", filename)

    with open(filepath, "w", encoding="utf-8") as f:
        f.write(doc.page_content)

    print(f"📄 Saved: {filepath}")

🔍 Scraping: https://www.who.int/news-room/fact-sheets/detail/healthy-diet
Added web document: healthy_diet
🔍 Scraping: https://www.who.int/news-room/fact-sheets/detail/food-safety
Added web document: food_safety
🔍 Scraping: https://www.diabetes.ca/nutrition-fitness/healthy-eating/healthy-eating-tips
Added web document: healthy_eating_tips
🔍 Scraping: https://www.diabetes.ca/nutrition-fitness/healthy-eating/planning-healthy-meals
Added web document: planning_healthy_meals
🔍 Scraping: https://www.diabetes.ca/nutrition-fitness/healthy-eating/carb-counting
Added web document: carb_counting
🔍 Scraping: https://www.diabetes.ca/nutrition-fitness/healthy-eating/sugars-and-sweeteners
Added web document: sugars_and_sweeteners
🔍 Scraping: https://nutritionsource.hsph.harvard.edu/2023/07/17/who-updated-guidelines-healthy-diets-total-fat/
Added web document: index
🔍 Scraping: https://www.who.int/news-room/fact-sheets/detail/obesity-and-overweight
Added web document: obesity_and_overweight
📄 Saved: 

In [None]:
print("Web documents loaded:", len(web_docs))

Web documents loaded: 8


# **Preprocessing**

In [None]:
# Combine everything into one list
all_docs = pdf_docs + web_docs
print("Total combined docs:", len(all_docs))

Total combined docs: 510


In [None]:
def clean_text(text):
    text = text.replace("\xa0", " ")
    text = text.replace("\n\n", "\n")
    text = text.strip()
    return text

for d in all_docs:
    d.page_content = clean_text(d.page_content)

In [None]:
#chunking
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150,
    separators=["\n\n", "\n", ".", ","]
)

chunks = splitter.split_documents(all_docs)

print("Number of chunks:", len(chunks))


Number of chunks: 2060


In [None]:
#Embeddings

from langchain_community.embeddings import HuggingFaceEmbeddings

embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
embedding = embed_model.embed_query("What is a healthy diet?")
len(embedding)

384

In [None]:
from langchain_community.vectorstores import Chroma

vectordb = Chroma.from_documents(
    documents=chunks,
    embedding=embed_model,
    persist_directory="nutrition_db"
)

vectordb.persist()
print("Vector DB created and stored!")


Vector DB created and stored!


  vectordb.persist()


In [None]:
retriever = vectordb.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 10}
)

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
import os

os.environ["GOOGLE_API_KEY"] = api_key

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0.5
)

In [None]:
from langchain_core.prompts import PromptTemplate

prompt_template = """
You are a Nutrition Assistant trained on verified guidelines from:
- WHO (World Health Organization)
- USDA Dietary Guidelines for Americans
- Harvard School of Public Health
- Diabetes Canada

You MUST use the information provided in the context.
However, you are allowed to combine and summarize overlapping dietary principles
(even if the text does not explicitly mention WHO or USDA).

Do NOT invent facts.
Do NOT provide disease-specific diet plans.

If the context truly contains no relevant nutrition guidance, say:
"I don't have verified guideline information about this topic in the provided documents."

--------------------
CONTEXT:
{context}
--------------------

QUESTION:
{question}

Provide a concise answer based on the verified guidelines:
"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template
)


In [None]:
def expand_query(q):
    extra = " nutrition healthy diet guidelines fruits vegetables sugar salt fat recommendations"
    return q + extra

In [None]:
from langchain_core.runnables import RunnableLambda
from langchain_core.output_parsers import StrOutputParser
def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

rag_chain = (
    {
        "context": RunnableLambda(expand_query) | retriever | RunnableLambda(format_docs),
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
)


In [None]:
response = rag_chain.invoke("What does WHO recommend for a healthy diet?")
print(response)

Based on the provided guidelines, the WHO recommends a healthy diet with particular attention to:

*   **Carbohydrates**
*   **Total fat, saturated fat, and trans fats**
*   **Added sugars**
*   **Sodium (salt)**
*   **Non-sugar sweeteners**

It also recommends increasing fruit and vegetable consumption, maintaining a healthy/appropriate weight, and reducing overall salt and fat intake.


In [None]:
# 1. Get retrieved docs
docs = retriever.invoke("What does WHO recommend for a healthy diet?")
print(docs)

[Document(metadata={'file_path': 'data/pdfs/2.-healthy-diet-24-04-19.pdf', 'creationDate': 'D:20230803083011Z', 'source': '2.-healthy-diet-24-04-19.pdf', 'trapped': '', 'producer': 'GPL Ghostscript 9.53.3', 'creator': 'CorelDRAW X5', 'modDate': "D:20230807162951+05'30'", 'total_pages': 25, 'page': 0, 'creationdate': '2023-08-03T08:30:11+00:00', 'title': '2. Healthy Diet-24-04-19.cdr', 'format': 'PDF 1.7', 'moddate': '2023-08-07T16:29:51+05:30', 'keywords': '', 'author': 'Ramchandra', 'subject': ''}, page_content='Healthy diet'), Document(metadata={'modDate': "D:20201227211603-05'00'", 'subject': 'Dietary Guidelines for Americans, 2020-2025', 'creator': 'Adobe InDesign 16.0 (Macintosh)', 'page': 25, 'creationDate': "D:20201223111339-05'00'", 'source': 'Dietary_Guidelines_for_Americans_2020-2025.pdf', 'keywords': 'Dietary Guidelines for Americans, 2020-2025', 'moddate': '2020-12-27T21:16:03-05:00', 'file_path': 'Dietary_Guidelines_for_Americans_2020-2025.pdf', 'total_pages': 164, 'format

In [None]:
# 2. Convert docs to text using format_docs
combined = format_docs(docs)
print("\n--- COMBINED TEXT (first 500 chars) ---\n")
print(combined)


--- COMBINED TEXT (first 500 chars) ---

Healthy diet

and beverage choices over time. More information is available at MyPlate.gov. Following a healthy dietary pattern from birth through older adulthood can have a profound impact on a person’s lifelong health. The Dietary Guidelines provides the framework for following such a pattern. However, broad and multisector collaboration is needed to help people achieve that goal. Action on many fronts is needed to ensure that healthy dietary choices at home, school, work, and play are the affordable, accessible norm. Everyone has a role to play in helping all Americans shift to a healthy dietary pattern and achieve better health. Try the MyPlate Plan A healthy eating routine is important at every stage of life and can have positive effects that add up over time. It’s important to eat a variety of fruits, vegetables, grains, dairy or fortified soy alternatives, and protein foods. When deciding what

and beverage choices over time. More inform

In [None]:
# 3. Build prompt input manually
prompt_input = prompt.format(
    question="can you recommend for a healthy diet?",
    context=combined
)

print("\n--- FINAL PROMPT SENT TO LLM (first 500 chars) ---\n")
print(prompt_input)


--- FINAL PROMPT SENT TO LLM (first 500 chars) ---


You are a helpful Nutrition Assistant trained only on WHO, USDA, Harvard and Diabetes Canada guidelines.

Use ONLY the following context to answer the question.
If the answer is not found in the context, say:
"I don't have information about this in the provided dietary guidelines."

⚠️ Do NOT provide medical or disease-specific diet plans.

Context:
Healthy diet

and beverage choices over time. More information is available at MyPlate.gov. Following a healthy dietary pattern from birth through older adulthood can have a profound impact on a person’s lifelong health. The Dietary Guidelines provides the framework for following such a pattern. However, broad and multisector collaboration is needed to help people achieve that goal. Action on many fronts is needed to ensure that healthy dietary choices at home, school, work, and play are the affordable, accessible norm. Everyone has a role to play in helping all Americans shift to a heal

In [None]:
raw_answer = llm.invoke(prompt_input)
print("\n--- LLM RAW ANSWER ---\n")
print(raw_answer.content)


--- LLM RAW ANSWER ---

For a healthy diet, it's important to:
*   Eat a variety of fruits, vegetables, grains, dairy or fortified soy alternatives, and protein foods.
*   Make nutrient-dense choices, such as:
    *   Plain shredded wheat
    *   Plain, low-fat yogurt with fruit
    *   Low-sodium black beans
    *   Vegetable oil
    *   Sparkling water

Following a healthy dietary pattern can have a profound impact on lifelong health, and a healthy eating routine is important at every stage of life. More information is available at MyPlate.gov.


# **Metrics**

In [None]:
eval_questions = [
    "What does WHO recommend for salt intake?",
    "How much free sugar should be consumed daily?",
    "How much fruit and vegetables should people eat?",
    "What does USDA say about saturated fat intake?",
]

gold_keywords = [
    ["5 g", "salt", "sodium"],
    ["<10%", "sugar", "5%"],
    ["400 g", "fruits", "vegetables"],
    ["10%", "saturated fat"],
]

In [None]:
from sklearn.metrics import precision_score, recall_score

def evaluate_retrieval():
    results = []

    for question, keywords in zip(eval_questions, gold_keywords):
        docs = retriever.invoke(question)
        text = " ".join([d.page_content for d in docs])

        found = any(kw.lower() in text.lower() for kw in keywords)
        results.append(found)

    recall_at_k = sum(results) / len(results)
    print("Recall@K:", recall_at_k)

In [None]:
evaluate_retrieval()

Recall@K: 1.0


In [None]:
def evaluate_answer_quality():
    for q, keywords in zip(eval_questions, gold_keywords):
        answer = rag_chain.invoke(q).content.lower()

        relevance = any(kw.lower() in answer for kw in keywords)
        faithfulness = "i don't know" not in answer

        print("\nQ:", q)
        print("Relevance:", relevance)
        print("Faithfulness:", faithfulness)
        print("Answer:", answer[:200], "...")


In [None]:
evaluate_answer_quality()


Q: What does WHO recommend for salt intake?
Relevance: True
Faithfulness: True
Answer: based on the provided guidelines:

the who guideline on sodium intake for adults and children aims to provide recommendations for the consumption of sodium. it provides global, evidence-informed recom ...

Q: How much free sugar should be consumed daily?
Relevance: True
Faithfulness: True
Answer: based on verified guidelines:

adults should consume less than 10% of total energy intake from free sugars. for a person consuming 2000 calories per day, this is equivalent to about 50g (or 12 level t ...

Q: How much fruit and vegetables should people eat?
Relevance: True
Faithfulness: True
Answer: according to the dietary guidelines for americans (2020-2025) for a 2,000-calorie level, individuals should consume 2 ½ cup equivalents of vegetables and 2 cup equivalents of fruits daily. ...

Q: What does USDA say about saturated fat intake?
Relevance: True
Faithfulness: True
Answer: based on the dietary guide

In [None]:
import time

start = time.time()
_ = rag_chain.invoke("What is a healthy diet?")
end = time.time()

print("Response time:", round(end - start, 2), "seconds")


Response time: 5.73 seconds


In [None]:
response = rag_chain.invoke("What does WHO recommend for fats?")
print(response.meta)


content="I don't have verified guideline information about specific fat recommendations from the WHO in the provided documents. The context states that WHO has released updated guidelines for defining healthy diets, with particular attention to total fat and specific types of fat such as saturated and trans fats, and that these guidelines should be used in conjunction with other nutrient guidelines related to fats. However, it does not detail the specific recommendations." additional_kwargs={} response_metadata={'finish_reason': 'STOP', 'model_name': 'gemini-2.5-flash', 'safety_ratings': [], 'model_provider': 'google_genai'} id='lc_run--019b2ed7-19c4-7423-9602-3ffd0dbce441-0' usage_metadata={'input_tokens': 1665, 'output_tokens': 2513, 'total_tokens': 4178, 'input_token_details': {'cache_read': 1011}, 'output_token_details': {'reasoning': 2434}}


In [None]:
!zip -r nutrition_db.zip nutrition_db

  adding: nutrition_db/ (stored 0%)
  adding: nutrition_db/aecfb561-424f-4961-9944-ac37a8e9a47d/ (stored 0%)
  adding: nutrition_db/aecfb561-424f-4961-9944-ac37a8e9a47d/link_lists.bin (deflated 84%)
  adding: nutrition_db/aecfb561-424f-4961-9944-ac37a8e9a47d/data_level0.bin (deflated 12%)
  adding: nutrition_db/aecfb561-424f-4961-9944-ac37a8e9a47d/length.bin (deflated 81%)
  adding: nutrition_db/aecfb561-424f-4961-9944-ac37a8e9a47d/header.bin (deflated 58%)
  adding: nutrition_db/aecfb561-424f-4961-9944-ac37a8e9a47d/index_metadata.pickle (deflated 46%)
  adding: nutrition_db/chroma.sqlite3 (deflated 56%)


In [None]:
from google.colab import files
files.download("nutrition_db.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip freeze > requirements_colab.txt


In [None]:
from google.colab import files
files.download("requirements_colab.txt")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pkg_resources

required_packages = [
    "streamlit",
    "langchain",
    "langchain-core",
    "langchain-community",
    "langchain-google-genai",
    "chromadb",
    "pymupdf",
    "sentence-transformers",
    "beautifulsoup4",
    "tiktoken",
    "requests",
]

installed = {pkg.key: pkg.version for pkg in pkg_resources.working_set}

with open("requirements_clean.txt", "w") as f:
    for pkg in required_packages:
        key = pkg.lower()
        if key in installed:
            f.write(f"{pkg}=={installed[key]}\n")

print("Clean requirements file created.")


Clean requirements file created.


  import pkg_resources


In [None]:
from google.colab import files
files.download("requirements_clean.txt")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>