In [1]:
import os
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from typing import List, Optional, Literal
from langchain.prompts import PromptTemplate
from langchain_ollama import ChatOllama
from langchain_core.output_parsers import JsonOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader
from langchain_community.vectorstores import SKLearnVectorStore
from langchain_nomic.embeddings import NomicEmbeddings  # local
from langchain_openai import OpenAIEmbeddings  # api
from src.services.retrival_engine import RetrivalEngine
from src.utils.document_loader import load_documents_from_urls
import json
from enum import Enum
from langchain_core.messages import HumanMessage, SystemMessage
os.environ.clear()
load_dotenv()

USER_AGENT environment variable not set, consider setting it to identify your requests.
  from .autonotebook import tqdm as notebook_tqdm


True

In [3]:
import pandas as pd
df_2 = pd.read_excel('urls.xlsx')
df_2

Unnamed: 0,urls
0,https://drive.google.com/file/d/1ntkwGeHmmkyQX...
1,https://drive.google.com/file/d/1ncu7OPrXB2i3n...
2,https://drive.google.com/file/d/199F9gjwuSCNRC...


# LLM

In [5]:
local_llm = "llama3.2:latest"
model_tested = "llama3.2:latest"
metadata = f"CRAG, {model_tested}"

# Create Index
Let's index 3 blog posts

In [2]:
urls = [
    "https://lpi.oregonstate.edu/sites/lpi.oregonstate.edu/files/pdf/mic/micronutrients_for_health.pdf",
    # "https://www.accessdata.fda.gov/scripts/InteractiveNutritionFactsLabel/assets/InteractiveNFL_Vitamins%26MineralsChart_October2021.pdf",
    
    "https://www.hilarispublisher.com/open-access/essential-nutrients-in-human-body.pdf",
]

# Load documents from the URLs
docs = [PyPDFLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]

# Initialize a text splitter with specified chunk size and overlap
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=600, chunk_overlap=100
)

# Create a retrival Engine
retrival_engine = RetrivalEngine()

# Split the documents into chunks
doc_splits = text_splitter.split_documents(docs_list)

contents = [doc.page_content for doc in doc_splits]
metadatas = []

for doc in doc_splits:
    metadata = dict(doc.metadata)
    metadata["content"] = "nutrition_article"
    metadatas.append(metadata)

# Add items to retrival engine in smaller batches
batch_size = 10
for i in range(0, len(contents), batch_size):
    batch_contents = contents[i:i+batch_size]
    batch_metadatas = metadatas[i:i+batch_size]
    
    # This will use the cache for any existing embeddings
    ids = retrival_engine.bulk_add_items(
        contents=batch_contents,
        metadatas=batch_metadatas,
        item_types=["nutrition_document"] * len(batch_contents)
    )

print("All documents stored in Pinecone!")


Loaded 24 cached embeddings
Using existing Pinecone index: recommendation-index
Loaded 4 cached retrieval results
Using cached embedding for: Micronutrients for Health
   i...
Using cached embedding for: almonds, clams, spinach, chick...
Using cached embedding for: Biotin (Vitamin B7)
•
 A
ssist...
Using cached embedding for: latitude, skin pigmentation, a...
Using cached embedding for: Vitamin K
•
 A
ssists in blood...
Using cached embedding for: raisins, peanut butter
Fluorid...
Using cached embedding for: This information and more can ...
Using cached embedding for: shrimp
, whole-wheat bread, br...
Using cached embedding for: Essential Nutrients in Human B...
Using cached embedding for: Fiber, starch, and sugar. Amon...
Saved 0 retrieval results to cache
Using cached embedding for: may even help you lose weight,...
Saved 0 retrieval results to cache
All documents stored in Pinecone!


# Building the graph

## Document retriver

In [3]:
class UserProfile(BaseModel):
    name: Optional[str] = Field(None, description="User's full name")
    age: int = Field(..., ge=0, le=120, description="User's age")
    gender: str = Field(..., description="Male, Female, or Other")
    height_cm: int = Field(..., gt=50, lt=250, description="Height in cm")
    weight_kg: float = Field(..., gt=20, lt=300, description="Weight in kg")
    activity_level: str = Field(..., description="Sedentary, Lightly active, Moderately active, Very active, Super active")
    dietary_preferences: List[Literal[
        "Vegetarian", "Vegan", "Pescatarian", "Keto", "Paleo", 
        "Gluten-Free", "Dairy-Free", "Nut-Free", "Halal", "Kosher",
        "Low-Carb", "Low-Fat", "High-Protein", "Mediterranean", "FODMAP", "Sugar-Free"
    ]] = Field(default=[], description="User's dietary preferences, can be one or more.")
    allergies: List[str] = Field(default=[], description="User's allergies")
    health_conditions: List[str] = Field(default=[], description="Any medical conditions")
    weight_goal: str = Field(..., description="Lose weight, Maintain weight, Gain muscle")


user_profile = {
    "name": "Space Cadet",
    "age": 23,
    "gender": "Male",
    "height_cm": 183,
    "weight_kg": 65,
    "activity_level": "Lightly active",
    "dietary_preferences": ["Dairy-Free", "Low-Carb"],
    "allergies": ["Peanuts"],
    "health_conditions": ["None"],
    "weight_goal": "Maintain weight"
}

user = UserProfile(**user_profile)
past_meals=[]
print(user.model_dump_json(indent=4))

{
    "name": "Space Cadet",
    "age": 23,
    "gender": "Male",
    "height_cm": 183,
    "weight_kg": 65.0,
    "activity_level": "Lightly active",
    "dietary_preferences": [
        "Dairy-Free",
        "Low-Carb"
    ],
    "allergies": [
        "Peanuts"
    ],
    "health_conditions": [
        "None"
    ],
    "weight_goal": "Maintain weight"
}


In [13]:
llm = ChatOllama(model=local_llm, format="json", temperature=0)
llm_json_mode = ChatOllama(model=local_llm, format="json", temperature=0)

# Retrieval Prompt Template for Multiple Meal Plans
retriever_prompt = PromptTemplate(
    template="""
    You are a nutritionist AI assistant that helps users generate **personalized meal recommendations** based on their profile.
    
    The user profile is as follows:

    - Age: {age}
    - Gender: {gender}
    - Height: {height_cm} cm
    - Weight: {weight_kg} kg
    - Activity Level: {activity_level}
    - Dietary Preferences: {dietary_preferences}
    - Allergies: {allergies}
    - Health Conditions: {health_conditions}
    - Weight Goal: {weight_goal}
    - Past Meal History (if available): {past_meals}

    ### Task:
    Generate a structured JSON response with **multiple queries** for retrieving meal plans.  
    Each query should focus on **one meal category**:  
    - Breakfast  
    - Lunch  
    - Dinner  
    - Snacks  

    Ensure that meals align with **dietary preferences, allergies, and weight goals** while maintaining **nutritional balance**.

    Your output must be a valid JSON object structured as follows:
    ```json
    {{
        "queries": [
            {{
                "meal_type": "BREAKFAST",
                "query": "Retrieve high-protein breakfast meals suitable for {gender}, {age} years old, {activity_level} activity, avoiding {allergies}."
            }},
            {{
                "meal_type": "LUNCH",
                "query": "Retrieve balanced lunch options with {dietary_preferences} for a {weight_goal} goal, avoiding {allergies}."
            }},
            {{
                "meal_type": "DINNER",
                "query": "Find nutritious dinners for {age}-year-old {gender} aiming to {weight_goal}."
            }},
            {{
                "meal_type": "SNACKS",
                "query": "Suggest healthy snack options that fit within a {dietary_preferences} diet while avoiding {allergies}."
            }}
        ]
    }}
    ```
    """,
    input_variables=[
        "age",
        "gender",
        "height_cm",
        "weight_kg",
        "activity_level",
        "dietary_preferences",
        "allergies",
        "health_conditions",
        "weight_goal",
        "past_meals"
    ],
)

# Output Parser
output_parser = JsonOutputParser()

# Function to Generate Multiple Queries
def generate_retrieval_queries(user_profile, llm):
    formatted_prompt = retriever_prompt.format(**user_profile)
    response = llm.invoke(formatted_prompt)
    return output_parser.parse(response.content).get("queries", [])

# Example User Profile
user_profile = {
    "age": 28,
    "gender": "Male",
    "height_cm": 183,
    "weight_kg": 65,
    "activity_level": "Lightly active",
    "dietary_preferences": ["Dairy-Free", "Low-Carb"],
    "allergies": ["Peanuts"],
    "health_conditions": ["None"],
    "weight_goal": "Maintain weight",
    "past_meals": ["Oatmeal with fruits", "Grilled chicken with rice", "Salmon with vegetables"],
}

# Generate queries
queries = generate_retrieval_queries(user_profile, llm_json_mode)
print(queries)

[{'meal_type': 'BREAKFAST', 'query': "Retrieve high-protein breakfast meals suitable for Male, 28 years old, Lightly active activity, avoiding ['Peanuts']."}, {'meal_type': 'LUNCH', 'query': "Retrieve balanced lunch options with ['Dairy-Free', 'Low-Carb'] for a Maintain weight goal, avoiding ['Peanuts']."}, {'meal_type': 'DINNER', 'query': 'Find nutritious dinners for 28-year-old Male aiming to Maintain weight.'}, {'meal_type': 'SNACKS', 'query': "Suggest healthy snack options that fit within a ['Dairy-Free', 'Low-Carb'] diet while avoiding ['Peanuts']."}]


In [15]:
# fucntion to get retrival docs
def retrieve_docs(queries, retriver):
    all_results = {}
    for query in queries:
        meal_type = query["meal_type"]
        results = retriver.get_retrivals(query['query'], top_k=3)
        all_results[meal_type] = results
    
    return all_results

queries = generate_retrieval_queries(user_profile, llm_json_mode)
results = retrieve_docs(queries, retrival_engine)
results

Using cached retrieval results for: Retrieve high-protein breakfas...
Using cached retrieval results for: Retrieve balanced lunch option...
Using cached retrieval results for: Find nutritious dinners for 28...
Using cached retrieval results for: Suggest healthy snack options ...


{'BREAKFAST': [{'id': 'item_5',
   'score': 0.791204035,
   'metadata': {'content': 'raisins, peanut butter\nFluoridated water, crab meat, beans, \nblack tea, raisins, cereal, fish, fruit juice\nCod, iodized salt, potatoes (with skin), \nmilk, shrimp, turkey, navy beans, tuna, \neggs, seaweed\nBeef, fortified cereal, beans, oysters, \nmolasses, lentils, firm tofu, kidney beans, \ncashews, spinach, potatoes (with skin), \nshrimp, light tuna, eggs, tomatoes, dark-\nmeat chicken and turkey, raisins, prunes\n \nP\numpkin seeds, almonds, cashews, \nbeans, spinach, milk, figs, brown rice, \ncocoa powder, molasses, peanuts, \npineapple, okra, milk, bananas\nMen: 120 mg\nWomen: 90 mg \nPregnancy: 90 mg\nBreast-feeding: 90 mg\nAdults: 1,000 mg \nPregnancy: 1,000 mg\nBreast-feeding: 1,000 mg \nMen over 70: 1,200 mg\nWomen over 50: 1,200 mg\nMen: 35 mg\nMen over 50: 30 mg \nWomen: 25 mg \nPregnancy: 30 mg\nBreast-feeding: 45 mg\nWomen over 50: 20 mg\nAdults: 900 mg\nPregnancy: 1,000 mg \nBreast-f

In [16]:
results.keys()

dict_keys(['BREAKFAST', 'LUNCH', 'DINNER', 'SNACKS'])

## Router

In [17]:
router_instructions = """
You are an expert in routing user health profiles to the most relevant data source. Your task is to determine whether a query should be answered using a vectorstore or a web search.
	•	The vectorstore contains documents related to nutrition and food for health. Use this for queries specifically about diet, nutrition, or health-related food topics.
	•	For all other topics, especially current events, use web search as the data source.
	•	Your response must be a JSON object with a single key, datasource, whose value is either 'vectorstore' or 'websearch'. example: {"datasource": "vectorstore"} or {"datasource": "websearch"}

Ensure your decision-making is clear, accurate, and follows these rules strictly.
"""

# Testing
test_web_search = llm_json_mode.invoke(
	[SystemMessage(router_instructions), HumanMessage("What are the benefits of turmeric?")]
)
test_web_search2 = llm_json_mode.invoke(
	[SystemMessage(router_instructions), HumanMessage("Who is the president of the United States?")]
)
print(
	json.loads(test_web_search.content),
	json.loads(test_web_search2.content)
)

{'datasource': 'vectorstore'} {'datasource': 'websearch'}


## Retrival Grader

In [None]:
load_dotenv()

from langchain_openai import ChatOpenAI

llm_chat = ChatOpenAI(
    model="gpt-4o",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

llm_chat_json = llm_chat.with_structured_output(method="json_mode")

doc_grader_instructions = """
You are a grader assessing the quality and relevance of a retrieved nutrition document to a user's profile and a specified meal type.

Evaluate whether the document provides useful, accurate, and practical nutritional information that aligns with health and dietary topics, as well as the specified meal type.

If the document contains scientifically valid and actionable insights on nutrition and is relevant to the given meal type, grade it as relevant.
"""

doc_grader_prompt = """
Here is the retrieved nutrition document: \n\n {document} \n\n
Here is the user's profile: \n\n {user_profile} \n\n
Meal type: {meal_type} \n\n
Here is the grading criteria:  
- The document should be scientifically accurate and credible.  
- It should be relevant to nutrition and health-related topics.  
- It should provide clear, actionable, and practical guidance.  
- It should align with the specified meal type ({meal_type}).  

Carefully and objectively assess whether the document meets these criteria.  

Return JSON with a single key, binary_score, that is either 'yes' or 'no' to indicate whether the document is relevant to nutrition, health topics, and the specified meal type.
example {'binary_score': 'yes'} or {'binary_score': 'no'}
"""

class MealTypes(Enum):
    BREAKFAST = "Breakfast"
    LUNCH = "Lunch"
    DINNER = "Dinner"
    SNACKS = "Snacks"

class DocumentMetadata(BaseModel):
    content: str
    creationdate: Optional[str]
    creator: Optional[str]
    item_type: str
    moddate: Optional[str]
    page: Optional[float]
    page_label: Optional[str]
    producer: Optional[str]
    source: Optional[str]
    total_pages: Optional[float]
    trapped: Optional[str]

class RetrivedDocument(BaseModel):
    id: str
    score: float
    metadata: DocumentMetadata


class GradedDocuments(BaseModel):
    meal_type: MealTypes
    # documents: List[RetrivedDocument]
    valid: bool


# function to grade the documents 
def grade_documents(user_profile, retrieve_docs:List[RetrivedDocument], meal_type:MealTypes) -> GradedDocuments:
    grade_documents:List[GradedDocuments] = []
    
    for doc in retrieve_docs:
        doc_content = doc.metadata.content
        formatted_prompt = doc_grader_prompt.format(document=doc_content, user_profile=user_profile, meal_type=meal_type.value)    
        response = llm_chat_json.invoke(formatted_prompt)
        grade_documents.append({
            "meal_type": meal_type,
            "valid": response.binary_score == 'yes'
        })
    
    return grade_documents

grade_documents = grade_documents(user_profile, results[MealTypes.BREAKFAST], MealTypes.BREAKFAST)
