In [21]:
import os
import dotenv
from pydantic import BaseModel, Field
from typing import List, Optional, Literal
from langchain.prompts import PromptTemplate
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import JsonOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader
from langchain_community.vectorstores import SKLearnVectorStore
from langchain_nomic.embeddings import NomicEmbeddings  # local
from langchain_openai import OpenAIEmbeddings  # api
from src.services.retrival_engine import RetrivalEngine
from src.utils.document_loader import load_documents_from_urls

os.environ.clear()
dotenv.load_dotenv()

True

In [25]:
import pandas as pd
df_2 = pd.read_excel('urls.xlsx')
df_2

Unnamed: 0,urls
0,https://lpi.oregonstate.edu/sites/lpi.oregonst...
1,https://www.accessdata.fda.gov/scripts/Interac...
2,https://www.hilarispublisher.com/open-access/e...


# LLM

In [2]:
local_llm = "llama3.2:latest"
model_tested = "llama3.2:latest"
metadata = f"CRAG, {model_tested}"

# Create Index
Let's index 3 blog posts

In [3]:
urls = [
    "https://lpi.oregonstate.edu/sites/lpi.oregonstate.edu/files/pdf/mic/micronutrients_for_health.pdf",
    "https://www.accessdata.fda.gov/scripts/InteractiveNutritionFactsLabel/assets/InteractiveNFL_Vitamins%26MineralsChart_October2021.pdf",
    "https://www.hilarispublisher.com/open-access/essential-nutrients-in-human-body.pdf",
]

# Load documents from the URLs
docs = [PyPDFLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]

# Initialize a text splitter with specified chunk size and overlap
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=600, chunk_overlap=100
)

# Create a retrival Engine
retrival_engine = RetrivalEngine()

# Split the documents into chunks
doc_splits = text_splitter.split_documents(docs_list)

contents = [doc.page_content for doc in doc_splits]
metadatas = []

for doc in doc_splits:
    metadata = dict(doc.metadata)
    metadata["content"] = "nutrition_article"
    metadatas.append(metadata)

# Add items to retrival engine in smaller batches
batch_size = 10
for i in range(0, len(contents), batch_size):
    batch_contents = contents[i:i+batch_size]
    batch_metadatas = metadatas[i:i+batch_size]
    
    # This will use the cache for any existing embeddings
    ids = retrival_engine.bulk_add_items(
        contents=batch_contents,
        metadatas=batch_metadatas,
        item_types=["nutrition_document"] * len(batch_contents)
    )

print("All documents stored in Pinecone!")


Loaded 82 cached embeddings
Using existing Pinecone index: recommendation-index
Using cached embedding for: Micronutrients for Health
   i...
Using cached embedding for: almonds, clams, spinach, chick...
Using cached embedding for: Biotin (Vitamin B7)
•
 A
ssist...
Using cached embedding for: latitude, skin pigmentation, a...
Using cached embedding for: Vitamin K
•
 A
ssists in blood...
Using cached embedding for: raisins, peanut butter
Fluorid...
Using cached embedding for: This information and more can ...
Using cached embedding for: shrimp
, whole-wheat bread, br...
Using cached embedding for: www.fda.gov/nutritioneducation...
Using cached embedding for: www.fda.gov/nutritioneducation...
Using cached embedding for: www.fda.gov/nutritioneducation...
Using cached embedding for: www.fda.gov/nutritioneducation...
Using cached embedding for: www.fda.gov/nutritioneducation...
Using cached embedding for: www.fda.gov/nutritioneducation...
Using cached embedding for: www.fda.gov/nutritionedu

In [4]:
# Get recommendations for a query
query = "Retrieve high-protein breakfast meals suitable for Male, 28 years old, Lightly active activity, avoiding ['Peanuts']"
results = retrival_engine.get_retrivals(query, top_k=3)

# Display results
print(f"Top recommendations for '{query}':\n")
for i, result in enumerate(results, 1):
    print(f"{i}. Score: {result['score']:.2f}")
    print(f"Content: {result['metadata']['content']}\n")

Using cached embedding for: Retrieve high-protein breakfas...
Top recommendations for 'Retrieve high-protein breakfast meals suitable for Male, 28 years old, Lightly active activity, avoiding ['Peanuts']':

1. Score: 0.75
Content: www.fda.gov/nutritioneducation
Interactive Nutrition Facts Label • October 2021 
Vitamins and Minerals Chart 8 
* The Daily Values are reference amounts of nutrients to consume or not to exceed each day.
MINERAL WHAT IT DOES WHERE IT IS FOUND DAILY 
VALUE*
Selenium • A ntioxidant
• Immune function
• Reproduction
• Thyroid function
• Eggs
• Enriched pasta and rice
• Meat
• Nuts (e.g., Brazil nuts) and seeds
• Poultry
• Seafood
• Whole grains
55 mcg
Sodium
Nutrient to get 
less of
• Acid-base balance
• Blood pressure regulation
• Fluid balance
• Muscle contraction
• Nervous system function
• Deli meat sandwiches
• Pizza
• Burritos and tacos
• Soups
• Savory snacks (e.g., chips, crackers,
popcorn)
• Poultry
• Pasta mixed dishes
• Burgers
• Egg dishes and omelets

# Define Tools

In [10]:
class UserProfile(BaseModel):
    name: Optional[str] = Field(None, description="User's full name")
    age: int = Field(..., ge=0, le=120, description="User's age")
    gender: str = Field(..., description="Male, Female, or Other")
    height_cm: int = Field(..., gt=50, lt=250, description="Height in cm")
    weight_kg: float = Field(..., gt=20, lt=300, description="Weight in kg")
    activity_level: str = Field(..., description="Sedentary, Lightly active, Moderately active, Very active, Super active")
    dietary_preferences: List[Literal[
        "Vegetarian", "Vegan", "Pescatarian", "Keto", "Paleo", 
        "Gluten-Free", "Dairy-Free", "Nut-Free", "Halal", "Kosher",
        "Low-Carb", "Low-Fat", "High-Protein", "Mediterranean", "FODMAP", "Sugar-Free"
    ]] = Field(default=[], description="User's dietary preferences, can be one or more.")
    allergies: List[str] = Field(default=[], description="User's allergies")
    health_conditions: List[str] = Field(default=[], description="Any medical conditions")
    weight_goal: str = Field(..., description="Lose weight, Maintain weight, Gain muscle")


user_profile = {
    "name": "Space Cadet",
    "age": 23,
    "gender": "Male",
    "height_cm": 183,
    "weight_kg": 65,
    "activity_level": "Lightly active",
    "dietary_preferences": ["Dairy-Free", "Low-Carb"],
    "allergies": ["Peanuts"],
    "health_conditions": ["None"],
    "weight_goal": "Maintain weight"
}

user = UserProfile(**user_profile)
past_meals=[]
print(user.model_dump_json(indent=4))

{
    "name": "Space Cadet",
    "age": 23,
    "gender": "Male",
    "height_cm": 183,
    "weight_kg": 65.0,
    "activity_level": "Lightly active",
    "dietary_preferences": [
        "Dairy-Free",
        "Low-Carb"
    ],
    "allergies": [
        "Peanuts"
    ],
    "health_conditions": [
        "None"
    ],
    "weight_goal": "Maintain weight"
}


In [17]:
llm = ChatOllama(model=local_llm, format="json", temperature=0)

# Retrieval Prompt Template for Multiple Meal Plans
retriever_prompt = PromptTemplate(
    template="""
    You are a nutritionist AI assistant that helps users generate **personalized meal recommendations** based on their profile.
    
    The user profile is as follows:

    - Age: {age}
    - Gender: {gender}
    - Height: {height_cm} cm
    - Weight: {weight_kg} kg
    - Activity Level: {activity_level}
    - Dietary Preferences: {dietary_preferences}
    - Allergies: {allergies}
    - Health Conditions: {health_conditions}
    - Weight Goal: {weight_goal}
    - Past Meal History (if available): {past_meals}

    ### Task:
    Generate a structured JSON response with **multiple queries** for retrieving meal plans.  
    Each query should focus on **one meal category**:  
    - Breakfast  
    - Lunch  
    - Dinner  
    - Snacks  

    Ensure that meals align with **dietary preferences, allergies, and weight goals** while maintaining **nutritional balance**.

    Your output must be a valid JSON object structured as follows:
    ```json
    {{
        "queries": [
            {{
                "meal_type": "Breakfast",
                "query": "Retrieve high-protein breakfast meals suitable for {gender}, {age} years old, {activity_level} activity, avoiding {allergies}."
            }},
            {{
                "meal_type": "Lunch",
                "query": "Retrieve balanced lunch options with {dietary_preferences} for a {weight_goal} goal, avoiding {allergies}."
            }},
            {{
                "meal_type": "Dinner",
                "query": "Find nutritious dinners for {age}-year-old {gender} aiming to {weight_goal}."
            }},
            {{
                "meal_type": "Snacks",
                "query": "Suggest healthy snack options that fit within a {dietary_preferences} diet while avoiding {allergies}."
            }}
        ]
    }}
    ```
    """,
    input_variables=[
        "age",
        "gender",
        "height_cm",
        "weight_kg",
        "activity_level",
        "dietary_preferences",
        "allergies",
        "health_conditions",
        "weight_goal",
        "past_meals"
    ],
)

# Output Parser
output_parser = JsonOutputParser()

# Function to Generate Multiple Queries
def generate_retrieval_queries(user_profile):
    formatted_prompt = retriever_prompt.format(**user_profile)
    response = llm.invoke(formatted_prompt)
    return output_parser.parse(response.content).get("queries", [])

# Example User Profile
user_profile = {
    "age": 28,
    "gender": "Male",
    "height_cm": 183,
    "weight_kg": 65,
    "activity_level": "Lightly active",
    "dietary_preferences": ["Dairy-Free", "Low-Carb"],
    "allergies": ["Peanuts"],
    "health_conditions": ["None"],
    "weight_goal": "Maintain weight",
    "past_meals": ["Oatmeal with fruits", "Grilled chicken with rice", "Salmon with vegetables"],
}

# Generate queries
queries = generate_retrieval_queries(user_profile)
print(queries)

[{'meal_type': 'Breakfast', 'query': "Retrieve high-protein breakfast meals suitable for Male, 28 years old, Lightly active activity, avoiding ['Peanuts']."}, {'meal_type': 'Lunch', 'query': "Retrieve balanced lunch options with ['Dairy-Free', 'Low-Carb'] for a Maintain weight goal, avoiding ['Peanuts']."}, {'meal_type': 'Dinner', 'query': 'Find nutritious dinners for 28-year-old Male aiming to Maintain weight.'}, {'meal_type': 'Snacks', 'query': "Suggest healthy snack options that fit within a ['Dairy-Free', 'Low-Carb'] diet while avoiding ['Peanuts']."}]


In [18]:
queries[1]

{'meal_type': 'Lunch',
 'query': "Retrieve balanced lunch options with ['Dairy-Free', 'Low-Carb'] for a Maintain weight goal, avoiding ['Peanuts']."}

In [20]:
for quey in queries:
    query = quey["query"]
    print(f"Query: {query}")
    results = retrival_engine.get_retrivals(query, top_k=3)
    print(f"Top recommendations for '{query}':\n")
    for i, result in enumerate(results, 1):
        print(f"{i}. Score: {result['score']:.2f}")
        print(f"Content: {result['metadata']['content']}\n")

Query: Retrieve high-protein breakfast meals suitable for Male, 28 years old, Lightly active activity, avoiding ['Peanuts'].
Using cached embedding for: Retrieve high-protein breakfas...
Top recommendations for 'Retrieve high-protein breakfast meals suitable for Male, 28 years old, Lightly active activity, avoiding ['Peanuts'].':

1. Score: 0.75
Content: www.fda.gov/nutritioneducation
Interactive Nutrition Facts Label • October 2021 
Vitamins and Minerals Chart 8 
* The Daily Values are reference amounts of nutrients to consume or not to exceed each day.
MINERAL WHAT IT DOES WHERE IT IS FOUND DAILY 
VALUE*
Selenium • A ntioxidant
• Immune function
• Reproduction
• Thyroid function
• Eggs
• Enriched pasta and rice
• Meat
• Nuts (e.g., Brazil nuts) and seeds
• Poultry
• Seafood
• Whole grains
55 mcg
Sodium
Nutrient to get 
less of
• Acid-base balance
• Blood pressure regulation
• Fluid balance
• Muscle contraction
• Nervous system function
• Deli meat sandwiches
• Pizza
• Burritos and ta