### Document Ingestion

In [3]:
# !pip install PyPDF2

In [4]:
# Install required libraries
# !pip install PyPDF2

from PyPDF2 import PdfReader

# Load PDF
pdf_path = "nutrition_paper.pdf"
reader = PdfReader(pdf_path)

# Extract text
text = ""
for page in reader.pages:
    text += page.extract_text() + "\n"

print(text[:500])  # Show first 500 characters to check


DIETARY GUIDELINES 
FOR INDIANS 
-A Manual 
NATIONAL INSTITUTE OF NUTRITION 
Hyderabad – 500 007, INDIA 
First Published                 .....           1998 
Reprinted                         .....           1999, 2003, 2005, 2007 Second Edition                .....            2011 
ATTENTION  READERS 
Readers can download the soft copy of the “Dietary Guidelines 
for Indians” free of cost for their own use and also for 
dissemination of information for  the benefit of the society on non- 
pro


In [5]:
# Simple chunking
chunk_size = 500  # words per chunk
overlap = 50      # words overlap between chunks

words = text.split()
chunks = []

for i in range(0, len(words), chunk_size - overlap):
    chunk = " ".join(words[i:i+chunk_size])
    chunks.append(chunk)

print(f"Total chunks: {len(chunks)}")
print(chunks[0][:300])  # Preview first chunk

Total chunks: 74
DIETARY GUIDELINES FOR INDIANS -A Manual NATIONAL INSTITUTE OF NUTRITION Hyderabad – 500 007, INDIA First Published ..... 1998 Reprinted ..... 1999, 2003, 2005, 2007 Second Edition ..... 2011 ATTENTION READERS Readers can download the soft copy of the “Dietary Guidelines for Indians” free of cost fo


In [6]:
# Install OpenAI library if not already
# !pip install openai

In [7]:
# pip install python-dotenv

In [8]:
from dotenv import load_dotenv
from openai import OpenAI
import os

load_dotenv()
# Check if the key is loaded
api_key = os.getenv("OPENAI_API_KEY")
if api_key is None:
    raise ValueError("OPENAI_API_KEY not found. Check your .env file!")

client = OpenAI(api_key=api_key)

embeddings = []

for chunk in chunks:
    # Generate embedding
    response = client.embeddings.create(
        model="text-embedding-ada-002",
        input=chunk
    )
    # Correct way to access embedding
    emb = response.data[0].embedding
    embeddings.append(emb)

print(f"Total embeddings created: {len(embeddings)}")
print(f"Length of one embedding vector: {len(embeddings[0])}")

Total embeddings created: 74
Length of one embedding vector: 1536


In [9]:
# !pip install chromadb

In [10]:
# import chromadb
# print(chromadb.__version__)

In [14]:
import chromadb
from chromadb.config import Settings

# Initialize Chroma client
client_db = chromadb.Client(Settings())
collection = client_db.create_collection("nutrition_docs")  # Create fresh

# Add each chunk and embedding to the database
for i, chunk in enumerate(chunks):
    collection.add(
        ids=[str(i)],             # unique string ID for each chunk
        documents=[chunk],
        embeddings=[embeddings[i]],
        metadatas=[{"chunk_id": i}]
    )

print("All chunks stored in Chroma database!")

All chunks stored in Chroma database!


In [16]:
# User question
query = "What are the benefits of omega-3?"

# Embed the question
query_resp = client.embeddings.create(
    model="text-embedding-ada-002",
    input=query
)
query_emb = query_resp.data[0].embedding

# Retrieve top-3 relevant chunks from Chroma
results = collection.query(
    query_embeddings=[query_emb],
    n_results=3
)

# Combine retrieved chunks as context
context = " ".join(results['documents'][0])

# Create prompt for LLM
prompt = f"Answer the question using the following context:\n{context}\n\nQuestion: {query}"

# Generate answer using GPT-3.5-turbo
response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": prompt}]
)

print("Answer:\n", response.choices[0].message.content)

Answer:
 Omega-3 fatty acids, particularly long chain n-3 PUFA found in fish oils and microalgae, have numerous health benefits. They are anti-inflammatory, anti-atherogenic, and antithrombotic. Omega-3s increase insulin sensitivity, improve peripheral glucose utilization, and decrease adiposity. They are essential for vision and brain growth, making them especially important for pregnant women. Consuming foods rich in omega-3 can help prevent inflammation, accumulation of fatty material in blood vessels, and clotting of blood, ultimately reducing the risk of cardiovascular diseases. Omega-3 fatty acids also help maintain a healthy balance of fats in the body and support the functioning of various systems including vascular, immune, nervous, and renal systems.
