In [1]:
import os 
from dotenv import load_dotenv 
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

## **`Data Ingestion`**

#### **PDF Text Extraction**

In [2]:
from langchain_community.document_loaders import PyMuPDFLoader
import fitz

file_path = r"C:\Users\hites\OneDrive\Desktop\LLMops\data\human-nutrition-text.pdf"
output_dir = "./extracted_images" 
os.makedirs(output_dir, exist_ok = True)

In [None]:
# Fixed image extraction code
pdf = fitz.open(file_path)
image_metadata = []

for page_index in range(len(pdf)):
    page = pdf[page_index] 
    images = page.get_images(full=True)
    # print(f"Page: {page_index + 1} has {len(images)} images") 

    for img_index, img in enumerate(images, start=1):
        xref = img[0] 
        base_image = pdf.extract_image(xref) 
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]
        
        # Save image
        image_filename = f"page{page_index + 1}_img{img_index}.{image_ext}"
        image_path = os.path.join(output_dir, image_filename) 
        with open(image_path, 'wb') as f:
            f.write(image_bytes) 
        
        # Find image location (bounding box) - with error handling
        bbox = None
        try:
            rects = [b for b in page.get_image_bbox(xref)] 
            bbox = [round(b, 2) for b in rects] if rects else None
        except ValueError as e:
            print(f"Warning: Could not get bounding box for image {xref} on page {page_index + 1}: {e}")
            bbox = None

        image_metadata.append({
            "page": page_index + 1, 
            "image_path": image_path, 
            "bbox": bbox
        })

pdf.close() 
print(f"\nExtracted and saved {len(image_metadata)} images to '{output_dir}'")


In [None]:
# Alternative approach using get_image_rects method
def extract_images_with_bounds(pdf_path, output_dir):
    """
    Extract images from PDF with better bounding box detection
    """
    pdf = fitz.open(pdf_path)
    image_metadata = []
    
    for page_index in range(len(pdf)):
        page = pdf[page_index]
        images = page.get_images(full=True)
        
        for img_index, img in enumerate(images, start=1):
            xref = img[0]
            base_image = pdf.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            
            # Save image
            image_filename = f"page{page_index + 1}_img{img_index}.{image_ext}"
            image_path = os.path.join(output_dir, image_filename)
            
            with open(image_path, 'wb') as f:
                f.write(image_bytes)
            
            # Try multiple methods to get bounding box
            bbox = None
            
            # Method 1: get_image_bbox
            try:
                rects = [b for b in page.get_image_bbox(xref)]
                bbox = [round(b, 2) for b in rects] if rects else None
            except ValueError:
                # Method 2: get_image_rects (alternative method)
                try:
                    rects = page.get_image_rects(xref)
                    bbox = [round(b, 2) for b in rects] if rects else None
                except:
                    # Method 3: Search for image rectangles in the page
                    try:
                        img_rects = page.get_image_rects()
                        if img_rects:
                            # Find the rectangle that matches our image
                            for rect in img_rects:
                                if rect[4] == xref:  # xref is the 5th element
                                    bbox = [round(b, 2) for b in rect[:4]]
                                    break
                    except:
                        pass
            
            image_metadata.append({
                "page": page_index + 1,
                "image_path": image_path,
                "bbox": bbox,
                "xref": xref
            })
    
    pdf.close()
    return image_metadata

# Test the alternative approach
print("Testing alternative image extraction method...")
image_metadata_alt = extract_images_with_bounds(file_path, output_dir)
print(f"Extracted {len(image_metadata_alt)} images using alternative method")


In [3]:
loader = PyMuPDFLoader(file_path)
documents = loader.load()

In [43]:
documents[0]

Document(metadata={'producer': 'Prince 12.5 (www.princexml.com)', 'creator': 'Pressbooks 5.9.2', 'creationdate': '', 'source': 'C:\\Users\\hites\\OneDrive\\Desktop\\LLMops\\data\\human-nutrition-text.pdf', 'file_path': 'C:\\Users\\hites\\OneDrive\\Desktop\\LLMops\\data\\human-nutrition-text.pdf', 'total_pages': 1208, 'format': 'PDF 1.7', 'title': 'Human Nutrition: 2020 Edition', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content='Human Nutrition: 2020 Edition')

In [4]:
print(f"Extracted text from {len(documents)} pages\n")

Extracted text from 1208 pages



In [20]:
print("Page Text Preview:\n", documents[2].page_content, "...\n")

Page Text Preview:
 Human Nutrition: 2020 
Edition 
UNIVERSITY OF HAWAI‘I AT MĀNOA 
FOOD SCIENCE AND HUMAN 
NUTRITION PROGRAM 
ALAN TITCHENAL, SKYLAR HARA, 
NOEMI ARCEO CAACBAY, WILLIAM 
MEINKE-LAU, YA-YUN YANG, MARIE 
KAINOA FIALKOWSKI REVILLA, 
JENNIFER DRAPER, GEMADY 
LANGFELDER, CHERYL GIBBY, CHYNA 
NICOLE CHUN, AND ALLISON 
CALABRESE ...



#### **PDF Image Extraction**

In [39]:
import fitz
import os

pdf = fitz.open(file_path)
image_metadata = []

for page_index in range(len(pdf)):
    page = pdf[page_index]
    images = page.get_images(full=True)

    for img_index, img in enumerate(images, start=1):
        xref = img[0]
        name = img[7]
        base_image = pdf.extract_image(xref)
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]

        # Save image
        image_filename = f"page{page_index + 1}_img{img_index}.{image_ext}"
        image_path = os.path.join(output_dir, image_filename)
        with open(image_path, "wb") as f:
            f.write(image_bytes)

        # Find bounding box
        try:
            rect = page.get_image_bbox(name)
            bbox = [round(coord, 2) for coord in rect]
        except ValueError:
            bbox = None  # if image name not found on page

        image_metadata.append({
            "page": page_index + 1,
            "image_path": image_path,
            "bbox": bbox
        })

pdf.close()
print(f"\nExtracted and saved {len(image_metadata)} images to '{output_dir}'")


Extracted and saved 253 images to './extracted_images'


In [42]:
image_metadata[0]

{'page': 26,
 'image_path': './extracted_images\\page26_img1.jpeg',
 'bbox': [56.69, 318.08, 206.69, 543.08]}

In [44]:
for doc in documents:
    page_num = doc.metadata.get("page", 0) + 1
    page_images = [img for img in image_metadata if img["page"] == page_num] 
    doc.metadata["images"] = page_images

In [50]:
print(documents[100])

page_content='Digestive 
system 
without 
labels by 
Mariana 
Ruiz / Public 
Domain 
Knowing how to maintain the balance of friendly bacteria in your 
intestines through proper diet can promote overall health. Recent 
scientific studies have shown that probiotic supplements positively 
affect intestinal microbial flora, which in turn positively affect 
immune system function. As good nutrition is known to influence 
immunity, there is great interest in using probiotic foods and other 
immune-system-friendly foods as a way to prevent illness. In this 
chapter we will explore not only immune system function, but also 
Introduction  |  59' metadata={'producer': 'Prince 12.5 (www.princexml.com)', 'creator': 'Pressbooks 5.9.2', 'creationdate': '', 'source': 'C:\\Users\\hites\\OneDrive\\Desktop\\LLMops\\data\\human-nutrition-text.pdf', 'file_path': 'C:\\Users\\hites\\OneDrive\\Desktop\\LLMops\\data\\human-nutrition-text.pdf', 'total_pages': 1208, 'format': 'PDF 1.7', 'title': 'Human Nutritio

## **`Chunking`**

In [56]:
from langchain_text_splitters.character import RecursiveCharacterTextSplitter

In [73]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 300, chunk_overlap = 50)

In [74]:
text_chunks = text_splitter.split_documents(documents)

### **VectorDB**

In [75]:
from langchain_openai import OpenAIEmbeddings 
from langchain_community.vectorstores import FAISS

In [76]:
embeddings = OpenAIEmbeddings(model = "text-embedding-ada-002")

In [77]:
vectorstore = FAISS.from_documents(text_chunks, embeddings)

In [78]:
vectorstore

<langchain_community.vectorstores.faiss.FAISS at 0x1b684f5d1b0>

### **`Retrieval`**

In [92]:
query = "What is Alcohol and how much to consume ?"
docs = vectorstore.similarity_search(query, k = 4)

retriever = vectorstore.as_retriever()

# Display results 
for i, doc in enumerate(docs):
    print(f"Document {i + 1}: ")
    print(doc.page_content)
    print("-"*50)

Document 1: 
If alcohol is consumed, it should be consumed in moderation—up 
to one drink per day for women and up to two drinks per day for 
men—and only by adults of legal drinking age. 
Evidence-Based Physical Activity 
Recommendations 
The other part of the energy balance equation is physical activity.
--------------------------------------------------
Document 2: 
the Dietary Guidelines define moderate alcohol intake as no more 
than one drink per day for women and no more than two drinks per 
day for men.4 Although drunkenness has pervaded many cultures, 
drinking in moderation has long been a mantra of multiple cultures 
with access to alcohol.
--------------------------------------------------
Document 3: 
• Amount of alcohol consumed 
• Consumption rate 
• Consumption before or after a meal (food in the stomach slows 
absorption) 
434  |  Introduction
--------------------------------------------------
Document 4: 
If alcohol is consumed, it should be consumed in moderation—up 

### **`Prompt`**

In [106]:
from langchain_core.prompts import ChatPromptTemplate

cot_template = ChatPromptTemplate.from_messages([
    ("system", """You are an analytical RAG assistant that uses step-by-step reasoning.

## Response Process:
1. **Analyze**: Read the question and identify what information is needed
2. **Search**: Look through the context for relevant information
3. **Verify**: Confirm the information directly addresses the question
4. **Respond**: Provide the answer or state if information is unavailable

## Answer Structure:
When answering, briefly show your reasoning:
- "Based on the context, which states [relevant excerpt]..."
- "The context mentions that [answer]..."
- "According to the provided information, [answer]..."

When unable to answer:
- "I searched the context for information about [topic], but it doesn't contain this information."

## Guidelines:
- Think step-by-step but keep reasoning concise
- Always ground your answer in specific context
- Make your reasoning transparent to the user"""),

    ("system", """## Available Context:
{context}

Use the above context to answer the question. Show brief reasoning to build trust in your answer."""),

    ("human", "{question}")
])

In [107]:
prompt = template.format(context = "Human Nutrition", question = "What is Alcohol, How much alcohol human's should consume ?")

#### **`Parser`**

In [108]:
from langchain_core.output_parsers.string import StrOutputParser

In [109]:
output_parser = StrOutputParser()

### **`RAG Chain`**

In [110]:
from langchain_openai import ChatOpenAI


llm = ChatOpenAI(model = "gpt-4o-mini")

In [113]:
from langchain_core.runnables import RunnablePassthrough


rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()} | cot_template | llm | output_parser

)

In [115]:
print(rag_chain.invoke("What is Protein, How much a individual should consume protein in a day ?"))

Based on the context, protein is described as an essential component of the body that plays important roles, including various physiological functions. The information mentions that understanding how to calculate the recommended protein intake is crucial for maintaining health.

The context provides a specific formula for calculating an individual's recommended daily protein intake: 

\[ \text{(Weight in lbs. ÷ 2.2 lb/kg) × 0.8 g/kg} \]

This means that to determine your daily protein needs, you divide your weight in pounds by 2.2 to convert to kilograms, then multiply that by 0.8 grams of protein per kilogram.

Additionally, it is noted that if a person is overweight, the calculation may overestimate the amount of protein they actually need.

Thus, the answer is:
1. Protein is a vital nutrient that serves numerous roles in the body.
2. An individual's recommended protein intake can be calculated using the equation mentioned above.
