### Installation

In [None]:

# %pip install langchain langchain-community python_dotenv
# %pip install langchain-openai

# %pip install pandas numpy
# %pip install streamlit

# %pip install "unstructured[all-docs]<=0.16.10"
# %pip install langchain_postgres

# %pip install redis>=4.1.0

### Import Libraries

In [1]:
from unstructured.partition.pdf import partition_pdf
from langchain_openai import ChatOpenAI
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough,RunnableLambda

from langchain_postgres.vectorstores import PGVector
from database import COLLECTION_NAME, CONNECTION_STRING
from langchain_community.storage import RedisStore
from langchain.schema.document import Document
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from pathlib import Path
from IPython.display import display, HTML
from base64 import b64decode
import os, hashlib, shutil, uuid, json, time
import torch, redis, streamlit as st
import logging
# Initialize Redis client
client = redis.Redis(host="localhost", port=6379, db=0)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
FILE_PATH = Path("data/hbspapers_48__1.pdf") 

### Data Loading

Partition tables and text into chunks

In [4]:

def data_loading():

    raw_pdf_elements = partition_pdf(
        filename=FILE_PATH,
      
        infer_table_structure=True,
        strategy = "hi_res",
        
        extract_image_block_types = ["Image"],
        extract_image_block_to_payload  = True,

        chunking_strategy="by_title",     
        mode='elements',
        max_characters=10000,
        new_after_n_chars=5000,
        combine_text_under_n_chars=2000,
        image_output_dir_path="data/",
    )
    return raw_pdf_elements

In [5]:
pdf_elements = data_loading()

In [6]:
pdf_elements

[<unstructured.documents.elements.CompositeElement at 0x7fb297400320>,
 <unstructured.documents.elements.CompositeElement at 0x7fb298331c40>,
 <unstructured.documents.elements.CompositeElement at 0x7fb297941070>,
 <unstructured.documents.elements.CompositeElement at 0x7fb297940f20>,
 <unstructured.documents.elements.CompositeElement at 0x7fb2985eb5f0>,
 <unstructured.documents.elements.CompositeElement at 0x7fb29825fc80>,
 <unstructured.documents.elements.CompositeElement at 0x7fb2983d3710>,
 <unstructured.documents.elements.Table at 0x7fb2807ddf70>,
 <unstructured.documents.elements.CompositeElement at 0x7fb297e715b0>,
 <unstructured.documents.elements.Table at 0x7fb297e70470>,
 <unstructured.documents.elements.CompositeElement at 0x7fb297e71730>,
 <unstructured.documents.elements.Table at 0x7fb297e709e0>,
 <unstructured.documents.elements.CompositeElement at 0x7fb297e707d0>,
 <unstructured.documents.elements.Table at 0x7fb297e70200>,
 <unstructured.documents.elements.CompositeElement

In [44]:
# tables[0].metadata.to_dict()

In [7]:
tables = [element.metadata.text_as_html for element in pdf_elements if 'Table' in str(type(element))]
text = [element.text for element in pdf_elements if 'CompositeElement' in str(type(element))]

In [8]:
tables

['<table><tr><td/><td>Beef</td><td>Veal</td><td>Lamb</td><td>Mutton</td><td>Adult Australian RDI</td></tr><tr><td>Moisture (g)</td><td>73.1</td><td>74.8</td><td>72.9</td><td>73.2</td><td/></tr><tr><td>Protein (g)</td><td>23.2</td><td>24.8</td><td>21.9</td><td>21.5</td><td>46-64</td></tr><tr><td>Fat (g)</td><td>2.8</td><td>1.5</td><td>4.7</td><td>4.0</td><td>-</td></tr><tr><td>Energy (kJ)</td><td>498</td><td>477</td><td>546</td><td>514</td><td>6.5-15.8MJ</td></tr><tr><td>Cholesterol (mg)</td><td>50</td><td>51</td><td>66</td><td>66</td><td>-</td></tr><tr><td>Thiamin (mg)</td><td>0.04</td><td>0.06</td><td>0.12</td><td>0.16</td><td>1.1-1.2</td></tr><tr><td>Riboflavin (mg)</td><td>0.18</td><td>0.20</td><td>0.23</td><td>0.25</td><td>1.1-1.6</td></tr><tr><td>Niacin (mg)</td><td>5.0</td><td>16.0</td><td>5.2</td><td>8.0</td><td>14-16</td></tr><tr><td>Vitamin B6 (mg)</td><td>0.52</td><td>0.8</td><td>0.10</td><td>0.8</td><td>1.3-1.7</td></tr><tr><td>Vitamin B12 (ug)</td><td>2.5</td><td>1.6</td><t

In [9]:

display(HTML(tables[0]))

0,1,2,3,4,5
,Beef,Veal,Lamb,Mutton,Adult Australian RDI
Moisture (g),73.1,74.8,72.9,73.2,
Protein (g),23.2,24.8,21.9,21.5,46-64
Fat (g),2.8,1.5,4.7,4.0,-
Energy (kJ),498,477,546,514,6.5-15.8MJ
Cholesterol (mg),50,51,66,66,-
Thiamin (mg),0.04,0.06,0.12,0.16,1.1-1.2
Riboflavin (mg),0.18,0.20,0.23,0.25,1.1-1.6
Niacin (mg),5.0,16.0,5.2,8.0,14-16
Vitamin B6 (mg),0.52,0.8,0.10,0.8,1.3-1.7


In [10]:
text

['University of Wollongong\n\nResearch Online\n\nFaculty of Health and Behavioural Sciences - Papers (Archive)\n\nFaculty of Science, Medicine and Health\n\nSeptember 2007\n\nNutritional composition of red meat\n\nP. G. Williams University of Wollongong, peterw@uow.edu.au\n\nFollow this and additional works at: https://ro.uow.edu.au/hbspapers\n\nPart of the Arts and Humanities Commons, Life Sciences Commons, Medicine and Health Sciences Commons, and the Social and Behavioral Sciences Commons\n\nRecommended Citation\n\nWilliams, P. G.: Nutritional composition of red meat 2007. https://ro.uow.edu.au/hbspapers/48\n\nResearch Online is the open access institutional repository for the University of Wollongong. For further information contact the UOW Library: research-pubs@uow.edu.au\n\nNutritional composition of red meat\n\nAbstract\n\nLean red meats are: • An excellent source of high biological value protein, vitamin B12, niacin, vitamin B6, iron, zinc and phosphorus • A source of long-cha

### Summarize the Data

In [11]:
# Summarize extracted text and tables using LLM
def summarize_text_and_tables(text, tables):
    logging.info("Ready to summarize data with LLM")
    prompt_text = """You are an assistant tasked with summarizing text and tables. \
    
                    You are to give a concise summary of the table or text and do nothing else. 
                    Table or text chunk: {element} """
    prompt = ChatPromptTemplate.from_template(prompt_text)
    model = ChatOpenAI(temperature=0.6, model="gpt-4o-mini")
    summarize_chain = {"element": RunnablePassthrough()}| prompt | model | StrOutputParser()
    logging.info(f"{model} done with summarization")
    return {
        "text": summarize_chain.batch(text, {"max_concurrency": 5}),
        "table": summarize_chain.batch(tables, {"max_concurrency": 5})
    }

In [12]:
data_summary = summarize_text_and_tables(text, tables)


In [13]:
text_summary = data_summary['text']

In [14]:
tables_summary = data_summary['table']

In [15]:
text_summary

["The document discusses the nutritional composition of red meat, highlighting that lean red meats are rich in high-quality protein, vitamins (B12, niacin, B6), minerals (iron, zinc, phosphorus), and beneficial fats (long-chain omega-3s). They are generally low in fat and sodium and contain various antioxidants and bioactive substances. The article was authored by P. G. Williams and published in 2007 in the journal Nutrition & Dietetics. It is accessible through the University of Wollongong's Research Online repository.",
 'Lean red meats are rich in high-quality protein, essential vitamins (B12, niacin, B6), minerals (iron, zinc, phosphorus), and long-chain omega-3 fats. They are generally low in fat and sodium and contain various antioxidants and bioactive substances. The FSANZ defines meat broadly, including offal but excluding bone and bone marrow. In Australia, "red meat" specifically refers to meat from cattle, sheep, and goats, while processed meat contains at least 30% meat and

In [16]:
tables_summary

['The table presents nutritional information for four types of meat: Beef, Veal, Lamb, and Mutton, along with the recommended daily intake (RDI) for adults in Australia. Key nutrients measured include moisture, protein, fat, energy, cholesterol, vitamins, and minerals. \n\n- **Moisture content** ranges from 72.9g to 74.8g.\n- **Protein** is highest in Veal (24.8g) and lowest in Mutton (21.5g), with the RDI being 46-64g.\n- **Fat content** is lowest in Veal (1.5g) and highest in Lamb (4.7g).\n- **Energy** values range from 477 kJ (Veal) to 546 kJ (Lamb), with the RDI set at 6.5-15.8 MJ.\n- **Cholesterol** levels range from 50 mg (Beef) to 66 mg (Lamb and Mutton).\n- Various vitamins and minerals are also listed, with notable values for Vitamin B12 (highest in Mutton at 2.8 µg) and Iron (highest in Mutton at 3.3 mg).\n\nOverall, the table highlights the varying nutritional profiles of these meats, with specific details on their contributions to daily dietary needs.',
 'The table provides

### Initialize Retriever

In [17]:
def initialize_retriever():

    store = RedisStore(client=client)
    id_key = "doc_id"
    vectorstore = PGVector(
            embeddings=OpenAIEmbeddings(),
            collection_name=COLLECTION_NAME,
            connection=CONNECTION_STRING,
            use_jsonb=True,
            )
    retrieval_loader = MultiVectorRetriever(vectorstore=vectorstore, docstore=store, id_key="doc_id")
    return retrieval_loader

In [18]:
load_retriever = initialize_retriever()

### Add Summary to vectorstore & Raw data to RedisStore

In [19]:
# Store text, tables, and their summaries in the retriever

def store_docs_in_retriever(text, text_summary, table, table_summary, retriever):
    """Store text and table documents along with their summaries in the retriever."""

    def add_documents_to_retriever(documents, summaries, retriever, id_key = "doc_id"):
        """Helper function to add documents and their summaries to the retriever."""
        if not summaries:
            return None, []

        doc_ids = [str(uuid.uuid4()) for _ in documents]
        summary_docs = [
            Document(page_content=summary, metadata={id_key: doc_ids[i]})
            for i, summary in enumerate(summaries)
        ]

        retriever.vectorstore.add_documents(summary_docs, ids=doc_ids)
        retriever.docstore.mset(list(zip(doc_ids, documents)))     

# Add text, table, and image summaries to the retriever
    add_documents_to_retriever(text, text_summary, retriever)
    add_documents_to_retriever(table, table_summary, retriever)
    return retriever

In [22]:
retriever  = store_docs_in_retriever(text, text_summary, tables,  tables_summary, load_retriever)
    

In [23]:
for i in retriever:
    print(i)

('name', None)
('tags', None)
('metadata', None)
('vectorstore', <langchain_postgres.vectorstores.PGVector object at 0x7fb297409a60>)
('byte_store', None)
('docstore', <langchain_community.storage.redis.RedisStore object at 0x7fb297ddf1a0>)
('id_key', 'doc_id')
('search_kwargs', {})
('search_type', <SearchType.similarity: 'similarity'>)


In [24]:
query = "What is the comparison of the composition of red meat and vegetarian protein sources"
docs = retriever.invoke(query)

In [25]:
for dt in docs:
    print(dt)

b'1. Folate values from US data [39]; all other values from NUTTAB 2006 [38]\n\n10\n\nTable 6. Percentage of male adult recommended dietary intake (RDI) or adequate intake (AI)\n\nprovided by 100g of lean red meat and some vegetarian protein sources'
b'Creatine\n\nCreatine and its phosphorylated derivative creatine phosphate play an important role in muscle energy metabolism and under some circumstances creatine supplements can enhance muscle performance [35]. Red meat contains approximately 350mg/100g [31] and is the principal dietary source for humans. Creatine in meat is readily absorbed [36], but typical intakes are unlikely to provide the levels of creatine used for supplementation of sports performance (up to 15g/day).\n\n2.3 Nutrient composition of organ meats\n\nTable 5 provides a comparison of the nutrient content of liver, kidney, heart, brains and tripe from beef and lamb. From this table the following general statements can be made:\n\ne All organ meats (except tripe) are e

In [19]:
for dt in docs:
    print(dt)

1. Folate values from US data [39]; all other values from NUTTAB 2006 [38]

10

Table 6. Percentage of male adult recommended dietary intake (RDI) or adequate intake (AI)

provided by 100g of lean red meat and some vegetarian protein sources
2) Nutrient composition of red meat

Red meat contains high biological value protein and important micronutrients that are needed for good health throughout life. It also contains a range of fats, including essential omega-3 polyunsaturated fats. Recent analyses have shown that there has been a significant trend to leaner cuts of meat over the past two decades [3]. While the nutritional composition will vary somewhat according to breed, feeding regimen, season and meat cut, in general lean red meat has a low fat content, is moderate in cholesterol and rich in protein and many essential vitamins and minerals.

1

2.1 Nutrient composition of beef, veal, lamb and mutton

Table 1 presents the typical nutrient composition of samples of fat-trimmed Austr

### RAG Pipeline

#### Parse the retriever output into readable format

In [26]:
def parse_retriver_output(data):
    parsed_elements = []
    for element in data:
        if 'CompositeElement' in str(type(element)):
            parsed_elements.append(element.text)
        else:
            parsed_elements.append(element)
            
    return parsed_elements

#### Chat with the LLM using retrieved context

In [27]:
def chat_with_llm():


    prompt_text = """
                You are an AI Assistant tasked with understanding detailed
                information from text and tables. You are to answer the question based on the 
                context provided to you. You must not go beyond the context given to you.
                
                Context:
                {context}

                Question:
                {question}
                """

    prompt = ChatPromptTemplate.from_template(prompt_text)
    model = ChatOpenAI(temperature=0.6, model="gpt-4o-mini")

    rag_chain = {
       "context": retriever | RunnableLambda(parse_retriver_output), "question": RunnablePassthrough(),
        } | RunnablePassthrough().assign(
        response=(
        prompt 
        | model 
        | StrOutputParser()
        )
        )

    return rag_chain




In [28]:
rag_chain = chat_with_llm()

In [29]:
response = rag_chain.invoke("What is Meat based bioactive compounds")

In [30]:
response

{'context': [b'University of Wollongong\n\nResearch Online\n\nFaculty of Health and Behavioural Sciences - Papers (Archive)\n\nFaculty of Science, Medicine and Health\n\nSeptember 2007\n\nNutritional composition of red meat\n\nP. G. Williams University of Wollongong, peterw@uow.edu.au\n\nFollow this and additional works at: https://ro.uow.edu.au/hbspapers\n\nPart of the Arts and Humanities Commons, Life Sciences Commons, Medicine and Health Sciences Commons, and the Social and Behavioral Sciences Commons\n\nRecommended Citation\n\nWilliams, P. G.: Nutritional composition of red meat 2007. https://ro.uow.edu.au/hbspapers/48\n\nResearch Online is the open access institutional repository for the University of Wollongong. For further information contact the UOW Library: research-pubs@uow.edu.au\n\nNutritional composition of red meat\n\nAbstract\n\nLean red meats are: \xe2\x80\xa2 An excellent source of high biological value protein, vitamin B12, niacin, vitamin B6, iron, zinc and phosphoru

In [None]:
response = rag_chain.invoke("What is the nutrient composition of organ meats")

In [48]:
response

{'context': ['Table 1. Nutrient composition (per 100g) of lean red meat [4-6]',
  '1. Average values from 2002 analyses of Australian red meat [17]\n\n2. Values for raw lean chicken breast from NUTTAB 2006 [38]\n\n3. Values for raw lean pork fillet from NUTTAB 2006 [38]\n\n4. Values for raw flathead from NUTTAB 2006 [38]\n\n5. Values for canned red salmon from NUTTAB 2006 [38]\n\n9\n\nTable 5. Selected nutrients (per 100g) in raw liver, kidney, heart, brain and tripe 1',
  '<table><tr><td/><td>Liver</td><td>Kidney</td><td>Heart</td><td/><td>Brain</td><td>Tripe Beef</td></tr><tr><td/><td>Beef</td><td>Lamb</td><td>Beef</td><td>Lamb</td><td>Beef</td><td>Lamb</td><td>Lamb</td></tr><tr><td>Protein (g)</td><td>20.0</td><td>21.4</td><td>18.2</td><td>17.1</td><td>18.2</td><td>17.8</td><td>12.3</td><td>13.2</td></tr><tr><td>Fat (g)</td><td>8.6</td><td>7.5</td><td>1.6</td><td>2.5</td><td>3.0</td><td>5.6</td><td>8.0</td><td>2.1</td></tr><tr><td>Saturated fat (g)</td><td>2.8</td><td>2.2</td><td>0.

In [51]:
response

{'context': ['Table 1. Nutrient composition (per 100g) of lean red meat [4-6]',
  '1. Average values from 2002 analyses of Australian red meat [17]\n\n2. Values for raw lean chicken breast from NUTTAB 2006 [38]\n\n3. Values for raw lean pork fillet from NUTTAB 2006 [38]\n\n4. Values for raw flathead from NUTTAB 2006 [38]\n\n5. Values for canned red salmon from NUTTAB 2006 [38]\n\n9\n\nTable 5. Selected nutrients (per 100g) in raw liver, kidney, heart, brain and tripe 1',
  '<table><tr><td/><td>Liver</td><td>Kidney</td><td>Heart</td><td/><td>Brain</td><td>Tripe Beef</td></tr><tr><td/><td>Beef</td><td>Lamb</td><td>Beef</td><td>Lamb</td><td>Beef</td><td>Lamb</td><td>Lamb</td></tr><tr><td>Protein (g)</td><td>20.0</td><td>21.4</td><td>18.2</td><td>17.1</td><td>18.2</td><td>17.8</td><td>12.3</td><td>13.2</td></tr><tr><td>Fat (g)</td><td>8.6</td><td>7.5</td><td>1.6</td><td>2.5</td><td>3.0</td><td>5.6</td><td>8.0</td><td>2.1</td></tr><tr><td>Saturated fat (g)</td><td>2.8</td><td>2.2</td><td>0.

In [53]:
print(response['response'])

The nutrient composition of organ meats (per 100g) is as follows:

| Nutrient                     | Liver (Beef) | Kidney (Beef) | Heart (Beef) | Brain (Beef) | Tripe (Beef) | Liver (Lamb) | Kidney (Lamb) | Heart (Lamb) | Brain (Lamb) | Tripe (Lamb) |
|------------------------------|--------------|---------------|---------------|--------------|--------------|--------------|---------------|---------------|--------------|--------------|
| Protein (g)                  | 20.0         | 21.4          | 18.2          | 18.2         | 17.8         | 12.3         | 13.2          |               |              |              |
| Fat (g)                      | 8.6          | 7.5           | 1.6           | 3.0          | 5.6          | 8.0          | 2.1           |               |              |              |
| Saturated fat (g)           | 2.8          | 2.2           | 0.6           | 1.2          | 2.3          | 2.2          | 0.9           |               |              |              |
|

In [46]:
response

{'context': ['University of Wollongong\n\nResearch Online\n\nFaculty of Health and Behavioural Sciences - Papers (Archive)\n\nFaculty of Science, Medicine and Health\n\nSeptember 2007\n\nNutritional composition of red meat\n\nP. G. Williams University of Wollongong, peterw@uow.edu.au\n\nFollow this and additional works at: https://ro.uow.edu.au/hbspapers\n\nPart of the Arts and Humanities Commons, Life Sciences Commons, Medicine and Health Sciences Commons, and the Social and Behavioral Sciences Commons\n\nRecommended Citation\n\nWilliams, P. G.: Nutritional composition of red meat 2007. https://ro.uow.edu.au/hbspapers/48\n\nResearch Online is the open access institutional repository for the University of Wollongong. For further information contact the UOW Library: research-pubs@uow.edu.au\n\nNutritional composition of red meat\n\nAbstract\n\nLean red meats are: • An excellent source of high biological value protein, vitamin B12, niacin, vitamin B6, iron, zinc and phosphorus • A source