### Installation

In [None]:

# %pip install langchain langchain-community python_dotenv
# %pip install langchain-openai

# %pip install pandas numpy
# %pip install streamlit

# %pip install "unstructured[all-docs]<=0.16.10"
# %pip install langchain_postgres

# %pip install redis>=4.1.0

### Import Libraries

In [None]:
from unstructured.partition.pdf import partition_pdf
from langchain_openai import ChatOpenAI
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough,RunnableLambda

from langchain_postgres.vectorstores import PGVector
from database import COLLECTION_NAME, CONNECTION_STRING
from langchain_community.storage import RedisStore
from langchain.schema.document import Document
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from pathlib import Path
from IPython.display import display, HTML, Markdown
from base64 import b64decode
import os, hashlib, shutil, uuid, json, time
import torch, redis, streamlit as st
import logging
# Initialize Redis client
client = redis.Redis(host="localhost", port=6379, db=0)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dotenv import load_dotenv
load_dotenv()

False

In [3]:
FILE_PATH = Path("data/hbspapers_48__1.pdf") 

### Data Loading

Partition tables and text into chunks

In [4]:

def data_loading():

    raw_pdf_elements = partition_pdf(
        filename=FILE_PATH,
      
        infer_table_structure=True,
        strategy = "hi_res",
        
        extract_image_block_types = ["Image"],
        extract_image_block_to_payload  = True,

        chunking_strategy="by_title",     
        mode='elements',
        max_characters=10000,
        new_after_n_chars=5000,
        combine_text_under_n_chars=2000,
        image_output_dir_path="data/",
    )
    return raw_pdf_elements

In [5]:
pdf_elements = data_loading()

In [6]:
pdf_elements

[<unstructured.documents.elements.CompositeElement at 0x7f59f807f2f0>,
 <unstructured.documents.elements.CompositeElement at 0x7f59f8e52bd0>,
 <unstructured.documents.elements.CompositeElement at 0x7f59f7e03980>,
 <unstructured.documents.elements.CompositeElement at 0x7f59f8c79400>,
 <unstructured.documents.elements.CompositeElement at 0x7f59f8c79310>,
 <unstructured.documents.elements.CompositeElement at 0x7f59f8c7ae10>,
 <unstructured.documents.elements.CompositeElement at 0x7f59f7f35010>,
 <unstructured.documents.elements.Table at 0x7f59f7f371d0>,
 <unstructured.documents.elements.CompositeElement at 0x7f59f7f35340>,
 <unstructured.documents.elements.Table at 0x7f59f7f37b90>,
 <unstructured.documents.elements.CompositeElement at 0x7f59f803e240>,
 <unstructured.documents.elements.Table at 0x7f59f80d8980>,
 <unstructured.documents.elements.CompositeElement at 0x7f59f80db410>,
 <unstructured.documents.elements.Table at 0x7f59f80db830>,
 <unstructured.documents.elements.CompositeElement

In [44]:
# tables[0].metadata.to_dict()

In [35]:
tables = [element.metadata.text_as_html for element in pdf_elements if 'Table' in str(type(element))]
text = [element.text for element in pdf_elements if 'CompositeElement' in str(type(element))]

In [8]:
tables

['<table><tr><td/><td>Beef</td><td>Veal</td><td>Lamb</td><td>Mutton</td><td>Adult Australian RDI</td></tr><tr><td>Moisture (g)</td><td>73.1</td><td>74.8</td><td>72.9</td><td>73.2</td><td/></tr><tr><td>Protein (g)</td><td>23.2</td><td>24.8</td><td>21.9</td><td>21.5</td><td>46-64</td></tr><tr><td>Fat (g)</td><td>2.8</td><td>1.5</td><td>4.7</td><td>4.0</td><td>-</td></tr><tr><td>Energy (kJ)</td><td>498</td><td>477</td><td>546</td><td>514</td><td>6.5-15.8MJ</td></tr><tr><td>Cholesterol (mg)</td><td>50</td><td>51</td><td>66</td><td>66</td><td>-</td></tr><tr><td>Thiamin (mg)</td><td>0.04</td><td>0.06</td><td>0.12</td><td>0.16</td><td>1.1-1.2</td></tr><tr><td>Riboflavin (mg)</td><td>0.18</td><td>0.20</td><td>0.23</td><td>0.25</td><td>1.1-1.6</td></tr><tr><td>Niacin (mg)</td><td>5.0</td><td>16.0</td><td>5.2</td><td>8.0</td><td>14-16</td></tr><tr><td>Vitamin B6 (mg)</td><td>0.52</td><td>0.8</td><td>0.10</td><td>0.8</td><td>1.3-1.7</td></tr><tr><td>Vitamin B12 (ug)</td><td>2.5</td><td>1.6</td><t

In [25]:

display(HTML(tables[0]))

0,1,2,3,4,5
,Beef,Veal,Lamb,Mutton,Adult Australian RDI
Moisture (g),73.1,74.8,72.9,73.2,
Protein (g),23.2,24.8,21.9,21.5,46-64
Fat (g),2.8,1.5,4.7,4.0,-
Energy (kJ),498,477,546,514,6.5-15.8MJ
Cholesterol (mg),50,51,66,66,-
Thiamin (mg),0.04,0.06,0.12,0.16,1.1-1.2
Riboflavin (mg),0.18,0.20,0.23,0.25,1.1-1.6
Niacin (mg),5.0,16.0,5.2,8.0,14-16
Vitamin B6 (mg),0.52,0.8,0.10,0.8,1.3-1.7


In [10]:
text

['University of Wollongong\n\nResearch Online\n\nFaculty of Health and Behavioural Sciences - Papers (Archive)\n\nFaculty of Science, Medicine and Health\n\nSeptember 2007\n\nNutritional composition of red meat\n\nP. G. Williams University of Wollongong, peterw@uow.edu.au\n\nFollow this and additional works at: https://ro.uow.edu.au/hbspapers\n\nPart of the Arts and Humanities Commons, Life Sciences Commons, Medicine and Health Sciences Commons, and the Social and Behavioral Sciences Commons\n\nRecommended Citation\n\nWilliams, P. G.: Nutritional composition of red meat 2007. https://ro.uow.edu.au/hbspapers/48\n\nResearch Online is the open access institutional repository for the University of Wollongong. For further information contact the UOW Library: research-pubs@uow.edu.au\n\nNutritional composition of red meat\n\nAbstract\n\nLean red meats are: • An excellent source of high biological value protein, vitamin B12, niacin, vitamin B6, iron, zinc and phosphorus • A source of long-cha

### Summarize the Data

In [26]:
# Summarize extracted text and tables using LLM
def summarize_text_and_tables(text, tables):
    logging.info("Ready to summarize data with LLM")
    prompt_text = """You are an assistant tasked with summarizing text and tables. \
    
                    You are to give a concise summary of the table or text and do nothing else. 
                    Table or text chunk: {element} """
    prompt = ChatPromptTemplate.from_template(prompt_text)
    model = ChatOpenAI(temperature=0.6, model="gpt-4o-mini")
    summarize_chain = {"element": RunnablePassthrough()}| prompt | model | StrOutputParser()
    logging.info(f"{model} done with summarization")
    return {
        "text": summarize_chain.batch(text, {"max_concurrency": 5}),
        "table": summarize_chain.batch(tables, {"max_concurrency": 5})
    }

In [27]:
data_summary = summarize_text_and_tables(text, tables)


In [28]:
text_summary = data_summary['text']

In [29]:
tables_summary = data_summary['table']

In [30]:
text_summary

['The document discusses the nutritional composition of red meat, highlighting its benefits as an excellent source of high-quality protein, vitamins (B12, niacin, B6), minerals (iron, zinc, phosphorus), and beneficial fats (omega-3). It notes that lean red meats are low in fat and sodium while containing various antioxidants and bioactive substances. The article is authored by Assoc Prof Peter Williams and was published in 2007 in the journal Nutrition & Dietetics.',
 "Lean red meats are rich in high-quality protein, vitamins (B12, B6, niacin), minerals (iron, zinc, phosphorus), and omega-3 fats, while being low in fat and sodium. They also contain various antioxidants and bioactive compounds. The term 'meat' encompasses a variety of animal sources as defined by FSANZ, excluding eggs and foetuses, and includes both muscle and offal but not bone. In Australia, 'red meat' refers specifically to cattle, sheep, and goats, while processed meats contain at least 30% meat and are preserved th

In [16]:
tables_summary

['The table presents nutritional information for beef, veal, lamb, and mutton, comparing various components such as moisture, protein, fat, energy, cholesterol, and vitamins against the recommended daily intake (RDI) for adults in Australia. Key points include:\n\n- **Moisture Content**: Ranges from 72.9g (lamb) to 74.8g (veal).\n- **Protein**: Highest in veal (24.8g) and lowest in mutton (21.5g), with an RDI of 46-64g.\n- **Fat**: Lowest in veal (1.5g) and highest in lamb (4.7g).\n- **Energy**: Lamb has the highest energy content (546 kJ), while veal has the lowest (477 kJ).\n- **Cholesterol**: Similar levels across meats, with veal and lamb at 66mg.\n- **Vitamins and Minerals**: Varying levels of B vitamins, with niacin highest in veal (16mg) and vitamin B12 highest in mutton (2.8μg). Iron content is highest in mutton (3.3mg), while calcium is lowest across all meats (4.5mg in beef).\n\nOverall, veal tends to have lower fat and higher protein, while mutton offers more iron.',
 'The t

### Initialize Retriever

In [31]:
def initialize_retriever():

    store = RedisStore(client=client)
    id_key = "doc_id"
    vectorstore = PGVector(
            embeddings=OpenAIEmbeddings(),
            collection_name=COLLECTION_NAME,
            connection=CONNECTION_STRING,
            use_jsonb=True,
            )
    retrieval_loader = MultiVectorRetriever(vectorstore=vectorstore, docstore=store, id_key="doc_id")
    return retrieval_loader

In [32]:
load_retriever = initialize_retriever()

### Add Summary to vectorstore & Raw data to RedisStore

In [33]:
# Store text, tables, and their summaries in the retriever

def store_docs_in_retriever(text, text_summary, table, table_summary, retriever):
    """Store text and table documents along with their summaries in the retriever."""

    def add_documents_to_retriever(documents, summaries, retriever, id_key = "doc_id"):
        """Helper function to add documents and their summaries to the retriever."""
        if not summaries:
            return None, []

        doc_ids = [str(uuid.uuid4()) for _ in documents]
        summary_docs = [
            Document(page_content=summary, metadata={id_key: doc_ids[i]})
            for i, summary in enumerate(summaries)
        ]

        retriever.vectorstore.add_documents(summary_docs, ids=doc_ids)
        retriever.docstore.mset(list(zip(doc_ids, documents)))     

# Add text, table, and image summaries to the retriever
    add_documents_to_retriever(text, text_summary, retriever)
    add_documents_to_retriever(table, table_summary, retriever)
    return retriever

In [75]:
retriever  = store_docs_in_retriever(text, text_summary, tables,  tables_summary, load_retriever)
    

In [21]:
for i in retriever:
    print(i)

('name', None)
('tags', None)
('metadata', None)
('vectorstore', <langchain_postgres.vectorstores.PGVector object at 0x7f59f5cd4890>)
('byte_store', None)
('docstore', <langchain_community.storage.redis.RedisStore object at 0x7f59f857ce30>)
('id_key', 'doc_id')
('search_kwargs', {})
('search_type', <SearchType.similarity: 'similarity'>)


In [76]:
query = "What is the comparison of the composition of red meat and vegetarian protein sources"
docs = retriever.invoke(query)

In [77]:
docs

[b'2) Nutrient composition of red meat\n\nRed meat contains high biological value protein and important micronutrients that are needed for good health throughout life. It also contains a range of fats, including essential omega-3 polyunsaturated fats. Recent analyses have shown that there has been a significant trend to leaner cuts of meat over the past two decades [3]. While the nutritional composition will vary somewhat according to breed, feeding regimen, season and meat cut, in general lean red meat has a low fat content, is moderate in cholesterol and rich in protein and many essential vitamins and minerals.\n\n1\n\n2.1 Nutrient composition of beef, veal, lamb and mutton\n\nTable 1 presents the typical nutrient composition of samples of fat-trimmed Australian red meat, based on recent analyses of national retail samples [4-6] and compares this to the new Australian recommended dietary intakes [7]. While there are some differences between the four meats, in general lean red meat is

In [38]:
for dt in docs:
    print(dt)

b'2) Nutrient composition of red meat\n\nRed meat contains high biological value protein and important micronutrients that are needed for good health throughout life. It also contains a range of fats, including essential omega-3 polyunsaturated fats. Recent analyses have shown that there has been a significant trend to leaner cuts of meat over the past two decades [3]. While the nutritional composition will vary somewhat according to breed, feeding regimen, season and meat cut, in general lean red meat has a low fat content, is moderate in cholesterol and rich in protein and many essential vitamins and minerals.\n\n1\n\n2.1 Nutrient composition of beef, veal, lamb and mutton\n\nTable 1 presents the typical nutrient composition of samples of fat-trimmed Australian red meat, based on recent analyses of national retail samples [4-6] and compares this to the new Australian recommended dietary intakes [7]. While there are some differences between the four meats, in general lean red meat is 

In [79]:
result = parse_retriver_output(docs)

In [80]:
result

['2) Nutrient composition of red meat\n\nRed meat contains high biological value protein and important micronutrients that are needed for good health throughout life. It also contains a range of fats, including essential omega-3 polyunsaturated fats. Recent analyses have shown that there has been a significant trend to leaner cuts of meat over the past two decades [3]. While the nutritional composition will vary somewhat according to breed, feeding regimen, season and meat cut, in general lean red meat has a low fat content, is moderate in cholesterol and rich in protein and many essential vitamins and minerals.\n\n1\n\n2.1 Nutrient composition of beef, veal, lamb and mutton\n\nTable 1 presents the typical nutrient composition of samples of fat-trimmed Australian red meat, based on recent analyses of national retail samples [4-6] and compares this to the new Australian recommended dietary intakes [7]. While there are some differences between the four meats, in general lean red meat is 

### RAG Pipeline

#### Parse the retriever output

In [81]:
def parse_retriver_output(data):
    parsed_elements = []
    for element in data:
        # Decode bytes to string if necessary
        if isinstance(element, bytes):
            element = element.decode("utf-8")
        
        parsed_elements.append(element)
    
    return parsed_elements


#### Chat with the LLM using retrieved context

In [91]:
def chat_with_llm():


    prompt_text = """
                You are an AI Assistant tasked with understanding detailed
                information from text and tables. You are to answer the question based on the 
                context provided to you. You must not go beyond the context given to you.
                
                Context:
                {context}

                Question:
                {question}
                """

    prompt = ChatPromptTemplate.from_template(prompt_text)
    model = ChatOpenAI(temperature=0.6, model="gpt-4o-mini")

    rag_chain = {
       "context": retriever | RunnableLambda(parse_retriver_output), "question": RunnablePassthrough(),
        } | RunnablePassthrough().assign(
        response=(
        prompt 
        | model 
        | StrOutputParser()
        )
        )

    return rag_chain




In [92]:
rag_chain = chat_with_llm()

In [93]:
response = rag_chain.invoke("What is the nutrient composition of beef, veal, lamb and mutton")

In [94]:
response

{'context': ['<table><tr><td/><td>Beef</td><td>Veal</td><td>Lamb</td><td>Mutton</td><td>Adult Australian RDI</td></tr><tr><td>Moisture (g)</td><td>73.1</td><td>74.8</td><td>72.9</td><td>73.2</td><td/></tr><tr><td>Protein (g)</td><td>23.2</td><td>24.8</td><td>21.9</td><td>21.5</td><td>46-64</td></tr><tr><td>Fat (g)</td><td>2.8</td><td>1.5</td><td>4.7</td><td>4.0</td><td>-</td></tr><tr><td>Energy (kJ)</td><td>498</td><td>477</td><td>546</td><td>514</td><td>6.5-15.8MJ</td></tr><tr><td>Cholesterol (mg)</td><td>50</td><td>51</td><td>66</td><td>66</td><td>-</td></tr><tr><td>Thiamin (mg)</td><td>0.04</td><td>0.06</td><td>0.12</td><td>0.16</td><td>1.1-1.2</td></tr><tr><td>Riboflavin (mg)</td><td>0.18</td><td>0.20</td><td>0.23</td><td>0.25</td><td>1.1-1.6</td></tr><tr><td>Niacin (mg)</td><td>5.0</td><td>16.0</td><td>5.2</td><td>8.0</td><td>14-16</td></tr><tr><td>Vitamin B6 (mg)</td><td>0.52</td><td>0.8</td><td>0.10</td><td>0.8</td><td>1.3-1.7</td></tr><tr><td>Vitamin B12 (ug)</td><td>2.5</td><t

In [44]:
response = rag_chain.invoke("What is the nutrient composition of organ meats")

In [51]:
response

{'context': [b'Table 1. Nutrient composition (per 100g) of lean red meat [4-6]',
  b'1. Average values from 2002 analyses of Australian red meat [17]\n\n2. Values for raw lean chicken breast from NUTTAB 2006 [38]\n\n3. Values for raw lean pork fillet from NUTTAB 2006 [38]\n\n4. Values for raw flathead from NUTTAB 2006 [38]\n\n5. Values for canned red salmon from NUTTAB 2006 [38]\n\n9\n\nTable 5. Selected nutrients (per 100g) in raw liver, kidney, heart, brain and tripe 1',
  b'<table><tr><td/><td>Liver</td><td>Kidney</td><td>Heart</td><td/><td>Brain</td><td>Tripe Beef</td></tr><tr><td/><td>Beef</td><td>Lamb</td><td>Beef</td><td>Lamb</td><td>Beef</td><td>Lamb</td><td>Lamb</td></tr><tr><td>Protein (g)</td><td>20.0</td><td>21.4</td><td>18.2</td><td>17.1</td><td>18.2</td><td>17.8</td><td>12.3</td><td>13.2</td></tr><tr><td>Fat (g)</td><td>8.6</td><td>7.5</td><td>1.6</td><td>2.5</td><td>3.0</td><td>5.6</td><td>8.0</td><td>2.1</td></tr><tr><td>Saturated fat (g)</td><td>2.8</td><td>2.2</td><td

In [95]:
response = rag_chain.invoke("What is Meat?")

In [96]:
print(response['response'])

Meat is defined by the Food Standards Australia New Zealand (FSANZ) Food Standard Code as ‘the whole or part of the carcass of any buffalo, camel, cattle, deer, goat, hare, pig, poultry, rabbit or sheep, slaughtered other than in a wild state, but does not include eggs, or foetuses.’ This definition excludes kangaroo meat, which is widely available in Australia and is generally considered meat by most Australians.

The term ‘meat’ commonly refers to meat flesh (skeletal muscle along with any attached connective tissue or fat), but the FSANZ definition also includes offal (meat other than meat flesh, such as brain, heart, kidney, liver, pancreas, spleen, thymus, tongue, and tripe), while excluding bone and bone marrow.

In the context of Australia, ‘red meat’ refers specifically to meat from cattle, sheep, and goats (including beef, veal, lamb, mutton, and goat meat). It does not include meat from pigs (such as pork, bacon, and ham) or kangaroo, nor does it encompass less common game me