In [10]:
from IPython import display
from google.colab import userdata
from unstructured.partition.pdf import partition_pdf
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.schema.document import Document
from langchain.schema.messages import HumanMessage, SystemMessage
from langchain.vectorstores import FAISS
from langchain.retrievers.multi_vector import MultiVectorRetriever
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image

import torch
import os
import uuid
import base64
import nltk


In [11]:
openai_api_key = userdata.get('OPENAI_API_KEY')

In [12]:
output_path = "./images"

In [13]:
# Set the NLTK data path to the custom directory
nltk.data.path.append('/content/nltk_data')

# Set the NLTK_DATA environment variable to ensure it is globally recognized
os.environ['NLTK_DATA'] = '/content/nltk_data'

# Download 'punkt' to the specified directory if not already downloaded
nltk.download('punkt', download_dir='/content/nltk_data')

# Ensure the file path is correct
pdf_path = '/content/drive/MyDrive/Multimodal_RAG/AC-Aids.pdf'
output_path = '/content/drive/MyDrive/Multimodal_RAG/Output'  # Set your output directory for extracted images

# Check if the file exists
if not os.path.exists(pdf_path):
    raise FileNotFoundError(f"The file {pdf_path} does not exist. Please check the path.")

# Check if the output directory exists, if not create it
if not os.path.exists(output_path):
    os.makedirs(output_path)

# Now use the partition_pdf function
raw_pdf_elements = partition_pdf(
    filename=pdf_path,
    extract_images_in_pdf=True,
    infer_table_structure=True,
    chunking_strategy="by_title",
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    extract_image_block_output_dir=output_path
)


[nltk_data] Downloading package punkt to /content/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


yolox_l0.05.onnx:   0%|          | 0.00/217M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/115M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/46.8M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
text_elements = []
table_elements = []

text_summaries = []
table_summaries = []

summary_prompt = """
Summarize the following {element_type}:
{element}
"""

In [16]:
summary_chain = LLMChain(
    llm = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key = openai_api_key, max_tokens=1024),
    prompt = PromptTemplate.from_template(summary_prompt)
)

  llm = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key = openai_api_key, max_tokens=1024),
  summary_chain = LLMChain(


In [17]:
for e in raw_pdf_elements:
  if 'CompositeElement' in repr(e):
    text_elements.append(e.text)
    summary = summary_chain.run({'element_type' : 'text', 'element' : e})
    text_summaries.append(summary)

  elif 'Table' in repr(e):
    table_elements.append(e.text)
    summary = summary_chain.run({'element_type' : 'table', 'element' : e})
    table_summaries.append(summary)

  summary = summary_chain.run({'element_type' : 'text', 'element' : e})


In [19]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

In [20]:
def summarize_image(image_path):
    raw_image = Image.open(image_path).convert('RGB')

    # Generate a caption for the image
    inputs = processor(raw_image, return_tensors="pt")
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    # Use the caption as input to the language model
    prompt = f"You are an expert in analyzing images related to Dog's health. Based on the following description, provide a detailed analysis:\n\nDescription: {caption}"

    response = ChatOpenAI(
        model="gpt-4",
        openai_api_key=openai_api_key,
        max_tokens=1024
    ).invoke([HumanMessage(content=prompt)])

    return response.content


In [21]:
image_elements = []
image_summaries = []

for i in os.listdir(output_path):
    if i.endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(output_path, i)
        image_elements.append(image_path)
        summary = summarize_image(image_path)
        image_summaries.append(summary)




In [22]:
#Create Documents and Vectorstore

documents = []
retrieve_contents = []

for e, s in zip(text_elements, text_summaries):
  i = str(uuid.uuid4())
  doc = Document(
      page_content = s,
      metadata = {
          'id' : i,
          'type' : 'text',
          'original_content':e
          }
  )
  retrieve_contents.append((i, e))
  documents.append(doc)

for e, s in zip(image_elements, image_summaries):
  doc = Document(
      page_content = s,
      metadata = {
          'id' : i,
          'type' : 'image',
          'original_content':e
          }
  )
  retrieve_contents.append((i, s))
  documents.append(doc)

vectorstore = FAISS.from_documents(documents=documents, embedding=OpenAIEmbeddings(openai_api_key=openai_api_key))

  vectorstore = FAISS.from_documents(documents=documents, embedding=OpenAIEmbeddings(openai_api_key=openai_api_key))


In [23]:
vectorstore.save_local("faiss_index")

In [24]:
embeddings = OpenAIEmbeddings(openai_api_key = openai_api_key)

In [25]:
db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

In [27]:
prompt_template = """
You are a Vet Doctor and an expert in analyzing dog's health.
Answer the question based on the following answer, which can include text,
images, and tables:
{context}
Question: {question}
Don't answer is you are not sure and decline to answer and say "Sorry, I don't have much information about this"
Just return the helpful answer in as much as detaile possible.
Answer:
"""

In [28]:
qa_chain = LLMChain(
    llm = ChatOpenAI(model="gpt-4", openai_api_key = openai_api_key, max_tokens=1024),
    prompt=PromptTemplate.from_template(prompt_template)
)

In [29]:
def answer(question):
  relevant_docs = db.similarity_search(question)
  context = ""
  relevant_images = []
  for i in relevant_docs:
    if i.metadata['type'] == 'text':
      context += '[text]'+ i.metadata['original_content']
    elif i.metadata['type'] == 'image':
      context += '[image]'+ i.page_content
      relevant_images.append(i.metadata['original_content'])
    elif i.metadata['type'] == 'table':
      context += '[table]'+ i.metadata['original_content']
  result = qa_chain.run({'context' : context, 'question' : question})
  return result, relevant_images

In [30]:
result, relevant_images = answer("what is Gingivisit?")

In [31]:
result

"Gingivitis is a common oral health issue in dogs, and it refers to the inflammation of the gums. It's typically the first stage of periodontal disease. \n\nGingivitis is caused by the buildup of plaque, a sticky film of bacteria that forms on the teeth. When plaque isn't regularly removed, it can harden into tartar, which further irritates the gums, causing them to become inflamed and swollen. \n\nSymptoms of gingivitis in dogs include red, swollen gums, bad breath, and visible tartar on the teeth. In more advanced cases, the dog may show signs of discomfort while eating or may even lose its appetite. \n\nIf left untreated, gingivitis can progress to periodontitis, a more serious form of gum disease that can result in tooth loss and other health problems. Therefore, it's important to maintain good oral hygiene for your pets by regularly brushing their teeth and scheduling professional cleanings with a veterinarian.\n\nIt's also worth noting that certain breeds, older dogs, and those w

In [32]:
relevant_images[0]

'/content/drive/MyDrive/Multimodal_RAG/Output/figure-1-3.jpg'