In [None]:
import fitz  # PyMuPDF
from langchain_core.documents import Document
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy as np
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage
from sklearn.metrics.pairwise import cosine_similarity
import os
import base64
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

: 

In [2]:
# import groq
# from groq import Groq
# import base64
# import os
# import os
# from dotenv import load_dotenv

# # Load .env file
# load_dotenv()

# # Function to encode the image
# def encode_image(image_path):
#   with open(image_path, "rb") as image_file:
#     return base64.b64encode(image_file.read()).decode('utf-8')

# # Path to your image
# image_path = "download.jpg"

# # Getting the base64 string
# base64_image = encode_image(image_path)

# client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

# chat_completion = client.chat.completions.create(
#     messages=[
#         {
#             "role": "user",
#             "content": [
#                 {"type": "text", "text": "What's in this image?"},
#                 {
#                     "type": "image_url",
#                     "image_url": {
#                         "url": f"data:image/jpeg;base64,{base64_image}",
#                     },
#                 },
#             ],
#         }
#     ],
#     model="meta-llama/llama-4-scout-17b-16e-instruct",
# )

# print(chat_completion.choices[0].message.content)

In [3]:
###Clip Model
import os
from dotenv import load_dotenv
load_dotenv()


### initialize the Clip Model for unified embeddings
clip_model=CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor=CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model.eval()


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [4]:
### Embedding functions
def embed_image(image_data):
    """Embed image using CLIP"""
    if isinstance(image_data, str):  # If path
        image = Image.open(image_data).convert("RGB")
    else:  # If PIL Image
        image = image_data
    
    inputs=clip_processor(images=image,return_tensors="pt")
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
        # Normalize embeddings to unit vector
        features = features / features.norm(dim=-1, keepdim=True)
        return features.squeeze().numpy()
    
def embed_text(text):
    """Embed text using CLIP."""
    inputs = clip_processor(
        text=text, 
        return_tensors="pt", 
        padding=True,
        truncation=True,
        max_length=77  # CLIP's max token length
    )
    with torch.no_grad():
        features = clip_model.get_text_features(**inputs)
        # Normalize embeddings
        features = features / features.norm(dim=-1, keepdim=True)
        return features.squeeze().numpy()

In [5]:
## Process PDF
pdf_path="Failure_Modes_and_Effects_Analysis_FMEA_for_wind_t.pdf"
doc=fitz.open(pdf_path)
# Storage for all documents and embeddings
all_docs = []
all_embeddings = []
image_data_store = {}  # Store actual image data for LLM

# Text splitter
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)




In [6]:
doc

Document('Failure_Modes_and_Effects_Analysis_FMEA_for_wind_t.pdf')

In [7]:

for i,page in enumerate(doc):
    ## process text
    text=page.get_text()
    if text.strip():
        ##create temporary document for splitting
        temp_doc = Document(page_content=text, metadata={"page": i, "type": "text"})
        text_chunks = splitter.split_documents([temp_doc])

        #Embed each chunk using CLIP
        for chunk in text_chunks:
            embedding = embed_text(chunk.page_content)
            print('text embeddings: ', embedding)
            all_embeddings.append(embedding)
            all_docs.append(chunk)



    ## process images
    ##Three Important Actions:

    ##Convert PDF image to PIL format
    ##Store as base64 for GPT-4V (which needs base64 images)
    ##Create CLIP embedding for retrieval

    for img_index, img in enumerate(page.get_images(full=True)):
        try:
            xref = img[0]
            print('xref: ', xref)
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            # Convert to PIL Image (always RGB)
            pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")

            # Skip invalid/tiny images (like masks or 1x1 objects)
            if pil_image.width < 10 or pil_image.height < 10:
                print(f"Skipping tiny image {img_index} on page {i}: {pil_image.size}")
                continue

            # Create unique identifier
            image_id = f"page_{i}_img_{img_index}"

            # Store image as base64 for multimodal model
            buffered = io.BytesIO()
            pil_image.save(buffered, format="PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode()
            image_data_store[image_id] = img_base64

            # Embed image using CLIP (pass PIL directly)
            embedding = embed_image(pil_image)
            print('image embeddings: ', embedding)
            all_embeddings.append(embedding)

            # Create document for image
            image_doc = Document(
                page_content=f"[Image: {image_id}]",
                metadata={"page": i, "type": "image", "image_id": image_id}
            )
            all_docs.append(image_doc)

        except Exception as e:
            print(f"Error processing image {img_index} on page {i}: {e}")
            continue


doc.close()


text embeddings:  [ 2.02287734e-03  4.08941461e-03 -7.87010696e-03 -9.57226753e-03
 -6.45603240e-03  3.20307873e-02 -1.44604351e-02  1.33459613e-01
  8.02226737e-02  1.86329279e-02  3.35228704e-02 -2.16879491e-02
  3.56706977e-02  5.31563722e-02  1.08566210e-02 -3.68224992e-03
  2.59494781e-02 -1.65668037e-02 -1.48256570e-02  4.89919297e-02
  4.98729870e-02  7.48333931e-02 -2.28396375e-02  3.24540325e-02
  1.42665505e-02  7.48703163e-03  1.39096044e-02 -3.60840447e-02
 -3.62431183e-02 -3.12296879e-02  3.35371420e-02 -1.03849033e-02
 -3.92616987e-02 -1.49632720e-02 -1.05702709e-02  3.82866152e-02
 -6.67547360e-02 -1.23566268e-02 -4.85014059e-02 -4.80369255e-02
 -2.99444757e-02  1.13248993e-02  4.77224253e-02  5.93090849e-03
 -1.55146921e-03 -2.26026354e-03  5.20906933e-02 -2.83762999e-03
 -9.84028727e-03  8.94183200e-03 -2.21279934e-02  1.95728913e-02
  6.86699944e-03 -5.83674805e-03 -1.44181466e-02  1.83702800e-02
 -5.40656336e-02 -3.81354950e-02  6.87546432e-02  3.74498367e-02
  1.891

In [8]:
all_docs

[Document(metadata={'page': 0, 'type': 'text'}, page_content='See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/222553615\nFailure Modes and Effects Analysis (FMEA) for wind turbines\nArticle\xa0\xa0in\xa0\xa0International Journal of Electrical Power & Energy Systems · September 2010\nDOI: 10.1016/j.ijepes.2010.01.019\xa0·\xa0Source: OAI\nCITATIONS\n499\nREADS\n9,932\n3 authors, including:\nP.J. Tavner\nDurham University\n180 PUBLICATIONS\xa0\xa0\xa017,904 CITATIONS\xa0\xa0\xa0\nSEE PROFILE'),
 Document(metadata={'page': 0, 'type': 'text'}, page_content='P.J. Tavner\nDurham University\n180 PUBLICATIONS\xa0\xa0\xa017,904 CITATIONS\xa0\xa0\xa0\nSEE PROFILE\nAll content following this page was uploaded by P.J. Tavner on 01 January 2018.\nThe user has requested enhancement of the downloaded file.'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_0'}, page_content='[Image: page_0_img_0]'),
 Document(metadat

In [9]:
all_embeddings

[array([ 2.02287734e-03,  4.08941461e-03, -7.87010696e-03, -9.57226753e-03,
        -6.45603240e-03,  3.20307873e-02, -1.44604351e-02,  1.33459613e-01,
         8.02226737e-02,  1.86329279e-02,  3.35228704e-02, -2.16879491e-02,
         3.56706977e-02,  5.31563722e-02,  1.08566210e-02, -3.68224992e-03,
         2.59494781e-02, -1.65668037e-02, -1.48256570e-02,  4.89919297e-02,
         4.98729870e-02,  7.48333931e-02, -2.28396375e-02,  3.24540325e-02,
         1.42665505e-02,  7.48703163e-03,  1.39096044e-02, -3.60840447e-02,
        -3.62431183e-02, -3.12296879e-02,  3.35371420e-02, -1.03849033e-02,
        -3.92616987e-02, -1.49632720e-02, -1.05702709e-02,  3.82866152e-02,
        -6.67547360e-02, -1.23566268e-02, -4.85014059e-02, -4.80369255e-02,
        -2.99444757e-02,  1.13248993e-02,  4.77224253e-02,  5.93090849e-03,
        -1.55146921e-03, -2.26026354e-03,  5.20906933e-02, -2.83762999e-03,
        -9.84028727e-03,  8.94183200e-03, -2.21279934e-02,  1.95728913e-02,
         6.8

In [10]:
# Create unified FAISS vector store with CLIP embeddings
embeddings_array = np.array(all_embeddings)
embeddings_array

array([[ 2.0228773e-03,  4.0894146e-03, -7.8701070e-03, ...,
        -1.5621574e-02,  1.5000649e-03,  1.8694466e-02],
       [-2.4422185e-02,  7.9437916e-05,  2.4761422e-02, ...,
        -7.7859849e-02, -1.3052090e-02, -7.4465270e-04],
       [ 2.0496882e-02,  7.1867802e-03, -4.7610004e-02, ...,
         9.5357470e-02, -1.8829664e-02,  4.8908845e-02],
       ...,
       [-7.4206647e-03,  1.9646022e-02,  1.8758651e-02, ...,
         5.1781073e-02,  3.2903999e-02,  2.0148639e-02],
       [-4.7849854e-03, -3.6998503e-03,  1.6363552e-02, ...,
         4.8817322e-02,  2.8279308e-02, -2.8375957e-02],
       [ 1.6900143e-02, -5.7758968e-03, -3.8613178e-02, ...,
        -2.3534944e-02, -2.3536265e-03, -1.5202447e-02]],
      shape=(141, 512), dtype=float32)

In [11]:
(all_docs,embeddings_array)

([Document(metadata={'page': 0, 'type': 'text'}, page_content='See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/222553615\nFailure Modes and Effects Analysis (FMEA) for wind turbines\nArticle\xa0\xa0in\xa0\xa0International Journal of Electrical Power & Energy Systems · September 2010\nDOI: 10.1016/j.ijepes.2010.01.019\xa0·\xa0Source: OAI\nCITATIONS\n499\nREADS\n9,932\n3 authors, including:\nP.J. Tavner\nDurham University\n180 PUBLICATIONS\xa0\xa0\xa017,904 CITATIONS\xa0\xa0\xa0\nSEE PROFILE'),
  Document(metadata={'page': 0, 'type': 'text'}, page_content='P.J. Tavner\nDurham University\n180 PUBLICATIONS\xa0\xa0\xa017,904 CITATIONS\xa0\xa0\xa0\nSEE PROFILE\nAll content following this page was uploaded by P.J. Tavner on 01 January 2018.\nThe user has requested enhancement of the downloaded file.'),
  Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_0'}, page_content='[Image: page_0_img_0]'),
  Document(met

In [12]:
len(all_docs), len(embeddings_array)

(141, 141)

In [13]:
metadatas=[doc.metadata for doc in all_docs]
metadatas

[{'page': 0, 'type': 'text'},
 {'page': 0, 'type': 'text'},
 {'page': 0, 'type': 'image', 'image_id': 'page_0_img_0'},
 {'page': 1, 'type': 'text'},
 {'page': 1, 'type': 'text'},
 {'page': 1, 'type': 'text'},
 {'page': 1, 'type': 'text'},
 {'page': 1, 'type': 'image', 'image_id': 'page_1_img_0'},
 {'page': 2, 'type': 'text'},
 {'page': 2, 'type': 'text'},
 {'page': 2, 'type': 'text'},
 {'page': 2, 'type': 'text'},
 {'page': 4, 'type': 'text'},
 {'page': 4, 'type': 'text'},
 {'page': 4, 'type': 'text'},
 {'page': 4, 'type': 'text'},
 {'page': 4, 'type': 'text'},
 {'page': 5, 'type': 'text'},
 {'page': 5, 'type': 'text'},
 {'page': 5, 'type': 'text'},
 {'page': 5, 'type': 'text'},
 {'page': 5, 'type': 'text'},
 {'page': 5, 'type': 'text'},
 {'page': 5, 'type': 'text'},
 {'page': 5, 'type': 'text'},
 {'page': 5, 'type': 'text'},
 {'page': 5, 'type': 'text'},
 {'page': 5, 'type': 'text'},
 {'page': 5, 'type': 'text'},
 {'page': 5, 'type': 'text'},
 {'page': 6, 'type': 'text'},
 {'page': 6,

In [14]:
text_embeddings=[(doc.page_content, emb) for doc, emb in zip(all_docs, embeddings_array)]
text_embeddings

[('See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/222553615\nFailure Modes and Effects Analysis (FMEA) for wind turbines\nArticle\xa0\xa0in\xa0\xa0International Journal of Electrical Power & Energy Systems · September 2010\nDOI: 10.1016/j.ijepes.2010.01.019\xa0·\xa0Source: OAI\nCITATIONS\n499\nREADS\n9,932\n3 authors, including:\nP.J. Tavner\nDurham University\n180 PUBLICATIONS\xa0\xa0\xa017,904 CITATIONS\xa0\xa0\xa0\nSEE PROFILE',
  array([ 2.02287734e-03,  4.08941461e-03, -7.87010696e-03, -9.57226753e-03,
         -6.45603240e-03,  3.20307873e-02, -1.44604351e-02,  1.33459613e-01,
          8.02226737e-02,  1.86329279e-02,  3.35228704e-02, -2.16879491e-02,
          3.56706977e-02,  5.31563722e-02,  1.08566210e-02, -3.68224992e-03,
          2.59494781e-02, -1.65668037e-02, -1.48256570e-02,  4.89919297e-02,
          4.98729870e-02,  7.48333931e-02, -2.28396375e-02,  3.24540325e-02,
          1.42665505e-02,  7.48703163e-

In [15]:


# Create custom FAISS index since we have precomputed embeddings
vector_store = FAISS.from_embeddings(
    text_embeddings=[(doc.page_content, emb) for doc, emb in zip(all_docs, embeddings_array)],
    embedding=None,  # We're using precomputed embeddings
    metadatas=[doc.metadata for doc in all_docs]
)
# vector_store.save_local("faiss_index")
vector_store

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


<langchain_community.vectorstores.faiss.FAISS at 0x21638dfe090>

In [16]:
vector_store.save_local("faiss_index")


In [17]:
from langchain.vectorstores import FAISS

# Dummy embeddings class (for precomputed embeddings)
class DummyEmbeddings:
    def embed_documents(self, texts):
        return []

    def embed_query(self, text):
        return []

# Load your FAISS index safely
vector_store = FAISS.load_local(
    folder_path="faiss_index",
    embeddings=DummyEmbeddings(),
    allow_dangerous_deserialization=True
)
vector_store

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


<langchain_community.vectorstores.faiss.FAISS at 0x2161366bad0>

In [18]:

import os
import base64
from groq import Groq
from dotenv import load_dotenv
from langchain.schema import HumanMessage

# Load .env
load_dotenv()

# Initialize Groq client
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
model_name = "meta-llama/llama-4-scout-17b-16e-instruct"

# Function to encode image as base64
def encode_image(image_path):
    with open(image_path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")
    
# Replace llm with Groq call
def groq_chat_completion(messages):
    chat_completion = client.chat.completions.create(
        messages=messages,
        model=model_name,
    )
    return chat_completion.choices[0].message.content

In [19]:
def retrieve_multimodal(query, k=5):
    """Unified retrieval using CLIP embeddings for both text and images."""
    # Embed query using CLIP
    query_embedding = embed_text(query)
    
    # Search in unified vector store
    results = vector_store.similarity_search_by_vector(
        embedding=query_embedding,
        k=k
    )
    
    return results

In [20]:
print(image_data_store)

def create_multimodal_message(query, retrieved_docs):
    """Create a message with both text and images for Groq (LLaMA-Vision)."""
    content = []

    # Add query
    content.append({
        "role": "user",
        "content": [
            {"type": "text", "text": f"Question: {query}\n\nContext:\n"}
        ]
    })

    # Separate text and images
    text_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "text"]
    print(text_docs)
    image_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "image"]
    print(image_docs)

    # Add text context
    if text_docs:
        text_context = "\n\n".join(
            f"[Page {doc.metadata['page']}]: {doc.page_content}" for doc in text_docs
        )
        content[0]["content"].append({"type": "text", "text": f"Text excerpts:\n{text_context}\n"})

    # Add images
    for doc in image_docs:
        image_id = doc.metadata.get("image_id")
        print("image_id: ",image_id)
        if image_id and image_id in image_data_store:
            base64_image = image_data_store[image_id]
            content[0]["content"].append({"type": "text", "text": f"\n[Image from page {doc.metadata['page']}]:\n"})
            content[0]["content"].append({
                "type": "image_url",
                "image_url": {"url": f"data:image/png;base64,{base64_image}"}
            })

    # Add instruction
    content[0]["content"].append({
        "type": "text",
        "text": "\n\nPlease answer the question based on the provided text and images."
    })
    
    # print(content)

    return content


{'page_0_img_0': 'iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAjRklEQVR4nF16eZCs11Xf3e+39t6zvTdvX7TLWixjZAu8YocQF1kIVUAqyV8hkFQqBFcSCFFI4bL/DZg/CFSSwk4C2AqFCdhCsUwUI8uyJGt9T2/VzJs3M93Te3/L3W/qztgPwTddX/X09HTfc+8553fO73fgb3zj+lxXtJU5rxPrO5joYum1aSSxcWBZ1fvjg+7K+mgyzpqtg+G43e4AT4TxPM8cgePJnitnzNnIxCOl+LH1ZHOj1NKIOnLO1fVciM31DadMOVlgjAGAiPHe2rrA2kCnnPXGUogJjjRClTVQaahECkAbQ6TVUtYlBb4Rj9EKQNBh4AEw3gAACAKEEAKQxxhD552xUlnJsfVQe2BriQhWEOSdnvKeRGkhJGJMxaSVNhse7+zuGW84gBCncrlAGV4Mp91eOwVkUSychShO53Wdt9d396atJFlf31BVra2JktjJ0kVEKQEhZAgjaS2sQRQzSjWEFjlpTQUQgwh4zB2EmpoUOui99wBBAMMzA711hgCCKWYeYqehNchFDMXYYSG9hhhrD6M4HY8nSaM1L5bd9b6KUSVVhrBXJQSA8hQAhFO+c7C1ur7GICl295tRNC2L0cGs0WhEkAIaM4SL2aJYTKM8TVkmjAUeQW3zOGryRBpRaqWscpgAHmngDIbAgdhxTyAAAFmiCfAehP0HFoU78N4754iB1mNkHLAOIUQgSwkFoEYOKiOMsFpU9WAyX+GpcCDDzAMwn44Ao+08Vc4Tng6LKSUJaPVYq91rNFRVjm/diBHo5BkGy8X+fjdvqKUYz2Y0iiDKZ0p5wiNrGAA5Jm3CBDXI2YX3hagR5h5jjbwHUCGCEYEeQIwAdBhCZAFwwFsXLEDQAUiUUo7SynkDEcTEYkQYARJTlki19BALZQAmHiCA2VIKqiByEDsYZWkxm5OYFF4DXe

In [21]:
image_data_store.keys(), image_data_store.values()

(dict_keys(['page_0_img_0', 'page_1_img_0', 'page_8_img_0', 'page_11_img_2', 'page_11_img_3', 'page_11_img_4', 'page_11_img_5', 'page_11_img_6', 'page_11_img_7', 'page_11_img_8', 'page_11_img_9', 'page_11_img_10', 'page_11_img_11', 'page_11_img_12', 'page_11_img_13', 'page_11_img_14', 'page_11_img_15', 'page_11_img_16', 'page_11_img_17', 'page_11_img_18', 'page_11_img_19', 'page_11_img_20', 'page_11_img_21', 'page_12_img_1', 'page_12_img_2', 'page_12_img_3', 'page_12_img_4', 'page_12_img_5', 'page_12_img_6', 'page_12_img_7', 'page_12_img_8', 'page_12_img_9', 'page_12_img_10']),
 dict_values(['iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAjRklEQVR4nF16eZCs11Xf3e+39t6zvTdvX7TLWixjZAu8YocQF1kIVUAqyV8hkFQqBFcSCFFI4bL/DZg/CFSSwk4C2AqFCdhCsUwUI8uyJGt9T2/VzJs3M93Te3/L3W/qztgPwTddX/X09HTfc+8553fO73fgb3zj+lxXtJU5rxPrO5joYum1aSSxcWBZ1fvjg+7K+mgyzpqtg+G43e4AT4TxPM8cgePJnitnzNnIxCOl+LH1ZHOj1NKIOnLO1fVciM31DadMOVlgjAGAiPHe2rrA2kCnnPXGUogJjjRClTVQaahECkAbQ6TVUtYlBb4Rj9EKQNBh4AEw3gAACAKEEAKQxxhD552x

In [22]:
image_data_store.get('page_6_img_0')

In [24]:

def multimodal_pdf_rag_pipeline(query):
    """Main pipeline for multimodal RAG using Groq."""
    # Retrieve relevant documents
    context_docs = retrieve_multimodal(query, k=50)
    print('retrived documents', context_docs)

    # Create Groq messages
    messages = create_multimodal_message(query, context_docs)
    print('multimodal message',messages)

    # Get response from Groq
    response_text = groq_chat_completion(messages)

    # Print retrieved context info
    print(f"\nRetrieved {len(context_docs)} documents:")
    for doc in context_docs:
        doc_type = doc.metadata.get("type", "unknown")
        page = doc.metadata.get("page", "?")
        if doc_type == "text":
            preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
            print(f"  - Text from page {page}: {preview}")
        else:
            print(f"  - Image from page {page}")
    print("\n")

    return response_text

In [25]:
if __name__ == "__main__":
    # Example queries
    # queries = [
    #     "what are the failure causes?",
    #     "Summarize the main findings from the document",
    #     "What visual elements are present in the document?"
    # ]
    queries = [
    "what are the images in the document represent?",
    
    ]
    
    for query in queries:
        print(f"\nQuery: {query}")
        print("-" * 50)
        answer = multimodal_pdf_rag_pipeline(query)
        print(f"Answer: {answer}")
        print("=" * 70)


Query: what are the images in the document represent?
--------------------------------------------------
retrived documents [Document(id='2ea7ec02-9c3f-4585-99c1-00d716ae7a74', metadata={'page': 11, 'type': 'text'}, page_content='because severity information cannot be concluded from failure rate data. However, detect ability is linked to failure rate because \nif a Root Cause is hard to detect, a failure is more likely to occur. \n0\n0.1\n0.2\n0.3\n0.4\n0.5\n0.6\n0.7\n0.8\n0.9\n1\n1.1\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\nWSD\nWSDN\nLWK\nRPN\nFig. 2. R80 per assembly RPN comparison with field failure rate data'), Document(id='af359244-0e4b-4e47-8b1a-adb634cda146', metadata={'page': 5, 'type': 'text'}, page_content='are the different ways in which a component may fail. It is vitally important to realize that a Failure Mode is not the cause of a \nfailure, but the way in which a failure has occurred. The effects of one failure can frequently be linked to the Root Causes of \nanother failu