Hi, I’ve commented out these libraries since they’re already installed. You can uncomment them and run the commands if needed.

In [138]:
# !pip install fitz

In [139]:
# !pip install tools

In [140]:
# !pip install langchain

In [141]:
# !pip install langchain_community

In [142]:
import fitz # for PyMuPDF
from langchain_core.documents import Document
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy as np
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage
from sklearn.metrics.pairwise import cosine_similarity
import os
import base64
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS


In [143]:
# Load the CLIP Model - processor and model is required
import os
from dotenv import load_dotenv
load_dotenv()

##set up the environment
# os.environ["OPENAI_API_KEY"]=os.getenv("OPEN_API_KEY")

os.environ["OPENAI_API_KEY"] = ""


## initialize the clip model for unified embeddings
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

# this processor will help to convert the data whatevr required for the clip model
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

clip_model.eval()

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [144]:
## Embedding function

def embed_image(image_data):
  """Embed image using CLIP"""
  if isinstance(image_data, str): #if path
    image=Image.open(image_data).convert("RGB")
  else: #if PIL Image
    image=image_data

  inputs = clip_processor(images=image, return_tensors="pt")

  with torch.no_grad():
    features = clip_model.get_image_features(**inputs)
    # perform normalisation - Normalize Embeddings to unit vector
    features = features / features.norm(dim=-1, keepdim=True)
    return features.squeeze().numpy()

def embed_text(text):
  """Embed text using CLIP"""
  inputs = clip_processor(
      text=text,
      return_tensors="pt",
      padding=True,
      truncation=True,
      max_length=77  # CLIP's max token length
  )
  with torch.no_grad():
    features = clip_model.get_text_features(**inputs)
    # Normalize Embeddings
    features = features/features.norm(dim=-1, keepdim=True)
    return features.squeeze().numpy()

In [145]:
## Process the PDF
import fitz

pdf_path = "/content/test.pdf"
doc=fitz.open(pdf_path)

# Storage for all documents and embeddings
all_docs = []
all_embeddings = []
image_data_store = {}  # Stores actual image for LLM

# Text Splitter
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)



In [146]:
doc

Document('/content/test.pdf')

### Process the Images
**Three importaant Actions:**
* Convert the PDF image to PIL format.
* Store as base64 for GPT-4V (which needz base64 images)
* Create CLIP embedding for retrieval

In [147]:
for i,page in enumerate(doc):
    ## process text
    text=page.get_text()
    if text.strip():
        ##create temporary document for splitting
        temp_doc = Document(page_content=text, metadata={"page": i, "type": "text"})
        text_chunks = splitter.split_documents([temp_doc])

        #Embed each chunk using CLIP
        for chunk in text_chunks:
            embedding = embed_text(chunk.page_content)
            all_embeddings.append(embedding)
            all_docs.append(chunk)



    ## process images

    for img_index, img in enumerate(page.get_images(full=True)):
        try:
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            # Convert to PIL Image
            pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")

            # Create unique identifier
            image_id = f"page_{i}_img_{img_index}"

            # Store image as base64 for later use with GPT-4V
            buffered = io.BytesIO()
            pil_image.save(buffered, format="PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode()
            image_data_store[image_id] = img_base64

            # Embed image using CLIP
            embedding = embed_image(pil_image)
            all_embeddings.append(embedding)

            # Create document for image
            image_doc = Document(
                page_content=f"[Image: {image_id}]",
                metadata={"page": i, "type": "image", "image_id": image_id}
            )
            all_docs.append(image_doc)

        except Exception as e:
            print(f"Error processing image {img_index} on page {i}: {e}")
            continue

doc.close()

In [148]:
all_embeddings

[array([ 7.21080881e-03, -2.80058738e-02,  2.05637943e-02,  1.67821310e-02,
        -9.22840182e-03,  2.72802413e-02, -2.55076885e-02,  5.34691885e-02,
         5.42056262e-02, -5.22095757e-03, -1.83971860e-02, -2.57421727e-03,
         3.85971777e-02, -3.05170249e-02,  1.37278959e-02, -4.67458591e-02,
        -4.11158316e-02,  1.26956394e-02, -1.77991912e-02, -1.40098501e-02,
         4.03115992e-03,  1.90887526e-02, -2.15898249e-02, -3.11031882e-02,
         1.87622271e-02,  7.69760879e-03, -3.33616015e-04,  5.18313702e-03,
        -1.17035182e-02, -1.15468092e-02,  2.69735884e-02,  1.78336771e-03,
        -2.47586351e-02,  5.21757314e-03, -2.73551773e-02, -5.26737943e-02,
        -3.03071123e-02, -3.58409248e-02, -2.63961852e-02,  2.08840296e-02,
         8.00493057e-04,  2.09715427e-03, -1.74546875e-02, -2.90700030e-02,
        -3.79806326e-04,  8.91415868e-04, -1.22416916e-03, -3.12555470e-02,
        -4.25974391e-02,  3.03470660e-02, -2.58429702e-02, -2.40796227e-02,
         1.2

In [149]:
all_docs

[Document(metadata={'page': 0, 'type': 'text'}, page_content="FUN FACTS:\nOLIVIA\nWILSON\nFAVORITE COLORS:\n1.\n3.\n2.\n4.\nALL ABOUT ME\nHello, my name is\nI'm from: London\nZodiac Sign: Gemini\nI'm 10 years old\nI don't like vegetables:\nespecially cucumber. \nI'm a dancing diva: \nGUARANTEED SURPRISE MOVES.\nI'm obsessed with hats: A\nQUIRKY COLLECTION GALORE.\nI have three best friends and\nthey are all boys.\nMY PET\nMy pet dog is a corgi. He is\nvery funny and has a small\ntail. We have named him\nJonny. He has been with us\nfor the last 2 years.\nI am a young and"),
 Document(metadata={'page': 0, 'type': 'text'}, page_content='tail. We have named him\nJonny. He has been with us\nfor the last 2 years.\nI am a young and\nambitious woman on a\njourney to discover my\ntrue potential and make\na positive impact in the\nworld. Let me take you\nthrough the various\naspects that define me\nand make me who I am.'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_0

In [150]:
# !pip install faiss-cpu

In [151]:
# Create a unified FAISS vector store with CLIP embeddings
embeddings_array = np.array(all_embeddings)

# Create custom FAISS index since we have precomputed embeddings
vector_store = FAISS.from_embeddings(
    text_embeddings = [(doc.page_content, emb) for doc, emb in zip(all_docs, embeddings_array)],
    embedding = None, # we are using precomputed embeddings
    metadatas=[doc.metadata for doc in all_docs]
)



In [152]:
# !pip install -U langchain-openai

In [153]:
# Initialize GPT-4 Vision model
llm = init_chat_model("openai:gpt-4.1")
llm

ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x7d589b4dfe90>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x7d589b4de8d0>, root_client=<openai.OpenAI object at 0x7d589aa53190>, root_async_client=<openai.AsyncOpenAI object at 0x7d589b4df310>, model_name='gpt-4.1', model_kwargs={}, openai_api_key=SecretStr('**********'))

In [154]:
def retrieve_multimodal(query, k=5):
  """Unified retrieval using CLIP embeddings for both text and images"""

  # Embed query using CLIP
  query_embedding = embed_text(query)

  # Search in unified vector store
  results = vector_store.similarity_search_by_vector(
      embedding=query_embedding,
      k=k
  )

  return results

In [155]:
def create_multimodal_message(query, retrieved_docs):
  """Create a message with both text an images for GPT-4V"""
  content = []

  #Add the query
  content.append({
      "type": "text",
      "text": f"Question: {query}\n\nContext:\n"
  })

  # Seperate text and image documents
  text_docs = [doc for doc in retrieved_docs if doc.metadata.get("type")== "text"]
  image_docs = [doc for doc in retrieved_docs if doc.metadata.get("type")== "image"]

  # Add text context
  if text_docs:
    text_context = "\n\n".join([
        f"[Page {doc.metadata['page']}]: {doc.page_content}"
        for doc in text_docs
    ])

    # Add Images
    for doc in image_docs:
        image_id = doc.metadata.get("image_id")
        if image_id and image_id in image_data_store:
            content.append({
                "type": "text",
                "text": f"\n[Image from page {doc.metadata['page']}]:\n"
            })
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{image_data_store[image_id]}"
                }
            })

    # Add instruction
    content.append({
        "type": "text",
        "text": "\n\nPlease answer the question based on the provided text and images."
    })

    return HumanMessage(content=content)


In [156]:
def multimodal_pdf_rag_pipeline(query):

    """Main pipeline for multimodal RAG."""

    # Retrieve relevant documents
    context_docs = retrieve_multimodal(query, k=5)

    # Create multimodal message
    message = create_multimodal_message(query, context_docs)

    # Get response from GPT-4V
    response = llm.invoke([message])

    # Print retrieved context info
    print(f"\nRetrieved {len(context_docs)} documents:")
    for doc in context_docs:
        doc_type = doc.metadata.get("type", "unknown")
        page = doc.metadata.get("page", "?")
        if doc_type == "text":
            preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
            print(f"  - Text from page {page}: {preview}")
        else:
            print(f"  - Image from page {page}")
    print("\n")

    return response.content

In [157]:
if __name__ == "__main__":
    # Example queries
    queries = [
        "What does the image on page 1 is about?",
        "Summarize the main findings from the document",
        "What visual elements are present in the document?"
    ]

    for query in queries:
        print(f"\nQuery: {query}")
        print("-" * 50)
        answer = multimodal_pdf_rag_pipeline(query)
        print(f"Answer: {answer}")
        print("=" * 70)


Query: What does the image on page 1 is about?
--------------------------------------------------

Retrieved 5 documents:
  - Text from page 1: Project: New campaign
Task
Asset
Status
Priority
Deadline
Owner
Notes
Upload
teaser
✅ Com… ​
🟢 High​...
  - Text from page 0: tail. We have named him
Jonny. He has been with us
for the last 2 years.
I am a young and
ambitious ...
  - Text from page 0: FUN FACTS:
OLIVIA
WILSON
FAVORITE COLORS:
1.
3.
2.
4.
ALL ABOUT ME
Hello, my name is
I'm from: Londo...
  - Image from page 0
  - Image from page 0


Answer: Based on the provided images and the question "What does the image on page 1 is about?":

The first image shows a girl with long brown hair, wearing a white top with sheer sleeves that have star patterns. She is outdoors, with blurred greenery in the background, and is looking directly at the camera with a calm expression.

**So, the image on page 1 is about a portrait of a young girl outside in a natural setting.**

Query: Summarize the mai