# Multimodal Rag (PDF with Images)

In [1]:
import fitz # PyMuPDF
from langchain_core.documents import Document
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy as np
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage
from sklearn.metrics.pairwise import cosine_similarity
import os 
import base64
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS


In [2]:
### CLIP MODEL-Contrast Language Image PreTraining
import os
from dotenv import load_dotenv
load_dotenv()

## Setup the environment
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")

#initialiazed the clip model for unified embeddings
clip_model=CLIPModel.from_pretrained('openai/clip-vit-base-patch32')
clip_processor=CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model.eval()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [3]:
# Embedding functions
def embed_image(image_data):
    """Embed image using CLIP"""
    if isinstance(image_data,str): #if path
        image=Image.open(image_data).convert("RGB")
    else: #IF PIL IMAGE
        image=image_data
    
    clip_processor(images=image,return_tensors='pt')
    with torch.no_grad():
        features=clip_model.get_image_features(**inputs)

        #normalize Embeddings
        features=features/features.norm(dim=-1,keepdim=True)

        return features.squeeze().numpy()

def embed_text(text):
    """Embed text using CLIP."""
    inputs= clip_processor(
        text=text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=77 #Clips max token length
    )

    with torch.no_grad():
        features= clip_model.get_text_features(**inputs)

        #Normalize embeddings
        features =features/features.norm(dim=-1,keepdim=True)
        return features.squeeze().numpy()

In [4]:
## Process PDF
pdf_path="M:\job hunt\Harrisburg University Documents\Sem2\Scientific Compuitng 1\Final Research Project\Project_data_analysis.pdf"

doc=fitz.open(pdf_path)

## Storage for all documents and embeddings

all_docs=[]
all_embeddings=[]
image_data_store={} #store actual data for LLM

splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

In [5]:
doc

Document('M:\job hunt\Harrisburg University Documents\Sem2\Scientific Compuitng 1\Final Research Project\Project_data_analysis.pdf')

In [6]:
for i, page in enumerate(doc):
    ## process text
    text = page.get_text()

    if text.strip():
        #create a temporary document for splitting
        temp_doc = Document(page_content=text, metadata={'page': i, "type": 'text'})

        # keep your original call name if that's what your splitter uses
        try:
            text_chunks = splitter.split_documents([temp_doc])  # LangChain standard
        except AttributeError:
            text_chunks = splitter.split_document([temp_doc])   # your original method name

        #Embed each chunk using CLIP
        for chunk in text_chunks:
            embedding = embed_text(chunk.page_content)
            all_embeddings.append(embedding)

            # prefer your original target; fall back if it's not defined
            try:
                all.docs.append(chunk)      # if you truly have `all.docs`
            except Exception:
                all_docs.append(chunk)      # common alternative

    ## process the images
    # Three Important Actions:

    ##convert PDF image to PIL format
    ##Store as base64 for gpt-4v (which needs base64 images)
    ##Create CLIP embedding for retrival 

    for img_index, img in enumerate(page.get_images(full=True)):
        try:
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image['image']

            #convert to PIL
            pil_image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
            
            # create unique identifier
            image_id = f'page_{i}_img_{img_index}'

            #store image as base64 for later use with GPT4-V
            buffered = io.BytesIO()
            pil_image.save(buffered, format='PNG')
            img_base64 = base64.b64encode(buffered.getvalue()).decode()
            image_data_store[image_id] = img_base64

            #Embed image using CLIP 
            embedding = embed_image(pil_image)
            all_embeddings.append(embedding)

            # create document for image
            image_doc = Document(
                page_content=f"[Image:{image_id}]",
                metadata={"page": i, "type": "image", "image_id": image_id}
            )

            # prefer your original target; fall back if it's not defined
            try:
                all.docs.append(image_doc)
            except Exception:
                all_docs.append(image_doc)

        except Exception as e:
            print(f'Error processing image {img_index} on page {i}: {e}')
            continue

doc.close()


Error processing image 0 on page 0: name 'inputs' is not defined
Error processing image 1 on page 0: name 'inputs' is not defined
Error processing image 2 on page 0: name 'inputs' is not defined
Error processing image 3 on page 0: name 'inputs' is not defined
Error processing image 0 on page 1: name 'inputs' is not defined
Error processing image 1 on page 1: name 'inputs' is not defined
Error processing image 2 on page 1: name 'inputs' is not defined
Error processing image 3 on page 1: name 'inputs' is not defined
Error processing image 0 on page 2: name 'inputs' is not defined
Error processing image 1 on page 2: name 'inputs' is not defined
Error processing image 2 on page 2: name 'inputs' is not defined
Error processing image 3 on page 2: name 'inputs' is not defined
Error processing image 0 on page 3: name 'inputs' is not defined
Error processing image 1 on page 3: name 'inputs' is not defined
Error processing image 2 on page 3: name 'inputs' is not defined
Error processing image 3 