In [1]:
from langchain_core.documents import Document
import fitz 
from transformers import CLIPProcessor, CLIPModel 
from PIL import Image 
import numpy as np
from langchain.chat_models import init_chat_model 
from langchain.prompts import PromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage
from sklearn.metrics.pairwise import cosine_similarity
import base64 
import io
from langchain_text_splitters import RecursiveCharacterTextSplitter 
from langchain_community.vectorstores import FAISS

  from .autonotebook import tqdm as notebook_tqdm


### **CLIP Model loading**

In [2]:
import os 
import torch    
from dotenv import load_dotenv 
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [3]:
# Initialize the CLIP model for Unified embeddings 
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model.eval()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [4]:
## Embedding function 
def image_embed(image_path):
    "Embed images using CLIP"
    if isinstance(image_path, str):
        image = Image.open(image_path).convert('RGB')
    else:
        image = image_path 

    inputs = clip_processor(images=image, return_tensors="pt")
    with torch.no_grad(): 
        features = clip_model.get_image_features(**inputs)
        # Normalizing embeddings 
        features = features / features.norm(dim = -1, keepdim=True)
        return features.squeeze().numpy()
    
def text_embed(text):
    "Embed text using CLIP"
    inputs = clip_processor(text = text, return_tensors="pt", padding = True, truncation = True, max_length = 77)
    with torch.no_grad():
        features = clip_model.get_text_features(**inputs)
        # Normalizing Embeddings
        features = features / features.norm(dim = -1, keepdim=True)
        return features.squeeze().numpy()

In [5]:
### Process pdf 
pdf_path = r"C:\Users\hites\OneDrive\Desktop\Multi-modal RAG\data\BERT_Slides.pdf"
doc = fitz.open(pdf_path)

# Storage for docs and embeddings 
all_docs = []
all_embeddings = []
image_data_store = {}

## Text splitter 
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 100)

In [6]:
doc

Document('C:\Users\hites\OneDrive\Desktop\Multi-modal RAG\data\BERT_Slides.pdf')

In [7]:
for i, page in enumerate(doc):
    ## process text 
    text = page.get_text()
    if text.strip():
        # Create a temporary for splitting 
        temp_doc = Document(page_content=text, metadata={'page':i, 'type': 'text'})
        text_chunks = text_splitter.split_documents([temp_doc])
        
        # Embed each chunks using CLIP 
        for chunk in text_chunks:
            embedding = text_embed(chunk.page_content)
            all_embeddings.append(embedding)
            all_docs.append(chunk)
        
    ## Process Images 
    # 1. Convert PDF images to PIL Format 
    # 2. Store as base64 for GPT-4o model (which takes base64 images)
    # Create CLIP Embeddings for retrieval 
    for img_index, img in enumerate(page.get_images(full = True)):
        try:
            xref = img[0] 
            base_image = doc.extract_image(xref=xref)
            image_bytes = base_image["image"]

            # Convert to PIL Image 
            pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")

            # Create Unique identifier 
            image_id = f"page_{i}_img_{img_index}"

            # Store image as base64 for later use with gpt-4o 
            buffered = io.BytesIO() 
            pil_image.save(buffered, format = "PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode()
            image_data_store[image_id] = img_base64 

            # Embed image using CLIP 
            embedding = image_embed(pil_image)
            all_embeddings.append(embedding)

            # Create document for image 
            image_doc = Document(
                page_content=f"[Image: {image_id}]", 
                metadata = {"page": i, "type": "image", "image_id": image_id}
            )
            all_docs.append(image_doc)
        except Exception as e:
            print(f"Error processing image {img_index} on page {i}: {e}")
            continue 
doc.close()

In [8]:
embedding_array = np.array(all_embeddings) 

# Creating custom FAISS index since we have precomputed embeddings 
vector_store = FAISS.from_embeddings(
    text_embeddings = [(doc.page_content, emb) for doc, emb in zip(all_docs, all_embeddings)], 
    embedding=None, # using precomputed embeddings 
    metadatas=[doc.metadata for doc in all_docs]
)

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [9]:
vector_store

<langchain_community.vectorstores.faiss.FAISS at 0x1f425184160>

In [10]:
llm = init_chat_model(model="gpt-4.1")

In [11]:
def retrive_multimodal(query, k = 5):
    """Retrival using CLIP embeddings for both text and images."""
    # Embed query using CLIP 
    query_embedding = text_embed(query) 

    # Search in Vector store 
    results = vector_store.similarity_search_by_vector(
        embedding = query_embedding, 
        k = k 
    )

    return results

In [12]:
def create_multimodal_message(query, retrived_docs):
    """Create a message using both text and images for GPT""" 
    content = [] 

    # Add the query 
    content.append({
        "type": "text", 
        "text": f"Question: {query}\n\nContext:\n"  
    })

    # Seperate text and image document 
    text_docs = [doc for doc in retrived_docs if doc.metadata.get("type") == "text"]
    image_docs = [doc for doc in retrived_docs if doc.metadata.get("type") == "image"]
    
    # Add text context 
    if text_docs:
        text_context = "\n\n".join([
            f"[Page {doc.metadata['page']}]: {doc.page_content}"
            for doc in text_docs
        ])
        content.append({
            "type": "text",
            "text": f"Text excerpts:\n{text_context}\n"
        })
    
    # Add Images 
    for doc in image_docs:
        image_id = doc.metadata.get("image_id")
        if image_id and image_id in image_data_store:
            content.append({
                "type": "text",
                "text": f"\n[Image from page {doc.metadata['page']}]:\n"
            })
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{image_data_store[image_id]}"
                }
            })

    # Add Instruction 
    content.append({
        'type': 'text', 
        'text': "\n\nPlease answer the question based on the provided text and images."
    })

    return HumanMessage(content = content)

In [20]:
def multimodal_rag_pipeline(query):
    """Main pipeline for multimodal RAG."""
    # Retrieve relevant documents
    context_docs = retrive_multimodal(query=query, k=1) 

    # Create multimodal message
    message = create_multimodal_message(query, context_docs) 

    # Get response from GPT-4
    response = llm.invoke([message]) 

    # Print retrieved context info
    # print(f"\nRetrieved {len(context_docs)} documents:")
    # for doc in context_docs:
    #     doc_type = doc.metadata.get("type", "unknown")
    #     page = doc.metadata.get("page", "?")
    #     if doc_type == "text":
    #         preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
    #         print(f"  - Text from page {page}: {preview}")
    #     else:
    #         print(f"  - Image from page {page}")
    # print("\n")

    return response.content

In [21]:
if __name__ == "__main__":
    queries = [
        "How to inference a language model"
    ]

    for query in queries:
        print(f"\nQuery: {query}")
        print("-" * 50)
        answer = multimodal_rag_pipeline(query)
        print(f"Answer: {answer}")
        print("=" * 70)


Query: How to inference a language model
--------------------------------------------------
Answer: Certainly! Based on the provided text and context ("How to inference a language model?", mention of Transformer Encoder, input and output sequences, and appending the last token), here is how you inference a language model:

**How to inference a language model:**

1. **Start with an Input Sequence:**  
   Provide an initial input sequence to the model (for example: `[SOS] Before my`), where `[SOS]` is the start-of-sequence token.

2. **Pass Through the Language Model:**  
   Feed this input into the language model (such as the Transformer Encoder).

3. **Generate the Next Token:**  
   The model predicts the next token in the sequence (e.g., it outputs `bed` so the sequence is now `[SOS] Before my bed`).

4. **Append the Last Token:**  
   Take the newly predicted token (`bed`) and append it to your input sequence.

5. **Repeat as Needed:**  
   Continue passing the extended input back 