In [1]:
import os
import io
import fitz #PyMuPDF ke andar ye use hota hai.
import base64
import torch 
import numpy as np
from PIL import Image
from dotenv import load_dotenv
from sklearn.metrics.pairwise import cosine_similarity
from transformers import CLIPProcessor, CLIPModel
from langchain_core.prompts import PromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_classic.schema.messages import HumanMessage
from langchain.chat_models import init_chat_model
from langchain_core.documents import Document


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### Loading Clip Model so we reuired processor and model
load_dotenv()

## set up the environment
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

## Initialize Clip Model

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
# This model is reponsible for conversion of text and images into embeddings.

clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# CLIP processor isliye use hota hai kyunki CLIP model ko image aur text ek specific standardized format me chahiye hota hai â€” processor unhe convert karke ready-to-use banata hai.

clip_model.eval()



Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

## Embeddings of Image and Text Using CLIP

In [3]:
def embed_image(image_data):
    ''' Embbed image using clip'''
    if isinstance(image_data, str): # if path
        image = Image.open(image_data).convert("RGB")
    else: # If PIL Image
        image = image_data

    input = clip_processor(images=image, return_tensors="pt") # we need to return tensors in pytorch tensors.
    with torch.no_grad():
        features = clip_model.get_image_features(**input)
        # Normalize embeddings to unit vectors
        features = features/features.norm(dim=1, keepdim=True)
        return features.squeeze().numpy()

    
def embed_text(text):
    ''' Embed text using CLIP'''
    inputs = clip_processor(
        text=text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=77 # Clip's Max token length
    )
    with torch.no_grad():
        features = clip_model.get_text_features(**inputs)
        # Normalize embeddings
        features = features/features.norm(dim=1, keepdim=True)
        return features.squeeze().numpy()




In [None]:
## Process PDF
pdf_path = "multimodal_sample.pdf"
doc = fitz.open(pdf_path)

# we now create variables for Storage for all documents and embeddings
all_docs = []
all_embeddings = []
image_data_store = []

# Text Splitter
splitter = RecursiveCharacterTextSplitter(chunk_size=500) 
# RecursiveCharacterTextSplitter large text ko intelligent, meaning-preserving chunks me todta hai taaki embeddings and RAG best perform karein.




In [7]:
doc

Document('Practical.pdf')

In [8]:
for i,page in enumerate(doc): #go inside my doc
    # Proess the text
    text = page.get_text()
    if text.strip():
        # create temporary document for splitting
        temp_doc = Document(page_content=text, metadata={"page": i, "type": "text"})
        # For all the text data, keep meta data type as text only.
        text_chunks = splitter.split_documents([temp_doc])


        for chunk in text_chunks:
            embedding = embed_text(chunk.page_content)
            all_embeddings.append(embedding)
            all_docs.append(chunk)

## process images
    ##Three Important Actions:

    ##Convert PDF image to PIL format
    ##Store as base64 for GPT-4V (which needs base64 images)
    ##Create CLIP embedding for retrieval

    for img_index, img in enumerate(page.get_images(full=True)):
        try:
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            
            # Convert to PIL Image
            pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
            
            # Create unique identifier
            image_id = f"page_{i}_img_{img_index}"
            
            # Store image as base64 for later use with GPT-4V
            buffered = io.BytesIO()
            pil_image.save(buffered, format="PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode()
            image_data_store[image_id] = img_base64
            
            # Embed image using CLIP
            embedding = embed_image(pil_image)
            all_embeddings.append(embedding)
            
            # Create document for image
            image_doc = Document(
                page_content=f"[Image: {image_id}]",
                metadata={"page": i, "type": "image", "image_id": image_id}
            )
            all_docs.append(image_doc)
            
        except Exception as e:
            print(f"Error processing image {img_index} on page {i}: {e}")
            continue

doc.close()


Error processing image 0 on page 1: list indices must be integers or slices, not str
Error processing image 0 on page 2: list indices must be integers or slices, not str
Error processing image 1 on page 2: list indices must be integers or slices, not str


In [9]:

all_docs

[Document(metadata={'page': 0, 'type': 'text'}, page_content='PRACTICAL NO : 4  \nNAME : KRISH PAROTHI  \nSECTION: A4 \nBATCH: B3  \nROLL NO. : 49  \nSUBJECT : COMPUTER NETWORKS \n  \nAIM : To Implement Data link Layer flow control mechanism Develop a simple data link \nlayer that performs the flow control using the sliding window protocol, and loss recovery \nusing the Go-Back-N mechanism:-   \nA small file (set of data packets) needs to be transmitted over a channel that may lose or'),
 Document(metadata={'page': 0, 'type': 'text'}, page_content='using the Go-Back-N mechanism:-   \nA small file (set of data packets) needs to be transmitted over a channel that may lose or \nreorder packets. To manage the flow and recover from lost packets, the sender and receiver \nimplement the sliding window protocol with Go-Back-N (window size = 4). If a packet \nacknowledgment is not received within a fixed timeout, the sender should retransmit all \npackets from the unacknowledged one.  \n  \nCOD

In [10]:
# Create unified FAISS vector store with CLIP embeddings
embeddings_array = np.array(all_embeddings)
embeddings_array

array([[ 0.0177418 ,  0.01462063, -0.00422945, ...,  0.0441241 ,
         0.00833613, -0.00232817],
       [ 0.02780538,  0.01399401,  0.00153549, ...,  0.00678344,
         0.01409649,  0.04111386],
       [ 0.01202888,  0.01678002, -0.03601726, ..., -0.06275425,
        -0.03378645,  0.05293867],
       ...,
       [ 0.02796599,  0.0054474 , -0.01877345, ..., -0.08481495,
        -0.01538367,  0.0349141 ],
       [-0.00024179, -0.01857516, -0.03511811, ...,  0.02345825,
        -0.03515678,  0.03541576],
       [ 0.00902395,  0.01864137, -0.0012215 , ..., -0.00430229,
         0.00424004,  0.01835299]], shape=(7, 512), dtype=float32)

In [11]:

(all_docs,embeddings_array)

([Document(metadata={'page': 0, 'type': 'text'}, page_content='PRACTICAL NO : 4  \nNAME : KRISH PAROTHI  \nSECTION: A4 \nBATCH: B3  \nROLL NO. : 49  \nSUBJECT : COMPUTER NETWORKS \n  \nAIM : To Implement Data link Layer flow control mechanism Develop a simple data link \nlayer that performs the flow control using the sliding window protocol, and loss recovery \nusing the Go-Back-N mechanism:-   \nA small file (set of data packets) needs to be transmitted over a channel that may lose or'),
  Document(metadata={'page': 0, 'type': 'text'}, page_content='using the Go-Back-N mechanism:-   \nA small file (set of data packets) needs to be transmitted over a channel that may lose or \nreorder packets. To manage the flow and recover from lost packets, the sender and receiver \nimplement the sliding window protocol with Go-Back-N (window size = 4). If a packet \nacknowledgment is not received within a fixed timeout, the sender should retransmit all \npackets from the unacknowledged one.  \n  \nC

In [12]:
# Create custom FAISS index since we have precomputed embeddings
vector_store = FAISS.from_embeddings(
    text_embeddings=[(doc.page_content, emb) for doc, emb in zip(all_docs, embeddings_array)],
    embedding=None,  # We're using precomputed embeddings
    metadatas=[doc.metadata for doc in all_docs]
)
vector_store

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


<langchain_community.vectorstores.faiss.FAISS at 0x1e06ae5ae40>

In [13]:
# Initialize Groq: Llama Vision model
llm = init_chat_model("groq:meta-llama/llama-4-scout-17b-16e-instruct")
llm

ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x000001E0FA6F01A0>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x000001E0FA6F0EC0>, model_name='meta-llama/llama-4-scout-17b-16e-instruct', model_kwargs={}, groq_api_key=SecretStr('**********'))

In [14]:
def retrieve_multimodal(query, k=5):
    """Unified retrieval using CLIP embeddings for both text and images."""
    # Embed query using CLIP
    query_embedding = embed_text(query)
    
    # Search in unified vector store
    results = vector_store.similarity_search_by_vector(
        embedding=query_embedding,
        k=k
    )
    
    return results

In [15]:

def create_multimodal_message(query, retrieved_docs):
    """Create a message with both text and images for GPT-4V."""
    content = []
    
    # Add the query
    content.append({
        "type": "text",
        "text": f"Question: {query}\n\nContext:\n"
    })
    
    # Separate text and image documents
    text_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "text"]
    image_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "image"]
    
    # Add text context
    if text_docs:
        text_context = "\n\n".join([
            f"[Page {doc.metadata['page']}]: {doc.page_content}"
            for doc in text_docs
        ])
        content.append({
            "type": "text",
            "text": f"Text excerpts:\n{text_context}\n"
        })
    
    # Add images
    for doc in image_docs:
        image_id = doc.metadata.get("image_id")
        if image_id and image_id in image_data_store:
            content.append({
                "type": "text",
                "text": f"\n[Image from page {doc.metadata['page']}]:\n"
            })
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{image_data_store[image_id]}"
                }
            })
    
    # Add instruction
    content.append({
        "type": "text",
        "text": "\n\nPlease answer the question based on the provided text and images."
    })
    
    return HumanMessage(content=content)

In [16]:
def multimodal_pdf_rag_pipeline(query):
    """Main pipeline for multimodal RAG."""
    # Retrieve relevant documents
    context_docs = retrieve_multimodal(query, k=5)
    
    # Create multimodal message
    message = create_multimodal_message(query, context_docs)
    
    # Get response from GPT-4V
    response = llm.invoke([message])
    
    # Print retrieved context info
    print(f"\nRetrieved {len(context_docs)} documents:")
    for doc in context_docs:
        doc_type = doc.metadata.get("type", "unknown")
        page = doc.metadata.get("page", "?")
        if doc_type == "text":
            preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
            print(f"  - Text from page {page}: {preview}")
        else:
            print(f"  - Image from page {page}")
    print("\n")
    
    return response.content

In [17]:
if __name__ == "__main__":
    # Example queries
    queries = [
        
        "Explain first 10 pages of pdf"
    ]
    
    for query in queries:
        print(f"\nQuery: {query}")
        print("-" * 50)
        answer = multimodal_pdf_rag_pipeline(query)
        print(f"Answer: {answer}")
        print("=" * 70)


Query: Explain first 10 pages of pdf
--------------------------------------------------

Retrieved 5 documents:
  - Text from page 2: OUTPUT :
  - Text from page 1: send_packets() 
 
 
CODE SCREENSHOT:
  - Text from page 0: if random.random() < LOSS_PROBABILITY:  
                print(f"Packet {i} LOST")  
               ...
  - Text from page 0: acknowledgment is not received within a fixed timeout, the sender should retransmit all 
packets fro...
  - Text from page 0: base = 0  
    while base < TOTAL_PACKETS:  
        print(f"\nWindow: Sending packets {base} to {mi...


Answer: Based on the provided text excerpts, I'll explain the first 10 pages of the PDF, which seems to be a code snippet and explanation of the Go-Back-N protocol.

**Pages 0-1:**
The code snippet is implementing the Go-Back-N protocol, a transport-layer protocol used for reliable data transfer over an unreliable network. The protocol is used to ensure that data packets are delivered in the correct order.

The co