In [None]:
import fitz
from langchain_core.documents import Document
from transformers import CLIPModel, CLIPProcessor
from PIL import Image
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate
from langchain.schema import HumanMessage
from sklearn.metrics.pairwise import cosine_similarity
import torch
import numpy as np
import os
import base64
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS 

: 

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["Groq_API_key"]=os.getenv("Groq_API_key")

clip_model=CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor=CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

clip_model.eval()

TypeError: str expected, not NoneType

In [4]:
def embed_image(image_data):
    if isinstance(image_data, str):
        image= image.open(image_data).convert("RGB")
    else:
        image= image_data
    
    inputs=clip_processor(images = image, return_tensors = "pt")
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
        features= features/features.norm(dim=-1, keepdim=True)
        return features.squeeze().numpy()
    
def embed_text(text):
    inputs=clip_processor(
        images = image,
        return_tensors = "pt",
        padding=True,
        transaction = True,
        max_length = 77
        )
    with torch.no_grad():
        features = clip_model.get_text_features(**inputs)
        features= features/features.norm(dim=-1, keepdim=True)
        return features.squeeze().numpy()

In [None]:
pdf_path=("sample.pdf")
doc = fitz.open(pdf_path)

all_docs=[]
all_embeddings=[]
image_data_stores={}

splitter= RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)


FileNotFoundError: no such file: 'sample.pdf'

In [None]:
for i,page in enumerate(doc):
    text=page.get_text()
    if text.strip():
        temp_data=Document(page_content=text, metadata={"page":i, "type":'text'})
        text_chunks=splitter.split_documents({temp_data})

        for chunk in text_chunks:
            embedding= embed_text(chunk.page_content)
            all_embeddings.append(embedding)
            all_docs.append(chunk)
        

        for img_index, img  in enumerate(page.get_images(full=True)):
            try:
                xref=img[0]
                base_image=doc.extract_image(xref)
                image_bytes=base_image["image"]

                pil_image= Image.open(io.BytesIO(image_bytes)).convert("RGB")
                image_id=f"page_{i}_img_{img_index}"

                buffered = io.BytesIO()
                pil_image.save(buffered, format="PNG") 
                img_base64=base64.b64encode(buffered.get_value()).decode()
                image_data_stores[image_id]= img_base64

                embedding = embed_image(pil_image)
                all_embeddings.append(embedding)


                image_doc = Document(
                    page_content = f"[Image:{image_id}]"
                    metadata = {"page": i, "type":"image", "image_id": image_id}
                )
                all_docs.append(image_doc)

            except Exception as e:
                print(f"Error processing image {img_index} on page {i}: {e}")
                continue
doc.close()

In [None]:
##all_embeddins
##all_docs

In [None]:
embeddings_array = np.array(all_embeddings)

In [None]:
vector_store =  FAISS.from_embeddings (
    text_embeddings = [(doc.page content, emb) for doc, emb in zip (all_docs, embeddings_array)],
    embedding= None,
    metadatas=[doc.metadata for doc in all_docs]
    )



In [None]:
llm = init_chat_model("Groq_API_key")
llm

In [None]:
def retrieve_multimodal(query, k=5):
    """Unified retrieval using embeddings for both text and images"""
    query_embedding = embed_text(query)
    results = vector_store.similarity_search_by_vector(
        embedding= query_embedding,
        k=k
    )

    return results

In [None]:
def create_multimodal_message(query, retrieved_docs):
    content=[]
    content.append(
        "text": "text",
        "text": f"Question:{query}\n\nContext:\n"
    )

    text_docs= [doc for doc in retrieved_docs if doc.metadata.get("type")=="text"]
    image_docs= [doc for doc in retrieved_docs if doc.metadata.get("type")=="image"]

    if text_docs:
        text_context = "\n\n".join([
            f"[Page {doc.metadata['page']}]: {doc.page_content}"
            for doc in text_docs
        ])
        content.append({
            "type": "text"
            text: f"Text excerpts:\n{text_context}\n"
        })
    for doc in image_docs:
        image_id=doc.matadata.get("image_id")
        if image_id and image_id in image_data_stores:
            content.append([
                "type": "text",
                "text": f"\n[Image from page:{doc.metadata['page']}]:\n"
            ])
            content.append({
                "type": "image_url",
                "image_url": f"data:image/png;base64,{image_data_stores[image_id]}"
                })

    content.append([
        "type":"text",
        "text":"\n\nPlease answer the question based on the provided text and images."
        ])

    return HumanMessage(content=content)

       

In [None]:
def multimodal_pdf_rag_pipeline(query):
    context_docs=retrieve_multimodal(query, k=5)

    message=create_multimodal_message(query,context_docs)

    response = llm.invoke([message])

    print(f"\nRetrieved {len(context_docs)} documents:")

    for doc in context_docs:
        doc_type = doc.metadata.get("type", "unknown")
        page = doc.metadata.get("type", "?")

        if doc_type=="text":
            preview = doc.page_content[:100] + "..." if len(doc.page_content)>100 else doc.page_content
            print(f" - Text from page {page}: {preview}")
        else:
            print(f" - Text from page {page}")

    print("\n")

    result response.content

In [None]:
if __name__ == "__main__":
    queries=[
        
    ]

for query in queries:
    print(f"\nQuery: {query}")
    print(f"-"*50)
    answer= multimodal_pdf_rag_pipeline(query)
    print(f"Answer: {answer}")
    print(f"=" * 70)