# ColPali + Openparse: Multimodal RAG for Q&A

This code implements a multimodal Retrieval-Augmented Generation (RAG) system for answering questions about PDF documents. It combines several key technologies :

*ColPali*: A pretrained multimodal RAG model based on VLMs  
*OpenParse*: For text and table parsing from PDFs  
*Qwen2-VL*: A multimodal language model for image analysis  
*Groq*: For inference acceleration  

*Processing Pipeline:*

1. PDF document loading and indexing  
2. Relevant passage retrieval  
3. Best page extraction and image conversion  
4. Parsing with OpenParse:  
   - Table structure recognition  
   - Cell content extraction  
   - Text layout analysis    
   - Data formatting and cleaning    
5. System prompt generation  
6. Response generation  

In [None]:
!sudo apt-get update
!apt-get install poppler-utils

from byaldi import RAGMultiModalModel
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
from pdf2image import convert_from_path
from groq import Groq
import base64
import os
import PyPDF2
import openparse

os.environ["GROQ_API_KEY"] = "yourGroqApiToken"

RAG = RAGMultiModalModel.from_pretrained("vidore/colqwen2-v1.0")

model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct",
                                                        torch_dtype=torch.bfloat16,
                                                        attn_implementation="flash_attention_2",
                                                        device_map="cuda")

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

pdf_path = "/your/pdf.pdf"

RAG.index(input_path=pdf_path,
          index_name="multimodal_rag",
          store_collection_with_index=False,
          overwrite=True,)

text_query = "your query"
results = RAG.search(text_query,k=2)

In [None]:
results

In [None]:
images = convert_from_path(pdf_path)
image_index = results[0]["page_num"] -1

In [None]:
from IPython.display import Image,display
display(images[image_index])

In [None]:
retrieved_page_index = results[0]["page_num"] - 1

with open(pdf_path, "rb") as file:
    reader = PyPDF2.PdfReader(file)
    page = reader.pages[retrieved_page_index]

    writer = PyPDF2.PdfWriter()
    writer.add_page(page)

    output_pdf_path = "/content/extracted_page.pdf"
    with open(output_pdf_path, "wb") as output_file:
        writer.write(output_file)
print(f"La page {retrieved_page_index + 1} a été extraite dans {output_pdf_path}")

doc_with_tables_path = "/content/extracted_page.pdf"

parser = openparse.DocumentParser(
    table_args={
        "parsing_algorithm": "table-transformers", 
        "table_output_format": "markdown"
    }
)
parsed_doc2 = parser.parse(doc_with_tables_path)

pdf = openparse.Pdf(doc_with_tables_path)


In [None]:
pdf.display_with_bboxes(
    parsed_doc2.nodes,
)

In [None]:
def create_system_prompt(text_query, parsed_content):
    return f"""You are a document analysis assistant specialized in multimodal understanding. Your task is to answer questions accurately using the provided information.

    Question: {text_query}

    You have access to two information sources:
    1. Document image
    2. Extracted text content:
    {parsed_content}

    ANALYSIS PRIORITY:

    For CHARTS & VISUALIZATIONS:
    - Primary focus on the image input
    - Analyze curves, diagrams, and visual elements in detail
    - Use image-based reasoning for quantitative information

    For TABLES & TEXT:
    - Primary focus on the parsed text for structured data
    - Reference the image only for:
    * Layout verification
    * Table structure comprehension
    * Visual verification of unclear parsed content

    Guidelines:
    - First identify the main content type (chart/table/text)
    - Use Qwen2VL's visual understanding capabilities for complex charts
    - Maintain numerical precision from the correct source
    - Cross-reference between sources when necessary
    - If switching primary sources, briefly explain why

    Now answer the question using the most appropriate source based on content type."""


def get_parsed_page_content(pdf_path):
    try:
        parser = openparse.DocumentParser(
            table_args={
                "parsing_algorithm": "table-transformers",
                "table_output_format": "markdown"
            }
        )
        
        parsed_doc = parser.parse(pdf_path)

        content_parts = []
        for node in parsed_doc.nodes:
            if hasattr(node, 'text'):
                content_parts.append(f"{node.text}\n")

        return "\n".join(content_parts)
    except Exception as e:
        print(f"Error parsing PDF: {e}")
        return ""



In [None]:
def process_query(text_query, RAG, model, processor, pdf_path):
    results = RAG.search(text_query, k=2)
    image_index = results[0]["page_num"] - 1

    images = convert_from_path(pdf_path)

    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        page = reader.pages[image_index]
        writer = PyPDF2.PdfWriter()
        writer.add_page(page)

        output_pdf_path = "/content/extracted_page.pdf"
        with open(output_pdf_path, "wb") as output_file:
            writer.write(output_file)

    parsed_content = get_parsed_page_content(output_pdf_path)
    system_prompt = create_system_prompt(text_query, parsed_content)

    messages = [
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": [
                {"type": "image", "image": images[image_index]},
                {"type": "text", "text": text_query}
            ]
        }
    ]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(text=[text],
                      images=image_inputs,
                      videos=video_inputs,
                      padding=True,
                      return_tensors="pt")
    inputs = inputs.to("cuda")

    generate_ids = model.generate(
        **inputs,
        max_new_tokens=1024,
        temperature=0.7,
        top_p=0.9,
    )

    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generate_ids)
    ]

    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )

    return output_text[0]

In [None]:
response = process_query(text_query, RAG, model, processor, pdf_path)
print(response)