📚 Setup and Installation

First, let's install all necessary packages.

In [None]:
# Install required packages

# Install LlamaIndex packages for enhanced document processing

🔧 Core Imports and Configuration

We import all necessary libraries and configure the LLM and embedding models.

In [None]:
import gradio as gr
from gradio_pdf import PDF
import fitz  # PyMuPDF for efficient PDF handling
from PyPDF2 import PdfReader
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import google.generativeai as genai
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass
import json
from datetime import datetime
import hashlib

# LlamaIndex imports for enhanced document processing
from llama_index.core import Document, VectorStoreIndex, StorageContext
from llama_index.core.schema import TextNode
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.vector_stores import MetadataFilters, MetadataFilter, FilterOperator

# Configure Gemini (REPLACE WITH YOUR SECURE API KEY)

GEMINI_API_KEY = "***************************************" 
genai.configure(api_key=GEMINI_API_KEY)
gemini_model = genai.GenerativeModel("models/gemini-2.0-flash")

# Initialize embedding models
# We use 'all-MiniLM-L6-v2' for its balance of performance and speed.
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
llama_embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

📄 Data Structures for Enhanced Document Management

These data classes define the structured format for handling complex document metadata, which is crucial for the metadata-aware RAG pipeline.

In [None]:
@dataclass
class PageInfo:
    """
    Stores information about a single page extracted from the PDF.
    This structure is used during the initial parsing phase.
    """
    page_num: int
    text: str
    doc_type: Optional[str] = None
    page_in_doc: int = 0

@dataclass
class LogicalDocument:
    """
    Represents a logical document (e.g., a contract, a fee sheet)
    that may span multiple pages within a single uploaded PDF file.
    """
    doc_id: str
    doc_type: str
    page_start: int
    page_end: int
    text: str
    chunks: List[Dict] = None

@dataclass
class ChunkMetadata:
    """
    Rich metadata schema for each text chunk. This metadata is stored
    alongside the vector embedding in the FAISS index, enabling
    filtered and highly precise retrieval.
    """
    chunk_id: str
    doc_id: str
    doc_type: str
    chunk_index: int
    page_start: int
    page_end: int
    text: str
    embedding: Optional[np.ndarray] = None

🧠 Document Intelligence Functions

These functions implement the core intelligence layer, responsible for classifying document types and detecting logical boundaries within a single uploaded file.

1. Document Classification (classify_document_type)

This function uses the Gemini LLM to analyze a text sample and categorize the document, which is essential for the query routing mechanism.

In [None]:
def classify_document_type(text: str, max_length: int = 1500) -> str:
    """
    Classify the document type based on its content using the Gemini LLM.
    This is a key component of the Document Intelligence layer.
    """
    # Truncate text if too long to avoid token limits
    text_sample = text[:max_length] if len(text) > max_length else text

    prompt = f"""
    Analyze this document and classify it into ONE of these categories:
    - Resume: CV, professional profile, work history
    - Contract: Legal agreement, terms and conditions, service agreement
    - Mortgage Contract: Home loan agreement, mortgage terms, property financing
    - Invoice: Bill, payment request, financial statement
    - Pay Slip: Salary statement, wage slip, earnings statement
    - Lender Fee Sheet: Loan fees, lender charges, closing costs
    - Land Deed: Property deed, title document, ownership certificate
    - Bank Statement: Account statement, transaction history
    - Tax Document: W2, 1099, tax return, tax form
    - Insurance: Insurance policy, coverage document
    - Report: Analysis, research document, findings
    - Letter: Correspondence, memo, communication
    - Form: Application, questionnaire, data entry form
    - ID Document: Driver's license, passport, identification
    - Medical: Medical report, prescription, health record
    - Other: Doesn't fit other categories

    Document sample:
    {text_sample}

    Respond with ONLY the category name, nothing else.
    """

    try:
        response = gemini_model.generate_content(prompt)
        doc_type = response.text.strip()

        # Normalize the response to ensure it matches one of the predefined categories
        valid_types = [
            'Resume', 'Contract', 'Mortgage Contract', 'Invoice', 'Pay Slip',
            'Lender Fee Sheet', 'Land Deed', 'Bank Statement', 'Tax Document',
            'Insurance', 'Report', 'Letter', 'Form', 'ID Document', 'Medical'
        ]
        if doc_type not in valid_types:
            return 'Other'
        return doc_type
    except Exception as e:
        print(f"Error during document classification: {e}")
        return 'Other'


2. Logical Document Detection (detect_logical_documents)

This function processes the raw PDF, extracts text page-by-page, and uses the classifier to identify where one logical document ends and another begins, even if they are in the same physical file.

In [None]:
def detect_logical_documents(pdf_path: str) -> List[LogicalDocument]:
    """
    Parses a multi-page PDF, extracts text, and uses the classifier to
    segment the file into a list of distinct LogicalDocument objects.
    This handles the case where a single PDF contains multiple, different documents.
    """
    pages: List[PageInfo] = []
    doc_hash = hashlib.sha256(open(pdf_path, 'rb').read()).hexdigest()
    
    # 1. Extract text and classify the first page
    try:
        with open(pdf_path, 'rb') as file:
            reader = PdfReader(file)
            num_pages = len(reader.pages)
            
            for i in range(num_pages):
                page_text = reader.pages[i].extract_text()
                # Only classify the first page of the PDF to set the initial document type
                if i == 0:
                    doc_type = classify_document_type(page_text)
                else:
                    # For subsequent pages, we will check for a change in type
                    doc_type = None 
                
                pages.append(PageInfo(page_num=i + 1, text=page_text, doc_type=doc_type))
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return []

    # 2. Segment the pages into LogicalDocuments
    logical_documents: List[LogicalDocument] = []
    current_doc_start_page = 1
    current_doc_type = pages[0].doc_type if pages else 'Other'
    current_doc_text = ""
    
    for i, page in enumerate(pages):
        # Re-classify a page if the previous page's text is significantly different
        # This is a heuristic to detect document boundaries
        if i > 0 and len(page.text) > 100 and (i % 5 == 0 or len(page.text) / len(pages[i-1].text) > 2 or len(pages[i-1].text) / len(page.text) > 2):
             new_type = classify_document_type(page.text)
             if new_type != current_doc_type and new_type != 'Other':
                 # Finalize the previous document
                 logical_documents.append(LogicalDocument(
                     doc_id=f"{doc_hash}_{current_doc_start_page}-{i}",
                     doc_type=current_doc_type,
                     page_start=current_doc_start_page,
                     page_end=i,
                     text=current_doc_text.strip()
                 ))
                 # Start a new document
                 current_doc_start_page = i + 1
                 current_doc_type = new_type
                 current_doc_text = page.text
                 continue

        current_doc_text += "
" + page.text
        
    # Finalize the last document
    if current_doc_text:
        logical_documents.append(LogicalDocument(
            doc_id=f"{doc_hash}_{current_doc_start_page}-{num_pages}",
            doc_type=current_doc_type,
            page_start=current_doc_start_page,
            page_end=num_pages,
            text=current_doc_text.strip()
        ))
        
    return logical_documents

🛠️ Core RAG Pipeline Functions

These functions handle the ingestion, indexing, and retrieval processes, forming the backbone of the RAG system.

1. Ingestion and Indexing (process_pdf_to_chunks and build_faiss_index)

This is the ingestion pipeline. It takes the PDF, segments it into logical documents, chunks the text, and builds a FAISS vector index for fast retrieval.

In [None]:
def process_pdf_to_chunks(pdf_path: str) -> Tuple[List[ChunkMetadata], List[Document]]:
    """
    The Ingestion Layer: Converts a PDF file into a list of structured text chunks
    (ChunkMetadata) and LlamaIndex Document objects.
    """
    logical_documents = detect_logical_documents(pdf_path)
    all_chunks: List[ChunkMetadata] = []
    llama_documents: List[Document] = []
    
    # Initialize LlamaIndex text splitter for consistent chunking
    text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=50)

    for doc in logical_documents:
        # 1. Split the logical document text into smaller, manageable chunks
        chunks = text_splitter.split_text(doc.text)
        
        for i, chunk_text in enumerate(chunks):
            # 2. Generate embedding for the chunk using the Sentence Transformer model
            embedding = embed_model.encode(chunk_text)
            
            # 3. Create rich metadata for the chunk
            chunk_metadata = ChunkMetadata(
                chunk_id=f"{doc.doc_id}_chunk_{i}",
                doc_id=doc.doc_id,
                doc_type=doc.doc_type,
                chunk_index=i,
                page_start=doc.page_start,
                page_end=doc.page_end,
                text=chunk_text,
                embedding=embedding
            )
            all_chunks.append(chunk_metadata)
            
            # 4. Create LlamaIndex Document for the chunk
            # We use the chunk text as the document text and store all metadata
            llama_doc = Document(
                text=chunk_text,
                metadata={
                    "doc_id": doc.doc_id,
                    "doc_type": doc.doc_type,
                    "chunk_id": chunk_metadata.chunk_id,
                    "page_start": doc.page_start,
                    "page_end": doc.page_end,
                }
            )
            llama_documents.append(llama_doc)
            
    return all_chunks, llama_documents

def build_faiss_index(chunks: List[ChunkMetadata]) -> Tuple[faiss.IndexFlatL2, List[Dict]]:
    """
    Builds a FAISS index from the list of chunks and prepares the metadata list.
    """
    if not chunks:
        return None, []

    # Get the dimension of the embeddings
    d = chunks[0].embedding.shape[0]
    
    # Create a FAISS index (FlatL2 is a simple L2 distance index)
    index = faiss.IndexFlatL2(d)
    
    # Prepare the embeddings matrix and metadata list
    embeddings_matrix = np.array([chunk.embedding for chunk in chunks]).astype('float32')
    metadata_list = [
        {
            "chunk_id": chunk.chunk_id,
            "doc_id": chunk.doc_id,
            "doc_type": chunk.doc_type,
            "page_start": chunk.page_start,
            "page_end": chunk.page_end,
            "text": chunk.text
        }
        for chunk in chunks
    ]
    
    # Add the vectors to the index
    index.add(embeddings_matrix)
    
    return index, metadata_list


2. Retrieval and Querying (retrieve_chunks and rag_query)

These functions handle the core RAG logic: retrieving relevant chunks and generating a final answer using the LLM.

In [None]:
def retrieve_chunks(query: str, index: faiss.IndexFlatL2, metadata_list: List[Dict], k: int = 5) -> List[Dict]:
    """
    Retrieves the top-k most relevant chunks from the FAISS index.
    """
    # 1. Embed the query
    query_embedding = embed_model.encode(query).astype('float32').reshape(1, -1)
    
    # 2. Search the FAISS index
    D, I = index.search(query_embedding, k)  # D is distances, I is indices
    
    # 3. Collect the relevant chunks and their metadata
    retrieved_chunks = [metadata_list[i] for i in I[0] if i != -1]
    
    return retrieved_chunks

def rag_query(query: str, index: faiss.IndexFlatL2, doc_types: List[str]) -> str:
    """
    The main RAG function: retrieves context and generates the final answer.
    """
    if index is None:
        return "Error: Document index is not built. Please process a PDF first."

    # 1. Retrieve relevant chunks
    retrieved_chunks = retrieve_chunks(query, index, global_metadata)
    
    if not retrieved_chunks:
        return "No relevant information found in the document(s)."

    # 2. Format the context for the LLM
    context = "
---
".join([
        f"Document Type: {chunk['doc_type']}
Page Range: {chunk['page_start']}-{chunk['page_end']}
Content: {chunk['text']}"
        for chunk in retrieved_chunks
    ])
    
    # 3. Construct the prompt
    prompt = f"""
    You are an expert document analysis assistant. Your task is to answer the user's question
    based ONLY on the provided context. Do not use any external knowledge.
    
    The document(s) analyzed are of type(s): {', '.join(doc_types)}.
    
    CONTEXT:
    {context}
    
    QUESTION:
    {query}
    
    ANSWER:
    """
    
    # 4. Generate the answer
    try:
        response = gemini_model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        return f"Error generating response: {e}"

⚙️ Gradio Interface Logic

These functions manage the state and logic for the Gradio web interface.

In [None]:
# Global variables to hold the index and metadata (simulating in-memory storage)
global_index: Optional[faiss.IndexFlatL2] = None
global_metadata: List[Dict] = []
global_doc_types: List[str] = []

def process_and_index_pdf(pdf_file) -> Tuple[str, str, gr.Button]:
    """
    Handles the file upload, processing, and index building.
    Returns the status message and the updated state of the query button.
    """
    global global_index, global_metadata, global_doc_types
    
    if pdf_file is None:
        return "Error: Please upload a PDF file.", "Error: Please upload a PDF file.", gr.Button(interactive=False)

    pdf_path = pdf_file.name
    
    try:
        # 1. Process PDF into chunks
        status_msg = f"Processing PDF: {pdf_path}..."
        all_chunks, _ = process_pdf_to_chunks(pdf_path)
        
        if not all_chunks:
            return "Error: Could not extract content from PDF.", "Error: Could not extract content from PDF.", gr.Button(interactive=False)

        # 2. Build FAISS index
        status_msg += f"
Building FAISS index with {len(all_chunks)} chunks..."
        new_index, new_metadata = build_faiss_index(all_chunks)
        
        # 3. Update global state
        global_index = new_index
        global_metadata = new_metadata
        global_doc_types = list(set(chunk.doc_type for chunk in all_chunks))
        
        
        return final_status, final_status, gr.Button(interactive=True)
        
    except Exception as e:
        error_msg = f"An unexpected error occurred during indexing: {e}"
        return error_msg, error_msg, gr.Button(interactive=False)

def answer_question(query: str) -> str:
    """
    Handles the user query and returns the RAG answer.
    """
    if global_index is None:
        return "Error: Index not built. Please process a PDF first."
    if not query:
        return "Please enter a question."
        
    # Execute the full RAG query
    answer = rag_query(query, global_index, global_doc_types)
    return answer

# Define the Gradio Interface
with gr.Blocks(title="Intelligent RAG Document Q&A System") as demo:
    gr.Markdown("# Intelligent RAG Document Q&A System")
    gr.Markdown("Upload a PDF (potentially containing multiple document types) and ask questions about its content.")
    
    with gr.Row():
        # PDF Viewer and Uploader
        pdf_viewer = PDF(label="Uploaded Document Preview", height=600)
        
        with gr.Column():
            # Uploader and Indexing Button
            pdf_upload = gr.File(label="Upload PDF Document", file_types=[".pdf"])
            index_button = gr.Button("Process and Build Index")
            indexing_status = gr.Textbox(label="Indexing Status", lines=5)
            
            # Q&A Section
            gr.Markdown("## Ask a Question")
            question_input = gr.Textbox(label="Your Question", placeholder="e.g., What is the interest rate on the mortgage contract?")
            answer_output = gr.Textbox(label="RAG Answer", lines=10)
            
            # The Q&A button is initially disabled until the index is built
            query_button = gr.Button("Get Answer", interactive=False)

    # Event Handlers
    index_button.click(
        fn=process_and_index_pdf,
        inputs=[pdf_upload],
        outputs=[indexing_status, indexing_status, query_button] # Update status and enable query button
    )
    
    query_button.click(
        fn=answer_question,
        inputs=[question_input],
        outputs=[answer_output]
    )

🚀 Launch the App

In [None]:
# The interface is launched with share=True for easy access in Colab
demo.launch(share=True)