In [2]:
!pip install gradio
!pip install pymupdf
!pip install sentence-transformers
!pip install faiss-cpu

Collecting gradio
  Downloading gradio-5.32.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.2 (from gradio)
  Downloading gradio_client-1.10.2-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.12-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.

In [None]:
import gradio as gr
import json
import fitz  
import uuid
from typing import Dict, List, Tuple, Optional
import re
from dataclasses import dataclass, asdict
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

@dataclass
class PageContent:
    page_num: int
    text: str
    bbox: Tuple[float, float, float, float]  # x0, y0, x1, y1
    char_start: int
    char_end: int

@dataclass
class DocumentChunk:
    chunk_id: str
    text: str
    page_num: int
    chunk_index: int
    embedding: Optional[List[float]] = None
    char_positions: Optional[Tuple[int, int]] = None

class PDFCitationTool:
    def __init__(self):
        self.documents: Dict[str, Dict] = {}
        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')

    def process_pdf(self, pdf_path: str, doc_id: str = None) -> str:
        """Process PDF and extract text with page mapping"""
        if doc_id is None:
            doc_id = str(uuid.uuid4())

        doc = fitz.open(pdf_path)
        pages_content = []
        full_text = ""
        char_offset = 0

        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            page_text = page.get_text()

            # Store page content with character positions
            page_content = PageContent(
                page_num=page_num + 1,  # 1-indexed
                text=page_text,
                bbox=page.rect,
                char_start=char_offset,
                char_end=char_offset + len(page_text)
            )
            pages_content.append(page_content)
            full_text += page_text + "\n"
            char_offset += len(page_text) + 1

        # Create chunks with embeddings
        chunks = self._create_chunks(full_text, pages_content, doc_id)

        # Build FAISS index
        embeddings = [chunk.embedding for chunk in chunks]
        index = faiss.IndexFlatIP(len(embeddings[0]))
        index.add(np.array(embeddings).astype('float32'))

        self.documents[doc_id] = {
            'pages': [asdict(page) for page in pages_content],
            'chunks': [asdict(chunk) for chunk in chunks],
            'index': index,
            'full_text': full_text,
            'metadata': {
                'filename': pdf_path.split('/')[-1],
                'total_pages': len(doc),
                'total_chars': len(full_text)
            }
        }

        doc.close()
        return doc_id

    def _create_chunks(self, full_text: str, pages_content: List[PageContent], doc_id: str) -> List[DocumentChunk]:
        """Create overlapping chunks with page tracking"""
        chunks = []
        chunk_size = 500
        overlap = 100

        for i in range(0, len(full_text), chunk_size - overlap):
            chunk_text = full_text[i:i + chunk_size]
            if not chunk_text.strip():
                continue

            # Find which page(s) this chunk belongs to
            chunk_start = i
            chunk_end = i + len(chunk_text)

            # Find the primary page (where most of the chunk is)
            primary_page = 1
            for page in pages_content:
                if (chunk_start >= page.char_start and
                    chunk_start < page.char_end):
                    primary_page = page.page_num
                    break

            # Create embedding
            embedding = self.embedder.encode(chunk_text).tolist()

            chunk = DocumentChunk(
                chunk_id=f"{doc_id}_{len(chunks)}",
                text=chunk_text,
                page_num=primary_page,
                chunk_index=len(chunks),
                embedding=embedding,
                char_positions=(chunk_start, chunk_end)
            )
            chunks.append(chunk)

        return chunks

    def query_document(self, doc_id: str, query: str, top_k: int = 3) -> Dict:
        """Query document and return results with precise page citations"""
        if doc_id not in self.documents:
            return {"error": "Document not found"}

        doc = self.documents[doc_id]

        # Encode query
        query_embedding = self.embedder.encode(query).astype('float32').reshape(1, -1)

        # Search
        scores, indices = doc['index'].search(query_embedding, top_k)

        results = []
        for score, idx in zip(scores[0], indices[0]):
            chunk = doc['chunks'][idx]

            # Get more precise page information
            page_info = self._get_precise_page_info(
                chunk['text'],
                chunk['char_positions'],
                doc['pages']
            )

            results.append({
                'text': chunk['text'],
                'page_number': chunk['page_num'],
                'confidence': float(score),
                'precise_location': page_info,
                'chunk_id': chunk['chunk_id']
            })

        return {
            'query': query,
            'results': results,
            'document_info': doc['metadata']
        }

    def _get_precise_page_info(self, chunk_text: str, char_positions: Tuple[int, int], pages: List[Dict]) -> Dict:
        """Get more precise location information within the page"""
        start_char, end_char = char_positions

        # Find which page(s) the chunk spans
        spanning_pages = []
        for page in pages:
            if (start_char < page['char_end'] and end_char > page['char_start']):
                spanning_pages.append(page['page_num'])

        # Find position within the page (rough estimate)
        primary_page = spanning_pages[0] if spanning_pages else 1

        # Estimate paragraph/section within page
        page_text = None
        for page in pages:
            if page['page_num'] == primary_page:
                page_text = page['text']
                break

        paragraph_num = 1
        if page_text:
            # Count paragraphs before our chunk
            chunk_start_in_page = max(0, start_char - next(p['char_start'] for p in pages if p['page_num'] == primary_page))
            paragraphs_before = page_text[:chunk_start_in_page].count('\n\n')
            paragraph_num = max(1, paragraphs_before + 1)

        return {
            'primary_page': primary_page,
            'spanning_pages': spanning_pages,
            'estimated_paragraph': paragraph_num,
            'char_range': char_positions
        }

# Initialize the tool
pdf_tool = PDFCitationTool()

def mcp_tool_handler(tool_name: str, arguments: Dict) -> Dict:
    """MCP Tool handler - this is what other LLMs will call"""
    try:
        if tool_name == "process_pdf":
            doc_id = pdf_tool.process_pdf(arguments['pdf_path'])
            return {
                "success": True,
                "document_id": doc_id,
                "message": f"PDF processed successfully. Document ID: {doc_id}"
            }

        elif tool_name == "query_pdf":
            result = pdf_tool.query_document(
                arguments['document_id'],
                arguments['query'],
                arguments.get('top_k', 3)
            )
            return result

        elif tool_name == "list_documents":
            return {
                "documents": [
                    {
                        "id": doc_id,
                        "metadata": doc_data['metadata']
                    }
                    for doc_id, doc_data in pdf_tool.documents.items()
                ]
            }

        else:
            return {"error": f"Unknown tool: {tool_name}"}

    except Exception as e:
        return {"error": str(e)}

def gradio_interface(pdf_file, query_text):
    """Gradio interface for testing"""
    if pdf_file is None:
        return "Please upload a PDF file first."

    try:
        # Process PDF
        doc_id = pdf_tool.process_pdf(pdf_file.name)

        if not query_text.strip():
            return f"PDF processed successfully! Document ID: {doc_id}\nNow enter a query to search the document."

        # Query PDF
        results = pdf_tool.query_document(doc_id, query_text)

        if "error" in results:
            return f"Error: {results['error']}"

        # Format results with precise citations
        output = f"Query: {results['query']}\n\n"
        output += f"Document: {results['document_info']['filename']}\n"
        output += f"Total Pages: {results['document_info']['total_pages']}\n\n"

        output += "Results with Precise Citations:\n"
        output += "=" * 50 + "\n\n"

        for i, result in enumerate(results['results'], 1):
            output += f"Result {i} (Confidence: {result['confidence']:.3f}):\n"
            output += f"📍 **Citation: Page {result['precise_location']['primary_page']}"

            if result['precise_location']['estimated_paragraph'] > 1:
                output += f", Paragraph {result['precise_location']['estimated_paragraph']}"

            if len(result['precise_location']['spanning_pages']) > 1:
                output += f" (spans pages {'-'.join(map(str, result['precise_location']['spanning_pages']))})"

            output += "**\n\n"
            output += f"Text: {result['text'][:300]}{'...' if len(result['text']) > 300 else ''}\n\n"
            output += "-" * 30 + "\n\n"

        return output

    except Exception as e:
        return f"Error processing PDF: {str(e)}"

# MCP Server Configuration
MCP_TOOLS = [
    {
        "name": "process_pdf",
        "description": "Process a PDF file and extract text with page mapping for accurate citations",
        "input_schema": {
            "type": "object",
            "properties": {
                "pdf_path": {
                    "type": "string",
                    "description": "Path to the PDF file to process"
                }
            },
            "required": ["pdf_path"]
        }
    },
    {
        "name": "query_pdf",
        "description": "Query a processed PDF document and get results with precise page citations",
        "input_schema": {
            "type": "object",
            "properties": {
                "document_id": {
                    "type": "string",
                    "description": "The document ID returned from process_pdf"
                },
                "query": {
                    "type": "string",
                    "description": "The search query"
                },
                "top_k": {
                    "type": "integer",
                    "description": "Number of results to return (default: 3)",
                    "default": 3
                }
            },
            "required": ["document_id", "query"]
        }
    },
    {
        "name": "list_documents",
        "description": "List all processed documents",
        "input_schema": {
            "type": "object",
            "properties": {}
        }
    }
]

# Gradio App
demo = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.File(label="Upload PDF", file_types=[".pdf"]),
        gr.Textbox(label="Query", placeholder="What would you like to find in the document?")
    ],
    outputs=gr.Textbox(label="Results with Citations", lines=20),
    title="🎯 PDF Citation Tool - MCP Server",
    description="""
    **Track 1: MCP Tool/Server Submission**

    This Gradio app serves as both:
    1. **Interactive Interface**: Upload PDFs and test queries
    2. **MCP Server**: Other LLMs can use this as a tool for precise PDF citations

    **Features:**
    - Accurate page number citations
    - Paragraph-level precision
    - Confidence scores
    - Multi-page span detection
    - MCP-compatible tool interface

    **For LLM Integration**: This app exposes MCP tools that any LLM can call for precise PDF citations.
    """,
    examples=[
        ["D:/Download/LoRA.pdf", "What are the main findings?"],
    ]
)

if __name__ == "__main__":
    # For MCP Server mode
    import sys
    if len(sys.argv) > 1 and sys.argv[1] == "--mcp":
        # MCP Server mode - expose tools for LLM integration
        print(json.dumps({
            "tools": MCP_TOOLS,
            "handler": "mcp_tool_handler"
        }))
    else:
        # Gradio mode - interactive interface
        demo.launch(share=True)

OSError: [WinError 123] The filename, directory name, or volume label syntax is incorrect: 'What are the main findings?'