In [1]:
import pytesseract

pytesseract.pytesseract.tesseract_cmd = (
    r"C:\Program Files\Tesseract-OCR\tesseract.exe"
)

In [2]:
from typing import List, TypedDict, Optional
from langgraph.graph import StateGraph, END

from pypdf import PdfReader
from PIL import Image
import pytesseract
import os

In [3]:
class GraphState(TypedDict):
    # Inputs
    query: str
    files: Optional[List[str]]  # paths to PDFs or images

    # Outputs (for Agent-2)
    documents: List[dict]       # [{"doc_id": str, "text": str}]

In [4]:
def is_pdf(path: str) -> bool:
    return path.lower().endswith(".pdf")


def is_image(path: str) -> bool:
    return path.lower().endswith((".png", ".jpg", ".jpeg", ".tiff"))

In [5]:
def extract_from_files(state: GraphState) -> GraphState:
    """
    Sub-agent responsible for:
    - PDF text extraction (PyPDF)
    - OCR on scanned images (Tesseract)
    """

    documents = []

    for file_path in state.get("files", []):
        filename = os.path.basename(file_path)

        # ---------- PDF ----------
        if is_pdf(file_path):
            reader = PdfReader(file_path)
            text = ""

            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"

            documents.append({
                "doc_id": filename,
                "text": text.strip()
            })

        # ---------- IMAGE ----------
        elif is_image(file_path):
            image = Image.open(file_path)
            text = pytesseract.image_to_string(image)

            documents.append({
                "doc_id": filename,
                "text": text.strip()
            })

        else:
            raise ValueError(f"Unsupported file type: {file_path}")

    return {
        **state,
        "documents": documents
    }

In [6]:
def route_input(state: GraphState) -> str:
    """
    Routing decision:
    - If files are present → run extraction sub-agent
    - If text-only → pass through
    """
    if state.get("files"):
        return "extract_documents"
    return "pass_through"

In [7]:
def pass_through(state: GraphState) -> GraphState:
    """
    Text-only queries produce no documents.
    """
    return {
        **state,
        "documents": []
    }

In [12]:
graph = StateGraph(GraphState)

graph.add_node("router", lambda state: state)
graph.add_node("extract_documents", extract_from_files)
graph.add_node("pass_through", pass_through)

graph.set_entry_point("router")

graph.add_conditional_edges(
    "router",
    route_input,
    {
        "extract_documents": "extract_documents",
        "pass_through": "pass_through",
    }
)

graph.add_edge("extract_documents", END)
graph.add_edge("pass_through", END)

app = graph.compile()


In [13]:
result = app.invoke({
    "query": "Explain CRISPR gene editing",
    "files": None
})

print(result)

{'query': 'Explain CRISPR gene editing', 'files': None, 'documents': []}


In [18]:
result = app.invoke({
    "query": "Summarize the document",
    "files": ["sample.pdf", "scan.jpg"]  # replace with real paths
})

for doc in result["documents"]:
    print("DOC ID:", doc["doc_id"])
    print("TEXT PREVIEW:", doc["text"][:1200])
    print("-" * 40)

DOC ID: sample.pdf
TEXT PREVIEW: FORM – IF1 - (Integrated Form) 
 
FIRST INFORMATION REPORT 
(Under Section 154 Cr.P.C) 
 
1. Dist. ……………..   P.S……………….Year ……………..  F.I.R. No. …………….  Date …………   
 
2. (i) *Act …………………………………. *Sections  ……………………………………………...  
 (ii) *Act …………………………………. *Sections  ……………………………………………... 
 (iii)  *Act …………………………………. *Sections  ……………………………………………... 
 (iv) * Other Acts & Sections  ……………………………………………………………………….. 
3.  
(a) 
 
* Occurrence of Offence: * Day  ……………….*Date …………………. *Time …………….  
  
(b) 
 
Information received at P.S.  Date ………………………… Time …………………………….. 
  
(c)  
 
General Diary Reference: Entry No(s) …………………… Time ……………………………… 
 
4. Type of information :     *Written / Oral  
 
5. Place of occurrence:  (a) Direction and Distance from P.S. ………………… Beat No. …………….. 
 
 (b) * Address ………………………………………………………………………………………. 
………………………………………………………………………………………………….. 
 (c) In case outside limit of this Police Station, then the name of P.S. …………………………….. 
Dis

In [None]:
# ### Output Passed to Agent-2

# ```python
# {
#   "query": str,
#   "documents": [
#       {
#           "doc_id": "filename.pdf",
#           "text": "extracted raw text..."
#       }
#   ]
# }