In [74]:
from pydantic import BaseModel, Field


class OABSchema(BaseModel):
    nome: str | None = Field(
        None,
        description="Nome do profissional, normalmente no canto superior esquerdo da imagem",
    )
    inscricao: str | None = Field(
        None, description="Número de inscrição do profissional"
    )
    seccional: str | None = Field(None, description="Seccional do profissional")
    subsecao: str | None = Field(
        None, description="Subseção à qual o profissional faz parte"
    )
    categoria: str | None = Field(
        None,
        description="Categoria, pode ser ADVOGADO, ADVOGADA, SUPLEMENTAR, ESTAGIARIO, ESTAGIARIA",
    )
    endereco_profissional: str | None = Field(
        None, description="Endereço do profissional"
    )
    telefone_profissional: str | None = Field(
        None, description="Telefone do profissional"
    )
    situacao: str | None = Field(
        None,
        description="Situação do profissional, normalmente no canto inferior direito.",
    )


In [2]:
import os

from dotenv import load_dotenv
from langchain.chat_models import init_chat_model

load_dotenv()

model = init_chat_model(
    "gemini-2.5-flash",
    model_provider="google_genai",
    api_key=os.getenv("GEMINI_API_KEY"),
)

In [3]:
# from pprint import pprint
# from typing import Literal, TypedDict
# from uuid import uuid4

# from langchain.agents import create_agent
# from langchain.agents.middleware import (
#     HumanInTheLoopMiddleware,
#     PIIMiddleware,
#     SummarizationMiddleware,
# )
# from langchain_core.messages import HumanMessage
# from langgraph.checkpoint.memory import InMemorySaver, MemorySaver
# from langgraph.graph import END, START, StateGraph
# from langgraph.types import Command, RetryPolicy, interrupt
# from pydantic import BaseModel, Field

In [107]:
import os
from PyPDF2 import PdfReader
import json

DATA_DIR = "../data"

with open(os.path.join(DATA_DIR, "dataset.json"), "r", encoding="utf-8") as f:
    data = json.load(f)  # data will be a list of dicts

num = 0

reader = PdfReader(os.path.join(DATA_DIR, f"oab_{num + 1}.pdf"))
# reader = PdfReader(os.path.join(DATA_DIR, f"oab_{num}.pdf"))
# reader = PdfReader(os.path.join(DATA_DIR, f"oab_{num}.pdf"))

image_text = reader.pages[0].extract_text()
obj_schema = data[num]["extraction_schema"]
# obj_schema = OABSchema.model_json_schema()["properties"]
print(obj_schema)
print(image_text)

{'nome': 'Nome do profissional, normalmente no canto superior esquerdo da imagem', 'inscricao': 'Número de inscrição do profissional', 'seccional': 'Seccional do profissional', 'subsecao': 'Subseção à qual o profissional faz parte', 'categoria': 'Categoria, pode ser ADVOGADO, ADVOGADA, SUPLEMENTAR, ESTAGIARIO, ESTAGIARIA', 'endereco_profissional': 'Endereço do profissional', 'telefone_profissional': 'Telefone do profissional', 'situacao': 'Situação do profissional, normalmente no canto inferior direito.'}
JOANA D'ARC
Inscrição Seccional Subseção
101943 PR CONSELHO SECCIONAL - PARANÁ
SUPLEMENTAR
Endereço Profissional
AVENIDA PAULISTA, Nº 2300 andar Pilotis, Bela Vista
SÃO PAULO - SP
01310300
Telefone Profissional
SITUAÇÃO REGULAR


In [87]:
prompt_template_pt = """
Você receberá um trecho de texto extraído de um documento.

Com base nesse texto, extraia **exclusivamente** as informações solicitadas e retorne o resultado em **formato JSON**, seguindo rigorosamente o esquema fornecido.

Texto de entrada:
{text}

Esquema JSON esperado:
{schema}

Instruções importantes:
- Se uma informação não estiver presente no texto, atribua o valor **null** ao campo correspondente.
- Não adicione informações que não estejam explícitas no texto.
- Retorne **apenas** o JSON, sem comentários, explicações ou texto adicional.
- Garanta que o JSON seja **válido**, **bem formatado** e **compatível com o esquema**.
"""

prompt_template_en = """
You will receive a text excerpt extracted from a document.

Based on this text, extract **only** the requested information and return the result in **JSON format**, strictly following the provided schema.

Input text:
{text}

Expected JSON schema:
{schema}

Important instructions:
- If any information is missing from the text, assign the value **null** to the corresponding field.
- Do not include any information that is not explicitly mentioned in the text.
- Return **only** the JSON, without comments, explanations, or additional text.
- Ensure the JSON is **valid**, **well-formatted**, and **fully compliant** with the schema.
"""

In [73]:
# from pydantic import BaseModel, create_model

# extraction_schema = {
#     "nome": "Nome do profissional, normalmente no canto superior esquerdo da imagem",
#     "inscricao": "Número de inscrição do profissional",
#     "seccional": "Seccional do profissional",
#     "subsecao": "Subseção à qual o profissional faz parte",
#     "categoria": "Categoria, pode ser ADVOGADO, ADVOGADA, SUPLEMENTAR, ESTAGIARIO, ESTAGIARIA",
#     "endereco_profissional": "Endereço do profissional",
#     "telefone_profissional": "Telefone do profissional",
#     "situacao": "Situação do profissional, normalmente no canto inferior direito.",
# }

# # Dynamically create a Pydantic model with all fields as optional strings
# fields = {key: (str | None, None) for key in data[0]["extraction_schema"].keys()}
# DynamicModel = create_model("DynamicModel", **fields)

In [None]:
from langchain.agents import create_agent
from langgraph.checkpoint.memory import MemorySaver
from uuid import uuid4

config = {"configurable": {"thread_id": str(uuid4())}}
memory = MemorySaver()
agent = create_agent(
    model=model, tools=[], response_format=OABSchema, checkpointer=memory
)

In [101]:
from langchain_core.messages import HumanMessage

response = agent.invoke(
    {
        "messages": [
            HumanMessage(
                content=prompt_template_en.format(text=image_text, schema=obj_schema)
            )
        ]
    },
    config=config,
)

In [102]:
from pprint import pprint

pprint(response["structured_response"])

OABSchema(nome='SON GOKU', inscricao='101943', seccional='PR', subsecao='CONSELHO SECCIONAL - PARANÁ', categoria='SUPLEMENTAR', endereco_profissional=None, telefone_profissional=None, situacao='REGULAR')


In [12]:
"""
OABSchema(
    nome="JOANA D'ARC",
    inscricao="101943",
    seccional="PR",
    subsecao="CONSELHO SECCIONAL - PARANÁ",
    categoria="SUPLEMENTAR",
    endereco_profissional="AVENIDA PAULISTA, Nº 2300 andar Pilotis, Bela Vista SÃO PAULO - SP 01310300",
    telefone_profissional=None,
    situacao="REGULAR",
)
OABSchema(
    nome="LUIS FILIPE ARAUJO AMARAL",
    inscricao="101943",
    seccional="PR",
    subsecao="CONSELHO SECCIONAL - PARANÁ",
    categoria="SUPLEMENTAR",
    endereco_profissional="AVENIDA PAULISTA, Nº 2300 andar Pilotis, Bela Vista SÃO PAULO - SP 01310300",
    telefone_profissional=None,
    situacao="REGULAR",
)
OABSchema(
    nome="SON GOKU",
    inscricao="101943",
    seccional="PR",
    subsecao="CONSELHO SECCIONAL - PARANÁ",
    categoria="SUPLEMENTAR",
    endereco_profissional="K",
    telefone_profissional=None,
    situacao="REGULAR",
)
"""

'\nOABSchema(\n    nome="JOANA D\'ARC",\n    inscricao="101943",\n    seccional="PR",\n    subsecao="CONSELHO SECCIONAL - PARANÁ",\n    categoria="SUPLEMENTAR",\n    endereco_profissional="AVENIDA PAULISTA, Nº 2300 andar Pilotis, Bela Vista SÃO PAULO - SP 01310300",\n    telefone_profissional=None,\n    situacao="REGULAR",\n)\nOABSchema(\n    nome="LUIS FILIPE ARAUJO AMARAL",\n    inscricao="101943",\n    seccional="PR",\n    subsecao="CONSELHO SECCIONAL - PARANÁ",\n    categoria="SUPLEMENTAR",\n    endereco_profissional="AVENIDA PAULISTA, Nº 2300 andar Pilotis, Bela Vista SÃO PAULO - SP 01310300",\n    telefone_profissional=None,\n    situacao="REGULAR",\n)\nOABSchema(\n    nome="SON GOKU",\n    inscricao="101943",\n    seccional="PR",\n    subsecao="CONSELHO SECCIONAL - PARANÁ",\n    categoria="SUPLEMENTAR",\n    endereco_profissional="K",\n    telefone_profissional=None,\n    situacao="REGULAR",\n)\n'

In [13]:
from pydantic import BaseModel, Field
from typing import Optional, Literal

# Define the exact types of rules that your system understands
RuleType = Literal["regex", "keyword", "position"]

# Define the exact strategies for rules of type "keyword"
KeywordStrategy = Literal["next_line", "multiline_until_stop", "conditional_null"]


class Rule(BaseModel):
    """Stores a single extraction rule generated by the Learning Loop."""

    # --- Main Discriminator Field ---
    type: RuleType = Field(
        ..., description="The main type of the rule (regex, keyword, or position)"
    )

    # --- 1. For type="regex" ---
    rule: Optional[str] = Field(
        None,
        description="The regex pattern to be executed. (Ex: 'Inscrição[^\d]*(\d{6})')",
    )

    # --- 2. For type="keyword" ---
    keyword: Optional[str] = Field(
        None, description="The 'anchor' keyword to search for in the text."
    )

    strategy: Optional[KeywordStrategy] = Field(
        None, description="The action to take after finding the keyword."
    )

    stop_keyword: Optional[str] = Field(
        None,
        description="Where to stop for 'multiline' or what to check for 'conditional'.",
    )

    # --- 3. For type="position" ---
    line_number: Optional[int] = Field(
        None,
        description="The line number to extract (e.g., 1 for the first line).",
    )

In [71]:
rule_generation_prompt_template_en = """
You are an expert automation engineer specializing in robust text extraction.
Your task is to generate a **single, robust, and independent extraction rule** for a specific data field.

The goal is to create an "atomic" rule that can find this value in future documents. The rule MUST be based on stable "anchor" keywords (like "Inscrição", "Endereço Profissional") or patterns directly related to **itself**, not based on the position of *other* fields.

**Crucial Constraint: What to AVOID**
* **DO NOT** create rules that depend on the relative position of *other* fields.
* **Bad Rule (Coupled):** "Find the text on the line after the 'inscricao' field."
* **Good Rule (Atomic):** "Find the text on the line after the keyword 'Subseção'."

---
**ANALYSIS PATHS:**

**PATH A: If `field_value` is NOT null (e.g., "JOANA D'ARC")**
1.  **Locate:** Find the `field_value` in the `full_text`.
2.  **Find Anchor:** Analyze the text *immediately* surrounding the value to find a stable, unique keyword (like "Nome", "Inscrição", etc.) or a fixed position (like "first line").
3.  **Generate Rule:** Create the rule based on this anchor. A `regex` rule is strongly preferred.

**PATH B: If `field_value` IS null**
1.  **Locate Anchor:** Find the "anchor" keyword for the field (e.g., "Telefone Profissional") in the `full_text`.
2.  **Find Stop-Anchor:** Analyze the text *immediately following* this anchor. Find the *next* field's anchor (e.g., "SITUAÇÃO REGULAR").
3.  **Generate Rule:** Create a `conditional_null` rule. This rule checks if the `stop_keyword` appears immediately after the `keyword`, implying the field is empty.

---
**INPUTS:**

**1. Full Text (`full_text`):**
{text}

**2. Field to Analyze (`field_name`):**
"{field_name}"

**3. Extracted Value (`field_value`):** (This could be `null` or `None`)
"{field_value}"

**4. Field Description (`field_description`):**
"{field_description}"

---
**OUTPUT INSTRUCTIONS:**

Return **only** a single, valid JSON object for the generated rule, strictly adhering to the following `Rule` schema.

**Rule Schema:**
{{
    "type": "The type of rule. Use 'regex' whenever possible.",
    "rule": "The Python-compatible regex pattern. It MUST include a capture group ( ). It should be anchored to a stable keyword (e.g., 'Inscrição[^\d]*(\d{{6}})') and NOT to text from other fields.",
    "keyword": "The 'anchor' keyword (use if 'regex' is not possible).",
    "strategy": "The strategy for the 'keyword' (e.g., 'next_line', 'multiline_until_stop', 'conditional_null').",
    "stop_keyword": "The stopping keyword for the strategy.",
    "line_number": "The line number (use 'position' only as a last resort)."
}}

**Example for a `regex` rule (PATH A):**
{{
    "type": "regex",
    "rule": "Inscrição[^\d]*(\d{{6}})",
    "keyword": null,
    "strategy": null,
    "stop_keyword": null,
    "line_number": null
}}

**Example for a `keyword` rule (PATH A):**
{{
    "type": "keyword",
    "rule": null,
    "keyword": "Subseção",
    "strategy": "next_line",
    "stop_keyword": null,
    "line_number": null
}}

**Example for a `conditional_null` rule (PATH B):**
{{
    "type": "keyword",
    "rule": null,
    "keyword": "Telefone Profissional",
    "strategy": "conditional_null",
    "stop_keyword": "SITUAÇÃO",
    "line_number": null
}}

**Your Turn:**
Generate the rule for the field `"{field_name}"`.
"""

In [95]:
config = {"configurable": {"thread_id": str(uuid4())}}
memory = MemorySaver()
agent_rule = create_agent(
    model=model, tools=[], response_format=Rule, checkpointer=memory
)

In [104]:
pdf_text = image_text
llm_response = response["structured_response"].model_dump()
extraction_schema = obj_schema

generated_rules = {}

for field, value in llm_response.items():
    print("Generating rule for field:", field, "with value:", value)
    if value is not None:
        # Preparar os inputs para o NOVO prompt
        field_name = field
        field_value = value
        field_description = extraction_schema[field]

        # Criar o prompt de geração de regra
        rule_prompt = rule_generation_prompt_template_en.format(
            text=pdf_text,
            field_name=field_name,
            field_value=field_value,
            field_description=field_description,
        )

        # Chamar a API do gpt-5-mini com este 'rule_prompt'
        # Esta é a sua 2ª chamada de API (o "Meta-Prompt")
        rule_json_string = agent_rule.invoke({"messages": rule_prompt}, config=config)

        # # Validar o JSON da regra usando seu modelo Pydantic
        rule_object = Rule.model_validate(rule_json_string["structured_response"])

        generated_rules[field] = rule_object
        print("Generated rule for field", field, ":", generated_rules[field])
    # break

# # Agora você tem um "Heuristic Store" de altíssima qualidade
# save_rules_to_store("carteira_oab", generated_rules)

Generating rule for field: nome with value: SON GOKU
Generated rule for field nome : type='regex' rule='^([^\n]+)\nInscrição' keyword=None strategy=None stop_keyword=None line_number=None
Generating rule for field: inscricao with value: 101943
Generated rule for field inscricao : type='regex' rule='Inscrição.*?(\\d+)' keyword=None strategy=None stop_keyword=None line_number=None
Generating rule for field: seccional with value: PR
Generated rule for field seccional : type='regex' rule='Inscrição Seccional\\s*\\n\\s*\\d+\\s*(\\w+)Subseção' keyword=None strategy=None stop_keyword=None line_number=None
Generating rule for field: subsecao with value: CONSELHO SECCIONAL - PARANÁ
Generated rule for field subsecao : type='regex' rule='Subseção\\s*\\n([^\\n]+)' keyword=None strategy=None stop_keyword=None line_number=None
Generating rule for field: categoria with value: SUPLEMENTAR
Generated rule for field categoria : type='regex' rule='([A-Z]+)\nEndereco Profissional' keyword=None strategy=Non

In [105]:
generated_rules

{'nome': Rule(type='regex', rule='^([^\n]+)\nInscrição', keyword=None, strategy=None, stop_keyword=None, line_number=None),
 'inscricao': Rule(type='regex', rule='Inscrição.*?(\\d+)', keyword=None, strategy=None, stop_keyword=None, line_number=None),
 'seccional': Rule(type='regex', rule='Inscrição Seccional\\s*\\n\\s*\\d+\\s*(\\w+)Subseção', keyword=None, strategy=None, stop_keyword=None, line_number=None),
 'subsecao': Rule(type='regex', rule='Subseção\\s*\\n([^\\n]+)', keyword=None, strategy=None, stop_keyword=None, line_number=None),
 'categoria': Rule(type='regex', rule='([A-Z]+)\nEndereco Profissional', keyword=None, strategy=None, stop_keyword=None, line_number=None),
 'situacao': Rule(type='regex', rule='SITUAÇÃO (\\w+)', keyword=None, strategy=None, stop_keyword=None, line_number=None)}

In [108]:
import re
from typing import Dict, Any


def apply_rules_to_text(rules: Dict[str, Any], text: str) -> Dict[str, str | None]:
    """Applies a set of Rule objects to a given text and extracts their values.

    Args:
        rules: dict where each key is a field name and each value is a Rule instance.
        text: full text to extract information from.

    Returns:
        dict with extracted values (or None if not found).
    """
    results = {}

    for field_name, rule in rules.items():
        value = None

        if rule.type == "regex" and rule.rule:
            match = re.search(rule.rule, text, re.MULTILINE | re.DOTALL)
            if match:
                value = match.group(1).strip()

        elif rule.type == "keyword" and rule.keyword:
            lines = text.splitlines()
            for i, line in enumerate(lines):
                if rule.keyword.lower() in line.lower():
                    if rule.strategy == "next_line" and i + 1 < len(lines):
                        value = lines[i + 1].strip()
                    break

        results[field_name] = value

    return results


extracted_values = apply_rules_to_text(generated_rules, image_text)
pprint(extracted_values)

{'categoria': None,
 'inscricao': '101943',
 'nome': "JOANA D'ARC",
 'seccional': None,
 'situacao': 'REGULAR',
 'subsecao': '101943 PR CONSELHO SECCIONAL - PARANÁ'}


In [None]:
{
    "categoria": None,
    "inscricao": "101943",
    "nome": "JOANA D'ARC",
    "seccional": None,
    "situacao": "REGULAR",
    "subsecao": "101943 PR CONSELHO SECCIONAL - PARANÁ",
}

In [None]:
{
    "categoria": "SUPLEMENTAR",
    "inscricao": "101943",
    "nome": "SON GOKU",
    "seccional": "PR",
    "situacao": "REGULAR",
    "subsecao": "CONSELHO SECCIONAL - PARANÁ",
}

In [None]:
{
    "categoria": None,
    "endereco_profissional": None,
    "inscricao": "101943",
    "nome": "SON GOKU",
    "seccional": "PRSubseção",
    "situacao": "REGULAR",
    "subsecao": None,
}