In [4]:
from google import genai
from dotenv import load_dotenv
import os
import json
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter # División de texto

In [5]:
def get_gemini_client():
    load_dotenv()
    client = genai.Client()
    return client

In [6]:
def get_json_context(path_schema):
    #read schema
    with open(path_schema, 'r', encoding='utf-8') as f:
        context_data = json.load(f)
        context_str = json.dumps(context_data) #convert to string well formatted
        context_str = "**JSON SCHEMA TO FOLLOW:**\n" + context_str
    return context_str

In [7]:
def chunk_text(texto):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=50,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    chunks = splitter.split_text(texto)
    return chunks


In [None]:
#main
if __name__ == "__main__":
    client = get_gemini_client()
    
    #leemos schema
    with open('../../JSONS/baseSchema.json', 'r', encoding='utf-8') as f:
        json_schema = json.load(f)
    
    #leemos prompt para el LLM (inicial)
    with open('jsonFillPrompt.txt', 'r', encoding='utf-8') as f:
        initial_instructions = f.read()
    
    #leemos prompt de refinamiento
    with open('jsonRefinePrompt.txt', 'r', encoding='utf-8') as f:
        refine_instructions = f.read()

    #leemos documento legal
    reader = PdfReader('../../Documentos/CODIGO PENAL DE LA NACION ARGENTINA.pdf')
    legal_doc = ""
    for page in reader.pages:
        legal_doc += page.extract_text() + "\n"

    # Dividir en chunks
    legal_doc_chunks = chunk_text(legal_doc)
    print(f"Total de chunks: {len(legal_doc_chunks)}")
    
    # Limitar a 8 chunks
    legal_doc_chunks = legal_doc_chunks[:8]
    print(f"Procesando solo los primeros {len(legal_doc_chunks)} chunks")

    # PASO 1: Crear JSON inicial con el primer chunk + schema completo
    first_chunk_prompt = (
        initial_instructions +
        "\n\n**JSON SCHEMA:**\n" +
        json.dumps(json_schema, indent=2) +
        "\n\n**DOCUMENT:**\n" +
        legal_doc_chunks[0]
    )
    
    print("Procesando chunk 1...")
    response = client.models.generate_content(
        model="gemini-2.0-flash-exp",
        contents=first_chunk_prompt
    )
    
    # Parsear respuesta inicial
    refined_json = response.text
    print("JSON inicial creado: ", response.text)
    # PASO 2: Refinar con chunks restantes (SIN enviar el schema completo)
    for i, chunk in enumerate(legal_doc_chunks[1:], start=2):
        print(f"Procesando chunk {i}/{len(legal_doc_chunks)}...")
        
        # Prompt simplificado: instrucciones + JSON actual + nuevo chunk
        refine_prompt = (
            refine_instructions +
            "\n\n**EXISTING JSON:**\n" +
            json.dumps(refined_json, ensure_ascii=False, indent=2) +
            "\n\n**NEW CHUNK:**\n" +
            chunk
        )
        
        response = client.models.generate_content(
            model="gemini-2.0-flash-exp",
            contents=refine_prompt
        )
        
        # Actualizar JSON refinado
        refined_json = response.text
    
    # Guardar resultado final
    print("Guardando resultado...")
    with open('./processedDocs/response.json', 'w', encoding='utf-8') as f:
        json.dump(refined_json, f, ensure_ascii=False, indent=2)

    print("¡Proceso completado!: \n"+ refined_json)

Total de chunks: 719
Procesando solo los primeros 8 chunks
Procesando chunk 1...
JSON inicial creado:  ```json
{
  "document_id": "CP-ARG-1984",
  "metadata": {
    "title": "Código Penal de la Nación Argentina",
    "source": "Boletín Oficial",
    "url": "https://www.boletinoficial.gob.ar/",
    "document_type": "ley",
    "status": "vigente",
    "validity_dates": {
      "start_date": "1984-01-01",
      "end_date": null,
      "promulgation_date": "1984-01-01"
    },
    "jurisdiction": "nacional",
    "legal_area": [
      "penal"
    ]
  },
  "content": {
    "full_text": "CODIGO PENAL DE LA NACION ARGENTINA\nLEY 11.179 (T.O. 1984 actualizado)\nIndice Temático\nLIBRO PRIMERO\nDISPOSICIONES GENERALES\nTITULO I APLICACION DE LA LEY PENAL Arts.1 a 4\nTITULO II DE LAS PENAS Arts. 5 a 25\nTITULO III CONDENACION CONDICIONAL Arts. 26 a 29\nTITULO IV REPARACION DE PERJUICIOS Arts. 30 a 33\nTITULO V IMPUTABILIDAD Arts. 34 a 41\nTITULO VI TENTATIVA Arts. 42 a 44\nTITULO\nVII",
    "struct