In [53]:
import google.generativeai as genai
from dotenv import load_dotenv
import os, json
from pypdf import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [54]:
def get_gemini_model(model_name="gemini-2.0-flash-exp"):
    load_dotenv()
    api_key = os.getenv("GOOGLE_API_KEY")
    if not api_key:
        raise RuntimeError("Falta GOOGLE_API_KEY en .env")
    genai.configure(api_key=api_key)
    return genai.GenerativeModel(model_name)

In [55]:
def get_json_context(path_schema):
    #read schema
    with open(path_schema, 'r', encoding='utf-8') as f:
        context_data = json.load(f)
        context_str = json.dumps(context_data) #convert to string well formatted
        context_str = "**JSON SCHEMA TO FOLLOW:**\n" + context_str
    return context_str

In [56]:
def chunk_text(texto):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=50,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    chunks = splitter.split_text(texto)
    return chunks


In [77]:
# ====== CONFIG ======
DRY_RUN = False                 # <- ponelo en False cuando estés listo para llamar al LLM
MODEL = "gemini-2.0-flash-exp" # podés cambiar a uno más barato si querés
MAX_CHUNKS = 4                # al principio, procesa poquitos

GENERATION_CONFIG = {
    "max_output_tokens": 2000,  # no te excedas si no hace falta
    "temperature": 0.2
}

# ====== UTILIDAD: estimar tokens aprox ======
# Aproximación grosera: ~4 chars por token (varía por idioma, pero sirve para no volarte)
def approx_tokens(text: str) -> int:
    return max(1, int(len(text) / 4))

# ====== RESPUESTA SIMULADA ======
import json, re, time
def clean_llm_json(text: str) -> str:
    m = re.search(r"```json(.*?)```", text, flags=re.DOTALL|re.IGNORECASE)
    if m: text = m.group(1).strip()
    m = re.search(r"```(.*?)```", text, flags=re.DOTALL)
    if m: text = m.group(1).strip()
    return text.strip()

def fake_initial_json(schema_dict, first_chunk):
    # simulación: respeta el "shape" general y mete un preview
    return {
        "_simulated": True,
        "status": "draft",
        "schema_keys": list(schema_dict)[:6],
        "notes": "Simulación sin LLM. Ajustá prompts y pipeline hasta estar seguro.",
        "chunk_preview": first_chunk[:280]
    }

def fake_refine(existing, new_chunk, i):
    # simulación de refinamiento incremental
    if isinstance(existing, str):
        try:
            existing = json.loads(clean_llm_json(existing))
        except Exception:
            existing = {"_simulated": True, "raw": existing}
    existing[f"_refined_step_{i}"] = {"chunk_preview": new_chunk[:200]}
    return existing

# ====== WRAPPER DE ENVÍO ======
# Centraliza llamadas: si DRY_RUN, simula; si no, llama al modelo
def send_to_llm_initial(model, prompt, schema_dict, first_chunk):
    print(f"[send_to_llm_initial] in≈{approx_tokens(prompt)} tokens")
    if DRY_RUN:
        time.sleep(0.3)
        return fake_initial_json(schema_dict, first_chunk)
    resp = model.generate_content(prompt, generation_config=GENERATION_CONFIG)
    return clean_llm_json(resp.text)

def send_to_llm_refine(model, prompt, existing_json, new_chunk, step_i):
    print(f"[send_to_llm_refine] in≈{approx_tokens(prompt)} tokens")
    if DRY_RUN:
        time.sleep(0.2)
        return fake_refine(existing_json, new_chunk, step_i)
    resp = model.generate_content(prompt, generation_config=GENERATION_CONFIG)
    return clean_llm_json(resp.text)

In [78]:
# ====== MAIN ROBUSTO: parseo estricto + merge incremental ======
import os, json, re
from pathlib import Path
from copy import deepcopy
from pypdf import PdfReader

# -------- Helpers (robustos) --------
def clean_llm_json(text: str) -> str:
    # quita fences ```json ... ``` o ``` ... ```
    m = re.search(r"```json(.*?)```", text, flags=re.DOTALL|re.IGNORECASE)
    if m: text = m.group(1).strip()
    m = re.search(r"```(.*?)```", text, flags=re.DOTALL)
    if m: text = m.group(1).strip()
    return text.strip()

def extract_json_block(text: str) -> str:
    text = clean_llm_json(text)
    for pat in (r"\{.*\}", r"\[.*\]"):
        m = re.search(pat, text, flags=re.DOTALL)
        if m: return m.group(0)
    return text.strip()

def schema_to_blank_instance(schema):
    """Construye un objeto vacío con la forma del schema (para no propagar texto crudo)."""
    if not isinstance(schema, dict): 
        return None
    t = schema.get("type")
    if t == "object":
        props = schema.get("properties", {})
        return {k: schema_to_blank_instance(v) for k, v in props.items()}
    if t == "array":    return []
    if t == "string":   return ""
    if t in ("number","integer","boolean","null"): return None
    # fallback si no viene 'type' pero parece objeto
    if "properties" in schema:
        return {k: schema_to_blank_instance(v) for k, v in schema["properties"].items()}
    return None

def force_json_or_blank(text_or_obj, schema):
    """Devuelve SIEMPRE dict/list. Si el LLM no da JSON válido, devolvemos esqueleto del schema."""
    if isinstance(text_or_obj, (dict, list)):
        return text_or_obj
    s = extract_json_block(str(text_or_obj))
    try:
        return json.loads(s)
    except Exception:
        return schema_to_blank_instance(schema)

def deep_merge(dst, src):
    """Fusión profunda: integra src en dst sin borrar lo previo."""
    if not isinstance(dst, dict) or not isinstance(src, dict):
        return deepcopy(src)
    out = deepcopy(dst)
    for k, v in src.items():
        if k in out and isinstance(out[k], dict) and isinstance(v, dict):
            out[k] = deep_merge(out[k], v)
        elif k in out and isinstance(out[k], list) and isinstance(v, list):
            # evitar duplicados con hash JSON
            seen = set(json.dumps(x, sort_keys=True, ensure_ascii=False) for x in out[k])
            out[k] = out[k] + [x for x in v if json.dumps(x, sort_keys=True, ensure_ascii=False) not in seen]
        else:
            out[k] = deepcopy(v)
    return out

# -------- Defaults por si no están definidos en otra celda --------
if 'DRY_RUN' not in globals():     DRY_RUN = True
if 'MAX_CHUNKS' not in globals():  MAX_CHUNKS = 2
if 'GENERATION_CONFIG' not in globals():
    GENERATION_CONFIG = {"max_output_tokens": 2000, "temperature": 0.2}

# =========================
# MAIN (con guardas, parse estricto y merge)
# =========================
if __name__ == "__main__":
    # 0) Aviso modo
    print("⚠️ DRY_RUN=True (simulación). No se llama al modelo." if DRY_RUN else "✅ DRY_RUN=False (REAL). Se llama al modelo.")

    # 1) Modelo
    model = get_gemini_model(model_name="gemini-2.0-flash-exp")

    # 2) Rutas + asserts
    PDF_PATH    = Path("../../Documentos/CODIGO PENAL DE LA NACION ARGENTINA.pdf")
    SCHEMA_PATH = Path("../../JSONS/baseSchema.json")
    assert PDF_PATH.exists(),    f"No existe el PDF: {PDF_PATH.resolve()}"
    assert SCHEMA_PATH.exists(), f"No existe el schema: {SCHEMA_PATH.resolve()}"

    # 3) Cargar schema y prompts
    with open(SCHEMA_PATH, 'r', encoding='utf-8') as f:
        json_schema = json.load(f)
    with open('jsonFillPrompt.txt', 'r', encoding='utf-8') as f:
        initial_instructions = f.read()
    with open('jsonRefinePrompt.txt', 'r', encoding='utf-8') as f:
        refine_instructions = f.read()

    # 4) PDF -> texto
    reader = PdfReader(PDF_PATH.open("rb"))
    legal_doc = ""
    for page in reader.pages:
        legal_doc += (page.extract_text() or "") + "\n"

    # 5) Chunking
    legal_doc_chunks = chunk_text(legal_doc)
    print(f"Total chunks: {len(legal_doc_chunks)}")
    legal_doc_chunks = legal_doc_chunks[:MAX_CHUNKS]
    print(f"Procesando {len(legal_doc_chunks)} chunks (MAX_CHUNKS={MAX_CHUNKS}, DRY_RUN={DRY_RUN})")

    # 6) Primer envío (schema completo + primer chunk)
    first_chunk_prompt = (
        initial_instructions
        + "\n\n**JSON SCHEMA:**\n"
        + json.dumps(json_schema, ensure_ascii=False, indent=2)
        + "\n\n**DOCUMENT:**\n"
        + legal_doc_chunks[0]
    )
    print("Procesando chunk 1...")
    initial_result = send_to_llm_initial(
        model=model,
        prompt=first_chunk_prompt,
        schema_dict=json_schema,
        first_chunk=legal_doc_chunks[0]
    )
    # Clave: no propagar texto crudo
    result = force_json_or_blank(initial_result, json_schema)

    # 7) Refinamiento incremental con MERGE (no pisar lo previo)
    for i, chunk in enumerate(legal_doc_chunks[1:], start=2):
        refine_prompt = (
            refine_instructions
            + "\n\n**EXISTING JSON:**\n"
            + json.dumps(result, ensure_ascii=False, indent=2)
            + "\n\n**NEW CHUNK:**\n"
            + chunk
        )
        print(f"Refinando con chunk {i}/{len(legal_doc_chunks)}...")
        refined = send_to_llm_refine(
            model=model,
            prompt=refine_prompt,
            existing_json=result,
            new_chunk=chunk,
            step_i=i
        )
        refined = force_json_or_blank(refined, json_schema)
        result  = deep_merge(result, refined)

    # 8) Guardado consistente
    out_dir = Path("./processedDocs"); out_dir.mkdir(parents=True, exist_ok=True)
    with (out_dir / "response.json").open("w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)

    # Variante legible (solo convierte '\n' a saltos reales en strings)
    def _readable(o):
        if isinstance(o, dict):  return {k: _readable(v) for k, v in o.items()}
        if isinstance(o, list):  return [_readable(x) for x in o]
        if isinstance(o, str):   return o.replace("\\n", "\n")
        return o
    with (out_dir / "response_formatted.json").open("w", encoding="utf-8") as f:
        json.dump(_readable(result), f, ensure_ascii=False, indent=2)

    print("FIN. Guardados:\n - processedDocs/response.json\n - processedDocs/response_formatted.json")

✅ DRY_RUN=False (REAL). Se llama al modelo.
Total chunks: 719
Procesando 4 chunks (MAX_CHUNKS=4, DRY_RUN=False)
Procesando chunk 1...
[send_to_llm_initial] in≈4087 tokens
Refinando con chunk 2/4...
[send_to_llm_refine] in≈1075 tokens
Refinando con chunk 3/4...
[send_to_llm_refine] in≈1203 tokens
Refinando con chunk 4/4...
[send_to_llm_refine] in≈1329 tokens
FIN. Guardados:
 - processedDocs/response.json
 - processedDocs/response_formatted.json
