In [22]:
import google.generativeai as genai
from dotenv import load_dotenv
import os, json
from pypdf import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [23]:

def get_gemini_model(model_name="gemini-2.0-flash-001"): 
    load_dotenv()
    api_key = os.getenv("GOOGLE_API_KEY")
    if not api_key:
        raise RuntimeError("Falta GOOGLE_API_KEY en .env")
    genai.configure(api_key=api_key)
    return genai.GenerativeModel(model_name)

In [24]:
def get_json_context(path_schema):
    #read schema
    with open(path_schema, 'r', encoding='utf-8') as f:
        context_data = json.load(f)
        context_str = json.dumps(context_data) #convert to string well formatted
        context_str = "**JSON SCHEMA TO FOLLOW:**\n" + context_str
    return context_str

In [25]:
# def chunk_text(texto):
#     splitter = RecursiveCharacterTextSplitter(
#         chunk_size=400,
#         chunk_overlap=50,
#         separators=["\n\n", "\n", ".", " ", ""]
#     )
#     chunks = splitter.split_text(texto)
#     return chunks
def chunk_text(texto):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=8000,
        chunk_overlap=500, # Un overlap mayor asegura no cortar frases a la mitad
        separators=["\n\n", "ARTICULO", "TITULO", "\n", ".", " "] # Separadores semánticos
    )
    return splitter.split_text(texto)

In [26]:
import google.generativeai as genai
import sys

print(f"Versión de Python: {sys.version}")
print(f"Versión de google-generativeai: {genai.__version__}")

print("\n--- MODELOS DISPONIBLES ---")
try:
    for m in genai.list_models():
        if 'generateContent' in m.supported_generation_methods:
            print(f"- {m.name}")
except Exception as e:
    print(f"Error listando modelos: {e}")

Versión de Python: 3.13.3 (main, Apr  8 2025, 13:54:08) [Clang 17.0.0 (clang-1700.0.13.3)]
Versión de google-generativeai: 0.8.5

--- MODELOS DISPONIBLES ---
- models/gemini-2.5-flash
- models/gemini-2.5-pro
- models/gemini-2.0-flash-exp
- models/gemini-2.0-flash
- models/gemini-2.0-flash-001
- models/gemini-2.0-flash-exp-image-generation
- models/gemini-2.0-flash-lite-001
- models/gemini-2.0-flash-lite
- models/gemini-2.0-flash-lite-preview-02-05
- models/gemini-2.0-flash-lite-preview
- models/gemini-exp-1206
- models/gemini-2.5-flash-preview-tts
- models/gemini-2.5-pro-preview-tts
- models/gemma-3-1b-it
- models/gemma-3-4b-it
- models/gemma-3-12b-it
- models/gemma-3-27b-it
- models/gemma-3n-e4b-it
- models/gemma-3n-e2b-it
- models/gemini-flash-latest
- models/gemini-flash-lite-latest
- models/gemini-pro-latest
- models/gemini-2.5-flash-lite
- models/gemini-2.5-flash-image-preview
- models/gemini-2.5-flash-image
- models/gemini-2.5-flash-preview-09-2025
- models/gemini-2.5-flash-lite-

In [27]:
# ====== CONFIG ======
DRY_RUN = False                 # <- ponelo en False cuando estés listo para llamar al LLM
MODEL = "gemini-2.0-flash-exp" # podés cambiar a uno más barato si querés
MAX_CHUNKS = 4               # al principio, procesa poquitos

# GENERATION_CONFIG = {
#     "max_output_tokens": 2000,  # no te excedas si no hace falta
#     "temperature": 0.2
# }

GENERATION_CONFIG = {
    "max_output_tokens": 8192,  # Aumentamos tokens
    "temperature": 0.1,         # Bajamos creatividad para precisión
    "response_mime_type": "application/json" # CLAVE: Fuerza JSON nativo
}

# ====== UTILIDAD: estimar tokens aprox ======
# Aproximación grosera: ~4 chars por token (varía por idioma, pero sirve para no volarte)
def approx_tokens(text: str) -> int:
    return max(1, int(len(text) / 4))

# ====== RESPUESTA SIMULADA ======
import json, re, time
def clean_llm_json(text: str) -> str:
    m = re.search(r"```json(.*?)```", text, flags=re.DOTALL|re.IGNORECASE)
    if m: text = m.group(1).strip()
    m = re.search(r"```(.*?)```", text, flags=re.DOTALL)
    if m: text = m.group(1).strip()
    return text.strip()

def fake_initial_json(schema_dict, first_chunk):
    # simulación: respeta el "shape" general y mete un preview
    return {
        "_simulated": True,
        "status": "draft",
        "schema_keys": list(schema_dict)[:6],
        "notes": "Simulación sin LLM. Ajustá prompts y pipeline hasta estar seguro.",
        "chunk_preview": first_chunk[:280]
    }

def fake_refine(existing, new_chunk, i):
    # simulación de refinamiento incremental
    if isinstance(existing, str):
        try:
            existing = json.loads(clean_llm_json(existing))
        except Exception:
            existing = {"_simulated": True, "raw": existing}
    existing[f"_refined_step_{i}"] = {"chunk_preview": new_chunk[:200]}
    return existing

# ====== WRAPPER DE ENVÍO ======
# Centraliza llamadas: si DRY_RUN, simula; si no, llama al modelo
def send_to_llm_initial(model, prompt, schema_dict, first_chunk):
    print(f"[send_to_llm_initial] in≈{approx_tokens(prompt)} tokens")
    if DRY_RUN:
        time.sleep(0.3)
        return fake_initial_json(schema_dict, first_chunk)
    resp = model.generate_content(prompt, generation_config=GENERATION_CONFIG)
    return clean_llm_json(resp.text)

def send_to_llm_refine(model, prompt, existing_json, new_chunk, step_i):
    print(f"[send_to_llm_refine] in≈{approx_tokens(prompt)} tokens")
    if DRY_RUN:
        time.sleep(0.2)
        return fake_refine(existing_json, new_chunk, step_i)
    resp = model.generate_content(prompt, generation_config=GENERATION_CONFIG)
    return clean_llm_json(resp.text)

In [28]:
# ====== MAIN ROBUSTO: parseo estricto + merge incremental ======
import os, json, re
from pathlib import Path
from copy import deepcopy
from pypdf import PdfReader

# -------- Helpers (robustos) --------
def clean_llm_json(text: str) -> str:
    # quita fences ```json ... ``` o ``` ... ```
    m = re.search(r"```json(.*?)```", text, flags=re.DOTALL|re.IGNORECASE)
    if m: text = m.group(1).strip()
    m = re.search(r"```(.*?)```", text, flags=re.DOTALL)
    if m: text = m.group(1).strip()
    return text.strip()

def extract_json_block(text: str) -> str:
    text = clean_llm_json(text)
    for pat in (r"\{.*\}", r"\[.*\]"):
        m = re.search(pat, text, flags=re.DOTALL)
        if m: return m.group(0)
    return text.strip()

def schema_to_blank_instance(schema):
    """Construye un objeto vacío con la forma del schema (para no propagar texto crudo)."""
    if not isinstance(schema, dict): 
        return None
    t = schema.get("type")
    if t == "object":
        props = schema.get("properties", {})
        return {k: schema_to_blank_instance(v) for k, v in props.items()}
    if t == "array":    return []
    if t == "string":   return ""
    if t in ("number","integer","boolean","null"): return None
    # fallback si no viene 'type' pero parece objeto
    if "properties" in schema:
        return {k: schema_to_blank_instance(v) for k, v in schema["properties"].items()}
    return None

def force_json_or_blank(text_or_obj, schema):
    """Devuelve SIEMPRE dict/list. Si el LLM no da JSON válido, devolvemos esqueleto del schema."""
    if isinstance(text_or_obj, (dict, list)):
        return text_or_obj
    s = extract_json_block(str(text_or_obj))
    try:
        return json.loads(s)
    except Exception:
        return schema_to_blank_instance(schema)

def deep_merge(dst, src):
    """Fusión profunda: integra src en dst sin borrar lo previo."""
    if not isinstance(dst, dict) or not isinstance(src, dict):
        return deepcopy(src)
    out = deepcopy(dst)
    for k, v in src.items():
        if k in out and isinstance(out[k], dict) and isinstance(v, dict):
            out[k] = deep_merge(out[k], v)
        elif k in out and isinstance(out[k], list) and isinstance(v, list):
            # evitar duplicados con hash JSON
            seen = set(json.dumps(x, sort_keys=True, ensure_ascii=False) for x in out[k])
            out[k] = out[k] + [x for x in v if json.dumps(x, sort_keys=True, ensure_ascii=False) not in seen]
        else:
            out[k] = deepcopy(v)
    return out

# -------- Defaults por si no están definidos en otra celda --------
if 'DRY_RUN' not in globals():     DRY_RUN = True
if 'MAX_CHUNKS' not in globals():  MAX_CHUNKS = 2
if 'GENERATION_CONFIG' not in globals():
    GENERATION_CONFIG = {"max_output_tokens": 800, "temperature": 0.2}


if __name__ == "__main__":
    # 0) Aviso modo
    print("⚠️ DRY_RUN=True (simulación). No se llama al modelo." if DRY_RUN else "✅ DRY_RUN=False (REAL). Se llama al modelo.")

    # 1) Modelo
    model = get_gemini_model(model_name="gemini-2.0-flash-exp")

    # 2) Rutas + asserts
    # PDF_PATH    = Path("../../Documentos/CODIGO PENAL DE LA NACION ARGENTINA.pdf")
    PDF_PATH    = Path("../../Documentos/Taller.pdf")
    SCHEMA_PATH = Path("../../JSONS/baseSchema.json")
    assert PDF_PATH.exists(),    f"No existe el PDF: {PDF_PATH.resolve()}"
    assert SCHEMA_PATH.exists(), f"No existe el schema: {SCHEMA_PATH.resolve()}"

    # 3) Cargar schema y prompts
    with open(SCHEMA_PATH, 'r', encoding='utf-8') as f:
        json_schema = json.load(f)
    with open('jsonFillPrompt.txt', 'r', encoding='utf-8') as f:
        initial_instructions = f.read()
    with open('jsonRefinePrompt.txt', 'r', encoding='utf-8') as f:
        refine_instructions = f.read()

    # 4) PDF -> texto
    reader = PdfReader(PDF_PATH.open("rb"))
    legal_doc = ""
    for page in reader.pages:
        legal_doc += (page.extract_text() or "") + "\n"

    # 5) Chunking
    legal_doc_chunks = chunk_text(legal_doc)
    print(f"Total chunks: {len(legal_doc_chunks)}")
    legal_doc_chunks = legal_doc_chunks[:MAX_CHUNKS]
    print(f"Procesando {len(legal_doc_chunks)} chunks (MAX_CHUNKS={MAX_CHUNKS}, DRY_RUN={DRY_RUN})")

    # 6) Primer envío (schema completo + primer chunk)
    first_chunk_prompt = (
        initial_instructions
        + "\n\n**JSON SCHEMA:**\n"
        + json.dumps(json_schema, ensure_ascii=False, indent=2)
        + "\n\n**DOCUMENT:**\n"
        + legal_doc_chunks[0]
    )
    print("Procesando chunk 1...")
    initial_result = send_to_llm_initial(
        model=model,
        prompt=first_chunk_prompt,
        schema_dict=json_schema,
        first_chunk=legal_doc_chunks[0]
    )
    # Clave: no propagar texto crudo
    result = force_json_or_blank(initial_result, json_schema)

    # 7) Refinamiento incremental con MERGE (no pisar lo previo)
    for i, chunk in enumerate(legal_doc_chunks[1:], start=2):
        refine_prompt = (
            refine_instructions
            + "\n\n**EXISTING JSON:**\n"
            + json.dumps(result, ensure_ascii=False, indent=2)
            + "\n\n**NEW CHUNK:**\n"
            + chunk
        )
        print(f"Refinando con chunk {i}/{len(legal_doc_chunks)}...")
        refined = send_to_llm_refine(
            model=model,
            prompt=refine_prompt,
            existing_json=result,
            new_chunk=chunk,
            step_i=i
        )
        refined = force_json_or_blank(refined, json_schema)
        result  = deep_merge(result, refined)

    # 8) Guardado consistente
    out_dir = Path("./processedDocs"); out_dir.mkdir(parents=True, exist_ok=True)
    with (out_dir / "response.json").open("w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)

    # Variante legible (solo convierte '\n' a saltos reales en strings)
    def _readable(o):
        if isinstance(o, dict):  return {k: _readable(v) for k, v in o.items()}
        if isinstance(o, list):  return [_readable(x) for x in o]
        if isinstance(o, str):   return o.replace("\\n", "\n")
        return o

    print("FIN. Guardados:\n - processedDocs/response.json\n - processedDocs/response_formatted.json")

####################################################################

# if __name__ == "__main__":
#     # 0) Aviso modo
#     print(f"⚠️ DRY_RUN={DRY_RUN} (Simulación)" if DRY_RUN else f"✅ DRY_RUN={DRY_RUN} (Real)")

#     # 1) Modelo
#     model = get_gemini_model(model_name="gemini-2.0-flash-001")    

#     # 2) Rutas + asserts
#     PDF_PATH    = Path("../../Documentos/Taller.pdf")
#     SCHEMA_PATH = Path("../../JSONS/baseSchema.json")

#     if not PDF_PATH.exists(): raise FileNotFoundError(f"No existe el PDF: {PDF_PATH}")
#     if not SCHEMA_PATH.exists(): raise FileNotFoundError(f"No existe el schema: {SCHEMA_PATH}")

#     # 3) Cargar schema y prompts
#     # NOTA: Solo necesitamos el prompt inicial ("fill"), ya no el de refinamiento,
#     # porque tratamos cada chunk como una extracción independiente.
#     with open(SCHEMA_PATH, 'r', encoding='utf-8') as f:
#         json_schema = json.load(f)
#     with open('jsonFillPrompt.txt', 'r', encoding='utf-8') as f:
#         base_instructions = f.read()

#     # 4) PDF -> Texto (Agregando marcas de página para mejor contexto)
#     reader = PdfReader(PDF_PATH.open("rb"))
#     legal_doc = ""
#     for i, page in enumerate(reader.pages):
#         # El [PAGINA X] ayuda al modelo si necesita citar ubicaciones
#         legal_doc += f"\n[PAGINA {i+1}]\n" + (page.extract_text() or "")

#     # 5) Chunking (Usará tu nueva función con chunks grandes)
#     legal_doc_chunks = chunk_text(legal_doc)
#     print(f"Total chunks generados: {len(legal_doc_chunks)}")
    
#     # Recorte según MAX_CHUNKS
#     chunks_to_process = legal_doc_chunks[:MAX_CHUNKS]
#     print(f"Procesando {len(chunks_to_process)} chunks (Estrategia: Map-Reduce)...")

#     # 6) Bucle Principal: Extracción Paralela + Merge Secuencial
#     final_json = {} # Nuestro acumulador (estado final)

#     for i, chunk in enumerate(chunks_to_process):
#         print(f"--- Procesando Chunk {i+1}/{len(chunks_to_process)} ---")

#         # Construcción del Prompt:
#         # Le damos las instrucciones + el esquema + EL FRAGMENTO ACTUAL.
#         # No le pasamos el JSON anterior para no confundirlo ni gastar tokens.
#         prompt = (
#     base_instructions
#     + "\n\n**CURRENT TEXT SEGMENT TO ANALYZE:**\n"
#     + chunk
# )

#         # Llamada al LLM (Reutilizamos send_to_llm_initial que ya tenés)
#         resp_str = send_to_llm_initial(
#             model=model,
#             prompt=prompt,
#             schema_dict=json_schema,
#             first_chunk=chunk 
#         )

#         # Parseo y Merge
#         try:
#             # Limpieza básica por si el modelo metió ```json ... ```
#             cleaned = clean_llm_json(resp_str)
#             chunk_data = json.loads(cleaned)
            
#             # MAGIA: Unimos los datos de este chunk al acumulador principal
#             final_json = deep_merge(final_json, chunk_data)
#             print(f"   -> Datos del Chunk {i+1} integrados correctamente.")
            
#         except json.JSONDecodeError:
#             print(f"   ❌ Error: El modelo no devolvió un JSON válido en el chunk {i+1}.")
#         except Exception as e:
#             print(f"   ❌ Error inesperado procesando chunk {i+1}: {e}")

#     # 7) Guardado Final
#     out_dir = Path("./processedDocs")
#     out_dir.mkdir(parents=True, exist_ok=True)
    
#     output_path = out_dir / "response.json"
#     with output_path.open("w", encoding="utf-8") as f:
#         json.dump(final_json, f, ensure_ascii=False, indent=2)

#     print(f"\nFIN. Resultado guardado en: {output_path.resolve()}")


✅ DRY_RUN=False (REAL). Se llama al modelo.
Total chunks: 1
Procesando 1 chunks (MAX_CHUNKS=4, DRY_RUN=False)
Procesando chunk 1...
[send_to_llm_initial] in≈5971 tokens


ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_input_token_count, limit: 0, model: gemini-2.0-flash-exp
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.0-flash-exp
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0, model: gemini-2.0-flash-exp
Please retry in 55.437058396s. [links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_input_token_count"
  quota_id: "GenerateContentInputTokensPerModelPerMinute-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash-exp"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
}
violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash-exp"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
}
violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash-exp"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
}
, retry_delay {
  seconds: 55
}
]