In [None]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ".json"

import vertexai
from vertexai.preview import rag

vertexai.init(project="proyecto", location="us-east4")
print(list(rag.list_corpora()))

In [None]:
!pip install pandas

In [None]:
# =========================================================
# Clasificaci√≥n de sentencias (texto completo) con Vertex AI (Gemini 2.0)
# - Fuente: GCS bucket 
# - Salida: gs:///sentencias_motivos_vertex.csv (merge incremental)
# =========================================================
import os, re, json, time, csv, unicodedata, textwrap, tempfile
from typing import Any, Dict, List, Tuple, Optional

import pandas as pd
from IPython.display import display, clear_output, Markdown

# ---- GCS ----
from google.cloud import storage

# ---- Vertex AI ----
import vertexai
from vertexai.generative_models import GenerativeModel, GenerationConfig

# -------------------- CONFIG --------------------
PROJECT_ID = "project"
# Gemini 2.0 suele estar en us-central1; cambia si lo tienes en otra regi√≥n.
LOCATION   = "us-central1"
MODEL_ID   = "gemini-2.0-flash"  # o "gemini-2.0-pro"

SRC_BUCKET = "bucket"   # de aqu√≠ leemos los .txt
DST_BUCKET = "bucket2"        # aqu√≠ subimos el CSV
DST_BLOB_CSV = "sentencias_motivos_vertex.csv"

LOG_PATH   = "/dataset/log_vertex.txt"

# Cat√°logo fijo
MOTIVOS = [
    "acreditaci√≥n de la deuda",
    "usura",
    "abusividad del clausulado",
    "transparencia",
    "legitimaci√≥n activa",
    "prescripci√≥n",
    "validez de la firma",
    "legitimaci√≥n pasiva",
    "requerimiento previo derecho al honor",
]

# UI
VERBOSE_UI = True
UI_WRAP = 110
RAW_UI_MAX = 2500  # pon None para ver todo el raw

# -------------------- UI din√°mica --------------------
def _bar(p: float, width: int = 28) -> str:
    p = min(max(p, 0.0), 1.0)
    filled = int(round(p * width))
    return "‚ñà" * filled + "‚ñë" * (width - filled)

class LiveUI:
    def __init__(self, enabled=True): self.enabled = enabled
    def _truncate(self, s, max_chars=1200): s = s or "";  return (s[:max_chars] + " ‚Ä¶[+trunc]") if len(s) > max_chars else s
    def show(self, archivo, fase, detalle=None, ultimo_json=None, ultimo_raw=None, progress=None):
        if not self.enabled: return
        clear_output(wait=True)
        lines = []
        lines.append(f"**üìÑ Archivo:** `{archivo}`")
        lines.append(f"**‚öôÔ∏è Fase:** {fase}")
        if progress is not None:
            percent = int(progress * 100)
            lines.append(f"**Progreso:** `{percent:>3d}%`  `{_bar(progress)}`")
        if detalle:
            lines.append(f"**‚ÑπÔ∏è** {detalle}")
        if ultimo_json:
            wrapped = "\n".join(textwrap.wrap(self._truncate(ultimo_json, 2000), width=UI_WRAP))
            lines.append("**üß† Respuesta (JSON limpio):**\n\n```json\n" + wrapped + "\n```")
        if ultimo_raw is not None:
            show = ultimo_raw if RAW_UI_MAX is None or len(ultimo_raw) <= RAW_UI_MAX else (ultimo_raw[:RAW_UI_MAX] + " ‚Ä¶[+trunc]")
            lines.append("**üìú Respuesta LITERAL del modelo:**\n\n```\n" + show + "\n```")
        display(Markdown("\n\n".join(lines)))

ui = LiveUI(enabled=VERBOSE_UI)

# -------------------- Utilidades --------------------
def normalize_ws(s: str) -> str:
    return re.sub(r"[ \t]+\n", "\n", re.sub(r"[ \t]{2,}", " ", s or "")).strip()

def _strip_accents(s: str) -> str:
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def _norm_text(s: str) -> str:
    s = s.strip().lower()
    s = _strip_accents(s)
    repl = {
        "clausulas": "clausulado",
        "clausula": "clausulado",
        "abusivas": "abusividad",
        "abusiva": "abusividad",
        "transparente": "transparencia",
        "transparencia material": "transparencia",
        "legitimacion activa de la parte actora": "legitimacion activa",
        "legitimacion pasiva de la parte demandada": "legitimacion pasiva",
        "validez de la firma digital": "validez de la firma",
        "requerimiento previo": "requerimiento previo derecho al honor",
        "derecho al honor": "requerimiento previo derecho al honor",
    }
    for k, v in repl.items():
        if k in s: s = v
    return s

_CANON_BY_NORM = { _norm_text(m): m for m in MOTIVOS }

def canonizar_motivo(m: str) -> Optional[str]:
    if not isinstance(m, str) or not m.strip(): return None
    n = _norm_text(m)
    if n in _CANON_BY_NORM: return _CANON_BY_NORM[n]
    for norm_key, canon in _CANON_BY_NORM.items():
        if norm_key in n: return canon
    return None

def log_block(path, header, prompt_full, response_full):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    ts = time.strftime("%Y-%m-%d %H:%M:%S")
    with open(path, "a", encoding="utf-8") as f:
        f.write(f"{ts} | {header}\n")
        f.write("--- PROMPT START ---\n")
        f.write((prompt_full or "") + "\n")
        f.write("--- PROMPT END ---\n")
        f.write("--- RESPONSE START ---\n")
        f.write((response_full or "") + "\n")
        f.write("--- RESPONSE END ---\n")

# -------------------- GCS helpers --------------------
_storage = storage.Client()

def gcs_list_txt(bucket_name: str, prefix: Optional[str] = None) -> List[str]:
    bucket = _storage.bucket(bucket_name)
    blobs = _storage.list_blobs(bucket, prefix=prefix)
    names = [b.name for b in blobs if b.name.lower().endswith(".txt")]
    names.sort()
    return names

def gcs_read_text(bucket_name: str, blob_name: str, encoding: str = "utf-8") -> str:
    bucket = _storage.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    return blob.download_as_text(encoding=encoding)

def gcs_blob_exists(bucket_name: str, blob_name: str) -> bool:
    bucket = _storage.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    return blob.exists()

def gcs_upload_file(bucket_name: str, blob_name: str, local_path: str, content_type: Optional[str] = None):
    bucket = _storage.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    blob.upload_from_filename(local_path, content_type=content_type)

def gcs_download_to_temp(bucket_name: str, blob_name: str) -> str:
    bucket = _storage.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    fd, tmp_path = tempfile.mkstemp(prefix="gcs_", suffix=os.path.splitext(blob_name)[1])
    os.close(fd)
    blob.download_to_filename(tmp_path)
    return tmp_path

# -------------------- Prompt + Schema --------------------
def build_prompt_fulltext(texto: str) -> str:
    return f"""Eres un clasificador jur√≠dico estricto.
Analiza la sentencia (texto completo) en espa√±ol y devuelve EXCLUSIVAMENTE un JSON con la clave "motivos".
Cada motivo debe pertenecer EXACTAMENTE a este cat√°logo y a ninguno m√°s:

{json.dumps(MOTIVOS, ensure_ascii=False, indent=2)}

Formato preferido (admite tambi√©n lista simple):
{{
  "motivos": [
    {{"motivo": "xxx", "confianza": 0.0 a 1.0}},
    ...
  ]
}}

Reglas MUY IMPORTANTES:
- Responde en JSON V√ÅLIDO (RFC 8259) y NADA M√ÅS.
- Si NO hay motivos del cat√°logo, devuelve EXACTAMENTE: {{"motivos":[]}}.
- Devuelve nombres EXACTOS del cat√°logo; si dudas, elige el m√°s cercano del cat√°logo.
- M√°ximo 4 motivos relevantes.

TEXTO COMPLETO:
\"\"\"{normalize_ws(texto)}\"\"\""""

RESPONSE_SCHEMA: Dict[str, Any] = {
    "type": "object",
    "properties": {
        "motivos": {
            "type": "array",
            "items": {
                "oneOf": [
                    {"type": "string", "enum": MOTIVOS},
                    {
                        "type": "object",
                        "properties": {
                            "motivo": {"type": "string", "enum": MOTIVOS},
                            "confianza": {"type": "number"}
                        },
                        "required": ["motivo"]
                    }
                ]
            }
        }
    },
    "required": ["motivos"]
}

# -------------------- Llamada a Vertex --------------------
def vertex_generate_json(prompt: str,
                         model_id: str = MODEL_ID,
                         temp_primary: float = 0.0,
                         temp_fallback: float = 0.2,
                         tok_primary: int = 2048,
                         tok_fallback: int = 3072) -> str:
    model = GenerativeModel(model_id)
    last_err = None
    # 1) Estricto con schema
    try:
        cfg = GenerationConfig(
            response_mime_type="application/json",
            response_schema=RESPONSE_SCHEMA,
            temperature=temp_primary,
            max_output_tokens=tok_primary,
        )
        r = model.generate_content([prompt], generation_config=cfg)
        raw = (r.text or "").strip()
        if raw:
            return raw
    except Exception as e1:
        last_err = e1
    # 2) JSON mime sin schema
    try:
        cfg2 = GenerationConfig(
            response_mime_type="application/json",
            temperature=temp_fallback,
            max_output_tokens=tok_fallback,
        )
        r2 = model.generate_content([prompt], generation_config=cfg2)
        raw2 = (r2.text or "").strip()
        if raw2:
            return raw2
    except Exception as e2:
        last_err = e2
    # 3) Libre
    r3 = model.generate_content(
        [prompt],
        generation_config=GenerationConfig(
            temperature=temp_fallback,
            max_output_tokens=tok_fallback
        )
    )
    raw3 = (r3.text or "").strip()
    if raw3:
        return raw3
    raise RuntimeError(f"Vertex devolvi√≥ vac√≠o: {last_err if last_err else 'sin detalle'}")

# -------------------- Parser + normalizaci√≥n --------------------
def extract_first_json_value(s: str) -> Optional[str]:
    if not s: return None
    s = s.strip()
    s = re.sub(r'^```(?:json)?\s*', '', s, flags=re.IGNORECASE | re.MULTILINE)
    s = re.sub(r'\s*```$', '', s, flags=re.IGNORECASE)
    starts = [i for i, ch in enumerate(s) if ch in "{["]
    if not starts: return None
    start = starts[0]
    open_ch = s[start]; close_ch = "}" if open_ch == "{" else "]"
    depth = 0
    for i in range(start, len(s)):
        ch = s[i]
        if ch == open_ch: depth += 1
        elif ch == close_ch:
            depth -= 1
            if depth == 0: return s[start:i+1]
    return None

def parsear_motivos_desde_obj(obj) -> List[Tuple[str, float]]:
    arr = None
    if isinstance(obj, dict):
        if "motivos" in obj: arr = obj["motivos"]
    elif isinstance(obj, list):
        arr = obj
    out = []
    if not isinstance(arr, list): return out
    for item in arr:
        if isinstance(item, str):
            canon = canonizar_motivo(item)
            if canon: out.append((canon, 1.0))
        elif isinstance(item, dict):
            m = item.get("motivo"); c = item.get("confianza", 1.0)
            canon = canonizar_motivo(m) if m else None
            if canon:
                try: c = float(c)
                except: c = 1.0
                out.append((canon, max(0.0, min(1.0, c))))
    return out

# -------------------- Pipeline por sentencia --------------------
def clasificar_sentencia_fulltext(nombre_archivo: str, texto: str) -> List[str]:
    prompt = build_prompt_fulltext(texto)
    raw = vertex_generate_json(prompt)
    log_block(LOG_PATH, f"{nombre_archivo} | FULLTEXT", prompt, raw)

    frag_json = extract_first_json_value(raw)
    ui_json_to_show = frag_json or "{}"
    ui.show(archivo=nombre_archivo, fase="Inferencia completada",
            ultimo_json=ui_json_to_show, ultimo_raw=raw, progress=1.0)

    if not frag_json:
        return []
    try:
        obj = json.loads(frag_json)
    except Exception:
        return []

    pares = parsear_motivos_desde_obj(obj)
    seen, motivos = set(), []
    for m, c in sorted(pares, key=lambda x: -x[1]):
        if m not in seen:
            motivos.append(m); seen.add(m)
        if len(motivos) == 4: break
    return motivos

# -------------------- MAIN (lee GCS, sube CSV a GCS con merge incremental) --------------------
def run_from_gcs(prefix: Optional[str] = None):
    # Vertex
    vertexai.init(project=PROJECT_ID, location=LOCATION)

    # 1) Listado de .txt en el bucket origen
    txt_blobs = gcs_list_txt(SRC_BUCKET, prefix=prefix)
    if not txt_blobs:
        print(f"‚ö†Ô∏è No se encontraron .txt en gs://{SRC_BUCKET}/{prefix or ''}")
        return

    total = len(txt_blobs)
    rows = []  # (nombre, motivo1..motivo4)

    for idx, blob_name in enumerate(txt_blobs, start=1):
        base = os.path.basename(blob_name)
        nombre_sin_ext = os.path.splitext(base)[0]

        ui.show(archivo=base, fase=f"Procesando {idx}/{total}",
                detalle=f"Leyendo gs://{SRC_BUCKET}/{blob_name}‚Ä¶", progress=(idx-1)/total)

        try:
            texto = gcs_read_text(SRC_BUCKET, blob_name)
        except Exception as e:
            ui.show(archivo=base, fase=f"ERROR lectura {idx}/{total}",
                    detalle=f"No se pudo leer el blob: {e}", progress=(idx-1)/total)
            continue

        motivos = clasificar_sentencia_fulltext(base, texto)
        motivos = (motivos + ["null", "null", "null", "null"])[:4]
        rows.append((nombre_sin_ext, *motivos))

        ui.show(archivo=base, fase=f"Clasificada {idx}/{total}",
                detalle=f"{nombre_sin_ext} ‚Üí {motivos}", progress=idx/total)

    # 2) DataFrame con nuevas filas
    df_new = pd.DataFrame(rows, columns=["nombre", "motivo1", "motivo2", "motivo3", "motivo4"])

    # 3) Si existe CSV previo en destino, hacemos merge incremental por 'nombre'
    if gcs_blob_exists(DST_BUCKET, DST_BLOB_CSV):
        tmp_old = gcs_download_to_temp(DST_BUCKET, DST_BLOB_CSV)
        try:
            df_old = pd.read_csv(tmp_old)
        except Exception:
            df_old = pd.DataFrame(columns=["nombre", "motivo1", "motivo2", "motivo3", "motivo4"])
        # concat y dedup por 'nombre' (prioriza nuevas filas)
        df_merged = pd.concat([df_old[~df_old["nombre"].isin(df_new["nombre"])], df_new], ignore_index=True)
    else:
        df_merged = df_new

    # 4) Subir CSV final al bucket destino
    with tempfile.NamedTemporaryFile(mode="w", newline="", suffix=".csv", delete=False, encoding="utf-8") as tmpf:
        df_merged.to_csv(tmpf.name, index=False, encoding="utf-8")
        gcs_upload_file(DST_BUCKET, DST_BLOB_CSV, tmpf.name, content_type="text/csv")

    ui.show(archivo="(todos)", fase="Completado",
            detalle=f"CSV final ‚Üí gs://{DST_BUCKET}/{DST_BLOB_CSV}", progress=1.0)
    print(f"CSV final: gs://{DST_BUCKET}/{DST_BLOB_CSV}")

run_from_gcs(prefix=None)