# Editor CSV ‚Äî BO CABA

Mini-app para **editar, etiquetar y depurar** datasets en CSV, con:
- Carga desde **CSV_INPUT** (en Drive).
- **Editar** fragmento, **etiquetar** (1/0), marcar **ambiguo**, **temas**, **notas**.
- **Eliminar registro** (no destructivo; lista de excluidos).
- **Guardado incremental** (ediciones.csv + log), **reanudaci√≥n** (state.json).
- **Exportar limpio** (merge de ediciones y exclusiones a un nuevo CSV).

**Importante**: No toca el CSV original.


### **Toma los CSV originales (val/test) y crea una muestra aleatoria (‚Äúmanifest‚Äù) para etiquetar.**
Usarlo solo la primera vez. En las siguientes pasar directo a Gradio.

In [None]:
# === Manifest TEST (sample aleatoria de ~1000 IDs) ===
import os, pandas as pd

BASE = "/content/drive/MyDrive/IA/Proyectos/An√°lisis Bolet√≠n Oficial/boletin-ml"
CSV_TEST = f"{BASE}/data/labels/etiquetas_test.csv"  # CSV original (NO el CLEAN)
MANIFEST_PATH = f"{BASE}/labels/annot/test_eval/manifest_eval_ids.csv"
N = 1000                    # objetivo de filas a etiquetar
RNG = 2025                  # semilla reproducible

def ensure_id(df: pd.DataFrame) -> pd.DataFrame:
    if 'id' not in df.columns:
        ids = []
        for i, r in df.reset_index().iterrows():
            base = r.get('origen_pdf', '') or ''
            ids.append(f"row_{i}_{os.path.splitext(base)[0]}")
        df = df.copy()
        df['id'] = ids
    return df

df = pd.read_csv(CSV_TEST, dtype=str, keep_default_na=False, encoding="utf-8-sig", sep=None, engine="python")
df = ensure_id(df)

n_take = min(N, len(df))
sample = df.sample(n=n_take, random_state=RNG, replace=False)[['id']]

os.makedirs(os.path.dirname(MANIFEST_PATH), exist_ok=True)
sample.to_csv(MANIFEST_PATH, index=False, encoding="utf-8-sig")

print(f"Manifest TEST creado: {MANIFEST_PATH} | filas: {len(sample)}")

Manifest TEST creado: /content/drive/MyDrive/IA/Proyectos/An√°lisis Bolet√≠n Oficial/boletin-ml/labels/annot/test_eval/manifest_eval_ids.csv | filas: 1000


C√≥digo para a√±adir 200 filas m√°s para etiquetar. Usar solo una vez, luego pasar a Gradio.

In [None]:
# === Anexar +200 IDs nuevos al manifest TEST (sin duplicar) ===
import os, pandas as pd

BASE = "/content/drive/MyDrive/IA/Proyectos/An√°lisis Bolet√≠n Oficial/boletin-ml"
CSV_TEST = f"{BASE}/data/labels/etiquetas_test.csv"
MANIFEST_PATH = f"{BASE}/labels/annot/test_eval/manifest_eval_ids.csv"
N_NEW = 200
RNG = 2025

def ensure_id(df: pd.DataFrame) -> pd.DataFrame:
    if 'id' not in df.columns:
        ids = []
        for i, r in df.reset_index().iterrows():
            base = r.get('origen_pdf', '') or ''
            ids.append(f"row_{i}_{os.path.splitext(base)[0]}")
        df = df.copy()
        df['id'] = ids
    return df

df = pd.read_csv(CSV_TEST, dtype=str, keep_default_na=False, encoding="utf-8-sig", sep=None, engine="python")
df = ensure_id(df)
all_ids = set(df['id'])

cur = pd.read_csv(MANIFEST_PATH, dtype=str, keep_default_na=False) if os.path.exists(MANIFEST_PATH) else pd.DataFrame(columns=['id'])
have = set(cur['id']) if 'id' in cur.columns else set()

remaining = list(all_ids - have)
n_take = min(N_NEW, len(remaining))
added = pd.DataFrame(remaining).sample(n=n_take, random_state=RNG, replace=False).rename(columns={0:'id'})

out = pd.concat([cur[['id']], added[['id']]], ignore_index=True).drop_duplicates('id')
out.to_csv(MANIFEST_PATH, index=False, encoding="utf-8-sig")

print(f"Manifest actualizado: {MANIFEST_PATH} | antes: {len(cur)} | agregados: {n_take} | ahora: {len(out)}")

Manifest: /content/drive/MyDrive/IA/Proyectos/An√°lisis Bolet√≠n Oficial/boletin-ml/labels/annot/val_eval/manifest_eval_ids.csv
Antes: 400  |  Agregados: 100  |  Ahora: 500


In [None]:
# === Unificar test 2025: H1 + H2 en un solo CSV con IDs estables ===
import os, pandas as pd, hashlib
from google.colab import drive
drive.mount('/content/drive')

BASE = "/content/drive/MyDrive/IA/Proyectos/An√°lisis Bolet√≠n Oficial/boletin-ml"

# ‚Üê AJUSTAR si tienen otro nombre:
CSV_H1 = f"{BASE}/data/labels/etiquetas_test.csv"           # 1S-2025 (ya existente)
CSV_H2 = f"{BASE}/data/labels/etiquetas_test_H2_2025.csv"   # JUL-NOV 2025 (reci√©n procesado)

CSV_OUT = f"{BASE}/data/labels/etiquetas_test_2025YTD.csv"

def read_any(path):
    return pd.read_csv(path, dtype=str, keep_default_na=False,
                       encoding="utf-8-sig", sep=None, engine="python")

def ensure_id(df: pd.DataFrame) -> pd.DataFrame:
    if 'id' not in df.columns:
        # ID determin√≠stica con √≠ndice y PDF
        ids = []
        for i, r in df.reset_index().iterrows():
            base = (r.get('origen_pdf','') or '').replace('.pdf','')
            ids.append(f"row_{i}_{base}")
        df = df.copy()
        df['id'] = ids
    return df

# Cargar y marcar semestre
h1 = ensure_id(read_any(CSV_H1)).copy()
h1['source_semestre'] = 'H1_2025'

h2 = ensure_id(read_any(CSV_H2)).copy()
h2['source_semestre'] = 'H2_2025'

# Resolver colisiones de id entre H1 y H2 (poco probable, pero por las dudas)
ids_h1 = set(h1['id'])
dup_mask = h2['id'].isin(ids_h1)
if dup_mask.any():
    # Sufijar solo los que colisionan, registrando el remapeo
    def remap(row):
        if row['id'] in ids_h1:
            return f"{row['id']}_H2"
        return row['id']
    old2new = {}
    for i, row in h2[dup_mask].iterrows():
        old2new[row['id']] = f"{row['id']}_H2"
    h2.loc[dup_mask, 'id'] = h2[dup_mask].apply(remap, axis=1)
    # Guardar mapa por trazabilidad
    MAP_PATH = f"{BASE}/labels/annot/test_2025YTD_eval/id_remap_H2.csv"
    os.makedirs(os.path.dirname(MAP_PATH), exist_ok=True)
    pd.DataFrame([{'old_id':k,'new_id':v} for k,v in old2new.items()]).to_csv(MAP_PATH, index=False, encoding='utf-8-sig')
    print(f"‚ö†Ô∏è Colisiones de id resueltas: {len(old2new)}  -> {MAP_PATH}")

# Concatenar y ordenar por fecha si existe
cols = sorted(set(h1.columns) | set(h2.columns))
h1 = h1.reindex(columns=cols, fill_value='')
h2 = h2.reindex(columns=cols, fill_value='')
full = pd.concat([h1, h2], ignore_index=True)

# (Opcional) eliminar repetidos exactos por (origen_pdf, fragmento)
if 'origen_pdf' in full.columns and 'fragmento' in full.columns:
    before = len(full)
    full = full.drop_duplicates(subset=['origen_pdf','fragmento']).reset_index(drop=True)
    print(f"Dedup exacto origen_pdf+fragmento: {before-len(full)} filas eliminadas")

# Guardar
full.to_csv(CSV_OUT, index=False, encoding='utf-8-sig')
print(f"‚úÖ Unificado listo: {CSV_OUT} | filas: {len(full)}  (H1={len(h1)}, H2={len(h2)})")

Mounted at /content/drive
Dedup exacto origen_pdf+fragmento: 0 filas eliminadas
‚úÖ Unificado listo: /content/drive/MyDrive/IA/Proyectos/An√°lisis Bolet√≠n Oficial/boletin-ml/data/labels/etiquetas_test_2025YTD.csv | filas: 4044  (H1=2492, H2=1552)


In [None]:
# === Migrar ediciones ya hechas desde test_eval ‚Üí test_2025YTD_eval ===
import os, pandas as pd

BASE = "/content/drive/MyDrive/IA/Proyectos/An√°lisis Bolet√≠n Oficial/boletin-ml"
CSV_2025YTD = f"{BASE}/data/labels/etiquetas_test_2025YTD.csv"

# Origen (lo que ven√≠as usando):
OLD_TAG = "test_eval"   # ‚Üê NO tocar (carpeta existente)
OLD_DIR = f"{BASE}/labels/annot/{OLD_TAG}"
OLD_EDIC = f"{OLD_DIR}/ediciones.csv"
OLD_DELS = f"{OLD_DIR}/deleted_ids.csv"

# Nuevo tag
NEW_TAG = "test_2025YTD_eval"
NEW_DIR = f"{BASE}/labels/annot/{NEW_TAG}"
NEW_EDIC = f"{NEW_DIR}/ediciones.csv"
NEW_DELS = f"{NEW_DIR}/deleted_ids.csv"

os.makedirs(NEW_DIR, exist_ok=True)

df = pd.read_csv(CSV_2025YTD, dtype=str, keep_default_na=False, encoding='utf-8-sig')
ids_all = set(df['id'])

migrados = 0
if os.path.exists(OLD_EDIC):
    ed_old = pd.read_csv(OLD_EDIC, dtype=str, keep_default_na=False, encoding='utf-8-sig')
    ed_old = ed_old[ed_old['id'].isin(ids_all)].copy()
    migrados = len(ed_old)
    ed_old.to_csv(NEW_EDIC, index=False, encoding='utf-8-sig')
    print(f"‚úÖ Ediciones migradas: {migrados} ‚Üí {NEW_EDIC}")
else:
    print("‚ÑπÔ∏è No se encontr√≥ ediciones.csv del tag anterior; se empieza vac√≠o.")

if os.path.exists(OLD_DELS):
    dels_old = pd.read_csv(OLD_DELS, dtype=str, keep_default_na=False, encoding='utf-8-sig')
    dels_old = dels_old[dels_old['id'].isin(ids_all)].copy()
    dels_old.to_csv(NEW_DELS, index=False, encoding='utf-8-sig')
    print(f"‚úÖ Eliminados migrados: {len(dels_old)} ‚Üí {NEW_DELS}")
else:
    print("‚ÑπÔ∏è No se encontr√≥ deleted_ids.csv del tag anterior.")

‚úÖ Ediciones migradas: 684 ‚Üí /content/drive/MyDrive/IA/Proyectos/An√°lisis Bolet√≠n Oficial/boletin-ml/labels/annot/test_2025YTD_eval/ediciones.csv
‚ÑπÔ∏è No se encontr√≥ deleted_ids.csv del tag anterior.


In [None]:
# === Manifest test_2025YTD: reusar lo etiquetado y completar aleatorio hasta N_TARGET ===
import os, pandas as pd, numpy as np

BASE = "/content/drive/MyDrive/IA/Proyectos/An√°lisis Bolet√≠n Oficial/boletin-ml"
CSV_2025YTD   = f"{BASE}/data/labels/etiquetas_test_2025YTD.csv"
NEW_TAG       = "test_2025YTD_eval"
NEW_DIR       = f"{BASE}/labels/annot/{NEW_TAG}"
NEW_EDIC      = f"{NEW_DIR}/ediciones.csv"
MANIFEST_PATH = f"{NEW_DIR}/manifest_eval_ids.csv"

N_TARGET = 1200   # ‚Üê AJUSTAR: 1200‚Äì1500 recomendado
RNG      = 2025

df = pd.read_csv(CSV_2025YTD, dtype=str, keep_default_na=False, encoding='utf-8-sig')

def has_anchor(text: str) -> bool:
    if not isinstance(text, str): return False
    patt = r"(RESUELVE|DISPONE|DECRETA|ART[√çI]CULO\\s*1\\s*¬∞|sanciona\\s+con\\s+fuerza\\s+de\\s+ley)"
    return bool(pd.Series([text]).str.contains(patt, case=False, regex=True, na=False).iloc[0])

# Base ya etiquetada (si existe)
labeled_ids = set()
if os.path.exists(NEW_EDIC):
    ed = pd.read_csv(NEW_EDIC, dtype=str, keep_default_na=False, encoding='utf-8-sig')
    labeled_ids = set(ed['id'])

ids_all = list(df['id'])
ids_pool = [i for i in ids_all if i not in labeled_ids]

# Opcional: estratos simples por bucket y ancla (si faltan columnas, cae a aleatorio)
use_strata = ('bucket' in df.columns)
if 'has_ancla' not in df.columns:
    # construimos a demanda (barato)
    df['has_ancla'] = df['fragmento'].apply(has_anchor)

# Partimos del set ya etiquetado:
manifest_ids = list(labeled_ids & set(ids_all))

# ¬øCu√°ntos faltan para N_TARGET?
faltan = max(0, N_TARGET - len(manifest_ids))
if faltan > 0:
    if use_strata:
        # Muestreo estratificado proporcional: bucket √ó has_ancla
        rest = df[df['id'].isin(ids_pool)].copy()
        rest['estrato'] = rest['bucket'].fillna('NA') + " | ancla=" + rest['has_ancla'].astype(int).astype(str)
        # tama√±o de cada estrato
        sizes = rest['estrato'].value_counts().to_dict()
        # cupos proporcionales (con al menos 1 si hay espacio)
        total_rest = len(rest)
        cupos = {e: max(1, int(faltan * (n/total_rest))) for e,n in sizes.items()}
        # ajustar por redondeo
        diff = faltan - sum(cupos.values())
        for e in list(cupos.keys())[:abs(diff)]:
            cupos[e] += 1 if diff>0 else -1
            if cupos[e] < 0: cupos[e] = 0
        # sample por estrato
        sel = []
        for e, k in cupos.items():
            sub = rest[rest['estrato']==e]
            k = min(k, len(sub))
            if k>0:
                sel.append(sub.sample(n=k, random_state=RNG)[['id']])
        add = pd.concat(sel, ignore_index=True) if sel else pd.DataFrame(columns=['id'])
        add_ids = list(add['id'].unique())
    else:
        # Aleatorio simple
        add_ids = list(pd.Series(ids_pool).sample(n=min(faltan,len(ids_pool)), random_state=RNG))
    manifest_ids = list(manifest_ids) + add_ids

# Guardar manifest (√∫nicos)
man = pd.DataFrame(sorted(set(manifest_ids)), columns=['id'])
os.makedirs(NEW_DIR, exist_ok=True)
man.to_csv(MANIFEST_PATH, index=False, encoding='utf-8-sig')

print(f"‚úÖ Manifest listo: {MANIFEST_PATH} | ya etiquetados: {len(labeled_ids)} | en manifest: {len(man)}")

  return bool(pd.Series([text]).str.contains(patt, case=False, regex=True, na=False).iloc[0])


‚úÖ Manifest listo: /content/drive/MyDrive/IA/Proyectos/An√°lisis Bolet√≠n Oficial/boletin-ml/labels/annot/test_2025YTD_eval/manifest_eval_ids.csv | ya etiquetados: 684 | en manifest: 1200


### **Aqu√≠ empieza el editor Gradio**

In [1]:
# === 1) Entorno ===
%%capture
!pip -q install gradio==4.* tqdm==4.*

from google.colab import drive
drive.mount('/content/drive')

import os, re, json
from datetime import datetime
import pandas as pd
from tqdm.auto import tqdm
import gradio as gr

print('Entorno listo ‚úÖ')

In [2]:
# === 2) Configuraci√≥n (editor por CSV con manifest) ===

BASE = "/content/drive/MyDrive/IA/Proyectos/An√°lisis Bolet√≠n Oficial/boletin-ml"

# Eleg√≠ uno cada vez que edites:
# CSV_INPUT = f"{BASE}/data/labels/etiquetas_val.csv"
# DATASET_TAG = "val_eval"
# MANIFEST_PATH = f"{BASE}/labels/annot/val_eval/manifest_eval_ids.csv"

CSV_INPUT     = f"{BASE}/data/labels/etiquetas_test_2025YTD.csv"          # ‚Üê nuevo unificado
DATASET_TAG   = "test_2025YTD_eval"                                       # ‚Üê nuevo tag
MANIFEST_PATH = f"{BASE}/labels/annot/test_2025YTD_eval/manifest_eval_ids.csv"

# Carpeta de trabajo para este dataset (nuevo tag evita arrastrar borrados previos)
ANNOT_DIR = f"{BASE}/labels/annot/{DATASET_TAG}"
CSV_EDICIONES     = f"{ANNOT_DIR}/ediciones.csv"
CSV_EDICIONES_LOG = f"{ANNOT_DIR}/ediciones_log.csv"
JSON_STATE        = f"{ANNOT_DIR}/state.json"
CSV_DELETED       = f"{ANNOT_DIR}/deleted_ids.csv"

# Export del merge limpio
import os
CSV_BASENAME = os.path.splitext(os.path.basename(CSV_INPUT))[0]
CSV_EXPORT   = f"{BASE}/data/labels/{CSV_BASENAME}_CLEAN_{DATASET_TAG}.csv"

# Usuario anotador (opcional)
ANOTADOR = "juan"

# Ignorar borrados hist√≥ricos (empezamos fresco)
IGNORE_DELETED = True

import os
os.makedirs(ANNOT_DIR, exist_ok=True)
print('Config OK ‚úÖ')

Config OK ‚úÖ


In [3]:
# === 3) Utilidades ===

def read_csv_any(path):
    return pd.read_csv(path, dtype=str, keep_default_na=False, encoding='utf-8-sig', sep=None, engine='python')

def write_csv(df, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    df.to_csv(path, index=False, encoding='utf-8-sig')

def norm_text(s: str) -> str:
    import re
    s = (s or '').replace('\x0c', ' ')
    s = re.sub(r"\s+", " ", s)
    return s.strip()

def ensure_id(df: pd.DataFrame) -> pd.DataFrame:
    if 'id' not in df.columns:
        ids = []
        for i, r in df.reset_index().iterrows():
            base = r.get('origen_pdf', '') or ''
            ids.append(f"row_{i}_{os.path.splitext(base)[0]}")
        df = df.copy()
        df['id'] = ids
    return df

def load_state():
    if os.path.exists(JSON_STATE):
        try:
            return json.load(open(JSON_STATE, 'r', encoding='utf-8'))
        except Exception:
            pass
    return {"order": [], "pos": 0, "done_ids": [], "deleted_ids": []}

def save_state(st):
    json.dump(st, open(JSON_STATE, 'w', encoding='utf-8'))

def load_ediciones():
    cols = [
        'id','fragmento_original','fragmento_editado','label','is_ambiguo',
        'temas','anotador','notas','to_review','ts_edicion','rev',
        'origen_pdf','fecha','bucket','score'
    ]
    if os.path.exists(CSV_EDICIONES):
        ed = read_csv_any(CSV_EDICIONES)
        for c in cols:
            if c not in ed.columns:
                ed[c] = ''
        ed = ed[cols]
    else:
        ed = pd.DataFrame(columns=cols)
    if os.path.exists(CSV_EDICIONES_LOG):
        ed_log = read_csv_any(CSV_EDICIONES_LOG)
    else:
        ed_log = pd.DataFrame(columns=cols)
    return ed, ed_log

def upsert_edicion(ed: pd.DataFrame, row: dict) -> pd.DataFrame:
    ed = ed.copy()
    mask = (ed['id'] == row.get('id',''))
    if mask.any():
        idx = ed.index[mask][0]
        for k, v in row.items():
            ed.at[idx, k] = v
    else:
        ed = pd.concat([ed, pd.DataFrame([row])], ignore_index=True)
    return ed

def load_deleted():
    if os.path.exists(CSV_DELETED):
        d = read_csv_any(CSV_DELETED)
        return set(d.get('id', []))
    return set()

def append_deleted(rid: str, motivo: str = ""):
    ts = datetime.now().isoformat(timespec='seconds')
    row = pd.DataFrame([{"id": rid, "motivo": motivo, "ts": ts}])
    header = not os.path.exists(CSV_DELETED)
    row.to_csv(CSV_DELETED, index=False, encoding='utf-8-sig', mode='a', header=header)

print('Utils OK ‚úÖ')

Utils OK ‚úÖ


In [4]:
# === 4) Carga CSV y estado (con manifest) ===

df = read_csv_any(CSV_INPUT)
assert 'fragmento' in df.columns, "El CSV debe tener columna 'fragmento'"
df = ensure_id(df)
if 'fragmento_original' not in df.columns:
    df['fragmento_original'] = df['fragmento']

# manifest de evaluaci√≥n (si existe)

manifest_ids = None
import os, pandas as pd
if MANIFEST_PATH and os.path.exists(MANIFEST_PATH):
    _m = pd.read_csv(MANIFEST_PATH, dtype=str, keep_default_na=False)
    manifest_ids = set(_m['id'])
    print(f"Manifest cargado: {len(manifest_ids)} ids")

st = load_state()
ed, ed_log = load_ediciones()

# ignorar borrados previos si as√≠ se pidi√≥
deleted_ids = set() if IGNORE_DELETED else load_deleted()

ids_all = list(df['id'])
order = [i for i in ids_all if (i not in deleted_ids) and (manifest_ids is None or i in manifest_ids)]

if not st.get('order'):
    st['order'] = order
else:
    # refrescar con el nuevo order (manifest) preservando posici√≥n si se puede
    st['order'] = [i for i in st['order'] if i in order] + [i for i in order if i not in st['order']]

st['pos'] = min(st.get('pos', 0), max(0, len(st['order'])-1))
save_state(st)

print(f"Registros en CSV: {len(df)} | Evaluaci√≥n (manifest): {len(st['order'])}")

Manifest cargado: 1200 ids
Registros en CSV: 4044 | Evaluaci√≥n (manifest): 1200


In [5]:
# === 5) Funciones de la app ===

def get_current_record(df, st):
    if not st['order']:
        return {"id":"","fragmento":"","msg":"No hay registros (todos eliminados?)"}
    rid = st['order'][st['pos']]
    r = df[df['id']==rid]
    if len(r)==0:
        return {"id": rid, "fragmento":"", "msg":"ID no encontrado"}
    return r.iloc[0].to_dict()

def merge_original_edit(rec, cur_ed):
    frag_orig = rec.get('fragmento_original', rec.get('fragmento',''))
    frag_show = cur_ed.get('fragmento_editado','') if cur_ed else rec.get('fragmento','')
    frag_show = frag_show or frag_orig
    return frag_orig, frag_show

def load_view():
    global df, st, ed
    rec = get_current_record(df, st)
    cur_ed = None
    if len(ed):
        m = ed[ed['id']==rec.get('id','')]
        if len(m):
            cur_ed = m.iloc[0].to_dict()
    frag_orig, frag_show = merge_original_edit(rec, cur_ed)
    meta_md = (
        f"**ID:** {rec.get('id','')}\n\n"
        f"**Fecha:** {rec.get('fecha','')}  |  **PDF:** {rec.get('origen_pdf','')}\n\n"
        f"**Bucket:** {rec.get('bucket','')}  |  **Score:** {rec.get('score','')}  |  **N¬∫ Norma:** {rec.get('numero_norma','')}\n\n"
    )
    label_val = cur_ed.get('label','') if cur_ed else rec.get('label','')
    amb_val = bool(int(cur_ed.get('is_ambiguo','0'))) if cur_ed else bool(int(rec.get('is_ambiguo','0') or 0))
    temas_val = cur_ed.get('temas','') if cur_ed else rec.get('temas','')
    notas_val = cur_ed.get('notas','') if cur_ed else rec.get('notas','')
    torev_val = bool(int(cur_ed.get('to_review','0'))) if cur_ed else bool(int(rec.get('to_review','0') or 0))
    progreso = build_progress_text()
    return meta_md, frag_show, label_val, amb_val, temas_val, notas_val, torev_val, progreso, frag_orig

def upsert_and_advance(direction, frag_text, label_sel, is_amb, temas_txt, notas_txt, to_review):
    global df, st, ed, ed_log
    rec = get_current_record(df, st)
    now = datetime.now().isoformat(timespec='seconds')
    frag_text = norm_text(frag_text or '')
    is_amb_str = '1' if is_amb else '0'
    to_review_str = '1' if to_review else '0'

    cur_ed = None
    if len(ed):
        m = ed[ed['id']==rec.get('id','')]
        if len(m):
            cur_ed = m.iloc[0].to_dict()
    prev_rev = int(cur_ed.get('rev', 0)) if cur_ed else 0
    new_rev = prev_rev + 1 if (label_sel or frag_text != rec.get('fragmento','')) else prev_rev

    if new_rev != 0:
        row = {
            'id': rec.get('id',''),
            'fragmento_original': rec.get('fragmento_original', rec.get('fragmento','')),
            'fragmento_editado': frag_text or rec.get('fragmento',''),
            'label': str(label_sel or ''),
            'is_ambiguo': is_amb_str,
            'temas': temas_txt or '',
            'anotador': ANOTADOR,
            'notas': notas_txt or '',
            'to_review': to_review_str,
            'ts_edicion': now,
            'rev': str(new_rev),
            'origen_pdf': rec.get('origen_pdf',''),
            'fecha': rec.get('fecha',''),
            'bucket': rec.get('bucket',''),
            'score': rec.get('score',''),
        }
        ed = upsert_edicion(ed, row)
        write_csv(ed, CSV_EDICIONES)
        ed_log = pd.concat([ed_log, pd.DataFrame([row])], ignore_index=True)
        write_csv(ed_log, CSV_EDICIONES_LOG)
        done = set(st.get('done_ids', []))
        done.add(rec.get('id',''))
        st['done_ids'] = list(done)
        save_state(st)

    if direction != 0:
        new_pos = st['pos'] + direction
        new_pos = max(0, min(new_pos, len(st['order'])-1))
        st['pos'] = new_pos
        save_state(st)

    return load_view()

def save_and_next(frag_text, label_sel, is_amb, temas_txt, notas_txt, to_review):
    return upsert_and_advance(+1, frag_text, label_sel, is_amb, temas_txt, notas_txt, to_review)

def save_and_prev(frag_text, label_sel, is_amb, temas_txt, notas_txt, to_review):
    return upsert_and_advance(-1, frag_text, label_sel, is_amb, temas_txt, notas_txt, to_review)

def reset_fragment():
    global df, st
    rec = get_current_record(df, st)
    frag_orig = rec.get('fragmento_original', rec.get('fragmento',''))
    meta_md, _, label_val, amb_val, temas_val, notas_val, torev_val, progreso, _ = load_view()
    return meta_md, frag_orig, label_val, amb_val, temas_val, notas_val, torev_val, progreso, frag_orig

def skip_move(direction):
    global st
    if not st['order']:
        return "No hay registros", "", "", False, "", "", False, "", ""
    new_pos = st['pos'] + direction
    st['pos'] = max(0, min(new_pos, len(st['order'])-1))
    save_state(st)
    return load_view()

def goto_position(pos_text):
    global st
    try:
        p = int(str(pos_text).strip()) - 1
    except Exception:
        p = st['pos']
    st['pos'] = max(0, min(p, len(st['order'])-1))
    save_state(st)
    return load_view()

def delete_current():
    global df, st
    rec = get_current_record(df, st)
    rid = rec.get('id','')
    if not rid:
        return load_view()
    append_deleted(rid, motivo="manual")
    st['order'] = [i for i in st['order'] if i != rid]
    st['pos'] = min(st['pos'], max(0, len(st['order'])-1))
    save_state(st)
    return load_view()

def export_clean():
    base = read_csv_any(CSV_INPUT)
    base = ensure_id(base)
    dels = load_deleted()
    base = base[~base['id'].isin(dels)].copy()
    if os.path.exists(CSV_EDICIONES):
        e = read_csv_any(CSV_EDICIONES)
        e = e[['id','fragmento_editado','label','is_ambiguo','temas','notas','to_review']]
        base = base.merge(e, on='id', how='left')
        base['fragmento_final'] = base['fragmento_editado'].fillna('')
        base.loc[base['fragmento_final'].eq(''), 'fragmento_final'] = base['fragmento']
        for c in ['label','is_ambiguo','temas','notas','to_review']:
            if c not in base.columns: base[c] = ''
    else:
        base['fragmento_final'] = base['fragmento']
        for c in ['label','is_ambiguo','temas','notas','to_review']:
            if c not in base.columns: base[c] = ''
    cols_front = [c for c in ['id','fecha','origen_pdf','bucket','score','numero_norma'] if c in base.columns]
    cols_text = ['fragmento','fragmento_final']
    cols_labels = [c for c in ['label','is_ambiguo','temas','notas','to_review'] if c in base.columns]
    out_cols = cols_front + cols_text + cols_labels
    for c in base.columns:
        if c not in out_cols:
            out_cols.append(c)
    out = base[out_cols]
    write_csv(out, CSV_EXPORT)
    return f"Export listo: {CSV_EXPORT} (filas: {len(out)})"


def build_progress_text():
    """Muestra progreso + conteo de positivos/negativos y % positivos."""
    global st, ed
    total = len(st['order'])
    done_ids = set(st.get('done_ids', []))
    done_in_order = [i for i in st['order'] if i in done_ids]
    pos = neg = 0
    if len(ed):
        e = ed[ed['id'].isin(done_in_order)]
        pos = (e['label'] == '1').sum()
        neg = (e['label'] == '0').sum()
    unl = total - len(done_in_order)
    denom = pos + neg
    rate = (pos / denom) if denom > 0 else 0.0
    return f"Progreso: {len(done_in_order)}/{total} | +: {pos} | -: {neg} | sin etiqueta: {unl} | %+ (sobre etiquetados): {rate:.1%}"

print('Funciones OK ‚úÖ')

Funciones OK ‚úÖ


In [6]:
# === 6) App Gradio ===
with gr.Blocks(title=f"Editor CSV ‚Äî {DATASET_TAG.upper()}") as demo:
    gr.Markdown(f"## Editor CSV ‚Äî Dataset: **{DATASET_TAG}**\nOrigen: `{CSV_INPUT}`")
    with gr.Row():
        meta = gr.Markdown("Cargando...")
        progreso = gr.Markdown()
    with gr.Row():
        with gr.Column(scale=2):
            frag = gr.Textbox(label="Fragmento (editable)", lines=20, interactive=True)
            frag_orig_hidden = gr.Textbox(label="Original (oculto)", visible=False)
            with gr.Row():
                btn_delete = gr.Button("üóëÔ∏è Eliminar registro (excluir)")
        with gr.Column(scale=1):
            label = gr.Radio(choices=["1","0",""], label="Pertinente (1) / No (0)", value="")
            is_amb = gr.Checkbox(label="Ambiguo", value=False)
            temas = gr.Textbox(label="Temas (opcional)")
            notas = gr.Textbox(label="Notas")
            to_review = gr.Checkbox(label="Revisar despu√©s", value=False)
            pos_input = gr.Textbox(label="Ir a posici√≥n (1-N)")
            with gr.Row():
                btn_prev = gr.Button("‚üµ Guardar + Anterior")
                btn_next = gr.Button("Guardar + Siguiente ‚ü∂")
            with gr.Row():
                btn_reset = gr.Button("Reset a original")
                btn_skip  = gr.Button("Saltar (no guardar)")
            with gr.Row():
                btn_goto  = gr.Button("Ir a posici√≥n")
                btn_export= gr.Button("Exportar limpio (merge)")

    demo.load(load_view, outputs=[meta, frag, label, is_amb, temas, notas, to_review, progreso, frag_orig_hidden])
    btn_next.click(save_and_next, inputs=[frag, label, is_amb, temas, notas, to_review],
                   outputs=[meta, frag, label, is_amb, temas, notas, to_review, progreso, frag_orig_hidden])
    btn_prev.click(save_and_prev, inputs=[frag, label, is_amb, temas, notas, to_review],
                   outputs=[meta, frag, label, is_amb, temas, notas, to_review, progreso, frag_orig_hidden])
    btn_skip.click(lambda: skip_move(+1), outputs=[meta, frag, label, is_amb, temas, notas, to_review, progreso, frag_orig_hidden])
    btn_reset.click(reset_fragment, outputs=[meta, frag, label, is_amb, temas, notas, to_review, progreso, frag_orig_hidden])
    btn_goto.click(goto_position, inputs=pos_input, outputs=[meta, frag, label, is_amb, temas, notas, to_review, progreso, frag_orig_hidden])
    btn_export.click(export_clean, outputs=progreso)
    btn_delete.click(delete_current, outputs=[meta, frag, label, is_amb, temas, notas, to_review, progreso, frag_orig_hidden])

demo.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://7d55e7d23de511eabd.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


