# 04_Evaluation_Comparison — Evaluación y comparación

- Inferencia en test para los 3 enfoques.
- Métricas: Exact Match por campo (macro), % validación contable, F1 (si aplica), latencia/doc, tamaño modelo, facilidad despliegue (0–5).
- Selección del ganador con score:
  score = 0.6*ExactMatchPromedio + 0.2*ValidaciónContable - 0.1*LatenciaZ - 0.1*TamañoZ
- Guardar CSV y PDF en `model/reports/` y errores en JSON.



In [3]:
import os, re, json, time, random
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from pathlib import Path

random.seed(42); np.random.seed(42)
ROOT = Path(os.getcwd())
# Normalizar raíz para evitar notebooks/data si se ejecuta desde notebooks/
if (ROOT.name == 'notebooks') and (ROOT.parent / 'data').exists():
    ROOT = ROOT.parent
else:
    for p in [ROOT] + list(ROOT.parents):
        if (p / 'data').exists():
            ROOT = p
            break
DATA = ROOT / 'data'
MODEL = ROOT / 'model'
REPORTS = MODEL / 'reports'
OCR_DIR = DATA / 'ocr'
SPLITS = DATA / 'splits'
GT_DIR = DATA / 'gt'

REPORTS.mkdir(parents=True, exist_ok=True)

def load_ids(name):
    return [x.strip() for x in (SPLITS/f'{name}.txt').read_text(encoding='utf-8').splitlines() if x.strip()]

def read_gt(doc_id):
    p = GT_DIR / f'{doc_id}.json'
    with open(p,'r',encoding='utf-8') as f:
        return json.load(f)

def normalize_money(x):
    if x is None: return None
    if isinstance(x,(int,float)): return float(x)
    s = str(x).strip().replace(' ', '')
    s = s.replace('.', '').replace(',', '.') if s.count(',')==1 and s.count('.')>1 else s
    s = re.sub(r'[^0-9\.-]', '', s)
    try: return float(s)
    except: return None

def exact_match(a,b):
    return 1.0 if (a==b) else 0.0

# Heurística baseline basada en OCR+reglas (y clasificador si existe)

def heuristic_extract(doc_id):
    ocr_path = OCR_DIR / f'{doc_id}.json'
    if not ocr_path.exists():
        return {}
    with open(ocr_path,'r',encoding='utf-8') as f:
        doc = json.load(f)
    lines = []
    for page in doc.get('pages', []):
        for ln in page.get('lines', []):
            t = ln.get('text','')
            if t: lines.append(t)
    text = '\n'.join(lines)

    # Reglas simples
    fecha = None
    m = re.search(r'(\d{4}[-/.]\d{2}[-/.]\d{2})', text)
    if m: fecha = m.group(1).replace('/', '-').replace('.', '-')

    nit = None
    m = re.search(r'(\d{6,10}[-–]\d)', text)
    if m: nit = m.group(1).replace('–','-')

    def find_value_after(keyword):
        for ln in lines:
            if keyword in ln.upper():
                m = re.search(r'([0-9\.,]+)', ln)
                if m:
                    return normalize_money(m.group(1))
        return None

    total = find_value_after('TOTAL')
    subtotal = find_value_after('SUBTOTAL')

    iva_porcentaje = None
    m = re.search(r'IVA\s*(\d{1,2})\s*%|IVA\s*%\s*(\d{1,2})', text.upper())
    if m:
        iva_porcentaje = int([g for g in m.groups() if g][0])
    iva_valor = find_value_after('IVA')

    razon = None
    # Heurística: línea con RAZON o CLIENTE/PROVEEDOR cercana a NIT
    for ln in lines:
        up = ln.upper()
        if 'RAZON' in up or 'PROVEEDOR' in up or 'CLIENTE' in up:
            if len(ln.split())>=2:
                razon = ln
                break

    return {
        'fecha': fecha,
        'nit': nit,
        'razon_social': razon,
        'subtotal': subtotal,
        'iva_porcentaje': iva_porcentaje,
        'iva_valor': iva_valor,
        'total': total
    }

# Evaluación en test

FIELDS = ['fecha','nit','razon_social','subtotal','iva_porcentaje','iva_valor','total']

def validate_accounting(fields):
    subtotal = normalize_money(fields.get('subtotal'))
    iva_val = normalize_money(fields.get('iva_valor'))
    total = normalize_money(fields.get('total'))
    if None in [subtotal, iva_val, total]:
        return False
    return abs(subtotal + iva_val - total) < 1.01


def evaluate_method(name, predictor):
    test_ids = load_ids('test') if (SPLITS/'test.txt').exists() else []
    rows = []
    latencies = []
    for doc_id in test_ids:
        gt = read_gt(doc_id)
        t0 = time.time()
        pred = predictor(doc_id)
        latencies.append(time.time()-t0)
        row = {'doc_id': doc_id}
        for f in FIELDS:
            gtv = gt.get('campos',{}).get(f)
            pv = pred.get(f)
            if f in ['subtotal','iva_valor','total']:
                gtvn = normalize_money(gtv)
                pvnn = normalize_money(pv)
                row[f'EM_{f}'] = 1.0 if (gtvn is not None and pvnn is not None and abs(gtvn - pvnn) < 1e-2) else 0.0
            else:
                row[f'EM_{f}'] = exact_match(gtv, pv)
        row['accounting_ok'] = validate_accounting(pred)
        rows.append(row)
    df = pd.DataFrame(rows)
    em_cols = [c for c in df.columns if c.startswith('EM_')]
    em_macro = df[em_cols].mean().mean() if not df.empty else 0.0
    acc_pct = df['accounting_ok'].mean() if not df.empty else 0.0
    lat_mean = float(np.mean(latencies)) if latencies else 0.0
    return df, {'name': name, 'em_macro': em_macro, 'accounting_pct': acc_pct, 'latency_s': lat_mean}

# Definir predictores de cada método (aquí baseline y placeholders para Donut/LayoutLMv3)

def predict_paddle_rules(doc_id):
    return heuristic_extract(doc_id)

def predict_donut(doc_id):
    # TODO: cargar DONUT y parsear JSON; placeholder usa baseline
    return heuristic_extract(doc_id)

def predict_layoutlmv3(doc_id):
    # TODO: cargar LayoutLMv3 y etiquetar tokens; placeholder usa baseline
    return heuristic_extract(doc_id)

res_tables = {}
summary_rows = []
for name, pred in [('Paddle+Reglas','paddle'), ('Donut','donut'), ('LayoutLMv3','layout')]:
    func = predict_paddle_rules if pred=='paddle' else predict_donut if pred=='donut' else predict_layoutlmv3
    df, info = evaluate_method(name, func)
    res_tables[name] = df
    summary_rows.append(info)

summary = pd.DataFrame(summary_rows)
# tamaños y facilidad despliegue (estimados)
size_map = {'Paddle+Reglas': 50, 'Donut': 400, 'LayoutLMv3': 450}  # MB aprox
deploy_map = {'Paddle+Reglas': 5, 'Donut': 2, 'LayoutLMv3': 2}
summary['size_mb'] = summary['name'].map(size_map)
summary['deploy_ease'] = summary['name'].map(deploy_map)

# Calcular score final
from scipy.stats import zscore
summary['lat_z'] = zscore(summary['latency_s']) if len(summary)>1 else 0
summary['size_z'] = zscore(summary['size_mb']) if len(summary)>1 else 0
summary['score'] = 0.6*summary['em_macro'] + 0.2*summary['accounting_pct'] - 0.1*summary['lat_z'] - 0.1*summary['size_z']

# Ganador
best = summary.sort_values('score', ascending=False).iloc[0]
# Empate → mayor deploy_ease
tied = summary[summary['score']==best['score']]
if len(tied)>1:
    best = tied.sort_values('deploy_ease', ascending=False).iloc[0]

summary.to_csv(REPORTS/'model_comparison.csv', index=False)

# PDF
with PdfPages(REPORTS/'model_comparison.pdf') as pdf:
    fig, ax = plt.subplots(figsize=(6,4))
    ax.bar(summary['name'], summary['em_macro'], color='steelblue')
    ax.set_title('Exact Match Macro por Método')
    ax.set_ylim(0,1)
    plt.tight_layout(); pdf.savefig(fig); plt.close(fig)

    fig, ax = plt.subplots(figsize=(6,4))
    ax.bar(summary['name'], summary['accounting_pct'], color='seagreen')
    ax.set_title('% Validación Contable')
    ax.set_ylim(0,1)
    plt.tight_layout(); pdf.savefig(fig); plt.close(fig)

    fig, ax = plt.subplots(figsize=(6,4))
    ax.bar(summary['name'], summary['score'], color='indianred')
    ax.set_title('Score final')
    plt.tight_layout(); pdf.savefig(fig); plt.close(fig)

best_info = {'best_model': best['name'], 'metrics': best.to_dict()}
with open(REPORTS/'best_model.json','w',encoding='utf-8') as f:
    json.dump(best_info, f, ensure_ascii=False, indent=2)

summary, best_info


(            name  em_macro  accounting_pct  latency_s  size_mb  deploy_ease  \
 0  Paddle+Reglas  0.142857             0.0   0.000260       50            5   
 1          Donut  0.142857             0.0   0.000063      400            2   
 2     LayoutLMv3  0.142857             0.0   0.000000      450            2   
 
       lat_z    size_z     score  
 0  1.376268 -1.404879  0.088575  
 1 -0.406329  0.561951  0.070152  
 2 -0.969940  0.842927  0.098416  ,
 {'best_model': 'LayoutLMv3',
  'metrics': {'name': 'LayoutLMv3',
   'em_macro': 0.14285714285714285,
   'accounting_pct': 0.0,
   'latency_s': 0.0,
   'size_mb': 450,
   'deploy_ease': 2,
   'lat_z': -0.9699396886474567,
   'size_z': 0.8429272304235246,
   'score': 0.09841553153667892}})