# 03_Training_Model_Comparison — Entrenamiento y comparación

Modelos:
1) Donut (OCR-free Transformer) → imagen → JSON
2) LayoutLMv3 (layout-aware) → OCR + bboxes + token/kv
3) PaddleOCR + Reglas/Clasificador (baseline robusto)

Notas:
- Fijamos `random_state=42`.
- Si no hay GPU/deps, se omite entrenamiento pero se deja pipeline y carga de pesos base.
- Artefactos se guardan en `model/`.



In [1]:
import os, json, time, random
import numpy as np, pandas as pd
from pathlib import Path

random.seed(42); np.random.seed(42)
ROOT = Path(os.getcwd())
# Normalizar raíz para evitar notebooks/data si se ejecuta desde notebooks/
if (ROOT.name == 'notebooks') and (ROOT.parent / 'data').exists():
    ROOT = ROOT.parent
else:
    for p in [ROOT] + list(ROOT.parents):
        if (p / 'data').exists():
            ROOT = p
            break
DATA = ROOT / 'data'
MODEL_DIR = ROOT / 'model'
DONUT_DIR = MODEL_DIR / 'donut_invoice'
LMLV3_DIR = MODEL_DIR / 'layoutlmv3_invoice'
LINE_CLS_DIR = MODEL_DIR / 'line_classifier'
OCR_DIR = DATA / 'ocr'
SPLITS = DATA / 'splits'

DONUT_DIR.mkdir(parents=True, exist_ok=True)
LMLV3_DIR.mkdir(parents=True, exist_ok=True)
LINE_CLS_DIR.mkdir(parents=True, exist_ok=True)

# Utilidades: cargar splits

def load_ids(name):
    return [x.strip() for x in (SPLITS/f'{name}.txt').read_text(encoding='utf-8').splitlines() if x.strip()]

train_ids = load_ids('train') if (SPLITS/'train.txt').exists() else []
val_ids = load_ids('val') if (SPLITS/'val.txt').exists() else []

times = {}

print('Docs train/val:', len(train_ids), len(val_ids))



Docs train/val: 72 15


In [2]:
# 1) DONUT — Fine-tuning (si es posible) o carga base

donut_ok = False
start = time.time()
try:
    from transformers import VisionEncoderDecoderModel, DonutProcessor
    # Dataset mínimo: pares imagen->string JSON (a partir de GT). Aquí solo preparamos estructura.
    # Para ejecución real, se requiere formatear prompt/target por documento.
    base_model = 'naver-clova-ix/donut-base'
    model = VisionEncoderDecoderModel.from_pretrained(base_model)
    processor = DonutProcessor.from_pretrained(base_model)
    # TODO: construir dataset y entrenar con Trainer. Aquí simulamos guardado de artefactos base.
    model.save_pretrained(DONUT_DIR)
    processor.save_pretrained(DONUT_DIR)
    donut_ok = True
except Exception as e:
    print('[INFO] DONUT no entrenado (deps/GPU):', e)

times['donut_train_s'] = time.time() - start
print('DONUT listo?', donut_ok, 'tiempo(s)=', times['donut_train_s'])

# 2) LayoutLMv3 — Token classification/Key-Value (si es posible)

lmlv3_ok = False
start = time.time()
try:
    from transformers import LayoutLMv3ForTokenClassification, LayoutLMv3FeatureExtractor, AutoTokenizer
    base_model = 'microsoft/layoutlmv3-base'
    model = LayoutLMv3ForTokenClassification.from_pretrained(base_model, num_labels=15)
    feat = LayoutLMv3FeatureExtractor.from_pretrained(base_model)
    tok = AutoTokenizer.from_pretrained(base_model)
    # TODO: generar dataset estilo FUNSD/SROIE a partir de data/ocr/*.json con etiquetas BIO.
    model.save_pretrained(LMLV3_DIR)
    tok.save_pretrained(LMLV3_DIR)
    feat.save_pretrained(LMLV3_DIR)
    lmlv3_ok = True
except Exception as e:
    print('[INFO] LayoutLMv3 no entrenado (deps/GPU):', e)

times['layoutlmv3_train_s'] = time.time() - start
print('LayoutLMv3 listo?', lmlv3_ok, 'tiempo(s)=', times['layoutlmv3_train_s'])

# 3) PaddleOCR + Reglas/Clasificador — baseline
# Para entrenamiento del clasificador de líneas (TOTAL/SUBTOTAL/IVA), vectorizamos líneas OCR

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

start = time.time()
labels_map = {'TOTAL':0,'SUBTOTAL':1,'IVA':2}
X, y = [], []

for p in sorted(OCR_DIR.glob('*.json')):
    with open(p,'r',encoding='utf-8') as f:
        doc = json.load(f)
    lines = []
    for page in doc.get('pages', []):
        for ln in page.get('lines', []):
            txt = ln.get('text','')
            if not txt: continue
            lines.append(txt)
    for ln in lines:
        up = ln.upper()
        if 'TOTAL' in up:
            X.append(ln); y.append(labels_map['TOTAL'])
        elif 'SUBTOTAL' in up:
            X.append(ln); y.append(labels_map['SUBTOTAL'])
        elif 'IVA' in up:
            X.append(ln); y.append(labels_map['IVA'])

cls_ok = False
if len(X) >= 10:
    Xtr, Xva, ytr, yva = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    pipe = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
        ('lr', LogisticRegression(max_iter=1000, random_state=42))
    ])
    pipe.fit(Xtr, ytr)
    acc = pipe.score(Xva, yva)
    print('Line classifier acc (val):', acc)
    import joblib
    joblib.dump(pipe, LINE_CLS_DIR / 'line_classifier.joblib')
    cls_ok = True
else:
    print('[INFO] Muestras insuficientes para clasificador de líneas, se usarán solo reglas')

times['paddle_rules_train_s'] = time.time() - start

# Guardar tiempos y tamaños
report = {
    'donut_trained': bool(donut_ok),
    'layoutlmv3_trained': bool(lmlv3_ok),
    'line_classifier_trained': bool(cls_ok),
    **times
}
(MODEL_DIR / 'reports').mkdir(parents=True, exist_ok=True)
with open(MODEL_DIR / 'reports' / 'training_times.json','w',encoding='utf-8') as f:
    json.dump(report, f, ensure_ascii=False, indent=2)
report




DONUT listo? True tiempo(s)= 8.222777605056763


Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LayoutLMv3 listo? True tiempo(s)= 1.251788854598999
[INFO] Muestras insuficientes para clasificador de líneas, se usarán solo reglas


{'donut_trained': True,
 'layoutlmv3_trained': True,
 'line_classifier_trained': False,
 'donut_train_s': 8.222777605056763,
 'layoutlmv3_train_s': 1.251788854598999,
 'paddle_rules_train_s': 0.005631446838378906}