# 02_Cleaning_and_Preparation — Splits, PDFs→Imágenes, Preprocesado y OCR

Objetivos:
- Generar splits `train/val/test` (70/15/15) por `doc_id` y guardar en `data/splits/`.
- Convertir PDFs a imágenes a 300 dpi en `data/raw/` como `<doc_id>_p01.png`, etc.
- Preprocesar imágenes (deskew, Otsu, CLAHE) y guardar opcionalmente en `data/processed/`.
- Ejecutar OCR base (PaddleOCR o Tesseract) y guardar `data/ocr/<doc_id>.json` con líneas, palabras y bboxes (para LayoutLMv3).
- Validar consistencia básica de GT.



In [3]:
import os
import re
import io
import cv2
import json
import glob
import time
import random
import numpy as np
import pandas as pd
from pathlib import Path
from PIL import Image

random.seed(42)
np.random.seed(42)

ROOT = Path(os.getcwd())
# Normalizar raíz para evitar notebooks/data si se ejecuta desde notebooks/
if (ROOT.name == 'notebooks') and (ROOT.parent / 'data').exists():
    ROOT = ROOT.parent
else:
    for p in [ROOT] + list(ROOT.parents):
        if (p / 'data').exists():
            ROOT = p
            break
DATA = ROOT / 'data'
RAW = DATA / 'raw'
PROC = DATA / 'processed'
OCR_DIR = DATA / 'ocr'
SPLITS = DATA / 'splits'
GT_DIR = DATA / 'gt'
MANIFEST = DATA / 'manifest.csv'

SPLITS.mkdir(parents=True, exist_ok=True)
PROC.mkdir(parents=True, exist_ok=True)
OCR_DIR.mkdir(parents=True, exist_ok=True)

# Utilidades

def load_manifest():
    return pd.read_csv(MANIFEST)

def save_splits(doc_ids, train_ratio=0.7, val_ratio=0.15):
    rng = np.random.RandomState(42)
    doc_ids = list(sorted(doc_ids))
    rng.shuffle(doc_ids)
    n = len(doc_ids)
    n_train = int(n*train_ratio)
    n_val = int(n*val_ratio)
    train = doc_ids[:n_train]
    val = doc_ids[n_train:n_train+n_val]
    test = doc_ids[n_train+n_val:]
    for name, split in [('train',train),('val',val),('test',test)]:
        (SPLITS/f'{name}.txt').write_text('\n'.join(split), encoding='utf-8')
    return train, val, test

# PDF → Imágenes (300 dpi)

def pdf_to_images(pdf_path: Path, out_prefix: str, dpi=300):
    pages = []
    try:
        from pdf2image import convert_from_path
        pages = convert_from_path(str(pdf_path), dpi=dpi)
    except Exception:
        try:
            import fitz  # PyMuPDF
            doc = fitz.open(str(pdf_path))
            for i, page in enumerate(doc):
                pix = page.get_pixmap(dpi=dpi)
                img = Image.frombytes('RGB', [pix.width, pix.height], pix.samples)
                pages.append(img)
        except Exception as e2:
            print(f'[WARN] No se pudo convertir PDF {pdf_path}: {e2}')
            return []
    out_paths = []
    for i, img in enumerate(pages, start=1):
        out_path = RAW / f'{out_prefix}_p{i:02d}.png'
        img.save(out_path)
        out_paths.append(out_path)
    return out_paths

# Preprocesado: deskew, Otsu, CLAHE

def deskew(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    coords = np.column_stack(np.where(gray > 0))
    angle = 0.0
    if coords.size > 0:
        rect = cv2.minAreaRect(coords)
        angle = rect[-1]
        if angle < -45:
            angle = -(90 + angle)
        else:
            angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))

def preprocess_image(path: Path):
    img = cv2.imread(str(path))
    if img is None:
        return None
    img = deskew(img)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    gray = clahe.apply(gray)
    _, th = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    out = cv2.cvtColor(th, cv2.COLOR_GRAY2BGR)
    out_path = PROC / Path(path).name
    cv2.imwrite(str(out_path), out)
    return out_path

# OCR base → JSON compatible con LayoutLMv3 (palabras y bboxes normalizados 0-1000)

def ocr_document_images(doc_id: str, page_paths: list):
    # Intenta PaddleOCR, si no usa Tesseract
    use_paddle = False
    ocr = None
    try:
        from paddleocr import PaddleOCR
        ocr = PaddleOCR(lang='es', use_angle_cls=True, show_log=False)
        use_paddle = True
    except Exception:
        pass

    results = []
    for pth in page_paths:
        img = cv2.imread(str(pth))
        if img is None:
            continue
        h, w = img.shape[:2]
        if use_paddle and ocr is not None:
            res = ocr.ocr(str(pth), cls=True)
            words = []
            lines = []
            for block in res:
                for box, (text, conf) in block:
                    xs = [pt[0] for pt in box]
                    ys = [pt[1] for pt in box]
                    x1, y1, x2, y2 = min(xs), min(ys), max(xs), max(ys)
                    bbox = [int(1000*x1/w), int(1000*y1/h), int(1000*x2/w), int(1000*y2/h)]
                    words.append({'text': text, 'bbox': bbox, 'conf': float(conf)})
                line_text = ' '.join([w['text'] for w in words])
                lines.append({'text': line_text})
            results.append({'page_path': str(pth), 'width': w, 'height': h, 'words': words, 'lines': lines})
        else:
            try:
                import pytesseract
                data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT, lang='spa')
                words = []
                lines_map = {}
                for i in range(len(data['text'])):
                    txt = data['text'][i]
                    conf = data.get('conf',["0"]) [i]
                    if not txt or str(txt).strip()=='' or txt=='-1':
                        continue
                    x, y, bw, bh = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
                    bbox = [int(1000*x/w), int(1000*y/h), int(1000*(x+bw)/w), int(1000*(y+bh)/h)]
                    words.append({'text': txt, 'bbox': bbox, 'conf': float(conf) if str(conf).replace('.','',1).isdigit() else 0.0})
                    line_num = data.get('line_num',[1])[i]
                    lines_map.setdefault(line_num, []).append(txt)
                lines = [{'text': ' '.join(v)} for _, v in sorted(lines_map.items())]
                results.append({'page_path': str(pth), 'width': w, 'height': h, 'words': words, 'lines': lines})
            except Exception as e2:
                print('[WARN] OCR falló en', pth, e2)
    out_path = OCR_DIR / f'{doc_id}.json'
    with open(out_path, 'w', encoding='utf-8') as f:
        json.dump({'doc_id': doc_id, 'pages': results}, f, ensure_ascii=False)
    return out_path

# Validaciones básicas de GT

def basic_gt_validation(gt_obj: dict):
    campos = gt_obj.get('campos',{})
    iva_pct = campos.get('iva_porcentaje')
    iva_val = campos.get('iva_valor')
    if iva_pct in [None, '', 0, 0.0]:
        if isinstance(iva_val, (int, float)) and abs(float(iva_val)) > 0:
            return {'warning': 'iva_porcentaje=0 pero iva_valor>0'}
    return {}



In [4]:
# Flujo principal: splits, conversión de PDFs, preprocesado y OCR

manifest = load_manifest()

# Generar splits si no existen
if not (SPLITS/'train.txt').exists():
    train, val, test = save_splits(manifest['doc_id'].tolist(), 0.7, 0.15)
    print('Splits generados:', len(train), len(val), len(test))
else:
    print('Splits ya existen')

# Convertir PDFs a imágenes
pdfs = [p for p in (RAW.glob('*.pdf'))]
for pdf in pdfs:
    base_id = pdf.stem
    out_paths = pdf_to_images(pdf, base_id, dpi=300)
    if out_paths:
        print(f'PDF {pdf.name} -> {len(out_paths)} páginas')

# Preprocesar imágenes (opcional)
img_paths = list(RAW.glob('*.png')) + list(RAW.glob('*.jpg')) + list(RAW.glob('*.jpeg'))
processed = 0
for p in img_paths:
    pp = preprocess_image(p)
    if pp is not None:
        processed += 1
print('Imágenes preprocesadas:', processed)

# Construir lista de páginas por doc_id (considerando multipágina)
doc_to_pages = {}
for p in sorted(list(RAW.glob('*.png')) + list(RAW.glob('*.jpg')) + list(RAW.glob('*.jpeg'))):
    base = re.sub(r'_p\d+$', '', p.stem)
    doc_to_pages.setdefault(base, []).append(p)

# OCR por documento
count_ocr = 0
for doc_id, pages in doc_to_pages.items():
    out = OCR_DIR / f'{doc_id}.json'
    if out.exists():
        continue
    ocr_document_images(doc_id, pages)
    count_ocr += 1
print('Documentos con OCR generado (nuevos):', count_ocr)

# Validación simple GT
bad = []
for p in GT_DIR.glob('*.json'):
    with open(p,'r',encoding='utf-8') as f:
        gt = json.load(f)
    v = basic_gt_validation(gt)
    if v:
        bad.append({'gt': str(p), **v})

print('Advertencias GT:', len(bad))
pd.DataFrame(bad).head(10) if bad else 'OK'



Splits ya existen
PDF factura_0101.pdf -> 1 páginas
PDF factura_0102.pdf -> 1 páginas
PDF factura_0103.pdf -> 2 páginas
Imágenes preprocesadas: 104
Documentos con OCR generado (nuevos): 0
Advertencias GT: 0


'OK'