In [None]:
# Imports del paquete
from parser.parsers import Parser, _guess_mime
from parser.metrics import percent_docs_ok, layout_loss, table_consistency

# Extras para visualizar
from pathlib import Path
import pandas as pd
from textwrap import shorten

# Ruta de prueba (ajústala a tu archivo)
SAMPLE = "docs/Resume Martin Jurado_CDAO_24.pdf"   # o .docx / .png / .jpg
Path(SAMPLE).exists()


In [None]:
parser = Parser()
doc = parser.parse(SAMPLE)

print("doc_id:", doc.doc_id)
print("mime  :", doc.mime)
print("#pages:", len(doc.pages))


In [None]:
def doc_to_blocks_df(doc):
    rows = []
    for p in doc.pages:
        for i, b in enumerate(p.blocks):
            t = b.get("type")
            txt = b.get("text", "")
            # Creemos una vista corta del texto para no saturar la tabla
            rows.append({
                "page": p.page_number,
                "block_idx": i,
                "type": t,
                "text_preview": shorten(txt, width=90, placeholder="...") if isinstance(txt, str) else None,
                "n_chars": len(txt) if isinstance(txt, str) else None
            })
    return pd.DataFrame(rows)

df_blocks = doc_to_blocks_df(doc)

if len(df_blocks) == 0:
    print("El parser no generó bloques (posible PDF escaneado sin OCR).")
else:
    display(df_blocks.head(20))
    display(df_blocks.groupby(["page", "type"]).size().unstack(fill_value=0).head(20))



In [None]:
df_blocks.groupby(["page", "type"]).size().unstack(fill_value=0).head(20)

In [None]:
print("layout_loss:", round(layout_loss(doc), 3))
print("table_consistency:", table_consistency(doc, golden_tables_per_doc=None))


In [None]:
table_consistency(doc, golden_tables_per_doc=0)


In [None]:
# Procesar varios archivos y medir %docs_ok
FILES = [
    "docs/Resume Martin Jurado_CDAO_24.pdf",
    # "samples/otro.docx",
    # "samples/imagen.png",
]
docs = []
for p in FILES:
    try:
        docs.append(parser.parse(p))
    except Exception as e:
        print("[ERR]", p, e)
        docs.append(None)

print("%docs_ok:", round(percent_docs_ok(docs), 2))


In [None]:
print(_guess_mime(SAMPLE))

In [None]:
parser = Parser(
    ocr_lang="spa+eng",        # OCR en español+inglés si lo necesitas
    ocr_resolution=220,        # DPI para rasterizar páginas en fallback
    normalize_whitespace=True, # limpia espacios
    dehyphenate=True,          # recomponer palabras cortadas por guion
    enable_pdf_ocr_fallback=True,
    # tesseract_cmd=r"C:\Program Files\Tesseract-OCR\tesseract.exe",  # (Windows)
)

doc = parser.parse("docs/Resume Martin Jurado_CDAO_24.pdf")
len(doc.pages), doc.meta

In [None]:
doc = parser.parse("docs/Resume Martin Jurado_CDAO_24.pdf")
len(doc.pages), doc.meta


In [None]:
import pandas as pd
from textwrap import shorten

def doc_to_blocks_df(doc):
    rows = []
    for p in doc.pages:
        for i, b in enumerate(p.blocks):
            t = b.get("type")
            txt = b.get("text", "")
            rows.append({
                "page": p.page_number,
                "block_idx": i,
                "type": t,
                "text_preview": shorten(txt, width=100, placeholder="...") if isinstance(txt, str) else None,
                "n_chars": len(txt) if isinstance(txt, str) else None
            })
    return pd.DataFrame(rows)

df_blocks = doc_to_blocks_df(doc)
if len(df_blocks) == 0:
    print("El parser no generó bloques (posible PDF escaneado sin OCR).")
else:
    display(df_blocks.head(20))
    display(df_blocks.groupby(["page", "type"]).size().unstack(fill_value=0).head(20))


In [None]:
from parser.metrics import percent_docs_ok, layout_loss, table_consistency
print("layout_loss:", round(layout_loss(doc), 3))
print("table_consistency:", table_consistency(doc, golden_tables_per_doc=None))
