# ingest

> Building blocks for ingesting data into thedu

We will build a simple ingestion pipeline to ingest documents into thedu database for searching.

In [None]:
#| default_exp ingest

In [None]:
#| export
import chonkie
from fastcore.all import *
import pymupdf
from pymupdf import Page

In [None]:
#| export
def _read_page(page: Page # PyMuPDF Page object
              ) -> AttrDictDefault:
    """Return a serialisable snapshot of all common page data."""
    tp = page.get_textpage()
    def R(r): return tuple(r) if r else None
    anns = [dict(xref=a.xref,type=a.type[1],rect=R(a.rect),info=a.info,
                  colors=a.colors,border=getattr(a, "border", None),
                  uri=getattr(a, "uri", None)) for a in page.annots()]
    wids =[dict(xref=w.xref,field_name=w.field_name, field_type=w.field_type_string,
        rect=R(w.rect), value=w.field_value) for w in page.widgets()] if callable(getattr(page, "widgets", None)) else []

    return dict2obj(dict(number=page.number,
        rect=R(getattr(page, "rect", None)),
        mediabox=R(getattr(page, "mediabox", getattr(page, "rect", None))),
        rotation=page.rotation,
        xref=getattr(page, "xref", None),
        # text, reused textpage for speed
        text_plain=page.get_text("text", textpage=tp),
        text_rawdict=page.get_text("rawdict", textpage=tp),  # includes text blocks and image placeholders
        text_json=page.get_text("json", textpage=tp),
        # graphics and resources
        links=page.get_links(),
        annotations=anns,
        widgets=wids,
        images=page.get_images(full=True),      # list of image tuples
        drawings=page.get_drawings()        # vector drawing ops
    ))

def read_pdf(pth: str # path to PDF file
            ) -> L:
    """Read a PDF file and return a list of page data."""
    return L([_read_page(p) for p in pymupdf.open(pth)])

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()