# data

> Building blocks for ingesting and querying data with thedu with some additional utils

We will build a simple ingestion pipeline to ingest pdf documents into thedu database for searching.

In [None]:
#| default_exp data

In [None]:
#| export
from chonkie import Pipeline
from fastcore.all import AttrDictDefault, L, dict2obj, concat, patch
from fastlite import Database
import os
import pymupdf

In [None]:
#| export
def _read_page(page: pymupdf.Page # PyMuPDF Page object
              ) -> AttrDictDefault:
    '''Return a serialisable snapshot of all common page data.'''
    tp = page.get_textpage()
    def R(r): return tuple(r) if r else None
    anns = [dict(xref=a.xref, type=a.type[1], rect=R(a.rect), info=a.info,
                 colors=a.colors, border=getattr(a, 'border', None), uri=getattr(a, 'uri', None))
            for a in page.annots()]
    wids = [dict(xref=w.xref,field_name=w.field_name, field_type=w.field_type_string,
                rect=R(w.rect), value=w.field_value)
            for w in page.widgets()] if callable(getattr(page, 'widgets', None)) else []

    return dict2obj(dict(number=page.number,
        rect=R(getattr(page, 'rect', None)), mediabox=R(getattr(page, 'mediabox', getattr(page, 'rect', None))),
        rotation=page.rotation, xref=getattr(page, 'xref', None), text_plain=page.get_text('text', textpage=tp),
        text_rawdict=page.get_text('rawdict', textpage=tp),  # includes text blocks and image placeholders
        text_json=page.get_text('json', textpage=tp),
        # graphics and resources
        links=page.get_links(), annotations=anns, widgets=wids,
        images=page.get_images(full=True),      # list of image tuples
        drawings=page.get_drawings()        # vector drawing ops
    ))

def read_pdf(pth: str|os.PathLike # path to PDF file
            ) -> AttrDictDefault:
    '''Read a PDF file and return a list of page data.'''
    doc = pymupdf.open(pth)
    return dict2obj(dict(name=doc.metadata['title'],num_pages=doc.page_count,metadata=doc.metadata,toc=doc.get_toc(),pages=L([_read_page(p) for p in doc])))

#### Some utilities for pdf processing 

In [None]:
#| export
def pymupdf2txt(doc): return '\n\n'.join([p.text_plain for p in doc.pages])

In [None]:
#| export
def pdf_pipe():
    'Return the default chunking and embedding pipeline.'
    return (
            Pipeline()
            .chunk_with('recursive', tokenizer='gpt2', chunk_size=2048)
            .chunk_with('semantic', chunk_size=1024)
            .refine_with('overlap', context_size=128)
            .refine_with('embeddings', embedding_model='minishlab/potion-retrieval-32M')
        )

@patch
def pdf_ingest(
    self: Database,                # thedu database connection
    pdf_doc: dict|os.PathLike,     # a pdf document or path. Use `read_pdf` to read from path
    chunk_embed_pipe:Pipeline=None,# chunking and embedding pipeline. If None, use default chonkie pipeline
    docs_tbl: str = 'docs',        # docs table name
    content_tbl: str = 'content',  # content table name
):
    'Ingest PDF documents into thedu.'
    if isinstance(pdf_doc, (str, os.PathLike)): pdf_doc = read_pdf(pdf_doc)
    if isinstance(pdf_doc, dict): assert 'pages' in pdf_doc, 'Invalid PDF document dictionary. Use `read_pdf` to read from path.'
    if not chunk_embed_pipe: chunk_embed_pipe = pdf_pipe()
    self.store(chunk_embed_pipe.run(pymupdf2txt(pdf_doc)).chunks, name=pdf_doc.name, metadata=pdf_doc.metadata,
               doc_tbl=docs_tbl, content_tbl=content_tbl)

In [None]:
#| export
def clean(q:str  # query to be passed for fts search
          ):
    '''Clean the query by removing * and returning None for empty queries.'''
    return q.replace('*', '') if q.strip() else None

def add_wc(q:str  # query to be passed for fts search
           ):
    '''Add wild card * to each word in the query.'''
    return ' '.join(map(lambda w: w + '*', q.split(' ')))

def mk_wider(q:str  # query to be passed for fts search
             ):
    '''Widen the query by joining words with OR operator.'''
    return ' OR '.join(map(lambda w: f'{w}', q.split(' ')))

def kw(q:str  # query to be passed for fts search
       ):
    '''Extract keywords from the query using YAKE library.'''
    from yake import KeywordExtractor as KW
    return ' '.join((set(concat([k.split(' ') for k, s in KW().extract_keywords(q)]))))

def pre(q:str,          # query to be passed for fts search
        wc=True,        # add wild card to each word
        wide=True,      # widen the query with OR operator
        extract_kw=True # extract keywords from the query
        ):
    '''Preprocess the query for fts search.'''
    q = clean(q)
    if not q.strip(): return ''
    if extract_kw: q = kw(q)
    if wc: q = add_wc(q)
    if wide: q = mk_wider(q)
    return q

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()