We will build a simple ingestion pipeline to ingest pdf documents into litesearch database for searching.

In [None]:
#| default_exp data

In [None]:
#| export
from chonkie import Pipeline, Chunk
from fastcore.all import L, concat, patch, ifnone
from fastlite import Database
import os
import pymupdf
from pymupdf import Document, Pixmap, csRGB, Page

In [None]:
#| export
@patch
def get_texts(self: Document, st=0, end=-1, **kw):
	return L(self[st:end]).map(lambda p: p.get_text(**kw))

@patch
def get_links(self: Document, st=0, end=-1):
	return L(self[st:end]).map(lambda p: p.get_links()).concat()

@patch
def ext_im(self: Document, it=None):
    if not it: return None
    assert isinstance(it, tuple) and len(it) > 2, 'Invalid image tuple'
    xref, smask = it[0], it[1]
    if smask > 0:
        pix0 = Pixmap(self.extract_image(xref)['image'])
        if pix0.alpha: pix0 = Pixmap(pix0, 0)  # remove alpha channel
        mask = Pixmap(self.extract_image(smask)['image'])
        try: pix = Pixmap(pix0, mask)
        except (RuntimeError, ValueError, KeyError): pix = Pixmap(self.extract_image(xref)['image'])
        ext = 'pam' if pix0.n > 3 else 'png'
        return dict(ext=ext, colorspace=pix.colorspace.n, image=pix.tobytes(ext))
    if '/ColorSpace' in self.xref_object(xref, compressed=True):
        pix = Pixmap(csRGB, Pixmap(self, xref))
        return dict(ext='png', colorspace=3, image=pix.tobytes('png'))
    return self.extract_image(xref)

@patch
def ext_imgs(self: Document, st=0, end=-1):
	f = lambda p: [ext_im(self,it) for it in p.get_images(full=True)]
	return L(self[st:end]).map(f).concat()


#### Some utilities for pdf processing

In [None]:
#| export
def text_pipe():
    'Return the default chunking and embedding pipeline for texts.'
    return (Pipeline()
            .chunk_with('recursive', tokenizer='gpt2', chunk_size=2048)
            .chunk_with('semantic', chunk_size=1024)
            .refine_with('overlap', context_size=128)
            .refine_with('embeddings', embedding_model='minishlab/potion-retrieval-32M'))

def chunk_fn(f=None):
	'Return the chunking function. If None, use default chunk fn'
	return ifnone(f, text_pipe().run)

def chunk(lns, f=None): return chunk_fn(f)(lns).chunks

@patch
def pg2chunks(
		self:Page,      # pdf path
        fn=None   # chunking fn. If None, use default chunk fn
):
    'Return a list of text chunks for a document.'
    return chunk(self.get_text(), fn)

@patch()
def content(self:Chunk, xtra:dict=None):
	meta = dict(tokens=self.token_count, start_index=self.start_index,
	            end_index=self.end_index, context=self.context, **(xtra or dict()))
	return dict(content=self.text, embedding=self.embedding.tobytes(), metadata=meta)

@patch
def to_content(self:Document, fn=None):
	'''Return chunks of text for a document.'''
	m = dict(doc_name=self.metadata['title'], doc_page_count=self.page_count,doc_metadata=self.metadata, doc_toc=self.get_toc())
	return L(self.pages()).renumerate().map(
		lambda p: L(p[0].pg2chunks(fn)).map(
			lambda c: c.content(dict(pg_no=p[1]+1, **m)))).concat()
@patch
def pdf_ingest(
    self: Database,        # litesearch database connection
    path: os.PathLike,     # pdf path
    fn=None,         # chunking function. If None, use default chunk fn
    tbl: str = 'content',  # content table name
):
    'Ingest PDF documents into litesearch.'
    self.mk_store(tbl).insert_all(pymupdf.open(path).to_content(fn))

In [None]:
#| export
def clean(q:str  # query to be passed for fts search
          ) -> str:
    '''Clean the query by removing * and returning empty string for empty queries.'''
    if not q or not q.strip():
        return ''
    return q.replace('*', '')

def add_wc(q:str  # query to be passed for fts search
           ) -> str:
    '''Add wild card * to each word in the query.'''
    if not q or not q.strip():
        return ''
    return ' '.join(map(lambda w: w + '*', q.split(' ')))

def mk_wider(q:str  # query to be passed for fts search
             ) -> str:
    '''Widen the query by joining words with OR operator.'''
    if not q or not q.strip():
        return ''
    return ' OR '.join(map(lambda w: f'{w}', q.split(' ')))

def kw(q:str  # query to be passed for fts search
       ) -> str:
    '''Extract keywords from the query using YAKE library.'''
    from yake import KeywordExtractor as KW
    return ' '.join((set(concat([k.split(' ') for k, s in KW().extract_keywords(q)]))))

def pre(q:str,          # query to be passed for fts search
        wc=True,        # add wild card to each word
        wide=True,      # widen the query with OR operator
        extract_kw=True # extract keywords from the query
        ) -> str:
    '''Preprocess the query for fts search.'''
    q = clean(q)
    if not q:
        return ''
    if extract_kw: q = kw(q)
    if wc: q = add_wc(q)
    if wide: q = mk_wider(q)
    return q

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()