# core

> Building blocks for thedu

### Introduction
We often have to go through a whole bunch of hoops to get documents processed and ready for searching through them.
`thedu` plans to make this as easy as possible by providing simple building blocks to set up a database with FTS5 and vector search capabilities.

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from fastlite import *
from apswutils.utils import cursor_row2dict

In [None]:
#| export
@patch
def query(self: Database, sql: str, params: Optional[Union[Iterable, dict]] = None) -> Generator[dict, None, None]:
    '''Execute a query and return results as a list of AttrDict'''
    p = params if isinstance(params, dict) else tuple(params or tuple())
    cursor = self.execute(sql, p)
    cursor.row_trace = cursor_row2dict
    yield from cursor

> Simple Docs table setup

In [None]:
# | export
@patch
def mk_docs(self:Database, # database connection
            name:str=None,  # table name
            **kw,  # additional args to pass to fastlite create
        ) -> None:
    "Make a docs table"
    _docs = self.t[name].create(id=int, name=str, path= str, uploaded_at=float, metadata=str, if_not_exists=True,
                                defaults=dict(uploaded_at='CURRENT_TIMESTAMP'), not_null=['name'], pk='id', **kw)
    if not _docs.detect_fts(): _docs.enable_fts(['name','metadata'], create_triggers=True, tokenize='porter', replace=True)
    return _docs

In [None]:
# | export
@patch
def mk_content(self:Database,  # database connection
               name:str=None,  # table name
               **kw,  # additional args to pass to fastlite create
               ):
    "Make a sql table for content storage with FTS5 and vector search capabilities"
    _content = self.t[name].create(id=int, doc_id=str, doc_name=str, chunk_id=int, start_index=int, end_index=int,
        tokens=int, model=str, content=str, embedding=bytes, metadata=str, uploaded_at=float, if_not_exists=True, pk='id',
        context=str, defaults=dict(uploaded_at='CURRENT_TIMESTAMP'), not_null=['doc_id','doc_name','content'], **kw)
    if not _content.detect_fts():
        _content.enable_fts(['doc_id','doc_name','content'], create_triggers=True, tokenize='porter', replace=True)
    return _content

In [None]:
# | export
@patch
def quick_store(self:Database,  # database connection
                docs_tbl:str='docs',  # docs table name
                content_tbl:str='content',  # content table name
                ):
    "Quickly set up both docs and content tables in the database"
    return self.mk_docs(docs_tbl), self.mk_content(content_tbl)

In [None]:
#| export
def setup_db(pth_or_uri:str='thedu.db',  # the database name or URL
             wal:bool=True,  # use WAL mode
             sem_search:bool=True,  # enable usearch extensions
             **kw,  # additional args to pass to apswutils database
             ) -> Database:
    '''Set up a database connection and load usearch extensions. You can refer usearch docs on sqlite plugins here: <https://unum-cloud.github.io/USearch/sqlite/index.html>'''

    if isinstance(pth_or_uri, (str, Path)): Path(pth_or_uri).parent.mkdir(exist_ok=True)
    _db = Database(pth_or_uri, **kw)
    if wal: _db.enable_wal()
    if not sem_search: return _db
    from usearch import sqlite_path
    _db.conn.enableloadextension(True)
    _db.conn.loadextension(sqlite_path())
    _db.conn.enableloadextension(False)
    return _db

In [None]:
#| export
class Chunk: id:str; text:str; start_index:str; end_index:str; token_count:str; context:str; embedding:'np.ndarray'

In [None]:
#| export
@patch
def store(self: Database, # database connection
          chunks: list[Chunk], # chunks to be stored
          name:str, # name of the document
          path:os.PathLike|str=None, # path to the document
          metadata:dict=None, # metadata dictionary
          doc_tbl:str='docs', # doc table name
          content_tbl:str='content' # content table name
          ):
    '''Store text chunks into the vector + FTS store'''
    try: import ujson as json
    except ImportError: import json
    doc,content = self.quick_store(doc_tbl, content_tbl)
    dr=doc.insert(dict(name=name, path=str(path), metadata=json.dumps(metadata or {})))
    cr=L(chunks).map(lambda c: dict(doc_id=dr['id'], doc_name=name, content=c.text, embedding=c.embedding.tobytes(), tokens=c.token_count, start_index=c.start_index, end_index=c.end_index, context=c.context))
    content.insert_all(cr)

In [None]:
# | export
@patch
def search(self: Database, q, emb, columns:list=None, where:str=None, where_args:dict=None, lim=50, docs_tbl='docs', content_tbl='content', rerank=False):
    if not q.strip(): return None
    doc, content = self.quick_store(docs_tbl, content_tbl)
    fts = dict2obj(L(content.search(q, order_by='rank', columns=columns, limit=lim, where=where, where_args=where_args)))
    vecs = L(dict2obj(content(select=','.join(columns), where=f'embedding is not null {'AND ' + where if where else ''}',
                   where_args=dict(qvec=emb, **(where_args or {})), order_by='distance_cosine_f32(embedding, :qvec)',limit=lim)))
    if not rerank: return dict(fts=fts, vec=vecs)
    ranked = (fts + vecs).groupby('content')
    return L(ranked.items()).map(lambda kv: first(kv[1]))

In [None]:
#| hide
def test_mk_docs():
    db = setup_db(':memory:')
    db.quick_store()
    assert set([t.name for t in db.t]).intersection({'docs','content'}) == {'docs','content'}
    assert db.t.docs.detect_fts()
    assert db.t.content.detect_fts()
test_mk_docs()

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()