### Introduction
We often have to go through a whole bunch of hoops to get documents processed and ready for searching through them.
`litesearch` plans to make this as easy as possible by providing simple building blocks to set up a database with FTS5 and vector search capabilities.

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from fastlite import *
from apswutils.utils import cursor_row2dict
import numpy as np

In [None]:
#| export
@patch
def query(self: Database, sql: str, params: Optional[Union[Iterable, dict]] = None) -> Generator[dict, None, None]:
    '''Execute a query and return results as a list of AttrDict'''
    p = params if isinstance(params, dict) else tuple(params or tuple())
    cursor = self.execute(sql, p)
    cursor.row_trace = cursor_row2dict
    yield from cursor

> Simple Docs table setup

In [None]:
# | export
@patch
def mk_store(self:Database,  # database connection
               name:str='content',  # table name
               **kw,  # additional args to pass to fastlite create
               ):
    "Make a sql table for content storage with FTS5 and vector search capabilities"
    _content = self.t[name].create(id=int, content=str, embedding=bytes, metadata=str, uploaded_at=float,
        if_not_exists=True, pk='id', defaults=dict(uploaded_at='CURRENT_TIMESTAMP'), not_null=['content'], **kw)
    if not _content.detect_fts(): _content.enable_fts(['content','metadata'], create_triggers=True, tokenize='porter', replace=True)
    return _content

In [None]:
#| export
def setup_db(pth_or_uri:str=':memory:',  # the database name or URL
             wal:bool=True,  # use WAL mode
             sem_search:bool=True,  # enable usearch extensions
             **kw,  # additional args to pass to apswutils database
             ) -> Database:
    '''Set up a database connection and load usearch extensions. You can refer usearch docs on sqlite plugins here: <https://unum-cloud.github.io/USearch/sqlite/index.html>'''

    if isinstance(pth_or_uri, (str, Path)): Path(pth_or_uri).parent.mkdir(exist_ok=True)
    _db = Database(pth_or_uri, **kw)
    if wal: _db.enable_wal()
    if not sem_search: return _db
    # Lazy initialization: apply usearch fix only when semantic search is enabled
    from .postfix import usearch_fix
    usearch_fix()
    from usearch import sqlite_path
    _db.conn.enableloadextension(True)
    _db.conn.loadextension(sqlite_path())
    _db.conn.enableloadextension(False)
    return _db

In [None]:
# | export
@patch
def search(self: Database,      # database connection
           q:str,               # query string
           emb:bytes,    # embedding vector
           columns:list=None,   # columns to return
           where:str=None,      # additional where clause
           where_args:dict=None,# args for where clause
           lim=50,              # limit on number of results
           tbl='content',       # table name
           emb_col='embedding', # embedding column name
           rrf=True,            # need to rerank results with reciprocal rank fusion
           dtype=np.float16     # embedding dtype
           ):
    if not q.strip(): return None
    content = self.mk_store(tbl)
    if not columns: columns = ['content', 'metadata', 'embedding']
    fts = content.search(q, order_by='rank', columns=columns, limit=lim, where=where, where_args=where_args, quote=True)
    df='i8' if dtype==np.int8 else 'f16' if dtype==np.float16 else 'f64' if dtype==np.float64 else 'f32'
    vecs = content(select=','.join(columns), where=f'{emb_col} is not null' + (' AND ' + where if where else ''),
                   where_args=dict(qvec=emb, **(where_args or {})), order_by=f'distance_cosine_{df}({emb_col}, :qvec)',limit=lim)
    if not rrf: return dict(fts=[f for f in fts], vec=vecs)
    ranked = (dict2obj(L(fts)) + L(dict2obj(vecs))).groupby('content')
    return [first(kv[1]) for kv in ranked.items()][:lim]

NameError: name 'patch' is not defined

In [None]:
#| hide
def test_mk_docs():
    db = setup_db(':memory:')
    db.mk_store()
    assert 'content' in db.t, 'content table not created'
    assert db.t.content.detect_fts()

test_mk_docs()

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()