### Introduction
We often have to go through a whole bunch of hoops to get documents processed and ready for searching through them.
`litesearch` plans to make this as easy as possible by providing simple building blocks to set up a database with FTS5 and vector search capabilities.

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from fastcore.all import first, dict2obj, L, Path, Generator
from fastlite import Database, patch, Optional, Union, Iterable
from apswutils.utils import cursor_row2dict
import numpy as np

In [None]:
#| export
@patch
def query(self: Database, sql: str, params: Optional[Union[Iterable, dict]] = None) -> Generator[dict, None, None]:
    '''Execute a query and return results as a list of AttrDict'''
    p = params if isinstance(params, dict) else tuple(params or tuple())
    cursor = self.execute(sql, p)
    cursor.row_trace = cursor_row2dict
    yield from cursor

> Simple Docs table setup

In [None]:
# | export
@patch
def get_store(self:Database,    # database connection
            name:str='store',   # table name
            hash:bool=False,    # whether to create hash index on content
            **kw,               # additional args to pass to fastlite create
):
    "Make a sql table for content storage with FTS5 and vector search capabilities"
    cols = dict(content=str, embedding=bytes, metadata=str, uploaded_at=float,defaults=dict(uploaded_at='CURRENT_TIMESTAMP'),pk='id')
    if hash: cols.update(dict(hash_id='id',hash_id_columns=['content']))
    else: cols.update(dict(id=int, not_null=['content']))
    _content = self.t[name].create(**cols,if_not_exists=True, **kw)
    if not _content.detect_fts(): _content.enable_fts(['content','metadata'], create_triggers=True, tokenize='porter', replace=True)
    return _content

In [None]:
#| export
def database(pth_or_uri:str=':memory:',     # the database name or URL
             wal:bool=True,                 # use WAL mode
             sem_search:bool=True,          # enable usearch extensions
             **kw,                          # additional args to pass to apswutils database
             ) -> Database:
    'Set up a database connection and load usearch extensions.'

    if isinstance(pth_or_uri, (str, Path)): Path(pth_or_uri).parent.mkdir(exist_ok=True)
    _db = Database(pth_or_uri, **kw)
    if wal: _db.enable_wal()
    if not sem_search: return _db
    from usearch import sqlite_path
    _db.conn.enableloadextension(True)
    _db.conn.loadextension(sqlite_path())
    _db.conn.enableloadextension(False)
    return _db

In [None]:
# | export
@patch
def search(self: Database,  # database connection
           q:str,  # query string
           emb:bytes,  # embedding vector
           columns:list=None,  # columns to return
           where:str=None,  # additional where clause
           where_args:dict=None,  # args for where clause
           limit:int|None=50,  # limit on number of results
           offset:int|None=None,  # offset for results
           table_name='store',  # table name
           emb_col='embedding',  # embedding column name
           emb_metric:str='cosine',  # embedding distance metric (cosine,sqeuclidean,inner,divergence)
           rrf=True,  # need to rerank results with reciprocal rank fusion
           dtype=np.float16,  # embedding dtype
           ):
    'Search the litesearch store with fts and vector search combined.'
    if not q.strip(): return None
    content = self.get_store(table_name)
    if not columns: columns = ['content', 'metadata', 'embedding']
    fts = content.search(q, order_by='rank', columns=columns, limit=limit, where=where, where_args=where_args, quote=True)
    df='i8' if dtype==np.int8 else 'f16' if dtype==np.float16 else 'f64' if dtype==np.float64 else 'f32'
    vecs = content(select=','.join(columns), where=f'{emb_col} is not null' + (' AND ' + where if where else ''),
        where_args=dict(qvec=emb, **(where_args or {})), order_by=f'distance_{emb_metric}_{df}({emb_col}, :qvec)', limit=limit, offset=offset)
    if not rrf: return dict(fts=[f for f in fts], vec=vecs)
    ranked = (dict2obj(L(fts)) + L(dict2obj(vecs))).groupby('content')
    return [first(kv[1]) for kv in ranked.items()][:limit]

Let's test it out. We will create a database, run embedding comparisons, create a store and run search

In [None]:
db = database()

The fastlite database is set up with usearch extensions. Let's run some distance calculations.

In [None]:
embs = dict(
	v1=np.ones((100,),dtype=np.float32).tobytes(), 		# vector of ones
	v2=np.zeros((100,),dtype=np.float32).tobytes(), 	# vector of zeros
	v3=np.full((100,),0.25,dtype=np.float32).tobytes() 	# vector of 0.25s
)
def dist_q(metric):
	return db.q(f'''
		select
			distance_{metric}_f32(:v1,:v2) as {metric}_v1_v2,
			distance_{metric}_f32(:v1,:v3) as {metric}_v1_v3,
			distance_{metric}_f32(:v2,:v3) as {metric}_v2_v3
	''', embs)

for fn in ['sqeuclidean', 'divergence', 'inner', 'cosine']: print(dist_q(fn))

[{'sqeuclidean_v1_v2': 100.0, 'sqeuclidean_v1_v3': 56.25, 'sqeuclidean_v2_v3': 6.25}]
[{'divergence_v1_v2': 34.657352447509766, 'divergence_v1_v3': 12.046551704406738, 'divergence_v2_v3': 8.66433334350586}]
[{'inner_v1_v2': 1.0, 'inner_v1_v3': -24.0, 'inner_v2_v3': 1.0}]
[{'cosine_v1_v2': 1.0, 'cosine_v1_v3': 0.0, 'cosine_v2_v3': 1.0}]


In [None]:
db.get_store()
if 'store' in db.t: print('store is created')
print('detected fts table: ',db.t.store.detect_fts())
print('Search results:', len(db.search('h',np.zeros((100,)).tobytes()))) # there is no data yet, so should be 0

store is created
detected fts table:  store_fts
Search results: 0


We can also create a store with hash index on content. Useful for code search applications

In [None]:
st=db.get_store(name='my_store', hash=True)
st.insert_all([dict(content='hello world', embedding=np.ones((100,),dtype=np.float16).tobytes()),
                           dict(content='hi there', embedding=np.full((100,),0.5,dtype=np.float16).tobytes()),
                           dict(content='goodbye now', embedding=np.zeros((100,),dtype=np.float16).tobytes())],upsert=True,hash_id='id')
st(select='id,content')

[{'id': '250ce2bffa97ab21fa9ab2922d19993454a0cf28', 'content': 'hello world'},
 {'id': 'c89f43361891bfab9290bcebf182fa5978f89700', 'content': 'hi there'},
 {'id': '882293d5e5c3d3e04e8e0c4f7c01efba904d0932', 'content': 'goodbye now'}]

Let's run a search again.

In [None]:
db.search(q='hello', emb=np.full((100,),0.25, dtype=np.float16).tobytes(), columns=['content'], table_name='my_store',limit=2, rrf=False)

{'fts': [{'content': 'hello world'}],
 'vec': [{'content': 'hello world'}, {'content': 'hi there'}]}

Now, let's try the same but with a broader query.

In [None]:
db.search(q='goodbye OR hi', emb=np.full((100,),0,dtype=np.float16).tobytes(), columns=['content'], table_name='my_store',limit=2)

[{'content': 'goodbye now'}, {'content': 'hello world'}]

You can use different kind of embedding metrics as well. The default is `cosine`. Let's try with `divergence` distance

In [None]:
db.search(q='goodbye OR hi', emb=np.full((100,),0,dtype=np.float16).tobytes(), columns=['content'], table_name='my_store',limit=2, emb_metric='divergence')

[{'content': 'goodbye now'}, {'content': 'hi there'}]

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()