# core

> Building blocks for thedu

### Introduction
We often have to go through a whole bunch of hoops to get documents processed and ready for searching through them.
`thedu` plans to make this as easy as possible by providing simple building blocks to set up a database with FTS5 and vector search capabilities.

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from fastlite import *
from apsw.fts5 import Table as FTable

In [None]:
#| export
def setup_db(pth_or_uri:str='thedu.db',  # the database name or URL
             wal:bool=True,  # use WAL mode
             sem_search:bool=True,  # enable usearch extensions
             **kw,  # additional args to pass to apswutils database
             ) -> Database:
    '''Set up a database connection and load usearch extensions. You can refer usearch docs on sqlite plugins here: <https://unum-cloud.github.io/USearch/sqlite/index.html>'''

    if isinstance(pth_or_uri, (str, Path)): Path(pth_or_uri).parent.mkdir(exist_ok=True)
    _db = Database(pth_or_uri, **kw)
    if wal: _db.enable_wal()
    if not sem_search: return _db
    from usearch import sqlite_path
    _db.conn.enableloadextension(True)
    _db.conn.loadextension(sqlite_path())
    _db.conn.enableloadextension(False)
    return _db

> Simple Docs table setup

In [None]:
# | export
class Docs: name: str; path: str; uploaded_at: float
class Content: id: int; doc_id: str; doc_name: str; content: str; embedding: bytes; metadata: str; uploaded_at: float
def mk_docs(db:Database=setup_db(), # database connection
        ) -> Database:
    "Make a sql table with FTS5 and vector search capabilities"
    db.docs = db.create(Docs, if_not_exists=True, transform=True, defaults=dict(uploaded_at='CURRENT_TIMESTAMP'),
                        not_null=['name','path'], hash_id='id', hash_id_columns=('name'))
    db.content = db.create(Content, if_not_exists=True, transform=True, defaults=dict(uploaded_at='CURRENT_TIMESTAMP'),
                           foreign_keys=[('doc_id', 'docs'), ('doc_name', 'docs')], not_null=['doc_id','doc_name','content'])
    if db.conn.table_exists('main', f'{db.content.name}_fts'): db.search = FTable(db.conn, f'{db.content.name}_fts')
    else: db.search = FTable.create(db.conn, f'{db.content.name}_fts', columns=['content', 'metadata'], generate_triggers=True, content=db.content.name, tokenize=['json', 'include_keys', '0', 'simplify', 'casefold', 'true' , 'strip', 'true', 'unicodewords'])
    return db

In [None]:
#| hide
def test_mk_docs():
    db = mk_docs(setup_db(':memory:'))
    assert set([t.name for t in db.t]).intersection({'docs','content','content_fts'}) == {'docs','content','content_fts'}
    assert isinstance(db.search, FTable)
test_mk_docs()

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()