### Simple RAG with litesearch
> We will build a simple rag with litesearch. Do not be deceived. We are doing a whole bunch of heavylifting under the hood with very little code.

In [1]:
import asyncio
from fastcore.all import *
from fastlite import *
import numpy as np
import re
from selectolax.lexbor import LexborHTMLParser as HTMLParser
from litesearch import *
import pymupdf

Applying usearch macOS fix if required...
usearch dylib path:  /home/karthik/code/litesearch/.venv/bin/usearch_binaries/usearch_sqlite.dylib
Not on macOS, skipping usearch fix.


Let's set the db up. This db has usearch loaded. So, you can run cosine distance calculations using simd(means fast, real fast)

In [2]:
db:Database=setup_db('breugel.db')

In [3]:
db.q('select distance_cosine_f16(:vec1,:vec2)', dict(vec1=np.ones(512, np.float16).tobytes(), vec2=np.zeros(512, np.float16).tobytes()))

[{'distance_cosine_f16(:vec1,:vec2)': 1.0}]

There are way more functions you can run now. Checkout: https://unum-cloud.github.io/USearch/sqlite/index.html

#### Ingest PDF documents
> We will ingest a sample PDF documents from Bruegel. We will restrict it to 10 pdfs. But it has about 1800 in total. This isto showcase real world utilities of litesearch.
> We will read the PDF document using `read_pdf` function from litesearch.ingest module.
> We will then scrape the urls from the pdf and then recursively get all pdf's off of those links and ingest them as well.'

In [4]:
#| export
def arun(coro):
    try: loop = asyncio.get_running_loop()
    except RuntimeError: return asyncio.run(coro)
    # We're in a running loop â†’ use a temporary loop in a thread
    import concurrent.futures
    with concurrent.futures.ThreadPoolExecutor() as pool:
        return pool.submit(asyncio.run, coro).result()


class BruegelDataset:
    ''' Dataset for Bruegel PDF documents.'''
    URL = 'https://www.bruegel.org/system/files/2024-06/Bruegel_factsheet_2024_0.pdf'
    URI_SCHEMA = r'^http://data\.europa\.eu/eli/(?P<typedoc>[^/]+)/(?P<year>\d{4})/(?P<natural_number>\d+)/(?P<date>\d{4}-\d{2}-\d{2})/(?P<lang>[a-z]{2,3})/pdfa2a$'
    def __init__(self, dest:Path=Path('bruegel_dataset'), lang='eng', sample_size=10):
        self.dest = dest
        self.lang= lang
        self.sample_size = sample_size
        self.pdfs = self()


    @staticmethod
    def _is_pdf_link(l:str): return l.strip() and (l.lower().endswith('.pdf') or 'pdf' in l.lower())
    @staticmethod
    def url2name(url: str) -> str | None:
        if re.match(BruegelDataset.URI_SCHEMA, url):
            m = re.match(BruegelDataset.URI_SCHEMA, url)
            return f"{m['typedoc']}_{m['year']}_{m['natural_number']}_{m['date']}_{m['lang']}.pdf"
        return url.split('/')[-1] if url.split('/')[-1] != '' else url.rstrip('/').split('/')[-1]+'.html'

    def _get_meta(self, r):
        try:
            meta = HTMLParser(r).tags('meta')
            nodes = L([dict2obj(m.attributes) for m in meta]).filter(
                lambda m: 'about' in m
            ).filter(
                lambda m: ('property' in m and m['property'].lower() in ['eli:is_embodied_by','eli:title'])
            ).filter(
                lambda m: ('resource' in m and 'pdfa2a' in m['resource'].lower()) or 'content' in m
            )
            if self.lang: nodes = nodes.filter(lambda m: f'/{self.lang}' in m['about'])
            nodes = nodes.groupby('about')
            for k in nodes: nodes[k] = merge(*nodes[k])
            return L([(nodes[n]['resource'], nodes[n]['content']) for n in nodes])
        except: pass

    def read_link(self, l, dest=None):
        try: return self.save_pdf(l, dest).read_text()
        except: return ''

    def save_pdf(self, lnk:str, dest:Path=None) -> Path | None:
        try:
            if not lnk : return None
            if not dest: dest = self.dest
            p = dest / self.url2name(lnk)
            if not (p.exists() and p.stat().st_size > 1024): p = urlsave(lnk, p)
            return p
        except Exception as ex: print(ex); return None

    def __call__(self) -> L:
        '''Make a dataset of documents from a pdf url and all linked pdfs.'''

        def get_linked_pdfs(doc, filter=True):
            links = doc.get_links().attrgot('uri')
            if filter: links = links.filter(self._is_pdf_link)
            return links

        pth = self.dest / self.url2name(self.URL)
        if not pth.exists(): pth = urlsave(self.URL, self.dest / self.url2name(self.URL))
        main_doc = pymupdf.open(pth)
        pdf_lp = Path(self.dest / 'pdf_links.txt')
        if not pdf_lp.exists():
            links = get_linked_pdfs(main_doc, filter=False)
            print(f'Found {len(links)} linked pdfs.')
            all_c = parallel(self.read_link, links, threadpool=True, dest=self.dest/'links')
            pdf_links = parallel(self._get_meta, all_c, threadpool=True).concat().unique()
            print(f'Found {len(pdf_links)} external pdf links.')
            if len(pdf_links) > 0: pdf_lp.mk_write('\n'.join([f'{l[0]},{l[1]}' for l in pdf_links]))
        pdf_subset = L(pdf_lp.readlines())
        if self.lang: pdf_subset = pdf_subset.filter(lambda p: self.lang in p)
        pdf_subset = pdf_subset[:self.sample_size]
        url2tit = {l.split(',')[0]: l.split(',')[1] for l in pdf_subset}
        pdf_list = L([l.split(',')[0] for l in pdf_subset])
        name2url = {self.url2name(l): l for l in pdf_list}
        pdf_set = set(pdf_list.map(self.url2name))
        downloaded_pdfs = globtastic(self.dest / 'pdfs', file_glob='*.pdf', func=Path).filter(lambda m: m.stat().st_size > 1024).map(lambda p: p.name)
        fetch_list = [name2url.get(d) for d in pdf_set.difference(downloaded_pdfs)]
        if fetch_list:
            print(f'Downloading {len(fetch_list)} new pdfs...')
            parallel(self.save_pdf, fetch_list, dest=self.dest/'pdfs', n_workers=10)
        return globtastic(self.dest / 'pdfs', file_glob='*.pdf', func=Path).filter(lambda m: m.name in name2url).map(lambda m: AttrDict(path=m, title=url2tit[name2url[m.name]]))

In [5]:
b = BruegelDataset(sample_size=100)

Downloading 1 new pdfs...


Let's load the dataset. it takes a bit the first time around as it downloads about links from about 10 pdfs. check your examples folder for `breugel_dataset`

In [6]:
print('no. of pdfs: ', len(b.pdfs))

no. of pdfs:  24


Let's read the first pdf.

In [7]:
doc = first(b.pdfs)

Now let's load the pdfs into the db. The db is already patched with a whole bunch of syntactic sugars. Checkout `litesearch.data` for more info

In [8]:
# db.pdf_ingest(first(b.pdfs).path)

The above will work, but will load a new chunking pipe everytime. An efficient way is to get the default pipe of create your own. We use chonkie for chunking and embedding. To reduce the number of documents we're also filtering out non english docs. Checkout their docs for more information: https://docs.chonkie.ai/oss

In [9]:
fn = chunk_fn()

In [10]:
[db.pdf_ingest(p.path, fn) for p in b.pdfs]

[32m2025-11-16 10:11:55.902[0m | [34m[1mDEBUG   [0m | [36mchonkie.chunker.base[0m:[36m__init__[0m:[36m32[0m - [34m[1mInitialized RecursiveChunker[0m
[32m2025-11-16 10:11:55.902[0m | [34m[1mDEBUG   [0m | [36mchonkie.chunker.recursive[0m:[36mchunk[0m:[36m368[0m - [34m[1mStarting recursive chunking for text of length 847[0m
[32m2025-11-16 10:11:55.903[0m | [1mINFO    [0m | [36mchonkie.chunker.recursive[0m:[36mchunk[0m:[36m370[0m - [1mCreated 1 chunks using recursive chunking[0m
[32m2025-11-16 10:11:57.869[0m | [34m[1mDEBUG   [0m | [36mchonkie.chunker.base[0m:[36m__init__[0m:[36m32[0m - [34m[1mInitialized SemanticChunker[0m
[32m2025-11-16 10:11:57.870[0m | [34m[1mDEBUG   [0m | [36mchonkie.chunker.semantic[0m:[36mchunk[0m:[36m531[0m - [34m[1mStarting semantic chunking for text of length 847[0m
[32m2025-11-16 10:11:57.871[0m | [34m[1mDEBUG   [0m | [36mchonkie.chunker.semantic[0m:[36mchunk[0m:[36m535[0m - [34m[1m

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

This can be made even faster with chonie pipelines where you can pass a bunch of texts together and chunk all docs in batches first and push them to db. Checkout https://docs.chonkie.ai/oss/pipelines

Cool, let's search through these contents

In [11]:
content = db.table('content') # can also use db.t.content

In [15]:
fts_search = bind(content.search, order_by='rank', columns=['id','content', 'metadata'], limit=50)

In [16]:
q = 'economic growth in Europe'

In [17]:
fts_r = [r for r in fts_search(q)] # FTS search
print(len(fts_r), 'results got')

4 results got


As you can see, there is no exact term `economic growth in Europe` anywhere in the texts. Let's make the search wide. FTS allows globs like * and use OR. There is so much you can tune with apsw fts. We provide strong defaults (always customisable) with the `pre` function.

In [18]:
fts_r = dict2obj(L(fts_search(pre(q))))
print(len(fts_r), 'results got')

50 results got


This is just scratching the surface, apsw puts fts on steroids. Look forward for more updates to litesearch along these lines. To know more about apsw fts, refer: https://rogerbinns.github.io/apsw/example-fts.html

Okay, We've got some results. Let's get some vec search results.

In [19]:
from chonkie import AutoEmbeddings

Chonkie exposes model2vec embeddings which are static embeddings. Static embeddings are 500 times smaller and 50 times faster with a very small performance loss. Potion-retrieval-32M is a good model imo. Learn more at: https://github.com/MinishLab/model2vec

In [20]:
embedding_fn = AutoEmbeddings().get_embeddings('minishlab/potion-retrieval-32M').embed

In [21]:
vec_r = L(dict2obj(content(select='id, content, metadata', where='embedding is not null', where_args=dict(qvec=embedding_fn(q).tobytes()), order_by='distance_cosine_f32(embedding, :qvec)',limit=50)))
print(len(fts_r), 'results got')

50 results got


Cool, we see results. Now, these are ways to get these results separately. The underlying fastlite and apswutils wrappers over sqlite are powerful and allow you to manipulate the db in efficient ways. Checkout: https://github.com/AnswerDotAI/fastlite and https://github.com/AnswerDotAI/apswutils/tree/main

#### litesearch provides a search method which reranks the results from both FTS and vector search using Reciprocal Rank Fusion (RRF)
> You can always turn it off.

In [22]:
res=db.search(pre(q), embedding_fn(q).tobytes(), columns=['id', 'content', 'metadata'], rrf=True)

In [23]:
print(first(res))

{'id': 17518, 'content': 'Property Rights (IPR) appli\xad\ncations \nInnovations - \nNumber of innovations \nresulting from the projects \nfunded by the Programme \n(by type of innovation) \nincluding from awarded IPRs \nEconomic growth - \nCreation, growth & market \n', 'metadata': '{"tokens": 79, "start_index": 1763, "end_index": 1947, "context": "Economic growth - \\nCreation, growth & market \\n", "pg_no": 71, "doc_name": "CL2021R0695EN0010010.0001.3bi_cp 1..1", "doc_page_count": 73, "doc_metadata": {"format": "PDF 1.7", "title": "CL2021R0695EN0010010.0001.3bi_cp 1..1", "author": "Publications Office", "subject": " ", "keywords": "", "creator": "Arbortext Advanced Print Publisher 10.0.1465/W Unicode", "producer": "3-Heights(TM) PDF to PDF-A Converter Shell 4.7.24.2 (http://www.pdf-tools.com)", "creationDate": "D:20240320171849+05\'00\'", "modDate": "D:20240326180552+01\'00\'", "trapped": "", "encryption": null}, "doc_toc": [[1, "Consolidated text: Regulation\x92(EU) 2021/695 of the

You do not need to, but you can now flashrank these results if needed. The Potion-retrieval model is a bge distilled model which is a fusion cross encoder with strong retrieval capabilities.

In [24]:
from flashrank import Ranker, RerankRequest

It's a good idea to understand what the mak token limit is for the documents in the db so that we can set the max length of the ranker accordingly. If you set a high amount, it does affect performance.So, the ideal is to set it to max tokens + 100-200

In [25]:
max_token = int(db.q("select max(json_extract(metadata, '$.tokens')) as m from content")[0]['m'])

In [26]:
ranker = Ranker(max_length=max_token+150)

INFO:flashrank.Ranker:Downloading ms-marco-TinyBERT-L-2-v2...
ms-marco-TinyBERT-L-2-v2.zip: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3.26M/3.26M [00:00<00:00, 12.0MiB/s]


In [27]:
res1=ranker.rerank(RerankRequest(q, res.map(lambda r: dict(text=r.content, id=r.id))))

In [28]:
print(first(res1))

{'text': "\nTo address future societal challenges, embrace the opportunities of new tech\xad\nnologies and contribute to environmentally friendly and sustainable economic \ngrowth, jobs, competitiveness and the well-being of Europe's citizens, there is \nthe need to further strengthen Europe's capacity to innovate by: strengthening \n", 'id': 17262, 'score': np.float32(0.9951793)}


Now, let's compare flashrank results and litesearch rerank results.

In [30]:
for r1,r2 in zip(res,res1): print(r1['id'], r2['id'])

17518 17262
15288 14858
15289 15289
14800 17519
14632 15284
14857 17183
17202 16316
17519 14237
14858 14639
14642 17182
14633 14633
15625 10732
15284 14642
16326 16298
14801 12750
15278 13128
15624 13139
14638 14780
14639 14028
17514 16751
17398 11831
17182 17203
17203 17518
17298 14801
16298 15278
15195 12447
17262 13126
16751 14439
16327 14029
16306 15288
10732 17514
16312 14632
16316 15625
17516 15639
17183 8396
17399 2028
17517 1410
17299 17137
13128 11972
14237 14800
14244 15600
11759 17136
11758 17202
11830 16324
11760 14638
5337 16312
11862 14857
11971 13970
12029 14244
11853 15195
15746 13781
14439 16672
14780 16315
14029 15746
15745 16327
8396 15602
11831 17399
16673 17299
13970 12817
13781 16673
15602 17398
2028 16280
12447 12501
13139 12837
11972 13131
16315 17517
19971 12496
12837 16306
16324 15745
13126 17298
10657 10657
1410 5337
12817 11759
17137 17516
13131 15624
15600 16326
12750 11862
12496 19969
16280 19971
14028 11830
19969 11971
17136 11760
12501 11853
15639 11758


So you have it. a simple rag pipeline.