### Simple RAG with Thedu
> We will build a simple rag with thedu. Do not be deceived. We are doing a whole bunch of heavylifting under the hood with very little code.

In [1]:
#| export
from fastcore.all import *
from fastlite import *
import numpy as np
import re
from selectolax.parser import HTMLParser
from thedu import *
from yake import KeywordExtractor

Let's set the db up. This db has usearch loaded. So, you can run cosine distance calculations using simd(means fast, real fast)

In [2]:
db:Database=setup_db('breugel.db')

In [3]:
db.q('select distance_cosine_f16(:vec1,:vec2)', dict(vec1=np.ones(512, np.float16).tobytes(), vec2=np.zeros(512, np.float16).tobytes()))

[{'distance_cosine_f16(:vec1,:vec2)': 1.0}]

There are way more functions you can run now. Checkout: https://unum-cloud.github.io/USearch/sqlite/index.html

#### Ingest a PDF document
> We will ingest a sample PDF document from Bruegel.
> We will read the PDF document using `read_pdf` function from thedu.ingest module.
> We will then scrape the urls from the pdf and then recursively get all pdf's off of those links and ingest them as well.'

In [4]:
#| export
class BruegelDataset:
    ''' Dataset for Bruegel PDF documents.'''
    URL = 'https://www.bruegel.org/system/files/2024-06/Bruegel_factsheet_2024_0.pdf'
    URI_SCHEMA = r'^http://data\.europa\.eu/eli/(?P<typedoc>[^/]+)/(?P<year>\d{4})/(?P<natural_number>\d+)/(?P<date>\d{4}-\d{2}-\d{2})/(?P<lang>[a-z]{2,3})/pdfa2a$'
    def __init__(self, dest:Path=Path('bruegel_dataset')):
        self.dest = dest
        self.pdfs = self()


    @staticmethod
    def _is_pdf_link(l:str): return l.strip() and (l.lower().endswith('.pdf') or 'pdf' in l.lower())
    @staticmethod
    def url2name(url: str) -> str | None:
        if re.match(BruegelDataset.URI_SCHEMA, url):
            m = re.match(BruegelDataset.URI_SCHEMA, url)
            return f"{m['typedoc']}_{m['year']}_{m['natural_number']}_{m['date']}_{m['lang']}.pdf"
        return url.split('/')[-1] if url.split('/')[-1] != '' else url.rstrip('/').split('/')[-1]+'.html'

    def _get_meta(self, r):
        try:
            meta = HTMLParser(r).tags('meta')
            nodes = L([dict2obj(m.attributes) for m in meta]).filter(
                lambda m: 'about' in m
            ).filter(
                lambda m: ('property' in m and m['property'].lower() in ['eli:is_embodied_by','eli:title'])
            ).filter(
                lambda m: ('resource' in m and 'pdfa2a' in m['resource'].lower()) or 'content' in m
            ).groupby('about')
            for k in nodes: nodes[k] = merge(*nodes[k])
            return L([(nodes[n]['resource'], nodes[n]['content']) for n in nodes])
        except: pass


    def read_link(self, l, dest=None):
        try:
	        p = self.save_pdf(l, dest)
	        return p.read_text()
        except: return ''

    def save_pdf(self, l:str, dest:Path=None) -> Path | None:
        try:
            if not l : return None
            if not dest: dest = self.dest
            p = dest / self.url2name(l)
            if not (p.exists() and p.stat().st_size > 1024): p = urlsave(l,p)
            return p
        except Exception as ex: print(ex); return None

    def __call__(self) -> L:
        '''Make a dataset of documents from a pdf url and all linked pdfs.'''

        def get_linked_pdfs(doc, filter=True):
            links = doc.map(lambda p: p.links.map(lambda l: l.uri)).concat().unique()
            if filter: links = links.filter(self._is_pdf_link)
            return links

        pth = self.dest / self.url2name(self.URL)
        if not pth.exists(): pth = urlsave(self.URL, self.dest / self.url2name(self.URL))
        main_doc = read_pdf(pth)
        pdf_lp = Path(self.dest / 'pdf_links')
        if not pdf_lp.exists():
            links = get_linked_pdfs(main_doc, filter=False)
            print(f'Found {len(links)} linked pdfs.')
            all_c = parallel(self.read_link, links, threadpool=True, dest=self.dest/'links')
            pdf_links = parallel(self._get_meta, all_c, threadpool=True).concat().unique()
            print(f'Found {len(pdf_links)} external pdf links.')
            pdf_lp.mk_write('\n'.join([f'{l[0]},{l[1]}' for l in pdf_links]))
        url2tit = {l.split(',')[0]: l.split(',')[1] for l in pdf_lp.readlines()}
        pdf_list = L([l.split(',')[0] for l in pdf_lp.readlines()])
        name2url = {self.url2name(l): l for l in pdf_list}
        pdf_set = set(pdf_list.map(self.url2name))
        downloaded_pdfs = globtastic(self.dest / 'pdfs', file_glob='*.pdf', func=Path).filter(lambda m: m.stat().st_size > 1024).map(lambda p: p.name)
        fetch_list = [name2url.get(d) for d in pdf_set.difference(downloaded_pdfs)]
        if fetch_list:
            print(f'Downloading {len(fetch_list)} new pdfs...')
            parallel(self.save_pdf, fetch_list, dest=self.dest/'pdfs')
        return globtastic(self.dest / 'pdfs', file_glob='*.pdf', func=Path).map(lambda m: AttrDict(path=m, title=url2tit[name2url[m.name]]))

In [5]:
b = BruegelDataset()

Downloading 1 new pdfs...


#### Patching is a great way to add functionalities to existing classes. It works on all python classes
Here we are modifying the title in metadata with a better value. We get it as part of the breugel web scraping.

In [126]:
#| export
@patch
def read_pdfs(self:BruegelDataset, fn=noop):
    def update_meta(p):
        if not fn(p): return None
        d = read_pdf(p.path)
        d['name'] = d.metadata.title
        d.metadata.title = p.title
        return d
    return maps(update_meta, self.pdfs)

Let's load the dataset. it takes a bit the first time around as it downloads about 800 pdfs. check your examples folder for `breugel_dataset`

In [8]:
print('no. of pdfs: ', len(b.pdfs))

no. of pdfs:  817


Let's read the first pdf.

In [128]:
doc = first(b.read_pdfs(lambda p: p.path.stem.endswith('_eng')))

Now let's load the pdfs into the db. The db is already patched with a whole bunch of syntactic sugars. Checkout `thedu.data` for more info

In [None]:
db.pdf_ingest(first(b.read_pdfs()))

The above will work, but will load a new chunking pipe everytime. An efficient way is to get the default pipe of create your own. We use chonkie for chunking and embedding. To reduce the number of documents we're also filtering out non english docs. Checkout their docs for more information: https://docs.chonkie.ai/oss

In [None]:
pipe = pdf_pipe()
only_eng = lambda p: p.path.stem.endswith('_eng')
[db.pdf_ingest(p, pipe) for p in b.read_pdfs(fn=only_eng) if p]

This can be made even faster with chonie pipelines where you can pass a bunch of texts together and chunk all docs in batches first and push them to db. Checkout https://docs.chonkie.ai/oss/pipelines

Cool, let's search through these contents

In [6]:
content = db.table('content') # can also use db.t.content
doc = db.t.docs

In [7]:
fts_search = bind(content.search, order_by='rank', columns=['id','doc_id', 'content'], limit=50)

In [8]:
q = 'economic growth in Europe'

In [9]:
fts_r = [r for r in fts_search(q)] # FTS search
print(len(fts_r), 'results got')

0 results got


As you can see, there is no exact term `economic growth in Europe` anywhere in the texts. Let's make the search wide. FTS allows globs like * and use OR. There is so much you can tune with apsw fts. We provide strong defaults (always customisable) with the `pre` function.

In [10]:
fts_r = dict2obj(L(fts_search(pre(q))))
print(len(fts_r), 'results got')

50 results got


This is just scratching the surface, apsw puts fts on steroids. Look forward for more updates to thedu along these lines. To know more about apsw fts, refer: https://rogerbinns.github.io/apsw/example-fts.html

Okay, We've got some results. Let's get some vec search results.

In [11]:
from chonkie import AutoEmbeddings

Chonkie exposes model2vec embeddings which are static embeddings. Static embeddings are 500 times smaller and 50 times faster with a very small performance loss. Potion-retrieval-32M is a good model imo. Learn more at: https://github.com/MinishLab/model2vec

In [12]:
embedding_fn = AutoEmbeddings().get_embeddings('minishlab/potion-retrieval-32M').embed

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
vec_r = L(dict2obj(content(select='id, doc_id, content', where='embedding is not null', where_args=dict(qvec=embedding_fn(q).tobytes()), order_by='distance_cosine_f32(embedding, :qvec)',limit=50)))
print(len(fts_r), 'results got')

50 results got


Cool, we see results. Now, these are ways to get these results separately. The underlying fastlite and apswutils wrappers over sqlite are powerful and allow you to manipulate the db in efficient ways. Checkout: https://github.com/AnswerDotAI/fastlite and https://github.com/AnswerDotAI/apswutils/tree/main

#### Thedu provides a search method which reranks the results from both FTS and vector search using Reciprocal Rank Fusion (RRF)
> You can always turn it off.

In [21]:
res=db.search(pre(q), embedding_fn(q).tobytes(), columns=['id','doc_id', 'content'], rerank=True)

In [27]:
[print(r['id'], r['content']) for r in res[:10]]

29855 Economic growth - 
Creation, growth & market 
shares of companies having 
developed innovations in the 

27089 economic growth and economic cohesion in an inclusive manner, in 
particular addressing weaknesses of the economy of the Member States, 
boosting the growth potential of the economy of the Member State 
concerned, stimulating job creation, and mitigating the adverse 
effects of the crisis; 

26418 European relevance structured in six pillars: 
(a) green transition; 
(b) digital transformation; 
(c) smart, sustainable and inclusive growth, including economic 
cohesion, j
29854 Property Rights (IPR) appli­
cations 
Innovations - 
Number of innovations 
resulting from the projects 
funded by the Programme 
(by type of innovation) 
including from awarded IPRs 
Economic growth - 
Creation, growth & market 

26649 contribute to strengthening the growth potential, job creation, and 
economic, social and institutional resilience of the Member State, 
contributing to the implemen

[None, None, None, None, None, None, None, None, None, None]

You do not need to, but you can now flashrank these results if needed. The Potion-retrieval model is a bge distilled model which is a fusion cross encoder withs trong retrieval capabilities.

In [17]:
from flashrank import Ranker, RerankRequest

In [18]:
max_token = int(db.q('select max(tokens) as m from content')[0]['m'])

In [19]:
ranker = Ranker(max_length=max_token+150)

In [22]:
res1=ranker.rerank(RerankRequest(q, res.map(lambda r: dict(text=r.content, id=r.id, meta=dict(doc_id=r.doc_id)))))

In [23]:
[print(r['id'], r['text']) for r in res1[:10]]

29595 
To address future societal challenges, embrace the opportunities of new tech­
nologies and contribute to environmentally friendly and sustainable economic 
growth, jobs, competitiveness and the well-being of Europe's citizens, there is 
the need to further strengthen Europe's capacity to innovate by: strengthening 

26649 contribute to strengthening the growth potential, job creation, and 
economic, social and institutional resilience of the Member State, 
contributing to the implementation of the European Pillar of Social 
Rights, including through the promotion of policies for children 
and the youth, and to mitigating the economic and social impac
26418 European relevance structured in six pillars: 
(a) green transition; 
(b) digital transformation; 
(c) smart, sustainable and inclusive growth, including economic 
cohesion, j
28129 6.2 Number of enterprises supported by stage (early, growth/expansion) 
6.3 Number of enterprises supported by Member State and region at NUTS 2 


[None, None, None, None, None, None, None, None, None, None]

Now, let's compare flashrank results and thedu rerank results.

In [25]:
[print(r1['id'], r2['id']) for r1,r2 in zip(res,res1)]

29855 29595
27089 26649
26418 26418
29854 28129
26649 27089
27088 21287
29595 27084
26590 29518
27084 29855
29537 24455
26591 28150
26648 26425
26428 29517
29538 26006
26419 17580
28129 26428
29850 24862
26425 27973
29518 28399
26424 29127
29068 28716
26006 22493
27430 24473
24862 26419
29517 26591
26993 26570
27076 29537
27973 24860
29852 24868
17580 29854
22493 25783
28160 29068
29732 27076
29470 24159
11996 27088
28399 26993
30544 29850
26014 27444
28150 14899
28145 30544
29127 29470
28341 11997
29469 2557
21287 26590
28161 29469
29733 26424
24473 29538
28716 11996
11997 26648
24455 28145
26570 3194
14899 25728
25728 26014
27406 25531
28992 28149
3194 27406
25531 27430
24159 28161
24868 29733
28149 24542
24566 28992
24863 27030
24860 29732
27030 28341
2557 24566
31356 24220
24542 24863
28448 28160
33126 16562
12052 31356
16562 33126
11463 29852
25783 28448
33125 12052
24220 11463
27444 33125


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

So you have it. a simple rag pipeline.