## Load Dataset

In [1]:
from rag.load import load_benchmark_corpus


benchmark, corpus = load_benchmark_corpus()

## Split Into Chunks

In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter(
    separators=['\n\n', '\n', '!', '?', '.', ':', ';', ',', ' ', ''],
    chunk_size=500,
    chunk_overlap=0,
    add_start_index=True,
)
documents = text_splitter.create_documents(corpus.values())
documents[:3]

[Document(metadata={'start_index': 2}, page_content='At Fiverr we care about your privacy.\nWe do not sell or rent your personal information to third parties for their direct marketing purposes without your explicit consent.'),
 Document(metadata={'start_index': 173}, page_content='We do not disclose it to others except as disclosed in this Policy or required to provide you with the services of the Site and mobile applications, meaning - to allow you to buy, sell, share the information you want to share on the Site; to contribute on the forum; pay for products; post reviews and so on; or where we have a legal obligation to do so.'),
 Document(metadata={'start_index': 530}, page_content='We collect information that you provide us or voluntarily share with other users, and also some general technical information that is automatically gathered by our systems, such as IP address, browser information and cookies to enable you to have a better user experience and a more personalized browsing

## Embed Chunks

In [3]:
from sentence_transformers import SentenceTransformer
from transformers import BitsAndBytesConfig

from rag.embed import compute_similarities, get_query_strings


model = SentenceTransformer(
    "Qwen/Qwen3-Embedding-8B",
    model_kwargs={
        "quantization_config": BitsAndBytesConfig(load_in_8bit=True)}
)

similarities = compute_similarities(
    model,
    queries=get_query_strings(benchmark),
    documents=[document.page_content for document in documents],
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [4]:
from rag.metrics import print_evaluations, similarities_to_ranks

print_evaluations(benchmark, documents, similarities_to_ranks(similarities))

precision @ 1 :  0.1909, recall @ 1 :  0.1033
precision @ 2 :  0.1503, recall @ 2 :  0.1618
precision @ 4 :  0.1388, recall @ 4 :  0.2662
precision @ 8 :  0.1060, recall @ 8 :  0.3915
precision @ 16:  0.0793, recall @ 16:  0.5567
precision @ 32:  0.0581, recall @ 32:  0.6823
precision @ 64:  0.0491, recall @ 64:  0.8487
AUC: 0.051332957284483435
