## Load Dataset

In [1]:
from lib.load import extract_data, load_benchmark_corpus


extract_data()
benchmark, corpus = load_benchmark_corpus()

## Split Into Chunks

In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter(
    separators=['\n\n', '\n', '!', '?', '.', ':', ';', ',', ' ', ''],
    chunk_size=500,
    chunk_overlap=0,
    add_start_index=True,
)
documents = text_splitter.create_documents(corpus.values())
documents[:3]

[Document(metadata={'start_index': 2}, page_content='At Fiverr we care about your privacy.\nWe do not sell or rent your personal information to third parties for their direct marketing purposes without your explicit consent.'),
 Document(metadata={'start_index': 173}, page_content='We do not disclose it to others except as disclosed in this Policy or required to provide you with the services of the Site and mobile applications, meaning - to allow you to buy, sell, share the information you want to share on the Site; to contribute on the forum; pay for products; post reviews and so on; or where we have a legal obligation to do so.'),
 Document(metadata={'start_index': 530}, page_content='We collect information that you provide us or voluntarily share with other users, and also some general technical information that is automatically gathered by our systems, such as IP address, browser information and cookies to enable you to have a better user experience and a more personalized browsing

## Embed Chunks

In [3]:
from sentence_transformers import SentenceTransformer
from transformers import BitsAndBytesConfig

model = SentenceTransformer(
    "Qwen/Qwen3-Embedding-8B",
    model_kwargs={
        "quantization_config": BitsAndBytesConfig(load_in_8bit=True)}
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
document_embeddings = model.encode(
    [document.page_content for document in documents],
    show_progress_bar=True,
)

query_embeddings = model.encode(
    [test['query'] for test in benchmark],
    prompt_name="query",
    show_progress_bar=True,
)
similarities = model.similarity(query_embeddings, document_embeddings)

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [10]:
import torch

from lib.metrics import precision_recall


def evaluate_rag(similarities, topk):
    precision = recall = 0
    count = 0
    indices = torch.argsort(similarities, descending=True)[:, :topk]
    for test_idx, document_idxs in enumerate(indices):
        # Compute spans
        spans_true = []
        for snippet in benchmark[test_idx]["snippets"]:
            spans_true.append(snippet["span"])
        spans_pred = []
        for idx in document_idxs:
            document = documents[idx]
            start = document.metadata["start_index"]
            length = len(document.page_content)
            spans_pred.append((start, start + length))
        # Compute precision and recall
        p, r = precision_recall(spans_true, spans_pred)
        # Update accumulators
        precision += p
        recall += r
        count += 1
    return precision / count, recall / count

precision, recall = evaluate_rag(similarities, topk=4)
print(f"precision: {precision}, recall: {recall}")

precision: 0.13880909727731075, recall: 0.2661742755955439
