## Load Dataset

In [1]:
from lib.load import extract_data, load_benchmark_corpus


extract_data()
benchmark, corpus = load_benchmark_corpus()

## Split Into Chunks

In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter(
    separators=['\n\n', '\n', '!', '?', '.', ':', ';', ',', ' ', ''],
    chunk_size=500,
    chunk_overlap=0,
    add_start_index=True,
)

names, texts = zip(*corpus.items())
metadatas = [
    {"source_file": name}
    for idx, name in enumerate(names)
]

documents = text_splitter.create_documents(corpus.values(), metadatas=metadatas)
documents[:3]

[Document(metadata={'source_file': 'Fiverr.txt', 'start_index': 2}, page_content='At Fiverr we care about your privacy.\nWe do not sell or rent your personal information to third parties for their direct marketing purposes without your explicit consent.'),
 Document(metadata={'source_file': 'Fiverr.txt', 'start_index': 173}, page_content='We do not disclose it to others except as disclosed in this Policy or required to provide you with the services of the Site and mobile applications, meaning - to allow you to buy, sell, share the information you want to share on the Site; to contribute on the forum; pay for products; post reviews and so on; or where we have a legal obligation to do so.'),
 Document(metadata={'source_file': 'Fiverr.txt', 'start_index': 530}, page_content='We collect information that you provide us or voluntarily share with other users, and also some general technical information that is automatically gathered by our systems, such as IP address, browser information and 

## Embed Chunks

In [3]:
import gc
import torch

from sentence_transformers import SentenceTransformer
from transformers import BitsAndBytesConfig


def compute_similarities(benchmark, documents):
    # Load model
    model = SentenceTransformer(
        "Qwen/Qwen3-Embedding-8B",
        model_kwargs={"quantization_config": BitsAndBytesConfig(load_in_8bit=True)}
    )
    # Compute embeddings
    document_embeddings = model.encode(
        [f"{document.metadata["source_file"]}: {document.page_content}" for document in documents],
        show_progress_bar=True,
    )
    query_embeddings = model.encode(
        [test['query'] for test in benchmark],
        prompt_name="query",
        show_progress_bar=True,
    )
    # Compute similarity
    similarities = model.similarity(query_embeddings, document_embeddings)
    # Cleanup
    del model
    cleanup()

    return similarities

def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

if True:
    similarities = torch.load("data/sim_cache")
else:
    similarities = compute_similarities(benchmark, documents)

In [4]:
import random

random.seed(1996)
idxs = random.sample(range(len(benchmark)), 20)

benchmark = [benchmark[idx] for idx in idxs]
similarities = similarities[idxs]

In [5]:
from lib.metrics import evaluate_rag_reranked

evaluate_rag_reranked(benchmark[0:1], documents, torch.argsort(similarities, descending=True), 4)

(0.5006165228113441, 0.1924170616113744)

In [6]:
ranks = torch.argsort(similarities, descending=True)

In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


def format_prompts(query: str, instruction: str, documents: list[str]) -> list[str]:
    """Format query and documents into prompts for reranking."""
    if instruction:
        instruction = f" {instruction}"
    prompts = []
    for doc in documents:
        prompt = f"Check whether a given document contains information helpful to answer the query.\n<Document> {doc}\n<Query> {query}{instruction} ??"
        prompts.append(prompt)
    return prompts


model_path = "ContextualAI/ctxl-rerank-v2-instruct-multilingual-2b"

device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # so -1 is the real last token for all prompts

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=BitsAndBytesConfig(load_in_8bit=True),
    torch_dtype=dtype,
)
model.eval()

def infer_w_hf(model_path: str, query: str, instruction: str, documents: list[str]):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

    prompts = format_prompts(query, instruction, documents)
    enc = tokenizer(
        prompts,
        return_tensors="pt",
        padding=True,
        truncation=True,
    )
    input_ids = enc["input_ids"].to(device)
    attention_mask = enc["attention_mask"].to(device)

    with torch.no_grad():
        out = model(input_ids=input_ids, attention_mask=attention_mask)

    next_logits = out.logits[:, -1, :]  # [batch, vocab]

    scores_bf16 = next_logits[:, 0].to(torch.bfloat16)
    scores = scores_bf16.float().tolist()

    # Sort by score (descending)
    results = sorted([(s, i, documents[i]) for i, s in enumerate(scores)], key=lambda x: x[0], reverse=True)
    return results

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [35]:
from tqdm.notebook import tqdm


model_path = "ContextualAI/ctxl-rerank-v2-instruct-multilingual-2b"

results = []
for idx, (test, doc_idxs) in tqdm(
    enumerate(zip(benchmark, ranks)),
    total=min(len(benchmark), len(ranks))
):
    result = infer_w_hf(
        model_path,
        query=benchmark[idx]['query'],
        instruction='',
        documents= [f"{documents[idx].metadata["source_file"]}: {documents[idx].page_content}" for idx in ranks[idx, :32]],
    )
    results.append(result)

  0%|          | 0/20 [00:00<?, ?it/s]

In [36]:
reranks = []
for idx, result in enumerate(results):
    top_documents = ranks[idx, :32]
    base_document_idxs = [int(top_documents[relative_idx]) for score, relative_idx, content in result]
    reranks.append(base_document_idxs)

In [45]:
from lib.metrics import evaluate_rag, evaluate_rag_reranked

print("Baseline evaluation")
for k in [1, 2, 4, 8, 16, 32]:
    precision, recall = evaluate_rag(benchmark, documents, similarities, k)
    print(f"precision @ {k:<2}: {precision:7.4f}, recall @ {k:<2}: {recall:7.4f}")

print("\nReranked evaluation")
for k in [1, 2, 4, 8, 16, 32]:
    precision, recall = evaluate_rag_reranked(benchmark, documents, reranks, k)
    print(f"precision @ {k:<2}: {precision:7.4f}, recall @ {k:<2}: {recall:7.4f}")

Baseline evaluation
precision @ 1 :  0.3624, recall @ 1 :  0.1315
precision @ 2 :  0.2802, recall @ 2 :  0.2547
precision @ 4 :  0.2459, recall @ 4 :  0.3286
precision @ 8 :  0.1869, recall @ 8 :  0.4625
precision @ 16:  0.1322, recall @ 16:  0.5996
precision @ 32:  0.0933, recall @ 32:  0.7629

Reranked evaluation
precision @ 1 :  0.4750, recall @ 1 :  0.2364
precision @ 2 :  0.4327, recall @ 2 :  0.3167
precision @ 4 :  0.3335, recall @ 4 :  0.4449
precision @ 8 :  0.2444, recall @ 8 :  0.5929
precision @ 16:  0.1630, recall @ 16:  0.6969
precision @ 32:  0.0933, recall @ 32:  0.7629
