## Load Dataset

In [1]:
import os
import zipfile

ZIP_PATH = './data/LegalBench-RAG.zip'
DATA_PATH = './data/LegalBench-RAG/'

if not os.listdir(DATA_PATH):
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        for name in zip_ref.namelist():
            if name.endswith('/'):
                continue
            zip_ref.extract(name, path=DATA_PATH)

In [37]:
import json
import pprint


BENCHMARK_PRIVACY_QA_PATH = './data/LegalBench-RAG/benchmarks/privacy_qa.json'
CORPUS_PATH = './data/LegalBench-RAG/corpus/privacy_qa/'


with open(BENCHMARK_PRIVACY_QA_PATH) as f:
    benchmark_privacy_qa = json.load(f)['tests']
print("Benchmark Test Example:")
print(json.dumps(benchmark_privacy_qa[0], indent=2))
print()

corpus = {}
for document in os.listdir(CORPUS_PATH):
    with open(os.path.join(CORPUS_PATH, document)) as f:
        corpus[document] = f.read()
print("Corpus Example (Fiverr.txt / first 1000 characters):")
print(corpus['Fiverr.txt'][:1000])

Benchmark Test Example:
{
  "query": "Consider \"Fiverr\"'s privacy policy; who can see which tasks i hire workers for?",
  "snippets": [
    {
      "file_path": "privacy_qa/Fiverr.txt",
      "span": [
        3873,
        4312
      ],
      "answer": "  In addition, we collect information while you access, browse, view or otherwise use the Site.\nIn other words, when you access the Site we are aware of your usage of the Site, and may gather, collect and record the information relating to such usage, including geo-location information, IP address, device and connection information, browser information and web-log information, and all communications recorded by Users through the Site.\n"
    }
  ]
}

Corpus Example (Fiverr.txt / first 1000 characters):
  At Fiverr we care about your privacy.
We do not sell or rent your personal information to third parties for their direct marketing purposes without your explicit consent.
We do not disclose it to others except as disclosed in this P

## Split Into Chunks

In [29]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    separators=['\n\n', '\n', '!', '?', '.', ':', ';', ',', ' ', ''],
    chunk_size=500,
    chunk_overlap=0,
    add_start_index=True,
)

names, texts = zip(*corpus.items())
metadatas = [
    {"source_file": name}
    for idx, name in enumerate(names)
]
documents = text_splitter.create_documents(texts, metadatas=metadatas)
documents[:3]

[Document(metadata={'source_file': 'Fiverr.txt', 'start_index': 2}, page_content='At Fiverr we care about your privacy.\nWe do not sell or rent your personal information to third parties for their direct marketing purposes without your explicit consent.'),
 Document(metadata={'source_file': 'Fiverr.txt', 'start_index': 173}, page_content='We do not disclose it to others except as disclosed in this Policy or required to provide you with the services of the Site and mobile applications, meaning - to allow you to buy, sell, share the information you want to share on the Site; to contribute on the forum; pay for products; post reviews and so on; or where we have a legal obligation to do so.'),
 Document(metadata={'source_file': 'Fiverr.txt', 'start_index': 530}, page_content='We collect information that you provide us or voluntarily share with other users, and also some general technical information that is automatically gathered by our systems, such as IP address, browser information and 

## Embed Chunks

In [4]:
from sentence_transformers import SentenceTransformer
from transformers import BitsAndBytesConfig

model = SentenceTransformer(
    "Qwen/Qwen3-Embedding-8B",
    model_kwargs={
        "quantization_config": BitsAndBytesConfig(load_in_8bit=True)
    }
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [30]:
document_embeddings = model.encode(
    [f"{document.metadata['source_file']}: {document.page_content}" for document in documents],
    show_progress_bar=True,
)

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

In [8]:
benchmark = [
    test for test in benchmark_privacy_qa
    # if test['snippets'][0]['file_path'] == 'privacy_qa/Fiverr.txt'
]

query_embeddings = model.encode([test['query'] for test in benchmark], prompt_name="query")

In [31]:
similarities = model.similarity(query_embeddings, document_embeddings)
similarities.shape

torch.Size([194, 461])

In [23]:
def precision_recall(spans_true, spans_pred):
    true = as_indices(spans_true)
    pred = as_indices(spans_pred)

    tp = len(true & pred)
    fp = len(pred - true)
    fn = len(true - pred)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    return precision, recall
    

def as_indices(spans):
    indices = set()
    for start, end in spans:
        indices.update(range(start, end))
    return indices

In [32]:
import torch


TOPK = 4

precision = recall = 0
count = 0

indices = torch.argsort(similarities, descending=True)[:, :TOPK]
for test_idx, document_idxs in enumerate(indices):
    test = benchmark[test_idx]

    spans_true = []
    for snippet in benchmark[test_idx]["snippets"]:
        spans_true.append(snippet["span"])

    spans_pred = []
    for idx in document_idxs:
        document = documents[idx]
        start = document.metadata["start_index"]
        length = len(document.page_content)
        spans_pred.append((start, start + length))

    p, r = precision_recall(spans_true, sp ans_pred)
    
    precision += p
    recall += r
    count += 1

print(f"precision: {precision / count}, recall: {recall / count}")

precision: 0.17581403203871157, recall: 0.3731475664837389


In [35]:
pprint.pprint(benchmark[0])

{'query': 'Consider "Fiverr"\'s privacy policy; who can see which tasks i hire '
          'workers for?',
 'snippets': [{'answer': '  In addition, we collect information while you '
                         'access, browse, view or otherwise use the Site.\n'
                         'In other words, when you access the Site we are '
                         'aware of your usage of the Site, and may gather, '
                         'collect and record the information relating to such '
                         'usage, including geo-location information, IP '
                         'address, device and connection information, browser '
                         'information and web-log information, and all '
                         'communications recorded by Users through the Site.\n',
               'file_path': 'privacy_qa/Fiverr.txt',
               'span': [3873, 4312]}]}


In [37]:
for idx in indices[0]:
    print(documents[idx])

page_content='If for any reason you have a problem with deleting your personal information please contact Fiverr's Customer Support and we will make reasonable efforts to delete any such information pursuant to any applicable privacy laws.
  You can review and change your personal information by logging into the Site and visiting your account profile page.' metadata={'start_index': 23852}
page_content='By accessing and/or using the Site and its related sites, applications, services, goods and/or registering for a Fiverr account, you agree to the terms and conditions of this Policy, including to our collection, use, disclosure, processing and retention of your personal information.
You can also learn how to limit sharing of information in this Policy.' metadata={'start_index': 3003}
page_content='We will process any requests in line with any local laws and our policies and procedures.
If you do not have an active Fiverr account, please contact us at privacy@fiverr.com.
  If you have any