Install required packages

In [1]:
! pip install --quiet pyterrier[java]
! pip uninstall --quiet -y ir_datasets
! pip install --no-cache-dir --quiet git+https://github.com/JackMcKechnie/ir_datasets.git@sara

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m866.1/866.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m304.8/304.8 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m208.3/208.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.0/149.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [2]:
import requests
import zipfile
import io
import ir_datasets
import pyterrier as pt
import os
import pandas as pd
from tqdm import tqdm
from ir_measures import *

Get run files

In [3]:
url = "https://zenodo.org/records/18732768/files/runs.zip?download=1"

# download the zip
response = requests.get(url)
response.raise_for_status()  # stop if download fails

# unzip and extract
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    z.extractall("./")

print("Downloaded run files and unzipped to ./runs")

# load all runs
names = []
runs = []

files = [f for f in os.listdir("./runs") if "checkpoints" not in f]

for file in tqdm(files, desc="Loading runs"):
    names.append(file)
    runs.append(pt.io.read_results(f"./runs/{file}"))

Downloaded run files and unzipped to ./runs


Loading runs: 100%|██████████| 59/59 [00:14<00:00,  4.14it/s]


Run experiment!

In [4]:
irds = ir_datasets.load("sara")
irds.docs_iter()
irds.qrels_iter()
irds.queries_iter()
qrels = pd.DataFrame(irds.qrels_iter())
dataset = pt.datasets.get_dataset("irds:sara")
qrels["relevance"] = qrels["relevance"].apply(lambda x :  1 if x == 2 else x)
qrels = qrels.rename(columns = {"query_id" : "qid", "doc_id" : "docno", "relevance" : "label"})

[INFO] [starting] building docstore
[INFO] If you have a local copy of https://zenodo.org/records/18609870/files/sara_combined_docs.zip?download=1, you can symlink it here to avoid downloading it again: /root/.ir_datasets/downloads/e806b1d5ce35c94cec2899e190db7dd7
[INFO] [starting] https://zenodo.org/records/18609870/files/sara_combined_docs.zip?download=1
docs_iter:   0%|                                      | 0/1702 [00:00<?, ?doc/s]
https://zenodo.org/records/18609870/files/sara_combined_docs.zip?download=1: 0.0%| 0.00/51.8M [00:00<?, ?B/s][A
https://zenodo.org/records/18609870/files/sara_combined_docs.zip?download=1: 0.0%| 16.4k/51.8M [00:00<05:27, 158kB/s][A
https://zenodo.org/records/18609870/files/sara_combined_docs.zip?download=1: 0.1%| 49.2k/51.8M [00:00<03:42, 233kB/s][A
https://zenodo.org/records/18609870/files/sara_combined_docs.zip?download=1: 0.2%| 123k/51.8M [00:00<02:14, 385kB/s] [A
https://zenodo.org/records/18609870/files/sara_combined_docs.zip?download=1: 0.5%| 2

In [5]:
exp = pt.Experiment(
    runs,
    dataset.get_topics(),
    qrels,
    eval_metrics=[nDCG@10, MRR, P@10, R@100],
    names = names,
    round = 4,
    verbose = True
)

[INFO] If you have a local copy of https://raw.githubusercontent.com/JackMcKechnie/SARA-A-Collection-of-Sensitivity-Aware-Relevance-Assessments/main/repeated_queries.tsv, you can symlink it here to avoid downloading it again: /root/.ir_datasets/downloads/fc0247928a0b93bb344068fa238a5e3f
[INFO] [starting] https://raw.githubusercontent.com/JackMcKechnie/SARA-A-Collection-of-Sensitivity-Aware-Relevance-Assessments/main/repeated_queries.tsv
[INFO] [finished] https://raw.githubusercontent.com/JackMcKechnie/SARA-A-Collection-of-Sensitivity-Aware-Relevance-Assessments/main/repeated_queries.tsv: [00:00] [8.19kB] [12.5MB/s]


pt.Experiment:   0%|          | 0/59 [00:00<?, ?system/s]

In [6]:
exp

Unnamed: 0,name,RR,P@10,R@100,nDCG@10
0,bm25_1000_terrier_minilm.run,0.8165,0.6513,0.0664,0.6666
1,bm25_1000_terrier_monobert.run,0.7974,0.6727,0.0672,0.6785
2,tctcolbert_1000_npretriever_monot5.run,0.8492,0.7253,0.0753,0.7363
3,pl2_1000_terrier_monot5.run,0.8281,0.7073,0.0732,0.7168
4,ance_1000_npretriever_monot5.run,0.8617,0.718,0.0709,0.735
5,retromae_1000_npretriever_electra.run,0.8519,0.716,0.073,0.7243
6,pl2_1000_terrier.run,0.6923,0.5093,0.0503,0.5167
7,bm25_1000_terrier_monot5.run,0.8251,0.6887,0.0721,0.7002
8,retromae_1000_npretriever.run,0.7784,0.5567,0.0503,0.5774
9,retromae_1000_npretriever_monobert.run,0.8058,0.6907,0.069,0.6935


Let's search for ourselves, rather than using a run file. We can use pre-built huggingface indices for this:

In [7]:
index = pt.Artifact.from_hf('JackMcKechnie/sara.terrier')
bm25 = index.bm25() >> pt.apply.generic(lambda run: run.merge(pd.DataFrame(irds.docs_iter()), left_on="docno", right_on = "doc_id", how="left"))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


https://huggingface.co/datasets/JackMcKechnie/sara.terrier/resolve/main/artifact.tar.lz4:   0%|          | 0.0…

extracting data.direct.bf [12.8 MB]
extracting data.document.fsarrayfile [2.1 MB]
extracting data.inverted.bf [12.2 MB]
extracting data.lexicon.fsomapfile [13.1 MB]
extracting data.lexicon.fsomaphash [1017 B]
extracting data.lexicon.fsomapid [626.2 KB]
extracting data.meta-0.fsomapfile [8.0 MB]
extracting data.meta.idx [1014.2 KB]
extracting data.meta.zdata [4.1 MB]
extracting data.properties [4.2 KB]
extracting pt_meta.json [79 B]
terrier-assemblies 5.11 jar-with-dependencies not found, downloading to /root/.pyterrier...


https://repo1.maven.org/maven2/org/terrier/terrier-assemblies/5.11/terrier-assemblies-5.11-jar-with-dependenci…

Done
terrier-python-helper 0.0.8 jar not found, downloading to /root/.pyterrier...


https://repo1.maven.org/maven2/org/terrier/terrier-python-helper/0.0.8/terrier-python-helper-0.0.8.jar:   0%| …

Done


Java started (triggered by Retriever.__init__) and loaded: pyterrier.java.colab, pyterrier.java, pyterrier.java.24, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]


In [8]:
bm25.search("California energy crisis").iloc[0]["text"]

'Observations on the California Energy Crisis The White House Writers Group January 31st Gray Davis has announced that he will soon announce two energy czars to lead the state through the current crisis. ?Logical candidates include: Laurel and Hardy, Abbott and Costello, Beavis and Butthead, and Siegfried and Roy. The environmentalist Green Party is well-named. ?Thanks to its role in blocking construction of new power plants, Californians are forking over plenty of green for energy. California?s power industry is very Kennedyesque. ?Their message to consumers is, "Ask not what your energy company can do for you; ask what you can do for your energy company." Congress might get involved in California?s energy crisis. ?There?s nothing a drowning victim needs more than having an anchor thrown to him. It?s one thing for California politicians trying to solve the energy problem to go back to the drawing board, but do they have to ask taxpayers to buy them a new one? In California, lawmakers 

In [9]:
bm25.search("Accounting fraud").iloc[0]["text"]

"Just to let you know. Fraud just called me about the case. they have received the paperwork and they are ready to credit the full amount to the account however instead I 'm going ot debit new account for $4504.30. And tomorrow fraud will credit new account for the same amount. This way we do not have to open the old account. In other words we trying to minimixe exposure as possible and follow procedures. again your account will be debited only temp. I wanted to share that with you in case you see it on your statement. any questions please call me yours, ameer. >>> <John.Griffith@enron.com> 03/19/01 01:49PM >>> I am sorry I haven't been by. I will come by this evening. Thanks for everything. John"