In [None]:
#download and extract the MS MARCO passage dataset

mkdir collections/msmarco-passage

wget https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz -P collections/msmarco-passage

tar xvfz collections/msmarco-passage/collectionandqueries.tar.gz -C collections/msmarco-passage

In [None]:
#convert the MS MARCO tsv collection into Anserini's jsonl files 
python tools/scripts/msmarco/convert_collection_to_jsonl.py \
 --collection-path collections/msmarco-passage/collection.tsv \
 --output-folder collections/msmarco-passage/collection_jsonl

In [None]:
#indexing
#index these docs as a JsonCollection using Anserini
sh target/appassembler/bin/IndexCollection -threads 9 -collection JsonCollection \
 -generator DefaultLuceneDocumentGenerator -input collections/msmarco-passage/collection_jsonl \
 -index indexes/msmarco-passage/lucene-index-msmarco -storePositions -storeDocvectors -storeRaw 

In [None]:
#retrieval
#use the queries that are in the qrels file
python tools/scripts/msmarco/filter_queries.py \
 --qrels collections/msmarco-passage/qrels.dev.small.tsv \
 --queries collections/msmarco-passage/queries.dev.tsv \
 --output collections/msmarco-passage/queries.dev.small.tsv

In [None]:
#perform a retrieval run
sh target/appassembler/bin/SearchCollection -hits 1000 -parallelism 4 \
 -index indexes/msmarco-passage/lucene-index-msmarco \
 -topicreader TsvInt -topics collections/msmarco-passage/queries.dev.small.tsv \
 -output runs/run.msmarco-passage.dev.small.tsv -format msmarco \
 -bm25 -bm25.k1 0.82 -bm25.b 0.68  #uses BM25 with tuned parameters k1=0.82, b=0.68

In [None]:
#evaluate the retrieved documents
python tools/scripts/msmarco/msmarco_passage_eval.py \
 collections/msmarco-passage/qrels.dev.small.tsv runs/run.msmarco-passage.dev.small.tsv

In [None]:
#TREC evaluation
#convert runs and qrels files to the TREC format
python tools/scripts/msmarco/convert_msmarco_to_trec_run.py \
 --input runs/run.msmarco-passage.dev.small.tsv \
 --output runs/run.msmarco-passage.dev.small.trec

python tools/scripts/msmarco/convert_msmarco_to_trec_qrels.py \
 --input collections/msmarco-passage/qrels.dev.small.tsv \
 --output collections/msmarco-passage/qrels.dev.small.trec

In [None]:
#run the trec_eval tool
tools/eval/trec_eval.9.0.4/trec_eval -c -mrecall.1000 -mmap \
 collections/msmarco-passage/qrels.dev.small.trec runs/run.msmarco-passage.dev.small.trec