In [1]:
from beir import util, LoggingHandler
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES

import logging
import pathlib, os

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

#### Download scifact.zip dataset and unzip the dataset
dataset = "nfcorpus"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
out_dir = os.path.join("data")
data_path = util.download_and_unzip(url, out_dir)

#### Provide the data_path where scifact has been downloaded and unzipped
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

length = None
corpus = {k: v for k, v in list(corpus.items())[:length]}
queries = {k: v for k, v in list(queries.items())[:10]}
qrels = {k: v for k, v in list(qrels.items())[:length]}

  from tqdm.autonotebook import tqdm


2024-08-26 11:25:19 - Downloading nfcorpus.zip ...


data/nfcorpus.zip:   0%|          | 0.00/2.34M [00:00<?, ?iB/s]

2024-08-26 11:25:21 - Unzipping nfcorpus.zip ...
2024-08-26 11:25:21 - Loading Corpus...


  0%|          | 0/3633 [00:00<?, ?it/s]

2024-08-26 11:25:21 - Loaded 3633 TEST Documents.
2024-08-26 11:25:21 - Doc Example: {'text': 'Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear. We evaluated risk of breast cancer death among statin users in a population-based cohort of breast cancer patients. The study cohort included all newly diagnosed breast cancer patients in Finland during 1995–2003 (31,236 cases), identified from the Finnish Cancer Registry. Information on statin use before and after the diagnosis was obtained from a national prescription database. We used the Cox proportional hazards regression method to estimate mortality among statin users with statin use as time-dependent variable. A total of 4,151 participants had used statins. During the median follow-up of 3.25 years after the diagnosis (range 0.08–9.0 years) 6,011 participants die

In [31]:
#### Load the SBERT model and retrieve using cosine-similarity
model = DRES(models.SentenceBERT("all-MiniLM-L6-v2"), batch_size=16)
retriever = EvaluateRetrieval(model, score_function="dot") # or "cos_sim" for cosine similarity
results = retriever.retrieve(corpus, queries)

#### Evaluate your model with NDCG@k, MAP@K, Recall@K and Precision@K  where k = [1,3,5,10,100,1000] 
ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)

2024-08-24 23:09:52 - Use pytorch device_name: mps
2024-08-24 23:09:52 - Load pretrained SentenceTransformer: all-MiniLM-L6-v2




2024-08-24 23:09:54 - Encoding Queries...


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.16s/it]


2024-08-24 23:09:55 - Sorting Corpus by document length (Longest first)...
2024-08-24 23:09:55 - Scoring Function: Dot Product (dot)
2024-08-24 23:09:55 - Encoding Batch 1/1...


Batches: 100%|██████████| 228/228 [00:17<00:00, 13.36it/s]


2024-08-24 23:10:13 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
2024-08-24 23:10:13 - 

2024-08-24 23:10:13 - NDCG@1: 0.3000
2024-08-24 23:10:13 - NDCG@3: 0.3756
2024-08-24 23:10:13 - NDCG@5: 0.3396
2024-08-24 23:10:13 - NDCG@10: 0.3579
2024-08-24 23:10:13 - NDCG@100: 0.2929
2024-08-24 23:10:13 - NDCG@1000: 0.4221
2024-08-24 23:10:13 - 

2024-08-24 23:10:13 - MAP@1: 0.0107
2024-08-24 23:10:13 - MAP@3: 0.0284
2024-08-24 23:10:13 - MAP@5: 0.0372
2024-08-24 23:10:13 - MAP@10: 0.0693
2024-08-24 23:10:13 - MAP@100: 0.1051
2024-08-24 23:10:13 - MAP@1000: 0.1223
2024-08-24 23:10:13 - 

2024-08-24 23:10:13 - Recall@1: 0.0107
2024-08-24 23:10:13 - Recall@3: 0.0332
2024-08-24 23:10:13 - Recall@5: 0.0473
2024-08-24 23:10:13 - Recall@10: 0.0989
2024-08-24 23:10:13 - Recall@100: 0.2482
2024-08-24 23:10:13 - Recall@1000: 0.5796
2024-08-24 23:10:13 - 

2024-08-24 23:10:13 - P@1: 0.5000
2024-08-24 23:10:13

In [32]:
qrels

{'PLAIN-2': {'MED-2427': 2,
  'MED-10': 2,
  'MED-2429': 2,
  'MED-2430': 2,
  'MED-2431': 2,
  'MED-14': 2,
  'MED-2432': 2,
  'MED-2428': 1,
  'MED-2440': 1,
  'MED-2434': 1,
  'MED-2435': 1,
  'MED-2436': 1,
  'MED-2437': 1,
  'MED-2438': 1,
  'MED-2439': 1,
  'MED-3597': 1,
  'MED-3598': 1,
  'MED-3599': 1,
  'MED-4556': 1,
  'MED-4559': 1,
  'MED-4560': 1,
  'MED-4828': 1,
  'MED-4829': 1,
  'MED-4830': 1},
 'PLAIN-12': {'MED-2513': 2,
  'MED-5237': 2,
  'MED-2517': 2,
  'MED-2518': 2,
  'MED-2519': 2,
  'MED-2520': 2,
  'MED-2521': 2,
  'MED-2514': 1,
  'MED-2943': 1,
  'MED-5322': 1,
  'MED-5323': 1,
  'MED-5324': 1,
  'MED-5325': 1,
  'MED-5326': 1,
  'MED-5327': 1,
  'MED-5328': 1,
  'MED-5329': 1,
  'MED-5330': 1,
  'MED-5331': 1,
  'MED-5332': 1,
  'MED-5333': 1,
  'MED-5334': 1,
  'MED-5335': 1,
  'MED-5363': 1,
  'MED-5337': 1,
  'MED-5338': 1,
  'MED-5339': 1,
  'MED-5340': 1,
  'MED-5341': 1,
  'MED-5342': 1},
 'PLAIN-23': {'MED-2644': 2,
  'MED-2646': 2,
  'MED-2651': 2

In [33]:
results

{'PLAIN-2': {'MED-5027': 0.24864770472049713,
  'MED-905': 0.2768922448158264,
  'MED-1100': 0.24872131645679474,
  'MED-3306': 0.2928038537502289,
  'MED-2583': 0.27690908312797546,
  'MED-2363': 0.260562926530838,
  'MED-1739': 0.2488238364458084,
  'MED-4929': 0.3037835359573364,
  'MED-2160': 0.29308316111564636,
  'MED-1338': 0.2844192385673523,
  'MED-4821': 0.27692779898643494,
  'MED-2781': 0.2676653265953064,
  'MED-2606': 0.26062384247779846,
  'MED-2228': 0.2522867023944855,
  'MED-2787': 0.24888519942760468,
  'MED-2808': 0.3086799085140228,
  'MED-1395': 0.3038458526134491,
  'MED-2304': 0.298367977142334,
  'MED-3386': 0.2932829260826111,
  'MED-1437': 0.28963610529899597,
  'MED-2407': 0.284761905670166,
  'MED-2065': 0.2812482416629791,
  'MED-2409': 0.277045339345932,
  'MED-2305': 0.2728288471698761,
  'MED-5326': 0.2677273452281952,
  'MED-4195': 0.2637637257575989,
  'MED-4492': 0.26065441966056824,
  'MED-3473': 0.25663718581199646,
  'MED-3606': 0.2522948086261749

In [34]:
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="train")

length = None
corpus = {k: v for k, v in list(corpus.items())[:length]}
queries = {k: v for k, v in list(queries.items())[:10]}
qrels = {k: v for k, v in list(qrels.items())[:length]}

2024-08-24 23:20:56 - Loading Corpus...


100%|██████████| 3633/3633 [00:00<00:00, 55489.26it/s]

2024-08-24 23:20:56 - Loaded 3633 TRAIN Documents.
2024-08-24 23:20:56 - Doc Example: {'text': 'Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear. We evaluated risk of breast cancer death among statin users in a population-based cohort of breast cancer patients. The study cohort included all newly diagnosed breast cancer patients in Finland during 1995–2003 (31,236 cases), identified from the Finnish Cancer Registry. Information on statin use before and after the diagnosis was obtained from a national prescription database. We used the Cox proportional hazards regression method to estimate mortality among statin users with statin use as time-dependent variable. A total of 4,151 participants had used statins. During the median follow-up of 3.25 years after the diagnosis (range 0.08–9.0 years) 6,011 participants di




In [35]:
corpus

{'MED-10': {'text': 'Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear. We evaluated risk of breast cancer death among statin users in a population-based cohort of breast cancer patients. The study cohort included all newly diagnosed breast cancer patients in Finland during 1995–2003 (31,236 cases), identified from the Finnish Cancer Registry. Information on statin use before and after the diagnosis was obtained from a national prescription database. We used the Cox proportional hazards regression method to estimate mortality among statin users with statin use as time-dependent variable. A total of 4,151 participants had used statins. During the median follow-up of 3.25 years after the diagnosis (range 0.08–9.0 years) 6,011 participants died, of which 3,619 (60.2%) was due to breast cancer. After adjustment for a