#### **BioASQ BM25**

This notebook preprocesses BioASQ (part of BEIR) & runs BM25 on it.

##### Setup BioASQ dataset

In [1]:
!kaggle datasets download -d bioasq
!unzip ./bioasq.zip

Downloading bioasq.zip to /home/repos/semanticsearch
100%|█████████████████████████████████████▉| 7.93G/7.93G [25:04<00:00, 5.78MB/s]
100%|██████████████████████████████████████| 7.93G/7.93G [25:04<00:00, 5.66MB/s]
Archive:  ./bioasq.zip
  inflating: BioASQ-TaskB-testData/BioASQ-TaskB-testData/README  
  inflating: BioASQ-TaskB-testData/BioASQ-TaskB-testData/phaseA_1b_01.json  
  inflating: BioASQ-TaskB-testData/BioASQ-TaskB-testData/phaseA_1b_02.json  
  inflating: BioASQ-TaskB-testData/BioASQ-TaskB-testData/phaseA_1b_03.json  
  inflating: BioASQ-TaskB-testData/BioASQ-TaskB-testData/phaseA_2b_01.json  
  inflating: BioASQ-TaskB-testData/BioASQ-TaskB-testData/phaseA_2b_02.json  
  inflating: BioASQ-TaskB-testData/BioASQ-TaskB-testData/phaseA_2b_03.json  
  inflating: BioASQ-TaskB-testData/BioASQ-TaskB-testData/phaseA_2b_04.json  
  inflating: BioASQ-TaskB-testData/BioASQ-TaskB-testData/phaseA_2b_05.json  
  inflating: BioASQ-TaskB-testData/BioASQ-TaskB-testData/phaseA_3b_01.json  
  i

In [2]:
!kaggle datasets download -d bioasqtraintest
!unzip ./bioasqtraintest.zip

Downloading bioasqtraintest.zip to /home/repos/semanticsearch
 90%|██████████████████████████████████▏   | 4.00M/4.45M [00:00<00:00, 5.79MB/s]
100%|██████████████████████████████████████| 4.45M/4.45M [00:00<00:00, 5.83MB/s]
Archive:  ./bioasqtraintest.zip
  inflating: BioASQ-training8b/README  
  inflating: BioASQ-training8b/training8b.json  
  inflating: Task8BGoldenEnriched/Task8BGoldenEnriched/8B1_golden.json  
  inflating: Task8BGoldenEnriched/Task8BGoldenEnriched/8B2_golden.json  
  inflating: Task8BGoldenEnriched/Task8BGoldenEnriched/8B3_golden.json  
  inflating: Task8BGoldenEnriched/Task8BGoldenEnriched/8B4_golden.json  
  inflating: Task8BGoldenEnriched/Task8BGoldenEnriched/8B5_golden.json  


In [16]:
### BioASQ preprocessing ###
import csv
import jsonlines
import json

start_txt = '"abstractText":"'
start_pmid = '","pmid":"'
start_title = '","title":"'


with jsonlines.open('corpus.jsonl', 'w') as jsonl_out:

    with open("allMeSH_2020/allMeSH_2020.json", "r", encoding="utf8", errors="ignore") as f:
         # As the JSON file is too large for memory (19GB), iterate line by line
        for i, line in enumerate(f):

            # first line does not contain data
            if i == 0:
                continue
            
            # Find the indices we need
            txt_idx = line.find(start_txt)
            pmid_idx = line.find(start_pmid)
            title_idx = line.find(start_title)

            assert (txt_idx != -1) and (pmid_idx != -1), f"Could not find Txt: {txt_idx} Pmid: {pmid_idx}"

            abstractText = line[txt_idx + len(start_txt) : pmid_idx]
            pmid = line[pmid_idx + len(start_pmid) : title_idx]

            title = line[title_idx + len(start_title):]
            title_end = title.find('."}')
            title = title[:title_end]
            
            # TODO: Add Title
            line = {"_id": pmid, "text": abstractText, "title": title}
            jsonl_out.write(line)
            
    # Add manual fixes provided by BEIR authors
    with open("manual-fixes.csv", "r") as f:
        csv_f = csv.reader(f)
        for row in csv_f:
            # ID TITLE TEXT
            line = {"_id": row[0], "text": row[2], "title": row[1]}
            jsonl_out.write(line)

In [8]:
### If you want to benchmark on the BioASQ train dataset [This is not the one used in BEIR] 
def use_train():
    with open("BioASQ-training8b/training8b.json", "r") as content:
        with jsonlines.open(f'queries.jsonl', 'w') as queries_out:
            with open("test.tsv", "w") as record_file:
                record_file.write("query-id\tcorpus-id\tscore\n")
                base_corpus = json.load(content)

                for query in base_corpus["questions"]:
                    query_line = {"_id": query["id"], "text": query["body"]}
                    queries_out.write(query_line)

                    for doc in query["documents"]:
                        doc_id = doc.split("/")[-1]
                        relevance = 1

                        record_file.write(f"{query['id']}\t{doc_id}\t{relevance}\n")
                        
                        
### Test dataset with 500 queries as used in BEIR
test_data_path = "Task8BGoldenEnriched/Task8BGoldenEnriched"
with jsonlines.open(f'queries.jsonl', 'w') as queries_out:
    with open("test.tsv", "w") as record_file:
        # Write header
        record_file.write("query-id\tcorpus-id\tscore\n")
        for test_json in os.listdir(test_data_path):
            with open(os.path.join(test_data_path, test_json), "r") as content:

                queries_answers = json.load(content)

                for query in queries_answers["questions"]:
                    query_line = {"_id": query["id"], "text": query["body"]}
                    queries_out.write(query_line)

                    for doc in query["documents"]:
                        doc_id = doc.split("/")[-1]
                        relevance = 1

                        record_file.write(f"{query['id']}\t{doc_id}\t{relevance}\n")


In [17]:
!mkdir bioasq
!mkdir bioasq/qrels

!mv ./queries.jsonl bioasq/
!mv ./corpus.jsonl bioasq/
!mv ./test.tsv bioasq/qrels/test.tsv

##### Run with BM25

In [2]:
!wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.0.0-linux-x86_64.tar.gz -q
!tar -xzf elasticsearch-7.0.0-linux-x86_64.tar.gz
!chown -R daemon:daemon elasticsearch-7.0.0

chown: changing ownership of 'elasticsearch-7.0.0/jdk/release': Operation not permitted
chown: changing ownership of 'elasticsearch-7.0.0/jdk/jmods/jdk.xml.dom.jmod': Operation not permitted
chown: changing ownership of 'elasticsearch-7.0.0/jdk/jmods/jdk.net.jmod': Operation not permitted
chown: changing ownership of 'elasticsearch-7.0.0/jdk/jmods/jdk.jstatd.jmod': Operation not permitted
chown: changing ownership of 'elasticsearch-7.0.0/jdk/jmods/jdk.jdwp.agent.jmod': Operation not permitted
chown: changing ownership of 'elasticsearch-7.0.0/jdk/jmods/java.security.jgss.jmod': Operation not permitted
chown: changing ownership of 'elasticsearch-7.0.0/jdk/jmods/jdk.jcmd.jmod': Operation not permitted
chown: changing ownership of 'elasticsearch-7.0.0/jdk/jmods/jdk.internal.le.jmod': Operation not permitted
chown: changing ownership of 'elasticsearch-7.0.0/jdk/jmods/java.smartcardio.jmod': Operation not permitted
chown: changing ownership of 'elasticsearch-7.0.0/jdk/jmods/java.sec

SubprocessError: Exception occurred in preexec_fn.

In [None]:
# start server
import os
from subprocess import Popen, PIPE, STDOUT
es_server = Popen(['elasticsearch-7.0.0/bin/elasticsearch'], 
                  stdout=PIPE, stderr=STDOUT,
                  preexec_fn=lambda: os.setuid(1)  # as daemon
                 )

In [8]:
# wait a bit then test - should show something like cluster_name" : "elasticsearch" ...
#!curl -X GET "localhost:9200/"
# Alternative way to check if BM25 is working; without using curl
!wget "localhost:9200/"
!cat index.html
!rm index.html

--2021-12-09 14:58:43--  http://localhost:9200/
Resolving localhost (localhost)... 127.0.0.1
Connecting to localhost (localhost)|127.0.0.1|:9200... connected.
HTTP request sent, awaiting response... 200 OK
Length: 501 [application/json]
Saving to: ‘index.html’


2021-12-09 14:58:43 (63,8 MB/s) - ‘index.html’ saved [501/501]

{
  "name" : "james-new",
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "jTPlAc85QhqmVhSBGc7AOQ",
  "version" : {
    "number" : "7.0.0",
    "build_flavor" : "default",
    "build_type" : "tar",
    "build_hash" : "b7e28a7",
    "build_date" : "2019-04-05T22:55:32.697037Z",
    "build_snapshot" : false,
    "lucene_version" : "8.0.0",
    "minimum_wire_compatibility_version" : "6.7.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


In [5]:
### TRAIN DATASET RESULTS ###

import json

from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.search.lexical import BM25Search as BM25
from beir.retrieval.evaluation import EvaluateRetrieval


def run_bm25(dataset, data_path, initialize = True, hostname = "localhost"):
    corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")
    model = BM25(index_name=dataset, hostname=hostname, initialize=initialize)
    retriever = EvaluateRetrieval(model)
    #### Retrieve dense results (format of results is identical to qrels)
    results = retriever.retrieve(corpus, queries)
    # Save scores for top 1000 docs for each query, i.e. 1000 * queries lines
    with open(f"./results_{dataset}.json", 'w') as fp:
        json.dump(results, fp)
    ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
    return ndcg


### Signal 1M

#run_bm25("signal1m", "signal1m")

##### TREC-News

#run_bm25("trec-news", "trec-news")

##### TREC-Robust

#run_bm25("robust04", "robust04")

##### BioASQ

run_bm25("bioasq", "bioasq")

  from tqdm.autonotebook import tqdm


  0%|          | 0/14913938 [00:00<?, ?it/s]

  0%|          | 0/14913854 [00:00<?, ?docs/s]                  
que: 100%|██████████| 26/26 [01:13<00:00,  2.81s/it]


{'NDCG@1': 0.37095,
 'NDCG@3': 0.35248,
 'NDCG@5': 0.34372,
 'NDCG@10': 0.33488,
 'NDCG@100': 0.39548,
 'NDCG@1000': 0.43395}

In [9]:
### TEST DATASET RESULTS ###

import json

from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.search.lexical import BM25Search as BM25
from beir.retrieval.evaluation import EvaluateRetrieval


def run_bm25(dataset, data_path, initialize = True, hostname = "localhost"):
    corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")
    model = BM25(index_name=dataset, hostname=hostname, initialize=initialize)
    retriever = EvaluateRetrieval(model)
    #### Retrieve dense results (format of results is identical to qrels)
    results = retriever.retrieve(corpus, queries)
    # Save scores for top 1000 docs for each query, i.e. 1000 * queries lines
    with open(f"./results_{dataset}.json", 'w') as fp:
        json.dump(results, fp)
    ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
    return ndcg


### Signal 1M

#run_bm25("signal1m", "signal1m")

##### TREC-News

#run_bm25("trec-news", "trec-news")

##### TREC-Robust

#run_bm25("robust04", "robust04")

##### BioASQ

run_bm25("bioasq", "bioasq")

  from tqdm.autonotebook import tqdm


  0%|          | 0/14914714 [00:00<?, ?it/s]

  0%|          | 0/14914604 [00:00<?, ?docs/s]                  
que: 100%|██████████| 4/4 [00:45<00:00, 11.39s/it]


{'NDCG@1': 0.472,
 'NDCG@3': 0.48468,
 'NDCG@5': 0.48426,
 'NDCG@10': 0.48838,
 'NDCG@100': 0.55887,
 'NDCG@1000': 0.58621}