In [1]:
# Importing necessary libraries
import requests
import json
import string

import nltk
from nltk.tokenize import word_tokenize

In [2]:
import ssl

# Diable SSL verification to avoid certificate errors when downloaing nltk resources
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download nltk stopwords and punkt tokenizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /Users/saito/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/saito/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/saito/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Loading training data

In [3]:
from utils import load_bioasq_questions
bioasq_13b_questions = load_bioasq_questions('../data/BioASQ-task13bPhaseA-testset4.txt', num_questions=None, test=False)

# Check the number of questions
print(f"Number of BioASQ 13b questions: {len(bioasq_13b_questions)}")

Number of BioASQ 13b questions: 85


## PubMed's API

## Traditional IR model

### Step 1: Extract keywords from questions

#### Ideal preprocessing - Apply word2vec

In [None]:
from utils import load_vectors_gensim

types_file = '../data/word2vecTools/types.txt'
vectors_file = '../data/word2vecTools/vectors.txt'

w2v_model = load_vectors_gensim(types_file, vectors_file, 200)

In [None]:
from utils import expand_question_with_w2v, build_boolean_query

def expand_question(question):

    question['keywords'] = expand_question_with_w2v(question['body'], w2v_model)

    question['query'] = build_boolean_query(question['keywords'])

    return question

for i, question in enumerate(bioasq_13b_questions):
    # Expand the question using word2vec
    question = expand_question(question)

    # Print the expanded question
    print(f"Expanded Question {i+1}:")
    print(f"Keywords: {question['keywords']}")
    print(f"Query: {question['query']}")
    print()

#### Preprocessing alternative - Just remove stop words and punctuation

In [None]:
# from utils import extract_keywords, build_boolean_query

# for question in bioasq_13b_questions:
#     question['keywords'] = extract_keywords(question['body'])

#     question['query'] = ' '.join(question['keywords'])

### Step 2: Consume PubMed's API to get relevant documents

In [7]:
# Get the most relevant documents for each question according to the PubMed API
# and save them in a new attribute documents_api
from utils import get_most_relevant_documents

for question in bioasq_13b_questions:
    documents = get_most_relevant_documents(question['query'], page=0, documents_per_page=50)
    question['documents_api'] = documents
    question['documents'] = []

    print(f"Documents found for question `{question['id']}`: {len(documents)}")

Documents found for question `67e6cf2618b1e36f2e0000d0`: 0
Documents found for question `680d5e47353a4a2e6b000005`: 41
Documents found for question `680f4a68353a4a2e6b000007`: 11
Documents found for question `680a083218b1e36f2e00014d`: 45
Documents found for question `67e5557c18b1e36f2e0000ac`: 6
Documents found for question `6810fef8353a4a2e6b000016`: 0
Documents found for question `6810cb23353a4a2e6b000012`: 0
Documents found for question `680bc7a718b1e36f2e000156`: 50
Documents found for question `67e56f2018b1e36f2e0000b0`: 0
Documents found for question `67fe5f0918b1e36f2e000144`: 0
Documents found for question `67fbe4d718b1e36f2e00011d`: 50
Documents found for question `680a079718b1e36f2e000147`: 0
Documents found for question `67e5749b18b1e36f2e0000b5`: 50
Documents found for question `680d5f2a353a4a2e6b000006`: 0
Documents found for question `6810f6f0353a4a2e6b000015`: 1
Documents found for question `680a237618b1e36f2e000152`: 1
Documents found for question `68110110353a4a2e6b00

### Step 3: Rank documents with "Traditional IR" model

In [8]:
from rank_bm25 import BM25Okapi

def apply_bm25_ranking(bioasq_13b_questions):
    """
    Apply BM25 ranking to the documents of each question in the BioASQ 13b dataset.
    """
    # For each question, concatenate the title and abstract of each of its document
    for question in bioasq_13b_questions:
        full_doc = [doc["title"] + " " + doc["documentAbstract"] for doc in question['documents_api']]
        # Tokenize the full documents (title + abstract) of the question
        tokenized_docs = [word_tokenize(doc.lower()) for doc in full_doc if doc.strip()]

        # Ensure there are no empty tokenized documents
        if not tokenized_docs:
            print(f"No documents to rank for question `{question['id']}`.")
            continue

        # Tokenize the question (question body)
        tokenized_question = word_tokenize(question['body'].lower())

        # Create bm25 instance
        bm25 = BM25Okapi(tokenized_docs)

        # Get the scores for the query
        scores = bm25.get_scores(tokenized_question)

        # Sort documents by score
        ranked_docs = sorted(zip(question['documents_api'], scores), key=lambda x: x[1], reverse=True)

        print(f"Ranked documents for question `{question['id']}`:")
        
        # Append the top 10 documents to each question
        for doc, score in ranked_docs[:10]:
            question['documents'].append(f"http://www.ncbi.nlm.nih.gov/pubmed/{doc['pmid']}")
            print(f"Document ID: {doc['pmid']}, Score: {score}")
        print("\n")

     
    return [
        {
            'id': question['id'],
            'type': question['type'],
            'body': question['body'],
            'documents': question['documents']
        }
        for question in bioasq_13b_questions
    ]


traditional_ranked_questions = apply_bm25_ranking(bioasq_13b_questions)


No documents to rank for question `67e6cf2618b1e36f2e0000d0`.
Ranked documents for question `680d5e47353a4a2e6b000005`:
Document ID: 38309959, Score: 9.37924436910425
Document ID: 20854242, Score: 8.938550746548314
Document ID: 16428701, Score: 8.82679321141176
Document ID: 11586003, Score: 8.56347553465076
Document ID: 37954759, Score: 8.404561138335444
Document ID: 37562935, Score: 8.280248837120762
Document ID: 19962027, Score: 8.090772195821643
Document ID: 35927044, Score: 7.865801200416887
Document ID: 11052424, Score: 7.846529681170669
Document ID: 38508949, Score: 7.844014214077833


Ranked documents for question `680f4a68353a4a2e6b000007`:
Document ID: 11543693, Score: 6.0443011004224765
Document ID: 27089947, Score: 5.955344175656932
Document ID: 31474191, Score: 5.844531779494639
Document ID: 21340685, Score: 5.514759900032739
Document ID: 38495098, Score: 5.410128249526193
Document ID: 18020586, Score: 5.309849139395425
Document ID: 35026139, Score: 5.120845564296237
Docume

## Save the results

In [9]:
from utils import save_results_to_json

save_results_to_json(traditional_ranked_questions, filename = '../output/results_traditional_model.json')



'Results saved to ../output/results_traditional_model.json'