In [None]:
# Importing necessary libraries
import requests
import json
import string

import nltk
from nltk.tokenize import word_tokenize

In [None]:
import ssl

# Diable SSL verification to avoid certificate errors when downloaing nltk resources
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download nltk stopwords and punkt tokenizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /Users/saito/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/saito/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/saito/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Loading training data

In [7]:
from utils import load_bioasq_questions
bioasq_13b_questions = load_bioasq_questions('../data/BioASQ-training13b/training13b.json', num_questions=10)

# Check the number of questions
print(f"Number of BioASQ 13b questions: {len(bioasq_13b_questions)}")

Number of BioASQ 13b questions: 10


## PubMed's API

## Traditional IR model

### Step 1: Extract keywords from questions

#### Remove stop words and punctuation

In [8]:
from utils import extract_keywords

# For each question, extract keywords and save them in a attribute keywords
for question in bioasq_13b_questions:
    question['keywords'] = extract_keywords(question['body'])

print("Original question body:")
print(bioasq_13b_questions[0]['body'])
print("\nExtracted keywords:")
print(bioasq_13b_questions[0]['keywords'])

Original question body:
Is Hirschsprung disease a mendelian or a multifactorial disorder?

Extracted keywords:
['hirschsprung', 'disease', 'mendelian', 'multifactorial', 'disorder']


### Step 2: Consume PubMed's API to get relevant documents

In [9]:
# Get the most relevant documents for each question according to the PubMed API
# and save them in a new attribute documents_api
from utils import get_most_relevant_documents

for question in bioasq_13b_questions:
    documents = get_most_relevant_documents(' '.join(question['keywords']))
    question['documents_api'] = documents

    print(f"Documents found for question `{question['id']}`: {len(documents)}")

Documents found for question `55031181e9bde69634000014`: 3
Documents found for question `55046d5ff8aee20f27000007`: 0
Documents found for question `54e25eaaae9738404b000017`: 4
Documents found for question `535d292a9a4572de6f000003`: 25
Documents found for question `55262a9787ecba3764000009`: 25
Documents found for question `51406e6223fec90375000009`: 0
Documents found for question `553fa78b1d53b76422000007`: 11
Documents found for question `5149199dd24251bc05000040`: 25
Documents found for question `52bf1db603868f1b06000011`: 4
Documents found for question `5709e4b2cf1c32585100001c`: 0


### Step 3: Rank documents with "Traditional IR" model

In [13]:
from rank_bm25 import BM25Okapi

for question in bioasq_13b_questions:

    # ---------------- Process the documents ----------------
    # For each question, concatenate the title and abstract of each of its document
    full_doc = [doc["title"] + " " + doc["documentAbstract"] for doc in question['documents_api']]

    # Tokenize the full documents (title + abstract) of the question
    tokenized_docs = [word_tokenize(doc.lower()) for doc in full_doc]
    
    # Tokenize the question (question body)
    tokenized_question = word_tokenize(question['body'].lower())

    # ---------------- Score the documents ----------------
    # Create bm25 instance
    # Check if there are any documents to process, if not, skip the question
    if not tokenized_docs:
        print(f"Skipping question `{question['id']}` due to no documents found.\n")
        continue

    # Crea la instancia de BM25
    bm25 = BM25Okapi(tokenized_docs)

    # Get the scores for the query
    scores = bm25.get_scores(tokenized_question)

    # Sort documents by score
    ranked_docs = sorted(zip(question['documents_api'], scores), key=lambda x: x[1], reverse=True)

    # Keep the top 10 documents
    top_docs = [doc for doc, score in ranked_docs[:10]]

    print(f"Ranked documents for question `{question['id']}`:")
    
    # Print the top 10 documents id with their scores
    for doc, score in ranked_docs[:10]:
        print(f"Document ID: {doc['pmid']}, Score: {score}")
    print("\n")


Ranked documents for question `55031181e9bde69634000014`:
Document ID: 15617541, Score: 1.0713616533282146
Document ID: 15829955, Score: 0.9393249488260605
Document ID: 12239580, Score: 0.8723052363678908


Skipping question `55046d5ff8aee20f27000007` due to no documents found.

Ranked documents for question `54e25eaaae9738404b000017`:
Document ID: 15094122, Score: 1.0153747819406589
Document ID: 11076767, Score: 1.0045996513693458
Document ID: 38284126, Score: 0.9014224564351652
Document ID: 21784067, Score: 0.6323846734404047


Ranked documents for question `535d292a9a4572de6f000003`:
Document ID: 35940442, Score: 7.196373859909771
Document ID: 33835135, Score: 4.008976980135349
Document ID: 33767452, Score: 3.8572114432991196
Document ID: 35923905, Score: 3.732640956178657
Document ID: 37543950, Score: 3.697055238237451
Document ID: 33201478, Score: 3.545103756873269
Document ID: 33300079, Score: 3.0357011242198104
Document ID: 38706580, Score: 3.0297871162329604
Document ID: 367894