In [None]:
# Importing necessary libraries
import requests
import json
import string

# Download nltk stopwords and punkt tokenizer
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

In [None]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

## Loading training data

In [None]:
from utils import load_bioasq_questions
bioasq_13b_questions = load_bioasq_questions('../data/BioASQ-training13b/training13b.json', num_questions=10)

# Check the number of questions
print(f"Number of BioASQ 13b questions: {len(bioasq_13b_questions)}")

## PubMed's API

## Traditional IR model

### Step 1: Extract keywords from questions

#### Remove stop words and punctuation

In [None]:
from utils import extract_keywords

# For each question, extract keywords and save them in a attribute keywords
for question in bioasq_13b_questions:
    question['keywords'] = extract_keywords(question['body'])

print("Original question body:")
print(bioasq_13b_questions[0]['body'])
print("\nExtracted keywords:")
print(bioasq_13b_questions[0]['keywords'])

### Step 2: Consume PubMed's API to get relevant documents

In [None]:
# Get the most relevant documents for each question according to the PubMed API
# and save them in a new attribute documents_api
from utils import get_most_relevant_documents

for question in bioasq_13b_questions:
    documents = get_most_relevant_documents(' '.join(question['keywords']))
    question['documents_api'] = documents

    print(f"Documents found for question `{question['id']}`: {len(documents)}")

### Step 3: Rank documents with "Traditional IR" model

In [None]:
from rank_bm25 import BM25Okapi

for question in bioasq_13b_questions:

    # ---------------- Process the documents ----------------
    # For each question, concatenate the title and abstract of each of its document
    full_doc = [doc["title"] + " " + doc["documentAbstract"] for doc in question['documents_api']]

    # Tokenize the full documents (title + abstract) of the question
    tokenized_docs = [word_tokenize(doc.lower()) for doc in full_doc]
    
    # Tokenize the question (question body)
    tokenized_question = word_tokenize(question['body'].lower())

    # ---------------- Score the documents ----------------
    # Create bm25 instance
    bm25 = BM25Okapi(tokenized_docs)

    # Get the scores for the query
    scores = bm25.get_scores(tokenized_question)

    # Sort documents by score
    ranked_docs = sorted(zip(question['documents_api'], scores), key=lambda x: x[1], reverse=True)

    # Keep the top 10 documents
    top_docs = [doc for doc, score in ranked_docs[:10]]

    print(f"Ranked documents for question `{question['id']}`:")
    
    # Print the top 10 documents id with their scores
    for doc, score in ranked_docs[:10]:
        print(f"Document ID: {doc['pmid']}, Score: {score}")
    print("\n")
