In [None]:
# Importing necessary libraries
import requests
import json
import string

# Importing NLTK for text processing (if we don't use lemmatization or word2vec)
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [None]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download nltk stopwords and punkt tokenizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

## Loading training data

In [None]:
# Load BioASQ data
bioasq_13b_questions = (json.load(open('data/BioASQ-training13b/training13b.json')))['questions']

# Extract body, type, and id from the questions
bioasq_13b_questions = [
    {
        # Question data
        'body': question['body'],
        'type': question['type'],
        'id': question['id'],
        'target_documents': question['documents'],
    }
    for question in bioasq_13b_questions
    if question['type'] in ['yesno', 'factoid', 'summary', 'list']
]

# get the first X questions
testing_questions = 1
bioasq_13b_questions = bioasq_13b_questions[:testing_questions]

# Check the number of questions
print(f"Number of BioASQ 13b questions: {len(bioasq_13b_questions)}")

## PubMed's API

### Generate session ID

In [None]:
GET_SESSION_URL = "http://bioasq.org:8000/pubmed"

def get_session():
    """
    This function retrieves a session ID from the BioASQ server as a URL.
    These session IDs can be used for multiple requests but expire after 10 minutes, 
    so they must be renewed periodically.

    Returns:
        str: The session ID as a string (e.g., http://bioasq.org:8000/2?-3a641fde%3A19687315e96%3A-7fe2) if the request is successful, None otherwise.
    Raises:
        requests.RequestException: If the GET request fails due to network issues or server errors.
    """
    try:
        # Sending a GET request to the server
        response = requests.get(GET_SESSION_URL)
        
        # Checking if the request was successful
        if response.status_code == 200:
            # Extracting the session ID from the response
            return str(response.text)
        else:
            print(f"Error: Received status code {response.status_code}")
            return None
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        raise e

### Get list of most relevant documents

In [None]:
def get_most_relevant_documents(keywords, page=0, documents_per_page=25):
    """
    This function retrieves the most relevant documents from the BioASQ server based on the provided keywords.

    Args:
        keywords (str): The keywords to search for in the documents.
        page (int): The page number for pagination. Default is 0.
        documents_per_page (int): The number of documents to retrieve per page. Default is 10.

    Returns:
        list: A list of objects containing the most relevant documents.
            Content of the objects:
                year (string): The year of publication.
                documentAbstract (string): Abstract of the document.
                meshAnnotations (unclear - Null): MESH annotations of the document. (No idea what this is, usually Null)
                pmid (string): The PubMed ID of the document. Useful in case you want to look for the entire document in PubMed.
                        E.g. pmid = 38939119; https://pubmed.ncbi.nlm.nih.gov/38939119/
                title (string): Title of the document.
                sections (unclear - Null: section of the document? (No idea what this is, usually Null)
                fulltextAvailable (Boolean): Indicates if the full text of the document is available.
                journal (string): Journal in which the document was published?
                meshHeading (list of strings): MESH entities of the document, related to knowledge graphs?

            

    """
    session_url = get_session()
    request_data = f'json={{"findPubMedCitations": ["{keywords}", {page}, {documents_per_page}]}}'
    
    response = requests.post(session_url, data=request_data)
    
    if response.status_code == 200:
        return response.json()['result']['documents']
    else:
        print(f"Error: Received status code {response.status_code}")
        return None


## Traditional IR model

### Step 1: Extract keywords from questions

#### Remove stop words and punctuation

In [None]:
def extract_keywords(text):
    tokens = word_tokenize(text.lower())
    
    # Filter stopwords and punctuation
    stop_words = set(stopwords.words("english"))
    keywords = [
        word for word in tokens 
        if word.isalnum() and word not in stop_words
    ]
    # Return keywords as a string
    # return ' '.join(keywords)

    # Return keywords as a list of strings
    return keywords

# For each question, extract keywords and save them in a attribute keywords
for question in bioasq_13b_questions:
    question['keywords'] = extract_keywords(question['body'])

print("Original question body:")
print(bioasq_13b_questions[0]['body'])
print("\nExtracted keywords:")
print(bioasq_13b_questions[0]['keywords'])

### Step 2: Consume PubMed's API to get relevant documents

In [None]:
# Get the most relevant documents for each question according to the PubMed API
# and save them in a new attribute documents_api
for question in bioasq_13b_questions:
    documents = get_most_relevant_documents(' '.join(question['keywords']))
    question['documents_api'] = documents

    print(f"Documents found for question `{question['id']}`: {len(documents)}")

### Step 3: Rank documents with "Traditional IR" model

In [None]:
from rank_bm25 import BM25Okapi

for question in bioasq_13b_questions:

    # ---------------- Process the documents ----------------
    # For each question, concatenate the title and abstract of each of its document
    full_doc = [doc["title"] + " " + doc["documentAbstract"] for doc in question['documents_api']]

    # Tokenize the full documents (title + abstract) of the question
    tokenized_docs = [word_tokenize(doc.lower()) for doc in full_doc]
    
    # Tokenize the question (question body)
    tokenized_question = word_tokenize(question['body'].lower())

    # ---------------- Score the documents ----------------
    # Create bm25 instance
    bm25 = BM25Okapi(tokenized_docs)

    # Get the scores for the query
    scores = bm25.get_scores(tokenized_question)

    # Sort documents by score
    ranked_docs = sorted(zip(question['documents_api'], scores), key=lambda x: x[1], reverse=True)

    # Keep the top 10 documents
    top_docs = [doc for doc, score in ranked_docs[:10]]

    print(f"Ranked documents for question `{question['id']}`:")
    
    # Print the top 10 documents id with their scores
    for doc, score in ranked_docs[:10]:
        print(f"Document ID: {doc['pmid']}, Score: {score}")
    print("\n")
