#### Import of packages and pre-trained BERT model

In [3]:
# Importing necessary libraries
import torch
from transformers import AutoTokenizer, AutoModel
import json
import nltk

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load pre-trained model and tokenizer
# BioBERT is a good choice as it's trained on biomedical literature
MODEL_NAME = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mehdin.masinovic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mehdin.masinovic\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


 #### Loading and Processing the BioASQ Dataset (Similar to Baseline)

In [4]:
from utils_neural import get_embeddings

#### PubMed API Integration

#### Neural Document Retrieval

In [5]:
from utils_neural import retrieve_and_rank_documents_neural

#### Neural Snippet Extraction

In [6]:
from utils_neural import extract_and_rank_snippets_neural

#### Main Processing Pipeline

In [7]:
from utils import load_bioasq_questions
bioasq_13b_questions = load_bioasq_questions('../data/BioASQ-training13b/training13b.json', num_questions=10)

# Check the number of questions
print(f"Number of BioASQ 13b questions: {len(bioasq_13b_questions)}")

Number of BioASQ 13b questions: 10


In [8]:
# Process all questions
results = []

for question in bioasq_13b_questions:
    print(f"Processing question: {question['id']}")
    
    # 1. Retrieve and rank documents using neural approach
    ranked_docs = retrieve_and_rank_documents_neural(
        question, 
        model, 
        tokenizer, 
        max_docs=3
    )
    
    # 2. Extract and rank snippets using neural approach
    ranked_snippets = extract_and_rank_snippets_neural(
        question,
        ranked_docs,
        model,
        tokenizer,
        max_snippets=3
    )
    
    # 3. Format result for this question
    question_result = {
        'id': question['id'],
        'documents': [f"http://www.ncbi.nlm.nih.gov/pubmed/{doc['pmid']}" for doc in ranked_docs],
        'snippets': ranked_snippets
    }
    
    results.append(question_result)
    
    print(f"Found {len(ranked_docs)} documents and {len(ranked_snippets)} snippets")

# Save results to file
with open('neural_results.json', 'w') as f:
    json.dump({'questions': results}, f, indent=2)

Processing question: 55031181e9bde69634000014
Found 3 documents and 3 snippets
Processing question: 55046d5ff8aee20f27000007
Found 0 documents and 0 snippets
Processing question: 54e25eaaae9738404b000017
Found 3 documents and 3 snippets
Processing question: 535d292a9a4572de6f000003
Found 3 documents and 3 snippets
Processing question: 55262a9787ecba3764000009
Found 3 documents and 3 snippets
Processing question: 51406e6223fec90375000009
Found 0 documents and 0 snippets
Processing question: 553fa78b1d53b76422000007
Found 3 documents and 3 snippets
Processing question: 5149199dd24251bc05000040
Found 3 documents and 3 snippets
Processing question: 52bf1db603868f1b06000011
Found 3 documents and 3 snippets
Processing question: 5709e4b2cf1c32585100001c
Found 0 documents and 0 snippets


In [9]:
from utils import load_bioasq_test_questions
bioasq_13b_test_questions = load_bioasq_test_questions('../data/BioASQ-task13bPhaseA-testset4.txt')

# Check the number of questions
print(f"Number of BioASQ 13b test questions: {len(bioasq_13b_test_questions)}")

Number of BioASQ 13b test questions: 85


In [10]:
# Process all test questions
results = []

for question in bioasq_13b_test_questions:
    print(f"Processing question: {question['id']}")
    
    # 1. Retrieve and rank documents using neural approach
    ranked_docs = retrieve_and_rank_documents_neural(
        question, 
        model, 
        tokenizer, 
        max_docs=50  # Retrieve more docs initially to ensure quality
    )
    
    # Limit to top 10 documents
    top_docs = ranked_docs[:10]
    
    # 2. Extract and rank snippets using neural approach
    ranked_snippets = extract_and_rank_snippets_neural(
        question,
        top_docs,  # Only use the top 10 docs for snippet extraction
        model,
        tokenizer,
        max_snippets=10  # Limit to top 10 snippets
    )
    
    # 3. Format result for this question according to BioASQ requirements
    question_result = {
        'id': question['id'],
        'documents': [f"http://www.ncbi.nlm.nih.gov/pubmed/{doc['pmid']}" for doc in top_docs],
        'snippets': [
            {
                'document': f"http://www.ncbi.nlm.nih.gov/pubmed/{snippet['document'].split('/')[-1]}",
                'text': snippet['text'],
                'offsetInBeginSection': snippet['offsetInBeginSection'],
                'offsetInEndSection': snippet['offsetInEndSection'],
                'beginSection': snippet['beginSection'],
                'endSection': snippet['endSection']
            } 
            for snippet in ranked_snippets
        ]
    }
    
    results.append(question_result)
    
    print(f"Found {len(top_docs)} documents and {len(ranked_snippets)} snippets")

# Save results to file in the required format
with open('BioASQ-task13b-phaseA-testset4-neural-results.json', 'w') as f:
    json.dump({'questions': results}, f, indent=2)

Processing question: 67e6cf2618b1e36f2e0000d0
Found 0 documents and 0 snippets
Processing question: 680d5e47353a4a2e6b000005
Found 10 documents and 10 snippets
Processing question: 680f4a68353a4a2e6b000007
Found 10 documents and 10 snippets
Processing question: 680a083218b1e36f2e00014d
Found 10 documents and 10 snippets
Processing question: 67e5557c18b1e36f2e0000ac
Found 6 documents and 10 snippets
Processing question: 6810fef8353a4a2e6b000016
Found 0 documents and 0 snippets
Processing question: 6810cb23353a4a2e6b000012
Found 0 documents and 0 snippets
Processing question: 680bc7a718b1e36f2e000156
Found 10 documents and 10 snippets
Processing question: 67e56f2018b1e36f2e0000b0
Found 0 documents and 0 snippets
Processing question: 67fe5f0918b1e36f2e000144
Found 0 documents and 0 snippets
Processing question: 67fbe4d718b1e36f2e00011d
Found 10 documents and 10 snippets
Processing question: 680a079718b1e36f2e000147
Found 0 documents and 0 snippets
Processing question: 67e5749b18b1e36f2e00