In [None]:
"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.

Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
"""
# If you're using Google Colab and not running locally, run this cell.
# !pip install wget
# !pip install git+https://github.com/NVIDIA/apex.git
# !pip install nemo_toolkit[nlp]
# !pip install unidecode
import os
import nemo
import nemo.collections.nlp as nemo_nlp
import numpy as np
import time
import errno
import json

from nemo.backends.pytorch.common.losses import CrossEntropyLossNM
from nemo.collections.nlp.nm.data_layers import BertQuestionAnsweringDataLayer
from nemo.collections.nlp.nm.trainables import TokenClassifier
from nemo.collections.nlp.callbacks.qa_squad_callback import eval_epochs_done_callback, eval_iter_callback
from nemo.utils.lr_policies import get_lr_policy
from nemo import logging

# Introduction
BioBERT has the same network architecture as the original BERT, but instead of Wikipedia and BookCorpus it is pretrained on PubMed, a large biomedical text corpus, which achieves better performance in biomedical downstream tasks, such as question answering(QA), named entity recognition(NER) and relationship extraction(RE). This model was trained for 1M steps. For more information please refer to the original paper https://academic.oup.com/bioinformatics/article/36/4/1234/5566506.  For details about BERT please refer to https://ngc.nvidia.com/catalog/models/nvidia:bertbaseuncasedfornemo.

BioMegatron is an in house model, using Megatron https://github.com/NVIDIA/Megatron-LM pretrained on PubMed. The accuracy is better than using BioBERT on downstream tasks


In this notebook we're going to showcase how to train BioBERT/BioMegatron on a biomedical question answering (QA) dataset.

# Download model  checkpoint
Download BioBert/BioMegatron checkpoints finetuned on SQuADv1.1 from  NGC: https://ngc.nvidia.com/catalog/models. Alternatively, you can also download BioBert/BioMegatron checkpoints and do the finetuning on SQuADv1.1 locally. This will take some time. For this, follow instructions at https://ngc.nvidia.com/catalog/models/nvidia:bertbaseuncasedsquadv1. 
    Then, put the encoder weights at `./checkpoints/biobert/qa_squad/BERT.pt` or `./checkpoints/biomegatron/qa_squad/BERT.pt`, the model head weights at `./checkpoints/biobert/qa_squad/TokenClassifier.pt` or `./checkpoints/biomegatron/qa_squad/TokenClassifier.pt` and the model configuration file at `./checkpoints/biobert/qa_squad/bert_config.json` or `./checkpoints/biomegatron/qa_squad/bert_config.json`.

In [None]:
# Set which model to use.
model_type="biobert" # "biomegatron"
base_checkpoint_path={'biobert': './checkpoints/biobert/qa_squad', 'biomegatron': './checkpoints/biomegatron/qa_squad'}
pretrained_model_name={'biobert': 'bert-base-cased', 'biomegatron': 'megatron-bert-uncased'}
do_lower_case={'biobert': False, 'biomegatron': True}
work_dir={'biobert': 'output_bioasq_biobert', 'biomegatron': 'output_bioasq_biomegatron'}

In [None]:
# the checkpoints are available from NGC: https://ngc.nvidia.com/catalog/models
CHECKPOINT_ENCODER = os.path.join(base_checkpoint_path[model_type], 'BERT.pt')
CHECKPOINT_HEAD = os.path.join(base_checkpoint_path[model_type], 'TokenClassifier.pt')
CHECKPOINT_CONFIG = os.path.join(base_checkpoint_path[model_type], 'bert_config.json')
    
if not os.path.exists(CHECKPOINT_ENCODER):
    raise OSError(errno.ENOENT, os.strerror(errno.ENOENT), CHECKPOINT_ENCODER)

if not os.path.exists(CHECKPOINT_HEAD):
    raise OSError(errno.ENOENT, os.strerror(errno.ENOENT), CHECKPOINT_HEAD)
    
if not os.path.exists(CHECKPOINT_CONFIG):
    raise OSError(errno.ENOENT, os.strerror(errno.ENOENT), CHECKPOINT_CONFIG)

# Download training data
You first need to download the QA dataset BioASQ 7B to ./datasets/bioasq. Before using the files in this repository, you must first register BioASQ website and download the [BioASQ Task B](http://participants-area.bioasq.org/Tasks/A/getData/) data.
You can also download part of the data using ../question_answering/get_bioasq.py.
However the test labels for 7B need to be downloaded from the official website.
In the following we show an example for training and inference for 7B which is a superset of 6B.

In [None]:
data_dir="./datasets"
dataset="BioASQ"
if not os.path.exists(f"{data_dir}/{dataset}"):
    !python ../question_answering/get_bioasq.py --data_dir=$data_dir
!ls -l $data_dir/$dataset

After the previous step, you should have a ./datasets/BioASQ folder that contains the following files:

- 6B1_golden.json
- 6B2_golden.json
- 6B3_golden.json
- 6B4_golden.json
- 6B5_golden.json
- BioASQ-6b/train/Full-Abstract/BioASQ-train-factoid-6b-full-annotated.json
- BioASQ-6b/test/BioASQ-6b/test/Full-Abstract/BioASQ-test-factoid-6b-1.json
- BioASQ-6b/test/BioASQ-6b/test/Full-Abstract/BioASQ-test-factoid-6b-2.json
- BioASQ-6b/test/BioASQ-6b/test/Full-Abstract/BioASQ-test-factoid-6b-3.json
- BioASQ-6b/test/BioASQ-6b/test/Full-Abstract/BioASQ-test-factoid-6b-4.json
- BioASQ-6b/test/BioASQ-6b/test/Full-Abstract/BioASQ-test-factoid-6b-5.json
- BioASQ-7b/train/Full-Abstract/BioASQ-train-factoid-7b-full-annotated.json
- BioASQ-7b/test/BioASQ-7b/test/Full-Abstract/BioASQ-test-factoid-7b-1.json
- BioASQ-7b/test/BioASQ-7b/test/Full-Abstract/BioASQ-test-factoid-7b-2.json
- BioASQ-7b/test/BioASQ-7b/test/Full-Abstract/BioASQ-test-factoid-7b-3.json
- BioASQ-7b/test/BioASQ-7b/test/Full-Abstract/BioASQ-test-factoid-7b-4.json
- BioASQ-7b/test/BioASQ-7b/test/Full-Abstract/BioASQ-test-factoid-7b-5.json

The format of the data described in NeMo docs.

# Create Neural Modules

In [None]:
model_checkpoint=CHECKPOINT_ENCODER # language model encoder file
head_checkpoint=CHECKPOINT_HEAD # language model encoder file
model_config=CHECKPOINT_CONFIG # model configuration file
work_dir=work_dir[model_type]
train_file=f"{data_dir}/{dataset}/BioASQ-7b/train/Full-Abstract/BioASQ-train-factoid-7b-full-annotated.json"
doc_stride=128
max_query_length=64
max_seq_length=384
batch_size=12
version_2_with_negative=False

nf = nemo.core.NeuralModuleFactory(
    backend=nemo.core.Backend.PyTorch,
    placement=nemo.core.DeviceType.GPU,
    log_dir=work_dir
)
model = nemo_nlp.nm.trainables.get_pretrained_lm_model(
        config=model_config, pretrained_model_name=pretrained_model_name[model_type], checkpoint=model_checkpoint
    )
tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer(
    tokenizer_name='nemobert',
    pretrained_model_name=pretrained_model_name[model_type],
    do_lower_case=do_lower_case[model_type]
)
hidden_size = model.hidden_size
qa_head = TokenClassifier(
    hidden_size=hidden_size, num_classes=2, num_layers=1, log_softmax=False, name="TokenClassifier"
)
qa_head.restore_from(head_checkpoint)
task_loss = nemo_nlp.nm.losses.SpanningLoss()
# create training data layer, preprocessing takes a while. If you want to cache preprocessed data for future reuse use --use_cache=True
# remember to delete the cache when you switch the tokenizer/model (BioBERT and BioMegatron use different tokenizers)
train_data_layer = BertQuestionAnsweringDataLayer(
    mode="train",
    tokenizer=tokenizer,
    version_2_with_negative=version_2_with_negative,
    data_file=train_file,
    max_query_length=max_query_length,
    max_seq_length=max_seq_length,
    doc_stride=doc_stride,
    batch_size=batch_size,
    shuffle=True,
    use_cache=True
)

# Creating Neural graph

In [None]:
train_data = train_data_layer()
hidden_states = model(input_ids=train_data.input_ids, token_type_ids=train_data.input_type_ids, attention_mask=train_data.input_mask)
qa_output = qa_head(hidden_states=hidden_states)
loss = task_loss(logits=qa_output, start_positions=train_data.start_positions, end_positions=train_data.end_positions)
# If you're training on multiple GPUs, this should be
# len(train_data_layer) // (batch_size * batches_per_step * num_gpus)
train_steps_per_epoch = len(train_data_layer) // batch_size
logging.info(f"doing {train_steps_per_epoch} steps per epoch")

# Create Callbacks


In [None]:
train_callback = nemo.core.SimpleLossLoggerCallback(
    tensors=[loss.loss],
    print_func=lambda x: logging.info("Loss: {:.3f}".format(x[0].item())),
    get_tb_values=lambda x: [["loss", x[0]]],
    step_freq=100,
    tb_writer=nf.tb_writer,
)
ckpt_callback = nemo.core.CheckpointCallback(
    folder=nf.checkpoint_dir, epoch_freq=1, step_freq=-1
)

# Training
this may take more than an hour.

In [None]:
num_epochs=5
lr=5e-6
lr_warmup_proportion=0
weight_decay=0
lr_policy_fn = get_lr_policy("WarmupAnnealing", total_steps=num_epochs * train_steps_per_epoch, warmup_ratio=lr_warmup_proportion
)
nf.reset_trainer()
nf.train(
    tensors_to_optimize=[loss.loss],
    callbacks=[train_callback, ckpt_callback],
    lr_policy=lr_policy_fn,
    optimizer="adam_w",
    optimization_params={"num_epochs": num_epochs, "lr": lr, "weight_decay": weight_decay},
)

# Inference
Do inference on test data 7b-1 7b-2 7b-3 7b-4 7b-5. Here we only show inference with 7b-4. Rerun the following cells with all 5 test sets to get all numbers.

In [None]:
test_dataset="7b"
test_dataset_idx="4"

In [None]:
test_file=f"{data_dir}/{dataset}/BioASQ-{test_dataset}/test/Full-Abstract/BioASQ-test-factoid-{test_dataset}-{test_dataset_idx}.json"
logging.info(f"using test file {test_file}")
test_data_layer = BertQuestionAnsweringDataLayer(
    mode="test",
    tokenizer=tokenizer,
    version_2_with_negative=version_2_with_negative,
    data_file=test_file,
    max_query_length=max_query_length,
    max_seq_length=max_seq_length,
    doc_stride=doc_stride,
    batch_size=1,
    shuffle=False,
    use_cache=True
)

# Creating Neural test graph
test_data = test_data_layer()
test_hidden_states = model(input_ids=test_data.input_ids, token_type_ids=test_data.input_type_ids, attention_mask=test_data.input_mask)
test_qa_output = qa_head(hidden_states=test_hidden_states)
test_tensors=[test_data.unique_ids, test_qa_output]

In [None]:
n_best_size=20
null_score_diff_threshold=0
max_answer_length=30
logging.info(f"work dir {work_dir}, checkpoint dir {nf.checkpoint_dir}")
output_prediction_file=f"{work_dir}/predictions.json"
output_nbest_file=f"{work_dir}/nbest.json"
evaluated_tensors = nf.infer(
    tensors=test_tensors, cache=False, offload_to_cpu=False, checkpoint_dir=nf.checkpoint_dir
)
unique_ids = []
for t in evaluated_tensors[0]:
    unique_ids.extend(t.tolist())
logits = []
for t in evaluated_tensors[1]:
    logits.extend(t.tolist())
start_logits, end_logits = np.split(np.asarray(logits), 2, axis=-1)
(all_predictions, all_nbest, scores_diff) = test_data_layer.dataset.get_predictions(
    unique_ids=unique_ids,
    start_logits=start_logits,
    end_logits=end_logits,
    n_best_size=n_best_size,
    max_answer_length=max_answer_length,
    version_2_with_negative=version_2_with_negative,
    null_score_diff_threshold=null_score_diff_threshold,
    do_lower_case=do_lower_case[model_type],
)
with open(output_nbest_file, "w") as writer:
    writer.write(json.dumps(all_nbest, indent=4) + "\n")
with open(output_prediction_file, "w") as writer:
    writer.write(json.dumps(all_predictions, indent=4) + "\n")


In [110]:
# a test question example would be 
!echo $test_file
!grep -B 5 "5c72b7277c78d69471000073_001" $test_file

./datasets/BioASQ/BioASQ-7b/test/Full-Abstract/BioASQ-test-factoid-7b-4.json
        {
          "context": "Construction of a natural panel of 11p11.2 deletions and further delineation of the critical region involved in Potocki-Shaffer syndrome. Potocki-Shaffer syndrome (PSS) is a contiguous gene deletion syndrome that results from haploinsufficiency of at least two genes within the short arm of chromosome 11[del(11)(p11.2p12)]. The clinical features of PSS can include developmental delay, mental retardation, multiple exostoses, parietal foramina, enlarged anterior fontanel, minor craniofacial anomalies, ophthalmologic anomalies, and genital abnormalities in males. We constructed a natural panel of 11p11.2-p13 deletions using cell lines from 10 affected individuals, fluorescence in situ hybridization (FISH), microsatellite analyses, and array-based comparative genomic hybridization (array CGH). We then compared the deletion sizes and clinical features between affected individuals. The

In [111]:
# the corresponding first 2 best answers of the n-best list prediction with probabilities.
!grep -A 20 "5c72b7277c78d69471000073_001" $data_dir/$dataset/$prefix$suffix $output_nbest_file

output_bioasq_biobert/nbest.json:    "5c72b7277c78d69471000073_001": [
output_bioasq_biobert/nbest.json-        {
output_bioasq_biobert/nbest.json-            "text": "p11.2p12",
output_bioasq_biobert/nbest.json-            "probability": 0.3970165418689914,
output_bioasq_biobert/nbest.json-            "start_logit": [
output_bioasq_biobert/nbest.json-                5.539025783538818
output_bioasq_biobert/nbest.json-            ],
output_bioasq_biobert/nbest.json-            "end_logit": [
output_bioasq_biobert/nbest.json-                6.180495738983154
output_bioasq_biobert/nbest.json-            ]
output_bioasq_biobert/nbest.json-        },
output_bioasq_biobert/nbest.json-        {
output_bioasq_biobert/nbest.json-            "text": "p11.2p12)",
output_bioasq_biobert/nbest.json-            "probability": 0.1409910996956543,
output_bioasq_biobert/nbest.json-            "start_logit": [
output_bioasq_biobert/nbest.json-                5.539025783538818
output_bioas

In [112]:
# the golden label can be found in this following file under "exact_answer". In this case, the it is equal to the prediction
prefix=test_dataset.upper()
suffix=f"{test_dataset_idx}_golden.json"
!grep -B 7 "5c72b7277c78d69471000073" $data_dir/$dataset/$prefix$suffix

      "exact_answer": [
        [
          "11p11.2p12"
        ]
      ], 
      "concepts": [], 
      "type": "factoid", 
      "id": "5c72b7277c78d69471000073", 


# Evaluate inference output with BioASQ metrics

In [None]:
if not os.path.exists('bioasq-biobert'):
    print("clone https://github.com/dmis-lab/bioasq-biobert.git")
    !git clone https://github.com/dmis-lab/bioasq-biobert.git && cd bioasq-biobert && git fetch origin pull/12/head:fix_indentation && git checkout fix_indentation && cd ..
if not os.path.exists('Evaluation-Measures'):
    print("clone https://github.com/BioASQ/Evaluation-Measures.git")
    !git clone https://github.com/BioASQ/Evaluation-Measures.git && git checkout cd93f3b8eb290c965d18ef466ee28a0bcf451e5d

In [None]:
transformed_nbest_dir=f"{work_dir}/transformed_nbest"
!mkdir -p $transformed_nbest_dir
!python bioasq-biobert/biocodes/transform_n2b_factoid.py --nbest_path=$output_nbest_file --output_path=$transformed_nbest_dir

In [None]:
prefix=test_dataset.upper()
suffix=f"{test_dataset_idx}_golden.json"
! java -Xmx10G -cp Evaluation-Measures/flat/BioASQEvaluation/dist/BioASQEvaluation.jar evaluation.EvaluatorTask1b -phaseB -e 5 $data_dir/$dataset/$prefix$suffix $transformed_nbest_dir/BioASQform_BioASQ-answer.json

With the default hyper parameters the result for 7b-4 test factoid will look something like this for BioBERT:

```0.0 0.45161290322580644 0.6774193548387096 0.5403225806451613 0.0 0.0 0.0 0.0 0.0 0.0```

and for BioMegatron:

```0.0 0.6470588235 0.8235294118 0.7254901961 0.0 0.0 0.0 0.0 0.0 0.0```

where the second, third and fourth numbers will be strict accuracy (SAcc), lenient accuracy (LAcc) and mean reciprocal rank (MRR) for factoid
questions respectively.

The class weighted average for 7B test factoid of all 5 tasks:

| Model | SAcc | LAcc | MRR |
| :---         |     :---:      |        :---:     |     :---: |
|BioBERT   | 0.39     | 0.6 | 0.47   |
|BioMegatron     | 0.48       | 0.64    |0.54|