In [None]:
"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.

Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
"""
# If you're using Google Colab and not running locally, run this cell.
# !pip install wget
# !pip install git+https://github.com/NVIDIA/apex.git
# !pip install nemo_toolkit[nlp]
# !pip install unidecode
import os
import nemo
import nemo.collections.nlp as nemo_nlp
import numpy as np
import time
import errno

from nemo.backends.pytorch.common.losses import CrossEntropyLossNM
from nemo.collections.nlp.nm.data_layers import BertTokenClassificationDataLayer
from nemo.collections.nlp.nm.trainables import TokenClassifier
from nemo.collections.nlp.callbacks.token_classification_callback import eval_epochs_done_callback, eval_iter_callback
from nemo.utils.lr_policies import get_lr_policy
from nemo import logging

# Introduction
BioBERT has the same network architecture as the original BERT, but instead of Wikipedia and BookCorpus it is pretrained on PubMed, a large biomedical text corpus, which achieves better performance in biomedical downstream tasks, such as question answering(QA), named entity recognition(NER) and relationship extraction(RE). This model was trained for 1M steps. For more information please refer to the original paper https://academic.oup.com/bioinformatics/article/36/4/1234/5566506.  For details about BERT please refer to https://ngc.nvidia.com/catalog/models/nvidia:bertbaseuncasedfornemo.


In this notebook we're going to showcase how to train BioBERT on a biomedical named entity recognition (NER) dataset.

# Download model checkpoint
Download BioBert/BioMegatron checkpoints from  NGC: https://ngc.nvidia.com/catalog/models and put the encoder weights 
at `./checkpoints/biobert/BERT.pt` or `./checkpoints/biomegatron/BERT.pt` and the model configuration file at `./checkpoints/biobert/bert_config.json` or `./checkpoints/biomegatron/bert_config.json`.

In [None]:
# Set which model to use.
model_type="biobert" # "biomegatron"
base_checkpoint_path={'biobert': './checkpoints/biobert/', 'biomegatron': './checkpoints/biomegatron/'}
pretrained_model_name={'biobert': 'bert-base-cased', 'biomegatron': 'megatron-bert-uncased'}
do_lower_case={'biobert': False, 'biomegatron': True}
work_dir={'biobert': 'output_ner_biobert', 'biomegatron': 'output_ner_biomegatron'}

In [None]:
# the checkpoints are available from NGC: https://ngc.nvidia.com/catalog/models
CHECKPOINT_ENCODER = os.path.join(base_checkpoint_path[model_type], 'BERT.pt') # Model encoder checkpoint file
CHECKPOINT_CONFIG = os.path.join(base_checkpoint_path[model_type], 'bert_config.json') # Model configuration file
    
if not os.path.exists(CHECKPOINT_ENCODER):
    raise OSError(errno.ENOENT, os.strerror(errno.ENOENT), CHECKPOINT_ENCODER)

if not os.path.exists(CHECKPOINT_CONFIG):
    raise OSError(errno.ENOENT, os.strerror(errno.ENOENT), CHECKPOINT_CONFIG)

# Download training data
In this example we download the NER dataset NCBI-disease using token_classification/get_medical_data.py to ./datasets/ncbi-disease

In [None]:
data_dir="./datasets"
dataset="ncbi-disease"
!mkdir -p $data_dir
!python ../token_classification/get_medical_data.py --data_dir=$data_dir --dataset=$dataset
!python ../token_classification/import_from_iob_format.py --data_file=$data_dir/$dataset/train.tsv
!python ../token_classification/import_from_iob_format.py --data_file=$data_dir/$dataset/test.tsv
!python ../token_classification/import_from_iob_format.py --data_file=$data_dir/$dataset/dev.tsv
!ls -l $data_dir/$dataset

After the previous step, you should have a ./datasets/ncbi-disease folder that contains the following files:
- labels_train.txt
- labels_dev.txt
- labels_text.txt
- text_train.txt
- text_dev.txt
- text_text.txt

The format of the data described in NeMo docs.

# Create Neural Modules

In [None]:
model_checkpoint=CHECKPOINT_ENCODER # language model encoder file
model_config=CHECKPOINT_CONFIG # model configuration file
work_dir=work_dir[model_type]
train_data_text_file=f"{data_dir}/{dataset}/text_train.txt"
train_data_label_file=f"{data_dir}/{dataset}/labels_train.txt"
eval_data_text_file=f"{data_dir}/{dataset}/text_dev.txt"
eval_data_label_file=f"{data_dir}/{dataset}/labels_dev.txt"
none_label="O" 
num_labels=3 # this should be the same number as number of labels in the training data
fc_dropout=0.1
max_seq_length=128
batch_size=32

nf = nemo.core.NeuralModuleFactory(
    backend=nemo.core.Backend.PyTorch,
    placement=nemo.core.DeviceType.GPU
)
model = nemo_nlp.nm.trainables.get_pretrained_lm_model(
        config=model_config, pretrained_model_name=pretrained_model_name[model_type], checkpoint=model_checkpoint
    )
tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer(
    tokenizer_name='nemobert',
    pretrained_model_name=pretrained_model_name[model_type],
    do_lower_case=do_lower_case[model_type]
)
hidden_size = model.hidden_size
classifier = TokenClassifier(hidden_size=hidden_size, num_classes=num_labels, dropout=fc_dropout, num_layers=1)
task_loss = CrossEntropyLossNM(logits_ndim=3)
train_data_layer = BertTokenClassificationDataLayer(
    tokenizer=tokenizer,
    text_file=train_data_text_file,
    label_file=train_data_label_file,
    pad_label=none_label,
    label_ids=None,
    max_seq_length=max_seq_length,
    batch_size=batch_size,
    shuffle=True,
    use_cache=True
)
eval_data_layer = BertTokenClassificationDataLayer(
    tokenizer=tokenizer,
    text_file=eval_data_text_file,
    label_file=eval_data_label_file,
    pad_label=none_label,
    label_ids=train_data_layer.dataset.label_ids,
    max_seq_length=max_seq_length,
    batch_size=batch_size,
    shuffle=False,
    use_cache=False,
)

# Creating Neural graph

In [None]:
train_data = train_data_layer()
train_hidden_states = model(input_ids=train_data.input_ids, token_type_ids=train_data.input_type_ids, attention_mask=train_data.input_mask)
train_logits = classifier(hidden_states=train_hidden_states)
loss = task_loss(logits=train_logits, labels=train_data.labels, loss_mask=train_data.loss_mask)
# If you're training on multiple GPUs, this should be
# len(train_data_layer) // (batch_size * batches_per_step * num_gpus)
train_steps_per_epoch = len(train_data_layer) // batch_size
logging.info(f"doing {train_steps_per_epoch} steps per epoch")

eval_data = eval_data_layer()
eval_hidden_states = model(input_ids=eval_data.input_ids, token_type_ids=eval_data.input_type_ids, attention_mask=eval_data.input_mask)
eval_logits = classifier(hidden_states=eval_hidden_states)


# Create Callbacks


In [None]:

train_callback = nemo.core.SimpleLossLoggerCallback(
    tensors=[loss],
    print_func=lambda x: logging.info("Loss: {:.3f}".format(x[0].item())),
    get_tb_values=lambda x: [["loss", x[0]]],
    step_freq=100,
    tb_writer=nf.tb_writer,
)

# Callback to evaluate the model
eval_callback = nemo.core.EvaluatorCallback(
        eval_tensors=[eval_logits, eval_data.labels, eval_data.subtokens_mask],
        user_iter_callback=lambda x, y: eval_iter_callback(x, y),
        user_epochs_done_callback=lambda x: eval_epochs_done_callback(x, train_data_layer.dataset.label_ids, f'{nf.work_dir}/graphs'),
        tb_writer=nf.tb_writer,
        eval_step=100
    )


# Training
Training could take several minutes.

In [None]:

num_epochs=10
lr_warmup_proportion=0.1
lr=4e-5
weight_decay=0.01
lr_policy_fn = get_lr_policy("WarmupAnnealing", total_steps=num_epochs * train_steps_per_epoch, warmup_ratio=lr_warmup_proportion
)
nf.train(
    tensors_to_optimize=[loss],
    callbacks=[train_callback, eval_callback],
    lr_policy=lr_policy_fn,
    optimizer="adam_w",
    optimization_params={"num_epochs": num_epochs, "lr": lr, "weight_decay": weight_decay},
)

The result should look something like
```
[NeMo I 2020-05-22 17:13:48 token_classification_callback:82] Accuracy: 0.9882348032875798
[NeMo I 2020-05-22 17:13:48 token_classification_callback:86] F1 weighted: 98.82
[NeMo I 2020-05-22 17:13:48 token_classification_callback:86] F1 macro: 93.74
[NeMo I 2020-05-22 17:13:48 token_classification_callback:86] F1 micro: 98.82
[NeMo I 2020-05-22 17:13:49 token_classification_callback:89] precision    recall  f1-score   support
    
    O (label id: 0)     0.9938    0.9957    0.9947     22092
    B (label id: 1)     0.8843    0.9034    0.8938       787
    I (label id: 2)     0.9505    0.8982    0.9236      1090
    
           accuracy                         0.9882     23969
          macro avg     0.9429    0.9324    0.9374     23969
       weighted avg     0.9882    0.9882    0.9882     23969
```