In [None]:
"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.

Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
"""
# If you're using Google Colab and not running locally, run this cell.
# !pip install wget
# !pip install git+https://github.com/NVIDIA/apex.git
# !pip install nemo_toolkit[nlp]
# !pip install unidecode
import os
import nemo
import nemo.collections.nlp as nemo_nlp
import numpy as np
import time
import errno

from nemo.backends.pytorch.common.losses import CrossEntropyLossNM
from nemo.collections.nlp.data.datasets import TextClassificationDataDesc
from nemo.collections.nlp.nm.data_layers import BertTextClassificationDataLayer
from nemo.collections.nlp.nm.trainables import SequenceClassifier
from nemo.collections.nlp.callbacks.text_classification_callback import eval_epochs_done_callback, eval_iter_callback
from nemo.utils.lr_policies import get_lr_policy
from nemo import logging

# Introduction
BioBERT has the same network architecture as the original BERT, but instead of Wikipedia and BookCorpus it is pretrained on PubMed, a large biomedical text corpus, which achieves better performance in biomedical downstream tasks, such as question answering(QA), named entity recognition(NER) and relationship extraction(RE). This model was trained for 1M steps. For more information please refer to the original paper https://academic.oup.com/bioinformatics/article/36/4/1234/5566506.  For details about BERT please refer to https://ngc.nvidia.com/catalog/models/nvidia:bertbaseuncasedfornemo.


In this notebook we're going to showcase how to train BioBERT on a biomedical relation extraction (RE) dataset.

# Download model checkpoint
Download BioBert/BioMegatron checkpoints from  NGC: https://ngc.nvidia.com/catalog/models and put the encoder weights 
at `./checkpoints/biobert/BERT.pt` or `./checkpoints/biomegatron/BERT.pt` and the model configuration file at `./checkpoints/biobert/bert_config.json` or `./checkpoints/biomegatron/bert_config.json`.

In [None]:
# Set which model to use.
model_type="biobert" # "biomegatron"
base_checkpoint_path={'biobert': './checkpoints/biobert/', 'biomegatron': './checkpoints/biomegatron/'}
pretrained_model_name={'biobert': 'bert-base-cased', 'biomegatron': 'megatron-bert-uncased'}
do_lower_case={'biobert': False, 'biomegatron': True}
work_dir={'biobert': 'output_re_biobert', 'biomegatron': 'output_re_biomegatron'}

In [None]:
# the checkpoints are available from NGC: https://ngc.nvidia.com/catalog/models
CHECKPOINT_ENCODER = os.path.join(base_checkpoint_path[model_type], 'BERT.pt') # Model encoder checkpoint file
CHECKPOINT_CONFIG = os.path.join(base_checkpoint_path[model_type], 'bert_config.json') # Model configuration file
    
if not os.path.exists(CHECKPOINT_ENCODER):
    raise OSError(errno.ENOENT, os.strerror(errno.ENOENT), CHECKPOINT_ENCODER)

if not os.path.exists(CHECKPOINT_CONFIG):
    raise OSError(errno.ENOENT, os.strerror(errno.ENOENT), CHECKPOINT_CONFIG)

# Download training data
In this example we download the RE dataset chemprot to ./datasets/chemprot and process it with text_classification/data/import_datasets.py

In [None]:
#download https://github.com/arwhirang/recursive_chemprot/blob/master/Demo/tree_LSTM/data/chemprot-data_treeLSTM.zip and extract it into ./datasets/chemprot
data_dir="./datasets"
dataset="chemprot"
if not os.path.exists(f"{data_dir}/{dataset}"):
    !mkdir -p $data_dir/$dataset
    !wget "https://github.com/arwhirang/recursive_chemprot/blob/master/Demo/tree_LSTM/data/chemprot-data_treeLSTM.zip?raw=true" -O data.zip
    !unzip data.zip -d $data_dir/$dataset
    !rm data.zip

!python ../text_classification/data/import_datasets.py --source_data_dir=$data_dir/$dataset --target_data_dir=$data_dir/$dataset --dataset_name=$dataset
!ls -l $data_dir/$dataset

After the previous step, you should have a ./datasets/chemprot folder that contains the following files:
- train.tsv
- test.tsv
- dev.tsv
- label_mapping.tsv

The format of the data described in NeMo docs.

# Create Neural Modules

In [None]:
model_checkpoint=CHECKPOINT_ENCODER # language model encoder file
model_config=CHECKPOINT_CONFIG # model configuration file
work_dir=work_dir[model_type]
train_data_text_file=f"{data_dir}/{dataset}/train.tsv"
eval_data_text_file=f"{data_dir}/{dataset}/dev.tsv"
fc_dropout=0.1
max_seq_length=128
batch_size=32
num_output_layers=1

nf = nemo.core.NeuralModuleFactory(
    backend=nemo.core.Backend.PyTorch,
    placement=nemo.core.DeviceType.GPU
)
model = nemo_nlp.nm.trainables.get_pretrained_lm_model(
        config=model_config, pretrained_model_name=pretrained_model_name[model_type], checkpoint=model_checkpoint
    )
tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer(
    tokenizer_name='nemobert',
    pretrained_model_name=pretrained_model_name[model_type],
    do_lower_case=do_lower_case[model_type]
)
hidden_size = model.hidden_size
data_desc = TextClassificationDataDesc(data_dir=f"{data_dir}/{dataset}", modes=['train', 'dev'])
classifier = nemo_nlp.nm.trainables.SequenceClassifier(    
    hidden_size=hidden_size,
    num_classes=data_desc.num_labels,
    dropout=fc_dropout,
    num_layers=num_output_layers,
    log_softmax=False,
)
task_loss = CrossEntropyLossNM(weight=None)
train_data_layer = BertTextClassificationDataLayer(
    tokenizer=tokenizer,
    input_file=train_data_text_file,
    max_seq_length=max_seq_length,
    batch_size=batch_size,
    shuffle=True,
    use_cache=True
)
eval_data_layer = BertTextClassificationDataLayer(
    tokenizer=tokenizer,
    input_file=eval_data_text_file,
    max_seq_length=max_seq_length,
    batch_size=batch_size,
    shuffle=False,
    use_cache=False
)


# Creating Neural graph

In [None]:
train_data = train_data_layer()
train_hidden_states = model(input_ids=train_data.input_ids, token_type_ids=train_data.input_type_ids, attention_mask=train_data.input_mask)
train_logits = classifier(hidden_states=train_hidden_states)
loss = task_loss(logits=train_logits, labels=train_data.labels)
# If you're training on multiple GPUs, this should be
# len(train_data_layer) // (batch_size * batches_per_step * num_gpus)
train_steps_per_epoch = len(train_data_layer) // batch_size
logging.info(f"doing {train_steps_per_epoch} steps per epoch")

eval_data = eval_data_layer()
eval_hidden_states = model(input_ids=eval_data.input_ids, token_type_ids=eval_data.input_type_ids, attention_mask=eval_data.input_mask)
eval_logits = classifier(hidden_states=eval_hidden_states)

# Create Callbacks


In [None]:
train_callback = nemo.core.SimpleLossLoggerCallback(
    tensors=[loss],
    print_func=lambda x: logging.info("Loss: {:.3f}".format(x[0].item())),
    get_tb_values=lambda x: [["loss", x[0]]],
    step_freq=100,
    tb_writer=nf.tb_writer,
)

# Callback to evaluate the model
eval_callback = nemo.core.EvaluatorCallback(
        eval_tensors=[eval_logits, eval_data.labels],
        user_iter_callback=lambda x, y: eval_iter_callback(x, y, eval_data_layer),
        user_epochs_done_callback=lambda x: eval_epochs_done_callback(x, f'{nf.work_dir}/graphs'),
        tb_writer=nf.tb_writer,
        eval_step=500,
    )

# Training
Training could take several minutes.

In [None]:
num_epochs=3
lr_warmup_proportion=0.1
lr=3e-5
weight_decay=0.01
lr_policy_fn = get_lr_policy("WarmupAnnealing", total_steps=num_epochs * train_steps_per_epoch, warmup_ratio=lr_warmup_proportion
)
nf.train(
    tensors_to_optimize=[loss],
    callbacks=[train_callback, eval_callback],
    lr_policy=lr_policy_fn,
    optimizer="adam_w",
    optimization_params={"num_epochs": num_epochs, "lr": lr, "weight_decay": weight_decay},
)

The result should look something like this:
```
precision    recall  f1-score   support
    
               0     0.7328    0.8348    0.7805       115
               1     0.9402    0.9291    0.9346      7950
               2     0.8311    0.9146    0.8708       199
               3     0.6400    0.6302    0.6351       457
               4     0.8002    0.8317    0.8156      1093
               5     0.7228    0.7518    0.7370       548
    
        accuracy                         0.8949     10362
       macro avg     0.7778    0.8153    0.7956     10362
    weighted avg     0.8963    0.8949    0.8954     10362
```