In [None]:
import math
import os

import nemo
from nemo.utils.lr_policies import WarmupAnnealing

import nemo_nlp
from nemo_nlp import NemoBertTokenizer, SentencePieceTokenizer
from nemo_nlp.callbacks.ner import \
    eval_iter_callback, eval_epochs_done_callback

BATCHES_PER_STEP = 1
BATCH_SIZE = 32
CLASSIFICATION_DROPOUT = 0.1
DATA_DIR = "PATH TO WHERE YOU PUT CoNLL-2003 data"
MAX_SEQ_LENGTH = 128
NUM_EPOCHS = 3
LEARNING_RATE = 0.00005
LR_WARMUP_PROPORTION = 0.1
OPTIMIZER = "adam"

In [None]:
# Instantiate neural factory with supported backend
neural_factory = nemo.core.NeuralModuleFactory(
    backend=nemo.core.Backend.PyTorch,

    # If you're training with multiple GPUs, you should handle this value with
    # something like argparse. See examples/nlp/ner.py for an example.
    local_rank=None,

    # If you're training with mixed precision, this should be set to mxprO1 or mxprO2.
    # See https://nvidia.github.io/apex/amp.html#opt-levels for more details.
    optimization_level=nemo.core.Optimization.mxprO0,

    # If you're training with multiple GPUs, this should be set to
    # nemo.core.DeviceType.AllGpu
    placement=nemo.core.DeviceType.GPU)

In [None]:
# If you're using a standard BERT model, you should do it like this. To see the full
# list of BERT model names, check out nemo_nlp.huggingface.BERT.list_pretrained_models()
tokenizer = NemoBertTokenizer(pretrained_model="bert-base-cased")
bert_model = nemo_nlp.huggingface.BERT(
    pretrained_model_name="bert-base-cased",
    factory=neural_factory)

In [None]:
train_data_layer = nemo_nlp.BertNERDataLayer(
    tokenizer=tokenizer,
    path_to_data=os.path.join(DATA_DIR, "train.txt"),
    max_seq_length=MAX_SEQ_LENGTH,
    batch_size=BATCH_SIZE,
    factory=neural_factory)

tag_ids = train_data_layer.dataset.tag_ids

ner_loss = nemo_nlp.TokenClassificationLoss(
    d_model=bert_model.bert.config.hidden_size,
    num_labels=len(tag_ids),
    dropout=CLASSIFICATION_DROPOUT,
    factory=neural_factory)

input_ids, input_type_ids, input_mask, labels, _ = train_data_layer()

hidden_states = bert_model(
    input_ids=input_ids,
    token_type_ids=input_type_ids,
    attention_mask=input_mask)

train_loss, train_logits = ner_loss(
    hidden_states=hidden_states,
    labels=labels,
    input_mask=input_mask)

In [None]:
eval_data_layer = nemo_nlp.BertNERDataLayer(
    tokenizer=tokenizer,
    path_to_data=os.path.join(DATA_DIR, "dev.txt"),
    max_seq_length=MAX_SEQ_LENGTH,
    batch_size=BATCH_SIZE,
    factory=neural_factory)

input_ids, input_type_ids, eval_input_mask, \
    eval_labels, eval_seq_ids = eval_data_layer()

hidden_states = bert_model(
    input_ids=input_ids,
    token_type_ids=input_type_ids,
    attention_mask=eval_input_mask)

eval_loss, eval_logits = ner_loss(
    hidden_states=hidden_states,
    labels=eval_labels,
    input_mask=eval_input_mask)


In [None]:
callback_train = nemo.core.SimpleLossLoggerCallback(
    tensors=[train_loss],
    print_func=lambda x: print("Loss: {:.3f}".format(x[0].item())))

train_data_size = len(train_data_layer)

# If you're training on multiple GPUs, this should be
# train_data_size / (batch_size * batches_per_step * num_gpus)
steps_per_epoch = int(train_data_size / (BATCHES_PER_STEP * BATCH_SIZE))

callback_eval = nemo.core.EvaluatorCallback(
    eval_tensors=[eval_logits, eval_seq_ids],
    user_iter_callback=lambda x, y: eval_iter_callback(
        x, y, eval_data_layer, tag_ids),
    user_epochs_done_callback=lambda x: eval_epochs_done_callback(
        x, tag_ids, "output.txt"),
    eval_step=steps_per_epoch)

In [None]:
lr_policy = WarmupAnnealing(NUM_EPOCHS * steps_per_epoch,
                            warmup_ratio=LR_WARMUP_PROPORTION)
optimizer = neural_factory.get_trainer()
optimizer.train(
    tensors_to_optimize=[train_loss],
    callbacks=[callback_train, callback_eval],
    lr_policy=lr_policy,
    batches_per_step=BATCHES_PER_STEP,
    optimizer=OPTIMIZER,
    optimization_params={
        "num_epochs": NUM_EPOCHS,
        "lr": LEARNING_RATE
    })