In [None]:
# This example is for demonstration purposes
# It shows how to pre-train BERT from scratch using your own tokenizer model
# To build tokenizer model do:
# python tests/data/create_vocab.py --train_path wikitext-2/train.txt
# Please refer to the corresponding NLP tutorial on NeMo documentation

import math
import os

import nemo
from nemo.utils.lr_policies import CosineAnnealing

import nemo_nlp
from nemo_nlp import NemoBertTokenizer, SentencePieceTokenizer
from nemo_nlp.callbacks.bert_pretraining import eval_iter_callback, \
    eval_epochs_done_callback

BATCHES_PER_STEP = 1
BATCH_SIZE = 64
BATCH_SIZE_EVAL = 16
CHECKPOINT_DIR = "bert_pretraining_checkpoints"
D_MODEL = 768
D_INNER = 3072
HIDDEN_ACT = "gelu"
LEARNING_RATE = 1e-2
LR_WARMUP_PROPORTION = 0.05
MASK_PROBABILITY = 0.15
MAX_SEQ_LENGTH = 128
NUM_EPOCHS = 10
NUM_HEADS = 12
NUM_LAYERS = 12
OPTIMIZER = "novograd"
WEIGHT_DECAY = 0

In [None]:
# Instantiate neural factory with supported backend
neural_factory = nemo.core.NeuralModuleFactory(
    backend=nemo.core.Backend.PyTorch,

    # If you're training with multiple GPUs, you should handle this value with
    # something like argparse. See examples/nlp/bert_pretraining.py for an example.
    local_rank=None,

    # If you're training with mixed precision, this should be set to mxprO1 or mxprO2.
    # See https://nvidia.github.io/apex/amp.html#opt-levels for more details.
    optimization_level=nemo.core.Optimization.mxprO0,

    # If you're training with multiple GPUs, this should be set to
    # nemo.core.DeviceType.AllGpu
    placement=nemo.core.DeviceType.GPU)

In [None]:
# tokenizer = SentencePieceTokenizer(model_path="<PATH TO tokenizer.model>")
tokenizer = SentencePieceTokenizer(model_path="/home/okuchaiev/repos/gitlab-master/nemo/tokenizer.model")
tokenizer.add_special_tokens(["[MASK]", "[CLS]", "[SEP]"])

In [None]:
bert_model = nemo_nlp.huggingface.BERT(
    vocab_size=tokenizer.vocab_size,
    num_hidden_layers=NUM_LAYERS,
    hidden_size=D_MODEL,
    num_attention_heads=NUM_HEADS,
    intermediate_size=D_INNER,
    max_position_embeddings=MAX_SEQ_LENGTH,
    hidden_act=HIDDEN_ACT,
    factory=neural_factory)

In [None]:
mlm_log_softmax = nemo_nlp.TransformerLogSoftmaxNM(
    vocab_size=tokenizer.vocab_size,
    d_model=D_MODEL,
    factory=neural_factory)
mlm_loss = nemo_nlp.MaskedLanguageModelingLossNM(factory=neural_factory)

mlm_log_softmax.log_softmax.dense.weight = \
    bert_model.bert.embeddings.word_embeddings.weight

nsp_log_softmax = nemo_nlp.SentenceClassificationLogSoftmaxNM(
    d_model=D_MODEL,
    num_classes=2,
    factory=neural_factory)
nsp_loss = nemo_nlp.NextSentencePredictionLossNM(factory=neural_factory)

bert_loss = nemo_nlp.LossAggregatorNM(
    num_inputs=2,
    factory=neural_factory)

In [None]:
train_data_layer = nemo_nlp.BertPretrainingDataLayer(
    tokenizer=tokenizer,
    dataset=os.path.join("/home/okuchaiev/repos/gitlab-master/nemo/wikitext-2", "train.txt"),
    name="train",
    max_seq_length=MAX_SEQ_LENGTH,
    mask_probability=MASK_PROBABILITY,
    batch_size=BATCH_SIZE,
    factory=neural_factory)

test_data_layer = nemo_nlp.BertPretrainingDataLayer(
    tokenizer=tokenizer,
    dataset=os.path.join("/home/okuchaiev/repos/gitlab-master/nemo/wikitext-2", "test.txt"),
    name="test",
    max_seq_length=MAX_SEQ_LENGTH,
    mask_probability=MASK_PROBABILITY,
    batch_size=BATCH_SIZE_EVAL,
    factory=neural_factory)

In [None]:
input_ids, input_type_ids, input_mask, \
    output_ids, output_mask, nsp_labels = train_data_layer()

hidden_states = bert_model(input_ids=input_ids,
                           token_type_ids=input_type_ids,
                           attention_mask=input_mask)

train_mlm_log_probs = mlm_log_softmax(hidden_states=hidden_states)
train_mlm_loss = mlm_loss(log_probs=train_mlm_log_probs,
                          output_ids=output_ids,
                          output_mask=output_mask)

train_nsp_log_probs = nsp_log_softmax(hidden_states=hidden_states)
train_nsp_loss = nsp_loss(log_probs=train_nsp_log_probs, labels=nsp_labels)
train_loss = bert_loss(loss_1=train_mlm_loss, loss_2=train_nsp_loss)

In [None]:
input_ids_, input_type_ids_, input_mask_, \
    output_ids_, output_mask_, nsp_labels_ = test_data_layer()

hidden_states_ = bert_model(input_ids=input_ids_,
                            token_type_ids=input_type_ids_,
                            attention_mask=input_mask_)

test_mlm_log_probs = mlm_log_softmax(hidden_states=hidden_states_)
test_mlm_loss = mlm_loss(log_probs=test_mlm_log_probs,
                         output_ids=output_ids_,
                         output_mask=output_mask_)

test_nsp_log_probs = nsp_log_softmax(hidden_states=hidden_states_)
test_nsp_loss = nsp_loss(log_probs=test_nsp_log_probs, labels=nsp_labels_)

In [None]:
callback_loss = nemo.core.SimpleLossLoggerCallback(
    tensors=[train_loss],
    print_func=lambda x: print("Loss: {:.3f}".format(x[0].item())))

train_data_size = len(train_data_layer)

# If you're training on multiple GPUs, this should be
# train_data_size / (batch_size * batches_per_step * num_gpus)
steps_per_epoch = int(train_data_size / (BATCHES_PER_STEP * BATCH_SIZE))

callback_test = nemo.core.EvaluatorCallback(
    eval_tensors=[test_mlm_loss, test_nsp_loss],
    user_iter_callback=eval_iter_callback,
    user_epochs_done_callback=eval_epochs_done_callback,
    eval_step=steps_per_epoch)

In [None]:
lr_policy = CosineAnnealing(NUM_EPOCHS * steps_per_epoch,
                            warmup_ratio=LR_WARMUP_PROPORTION)
neural_factory.train(tensors_to_optimize=[train_loss],
                lr_policy=lr_policy,
                callbacks=[callback_loss, callback_test],
                #callbacks=[callback_loss],
                batches_per_step=BATCHES_PER_STEP,
                optimizer=OPTIMIZER,
                optimization_params={
                    "batch_size": BATCH_SIZE,
                    "num_epochs": NUM_EPOCHS,
                    "lr": LEARNING_RATE,
                    "weight_decay": WEIGHT_DECAY,
                    "betas": (0.95, 0.98),
                    "grad_norm_clip": None
                })