### Step 1 Download and prepare data

In [None]:
# This example is for demonstration purposes
# Please refer to the corresponding NLP tutorial on NeMo documentation
! scripts/get_wkt2.sh

In [None]:
# verify data is there 
! ls -l data/lm/wikitext-2

In [None]:
# Prepare tokenization model
! python scripts/create_vocab.py --train_path=data/lm/wikitext-2/train.txt

### Step 2 - import necessary packages, define hyperparameters, create tokenizer instance

In [None]:
import os
import torch
import nemo

from nemo.utils.lr_policies import CosineAnnealing

import nemo_nlp
from nemo_nlp import NemoBertTokenizer, SentencePieceTokenizer
from nemo_nlp.utils.callbacks.bert_pretraining import eval_iter_callback, \
    eval_epochs_done_callback

BATCHES_PER_STEP = 1
BATCH_SIZE = 64
BATCH_SIZE_EVAL = 16
D_MODEL = 768
D_INNER = 3072
HIDDEN_ACT = "relu"
LEARNING_RATE = 0.0001
LR_WARMUP_PROPORTION = 0.05
MASK_PROBABILITY = 0.15
MAX_SEQ_LENGTH = 128
NUM_EPOCHS = 1
NUM_HEADS = 12
# Note that for Demo purposes this is set to just one epoch
NUM_LAYERS = 1
OPTIMIZER = "adam_w"

In [None]:
# Instantiate neural factory with supported backend
neural_factory = nemo.core.NeuralModuleFactory(
    backend=nemo.core.Backend.PyTorch,

    # If you're training with multiple GPUs, you should handle this value with
    # something like argparse. See examples/nlp/bert_pretraining.py for an example.
    local_rank=None,

    # If you're training with mixed precision, this should be set to mxprO1 or mxprO2.
    # See https://nvidia.github.io/apex/amp.html#opt-levels for more details.
    optimization_level=nemo.core.Optimization.mxprO1,

    # If you're training with multiple GPUs, this should be set to
    # nemo.core.DeviceType.AllGpu
    placement=nemo.core.DeviceType.GPU)

In [None]:
# tokenizer.model file was created during Step 1
tokenizer = SentencePieceTokenizer(model_path="tokenizer.model")
tokenizer.add_special_tokens(["[MASK]", "[CLS]", "[SEP]"])

#### Instantiate necessary neural modules

In [None]:
bert_model = nemo_nlp.huggingface.BERT(
    vocab_size=tokenizer.vocab_size,
    num_hidden_layers=NUM_LAYERS,
    hidden_size=D_MODEL,
    num_attention_heads=NUM_HEADS,
    intermediate_size=D_INNER,
    max_position_embeddings=MAX_SEQ_LENGTH,
    hidden_act=HIDDEN_ACT,
    factory=neural_factory)

In [None]:
# Masked Language Modeling Loss
mlm_classifier = nemo_nlp.BertTokenClassifier(D_MODEL,
                                          num_classes=tokenizer.vocab_size,
                                              activation=HIDDEN_ACT,
                                          log_softmax=True)
mlm_loss = nemo_nlp.MaskedLanguageModelingLossNM()

# Next Sentence Prediciton Loss
nsp_classifier = nemo_nlp.SequenceClassifier(D_MODEL,
                                             num_classes=2,
                                             num_layers=2,
                                             activation='tanh',
                                             log_softmax=False)
nsp_loss = nemo.backends.pytorch.common.CrossEntropyLoss()

bert_loss = nemo_nlp.LossAggregatorNM(num_inputs=2)

In [None]:
import os
train_data_layer = nemo_nlp.BertPretrainingDataLayer(
    tokenizer=tokenizer,
    dataset=os.path.join("data/lm/wikitext-2", "train.txt"),
    max_seq_length=MAX_SEQ_LENGTH,
    mask_probability=MASK_PROBABILITY,
    batch_size=BATCH_SIZE,
    factory=neural_factory)

eval_data_layer = nemo_nlp.BertPretrainingDataLayer(
    tokenizer=tokenizer,
    dataset=os.path.join("data/lm/wikitext-2", "valid.txt"),
    max_seq_length=MAX_SEQ_LENGTH,
    mask_probability=MASK_PROBABILITY,
    batch_size=BATCH_SIZE_EVAL,
    factory=neural_factory)

### Step 3 - Describe training and evaluation DAGs

In [None]:
# Training DAG
input_ids, input_type_ids, input_mask, \
    output_ids, output_mask, nsp_labels = train_data_layer()

hidden_states = bert_model(input_ids=input_ids,
                           token_type_ids=input_type_ids,
                           attention_mask=input_mask)

mlm_logits = mlm_classifier(hidden_states=hidden_states)
t_mlm_loss = mlm_loss(logits=mlm_logits, output_ids=output_ids, output_mask=output_mask)

nsp_logits = nsp_classifier(hidden_states=hidden_states)
t_nsp_loss = nsp_loss(logits=nsp_logits, labels=nsp_labels)

loss = bert_loss(loss_1=t_mlm_loss, loss_2=t_nsp_loss)

In [None]:
# Evaluation DAG
e_input_ids, e_input_type_ids, e_input_mask, \
    e_output_ids, e_output_mask, e_nsp_labels = eval_data_layer()

e_hidden_states = bert_model(input_ids=e_input_ids,
                           token_type_ids=e_input_type_ids,
                           attention_mask=e_input_mask)

e_mlm_logits = mlm_classifier(hidden_states=e_hidden_states)
e_mlm_loss = mlm_loss(logits=e_mlm_logits, output_ids=e_output_ids, output_mask=e_output_mask)

e_nsp_logits = nsp_classifier(hidden_states=e_hidden_states)
e_nsp_loss = nsp_loss(logits=e_nsp_logits, labels=e_nsp_labels)

e_loss = bert_loss(loss_1=e_mlm_loss, loss_2=e_nsp_loss)

In [None]:
callback_loss = nemo.core.SimpleLossLoggerCallback(
    tensors=[loss],
    print_func=lambda x: print("Loss: {:.3f}".format(x[0].item())))

train_data_size = len(train_data_layer)

# If you're training on multiple GPUs, this should be
# train_data_size / (batch_size * batches_per_step * num_gpus)
steps_per_epoch = int(train_data_size / (BATCHES_PER_STEP * BATCH_SIZE))

callback_test = nemo.core.EvaluatorCallback(
    eval_tensors=[e_mlm_loss, e_nsp_loss],
    user_iter_callback=eval_iter_callback,
    user_epochs_done_callback=eval_epochs_done_callback,
    eval_step=steps_per_epoch)

In [None]:
lr_policy = CosineAnnealing(NUM_EPOCHS * steps_per_epoch,
                            warmup_ratio=LR_WARMUP_PROPORTION)
neural_factory.train(tensors_to_optimize=[loss],
                lr_policy=lr_policy,
                callbacks=[callback_loss, callback_test],
                #callbacks=[callback_loss],
                batches_per_step=BATCHES_PER_STEP,
                optimizer=OPTIMIZER,
                optimization_params={
                    "batch_size": BATCH_SIZE,
                    "num_epochs": NUM_EPOCHS,
                    "lr": LEARNING_RATE,
                    "betas": (0.95, 0.98),
                    "grad_norm_clip": None
                })