In [23]:
# NeMo core
import nemo
# NeMo NLP collection
import nemo_nlp

In [14]:
nf = nemo.core.NeuralModuleFactory()

In [15]:
dataset_dir = "/home/okuchaiev/repos/gitlab-master/nemo/tests/data/"
tokenizer = nemo_nlp.data.SentencePieceTokenizer(dataset_dir+"m_common.model")

In [16]:
train_dl = nemo_nlp.data.TranslationDataLayer(
        tokenizer_src=tokenizer,
        tokenizer_tgt=tokenizer,
        dataset_src=dataset_dir+"en_de/train.de",
        dataset_tgt=dataset_dir+"en_de/train.en",
        tokens_in_batch=4096,
        clean=True)

Tokenizing dataset ...
Tokenizing dataset ...


In [17]:
eval_dl = nemo_nlp.data.TranslationDataLayer(
        tokenizer_src=tokenizer,
        tokenizer_tgt=tokenizer,
        dataset_src=dataset_dir+"en_de/test.de",
        dataset_tgt=dataset_dir+"en_de/test.en",
        tokens_in_batch=4096,
        clean=False)

Tokenizing dataset ...
Tokenizing dataset ...


In [18]:
d_model=512
d_inner=2048
num_layers=6
num_heads=8
dp = 0.1
max_seq_len = 256

t_encoder = nemo_nlp.TransformerEncoderNM(
        d_model=d_model,
        d_inner=d_inner,
        num_layers=num_layers,
        num_attn_heads=num_heads,
        fully_connected_dropout=dp,
        vocab_size=tokenizer.vocab_size,
        max_seq_length=max_seq_len,
        embedding_dropout=dp)

t_decoder = nemo_nlp.TransformerDecoderNM(
        d_model=d_model,
        d_inner=d_inner,
        num_layers=num_layers,
        num_attn_heads=num_heads,
        fully_connected_dropout=dp,
        vocab_size=tokenizer.vocab_size,
        max_seq_length=max_seq_len,
        embedding_dropout=dp)

t_log_softmax = nemo_nlp.TransformerLogSoftmaxNM(
    vocab_size=tokenizer.vocab_size,
    d_model=d_model)

beam_translator = nemo_nlp.BeamSearchTranslatorNM(
    encoder=t_encoder,
    decoder=t_decoder,
    log_softmax=t_log_softmax,
    max_seq_length=max_seq_len,
    beam_size=4,
    length_penalty=0.0,
    bos_token=2, pad_token=0, eos_token=1)

t_loss = nemo_nlp.PaddedSmoothedCrossEntropyLossNM(pad_id=0, smoothing=0.1)
t_loss_eval = nemo_nlp.PaddedSmoothedCrossEntropyLossNM(pad_id=0, smoothing=0.0)



In [22]:
bert = nemo_nlp.huggingface.BERT(vocab_size=tokenizer.vocab_size)

In [19]:
# tie weights of softmax layer and embedding layers from encoder and decoder
t_log_softmax.tie_weights_with(
    t_encoder,
    weight_names=["log_softmax.dense.weight"],
    name2name_and_transform={
        "log_softmax.dense.weight": ("embeddings.word_embedding.weight", 0)
    }
)
t_decoder.tie_weights_with(
    t_encoder,
    weight_names=["embeddings.word_embedding.weight"],
    name2name_and_transform={
        "embeddings.word_embedding.weight": (
            "embeddings.word_embedding.weight", 0)
    }
)


In [20]:
# training pipeline
src, src_mask, tgt, tgt_mask, labels, sent_ids = train_dl()
src_hiddens = t_encoder(input_ids=src, input_mask_src=src_mask)
tgt_hiddens = t_decoder(input_ids_tgt=tgt,
                        hidden_states_src=src_hiddens,
                        input_mask_src=src_mask,
                        input_mask_tgt=tgt_mask)
log_softmax = t_log_softmax(hidden_states=tgt_hiddens)
train_loss = t_loss(log_probs=log_softmax, target_ids=labels)


In [9]:
# evaluation pipeline with beam search on top of the model output
src_, src_mask_, tgt_, tgt_mask_, labels_, sent_ids_ = eval_dl()
src_hiddens_ = t_encoder(input_ids=src_, input_mask_src=src_mask_)
tgt_hiddens_ = t_decoder(input_ids_tgt=tgt_,
                         hidden_states_src=src_hiddens_,
                         input_mask_src=src_mask_,
                         input_mask_tgt=tgt_mask_)
log_softmax_ = t_log_softmax(hidden_states=tgt_hiddens_)
eval_loss = t_loss_eval(log_probs=log_softmax_, target_ids=labels_)
beam_trans = beam_translator(
    hidden_states_src=src_hiddens_, input_mask_src=src_mask_)


In [10]:
# callback which prints training loss once in a while
callback = nemo.core.SimpleLossLoggerCallback(
    tensor_list2string=lambda x: str(x[0].item()),
    step_frequency=100)
# callback which calculates evaluation loss without label smoothing
# and SacreBLEU score between outputs of beam search and reference translations
from nemo_nlp.callbacks.translation import eval_iter_callback, \
    eval_epochs_done_callback
callback_dev = nemo.core.EvaluatorCallback(
    eval_tensors=[tgt_, eval_loss, beam_trans, sent_ids_],
    user_iter_callback=lambda x, y: eval_iter_callback(x, y, tokenizer),
    user_epochs_done_callback=lambda x: eval_epochs_done_callback(x),
    eval_step=50)


In [11]:
from nemo.utils.lr_policies import CosineAnnealing
lr_policy = CosineAnnealing(10000, warmup_steps=500)

In [12]:
# define and launch training algorithm (optimizer)
optimizer = nf.get_trainer(
    params={
        "optimizer_kind": "novograd",
        "optimization_params":{
            "num_epochs": 40,
            "lr": 0.01,
            "weight_decay": 0.0
        }
    }
)
optimizer.train(tensors_to_optimize=[train_loss],
                tensors_to_evaluate=[],
                lr_policy=lr_policy,
                callbacks=[callback, callback_dev])



Starting .....
Starting epoch 0
Step: 0
Train Loss: 10.435689926147461
Step time: 0.37077879905700684 seconds
Doing Evaluation .................................
Ground truth: "The greatest enemies of the press freedom are not evil and wicked politicians, but bad journalists depending on profit, blackmail and extortion" he said.

Translation:  ChinaChinaChinaChina BronChinaChina BronChinaChinaChina BronChina Bron Bron BronChina Bron Bron Bron Bron Bron Bron BronChina Bron Bron Bron Bron Bron Bron Bron Bron Bron Bronstruction Bron unreststruction Bron unreststruction Bron unrest unreststruction Bron

Ground truth: A bus full of white people, who sing songs in a black language - this degree of recognition brings not only morale and joy, but some grim-faced border soldiers even shed a few tears.

Translation:  Support Support Bron Support Bron Support Bron Support Bron analy analy analy analy analy analy analy analy analy analy analy analy analy analy analy analy analy analy analy analy Su

KeyboardInterrupt: 