In [1]:
import math
import numpy as np
import os

import nemo
from nemo.utils.lr_policies import WarmupAnnealing

import nemo_nlp
from nemo_nlp import NemoBertTokenizer
from nemo_nlp.utils.callbacks.token_classification import \
    eval_iter_callback, eval_epochs_done_callback

DATA_DIR = "PATH_TO_WHERE_THE_DATA_IS"
WORK_DIR = "PATH_TO_WHERE_TO_STORE_CHECKPOINTS_AND_LOGS"
PRETRAINED_BERT_MODEL = "bert-base-cased"

# model parameters
BATCHES_PER_STEP = 1
BATCH_SIZE = 32
CLASSIFICATION_DROPOUT = 0.1
MAX_SEQ_LENGTH = 128
NUM_EPOCHS = 3
LEARNING_RATE = 0.00005
LR_WARMUP_PROPORTION = 0.1
OPTIMIZER = "adam"
STEP_FREQ=200 # determines how often loss will be printed

# Download and preprocess the data

In [9]:
# create DATA_DIR and download Tatoeba sentences
! mkdir $DATA_DIR
! wget -nc -O $DATA_DIR/sentences.csv https://downloads.tatoeba.org/exports/sentences.csv
# extract English sentences
! grep -P "\teng\t" $DATA_DIR/sentences.csv > $DATA_DIR/english_sentences.csv
! rm $DATA_DIR/sentences.csv

mkdir: cannot create directory ‘PATH_TO_WHERE_THE_DATA_IS’: File exists
--2019-12-10 14:19:27--  https://downloads.tatoeba.org/exports/sentences.csv
Resolving downloads.tatoeba.org (downloads.tatoeba.org)... 94.130.77.194
Connecting to downloads.tatoeba.org (downloads.tatoeba.org)|94.130.77.194|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 440346898 (420M) [application/octet-stream]
Saving to: ‘PATH_TO_WHERE_THE_DATA_IS/sentences.csv’


2019-12-10 14:19:52 (17.3 MB/s) - ‘PATH_TO_WHERE_THE_DATA_IS/sentences.csv’ saved [440346898/440346898]



In [None]:
# To preprocess the data run NeMo/scripts/get_tatoeba_data.py
# This can take a few minutes

! python ../../scripts/get_tatoeba_data.py --num_lines_to_combine 5 --num_samples 15000 --data_dir $DATA_DIR

In [None]:
# Instantiate neural factory with supported backend
nf = nemo.core.NeuralModuleFactory(
    backend=nemo.core.Backend.PyTorch,

    # If you're training with multiple GPUs, you should handle this value with
    # something like argparse. See examples/nlp/token_classification.py for an example.
    local_rank=None,

    # If you're training with mixed precision, this should be set to mxprO1 or mxprO2.
    # See https://nvidia.github.io/apex/amp.html#opt-levels for more details.
    optimization_level="O0",
    
    # Define path to the directory you want to store your results
    log_dir=WORK_DIR,

    # If you're training with multiple GPUs, this should be set to
    # nemo.core.DeviceType.AllGpu
    placement=nemo.core.DeviceType.GPU)

In [None]:
# If you're using a standard BERT model, you should do it like this. To see the full
# list of BERT model names, check out nemo_nlp.huggingface.BERT.list_pretrained_models()
tokenizer = NemoBertTokenizer(pretrained_model=PRETRAINED_BERT_MODEL)
bert_model = nemo_nlp.huggingface.BERT(pretrained_model_name=PRETRAINED_BERT_MODEL)

In [None]:
# Describe training DAG
train_data_layer = nemo_nlp.BertTokenClassificationDataLayer(
        tokenizer=tokenizer,
        text_file=os.path.join(DATA_DIR, 'text_train.txt'),
        label_file=os.path.join(DATA_DIR, 'labels_train.txt'),
        max_seq_length=MAX_SEQ_LENGTH,
        batch_size=BATCH_SIZE,
        pad_label=NONE_LABEL)

label_ids = train_data_layer.dataset.label_ids
num_classes = len(label_ids)

hidden_size = bert_model.local_parameters["hidden_size"]
classifier = nemo_nlp.TokenClassifier(hidden_size=hidden_size,
                                          num_classes=num_classes,
                                          dropout=CLASSIFICATION_DROPOUT)

task_loss = nemo_nlp.TokenClassificationLoss(d_model=hidden_size,
                                            num_classes=len(label_ids),
                                            dropout=CLASSIFICATION_DROPOUT)

input_ids, input_type_ids, input_mask, loss_mask, _, labels = train_data_layer()

hidden_states = bert_model(input_ids=input_ids,
                           token_type_ids=input_type_ids,
                           attention_mask=input_mask)

logits = classifier(hidden_states=hidden_states)
loss = task_loss(logits=logits, labels=labels, loss_mask=loss_mask)

In [None]:
# Describe evaluation DAG
eval_data_layer = nemo_nlp.BertTokenClassificationDataLayer(
        tokenizer=tokenizer,
        text_file=os.path.join(DATA_DIR, 'text_dev.txt'),
        label_file=os.path.join(DATA_DIR, 'labels_dev.txt'),
        max_seq_length=MAX_SEQ_LENGTH,
        batch_size=BATCH_SIZE,
        pad_label=NONE_LABEL,
        label_ids=label_ids)

eval_input_ids, eval_input_type_ids, eval_input_mask, _, eval_subtokens_mask, eval_labels \
    = eval_data_layer()

hidden_states = bert_model(
    input_ids=eval_input_ids,
    token_type_ids=eval_input_type_ids,
    attention_mask=eval_input_mask)

eval_logits = classifier(hidden_states=hidden_states)

# Create callbacks

In [None]:
callback_train = nemo.core.SimpleLossLoggerCallback(
    tensors=[loss],
    print_func=lambda x: print("Loss: {:.3f}".format(x[0].item())),
    step_freq=STEP_FREQ)

train_data_size = len(train_data_layer)

# If you're training on multiple GPUs, this should be
# train_data_size / (batch_size * batches_per_step * num_gpus)
steps_per_epoch = int(train_data_size / (BATCHES_PER_STEP * BATCH_SIZE))

# Callback to evaluate the model
callback_eval = nemo.core.EvaluatorCallback(
    eval_tensors=[eval_logits, eval_labels, eval_subtokens_mask],
    user_iter_callback=lambda x, y: eval_iter_callback(x, y),
    user_epochs_done_callback=lambda x: eval_epochs_done_callback(x, label_ids),
    eval_step=steps_per_epoch)

# Callback to store checkpoints
ckpt_callback = nemo.core.CheckpointCallback(
    folder=nf.checkpoint_dir,
    epoch_freq=1)

# Training

In [None]:
lr_policy = WarmupAnnealing(NUM_EPOCHS * steps_per_epoch,
                            warmup_ratio=LR_WARMUP_PROPORTION)

nf.train(tensors_to_optimize=[loss],
         callbacks=[callback_train, callback_eval, ckpt_callback],
         lr_policy=lr_policy,
         batches_per_step=BATCHES_PER_STEP,
         optimizer=OPTIMIZER,
         optimization_params={"num_epochs": NUM_EPOCHS,
                              "lr": LEARNING_RATE})

# Inference

In [None]:
# Define the list of queiries for inference
queries = ['we bought four shirts from the nvidia gear store in santa clara', 
           'tom sam and i are going to travel do you want to join',
           'nvidia is a company',
           'can i help you',
           'we bought four shirts one mug and ten thousand titan rtx graphics cards the more you buy the more you save']

In [None]:
infer_data_layer = nemo_nlp.BertTokenClassificationInferDataLayer(queries=queries,
                                                                  tokenizer=tokenizer,
                                                                  max_seq_length=MAX_SEQ_LENGTH,
                                                                  batch_size=1)
input_ids, input_type_ids, input_mask, _, subtokens_mask = infer_data_layer()

hidden_states = bert_model(input_ids=input_ids,
                                      token_type_ids=input_type_ids,
                                      attention_mask=input_mask)
logits = classifier(hidden_states=hidden_states)

evaluated_tensors = nf.infer(tensors=[logits, subtokens_mask], checkpoint_dir=WORK_DIR + '/checkpoints')

In [None]:
# helper functions
def concatenate(lists):
    return np.concatenate([t.cpu() for t in lists])

def get_preds(logits):
    return np.argmax(logits, 1)

ids_to_labels = {label_ids[k]: k for k in label_ids}

logits, subtokens_mask = [concatenate(tensors) for tensors in evaluated_tensors]

preds = np.argmax(logits, axis=2)

for i, query in enumerate(queries):
    nf.logger.info(f'Query: {query}')

    pred = preds[i][subtokens_mask[i] > 0.5]
    words = query.strip().split()
    if len(pred) != len(words):
        raise ValueError('Pred and words must be of the same length')

    output = ''
    for j, word in enumerate(words):
        label = ids_to_labels[pred[j]]
    
        if label != NONE_LABEL:
            if 'U' in label:
                word = word.capitalize()
            if label[0] != 'O':
                word += label[0]
            
        output += word
        output += ' '
    nf.logger.info(f'Combined: {output.strip()}\n')