diff --git a/CHANGELOG.md b/CHANGELOG.md index 105d67e44600..a9285d147fec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -85,6 +85,13 @@ files, along with unit tests, examples and tutorials ([PR #375](https://github.com/NVIDIA/NeMo/pull/375)) - @titu1994 ### Changed +- Refactoring of `nemo_nlp` collections: +([PR #368](https://github.com/NVIDIA/NeMo/pull/368)) - @VahidooX, @yzhang123, @ekmb + - renaming and restructuring of files, folder, and functions in `nemo_nlp` + - losses cleaned up. LossAggregatorNM moved to nemo/backends/pytorch/common/losses + ([PR #316](https://github.com/NVIDIA/NeMo/pull/316)) - @VahidooX, @yzhang123, @ekmb + - renaming and restructuring of files, folder, and functions in `nemo_nlp` + - Updated licenses - All collections changed to use New Neural Type System. ([PR #307](https://github.com/NVIDIA/NeMo/pull/307)) - @okuchaiev - Additional Collections Repositories merged into core `nemo_toolkit` package. @@ -95,10 +102,6 @@ files, along with unit tests, examples and tutorials ([PR #286](https://github.com/NVIDIA/NeMo/pull/286)) - @stasbel - Major cleanup of Neural Module constructors (init), aiming at increasing the framework robustness: cleanup of NeuralModule initialization logic, refactor of trainer/actions (getting rid of local_params), fixes of several examples and unit tests, extraction and storing of intial parameters (init_params). ([PR #309](https://github.com/NVIDIA/NeMo/pull/309)) - @tkornuta-nvidia -- Refactoring of `nemo_nlp` collections: -([PR #316](https://github.com/NVIDIA/NeMo/pull/316)) - @VahidooX, @yzhang123, @ekmb - - renaming of files and restructuring of folder in `nemo_nlp` - - Updated licenses - Updated nemo's use of the logging library. from nemo import logging is now the reccomended way of using the nemo logger. neural_factory.logger and all other instances of logger are now deprecated and planned for removal in the next version. Please see PR 267 for complete change information. ([PR #267](https://github.com/NVIDIA/NeMo/pull/267), [PR #283](https://github.com/NVIDIA/NeMo/pull/283), [PR #305](https://github.com/NVIDIA/NeMo/pull/305), [PR #311](https://github.com/NVIDIA/NeMo/pull/311)) - @blisc - Changed Distributed Data Parallel from Apex to Torch diff --git a/Jenkinsfile b/Jenkinsfile index 5512971ba8d1..8e6955647162 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -91,19 +91,21 @@ pipeline { parallel { stage ('Text Classification with BERT Test') { steps { - sh 'cd examples/nlp/text_classification && CUDA_VISIBLE_DEVICES=0 python text_classification_with_bert.py --num_epochs=1 --max_seq_length=50 --dataset_name=jarvis --data_dir=/home/TestData/nlp/retail/ --eval_file_prefix=eval --batch_size=10 --num_train_samples=-1 --do_lower_case --shuffle_data --work_dir=outputs' + sh 'cd examples/nlp/text_classification && CUDA_VISIBLE_DEVICES=0 python text_classification_with_bert.py --pretrained_bert_model bert-base-uncased --num_epochs=1 --max_seq_length=50 --dataset_name=jarvis --data_dir=/home/TestData/nlp/retail/ --eval_file_prefix=eval --batch_size=10 --num_train_samples=-1 --do_lower_case --shuffle_data --work_dir=outputs' sh 'rm -rf examples/nlp/text_classification/outputs' } } stage ('Dialogue State Tracking - TRADE - Multi-GPUs') { steps { + sh 'rm -rf /home/TestData/nlp/multiwoz2.1/vocab.pkl' sh 'cd examples/nlp/dialogue_state_tracking && CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 dialogue_state_tracking_trade.py --batch_size=10 --eval_batch_size=10 --num_train_samples=-1 --num_eval_samples=-1 --num_epochs=1 --dropout=0.2 --eval_file_prefix=test --shuffle_data --num_gpus=2 --lr=0.001 --grad_norm_clip=10 --work_dir=outputs --data_dir=/home/TestData/nlp/multiwoz2.1' sh 'rm -rf examples/nlp/dialogue_state_tracking/outputs' + sh 'rm -rf /home/TestData/nlp/multiwoz2.1/vocab.pkl' } } stage ('GLUE Benchmark Test') { steps { - sh 'cd examples/nlp/glue_benchmark && CUDA_VISIBLE_DEVICES=1 python glue_benchmark_with_bert.py --data_dir /home/TestData/nlp/glue_fake/MRPC --work_dir glue_output --save_step_freq -1 --num_epochs 1 --task_name mrpc --batch_size 2' + sh 'cd examples/nlp/glue_benchmark && CUDA_VISIBLE_DEVICES=1 python glue_benchmark_with_bert.py --data_dir /home/TestData/nlp/glue_fake/MRPC --pretrained_bert_model bert-base-uncased --work_dir glue_output --save_step_freq -1 --num_epochs 1 --task_name mrpc --batch_size 2' sh 'rm -rf examples/nlp/glue_benchmark/glue_output' } } @@ -122,8 +124,8 @@ pipeline { parallel { stage('Token Classification Training/Inference Test') { steps { - sh 'cd examples/nlp/token_classification && CUDA_VISIBLE_DEVICES=0 python token_classification.py --data_dir /home/TestData/nlp/token_classification_punctuation/ --batch_size 2 --num_epochs 1 --save_epoch_freq 1 --work_dir token_classification_output --pretrained_bert_model bert-base-cased' - sh 'cd examples/nlp/token_classification && DATE_F=$(ls token_classification_output/) && CUDA_VISIBLE_DEVICES=0 python token_classification_infer.py --work_dir token_classification_output/$DATE_F/checkpoints/ --labels_dict /home/TestData/nlp/token_classification_punctuation/label_ids.csv --pretrained_bert_model bert-base-cased' + sh 'cd examples/nlp/token_classification && CUDA_VISIBLE_DEVICES=0 python token_classification.py --data_dir /home/TestData/nlp/token_classification_punctuation/ --batch_size 2 --num_epochs 1 --save_epoch_freq 1 --work_dir token_classification_output --pretrained_bert_model bert-base-uncased' + sh 'cd examples/nlp/token_classification && DATE_F=$(ls token_classification_output/) && CUDA_VISIBLE_DEVICES=0 python token_classification_infer.py --work_dir token_classification_output/$DATE_F/checkpoints/ --labels_dict /home/TestData/nlp/token_classification_punctuation/label_ids.csv --pretrained_bert_model bert-base-uncased' sh 'rm -rf examples/nlp/token_classification/token_classification_output' } } diff --git a/docs/sources/source/nlp/bert_pretraining.rst b/docs/sources/source/nlp/bert_pretraining.rst index 389f6a307466..4b19ec9f512c 100644 --- a/docs/sources/source/nlp/bert_pretraining.rst +++ b/docs/sources/source/nlp/bert_pretraining.rst @@ -191,7 +191,7 @@ For training from raw text use nemo_nlp.BertPretrainingDataLayer, for preprocess mlm_logits = mlm_classifier(hidden_states=hidden_states) mlm_loss = mlm_loss_fn(logits=mlm_logits, - output_ids=input_data.output_ids, + labels=input_data.output_ids, output_mask=input_data.output_mask) nsp_logits = nsp_classifier(hidden_states=hidden_states) diff --git a/docs/sources/source/nlp/joint_intent_slot_filling.rst b/docs/sources/source/nlp/joint_intent_slot_filling.rst index 57b82629b0be..29f0fd954806 100644 --- a/docs/sources/source/nlp/joint_intent_slot_filling.rst +++ b/docs/sources/source/nlp/joint_intent_slot_filling.rst @@ -3,9 +3,9 @@ Tutorial In this tutorial, we are going to implement a joint intent and slot filling system with pretrained BERT model based on `BERT for Joint Intent Classification and Slot Filling `_ :cite:`nlp-slot-chen2019bert`. -All code used in this tutorial is based on ``examples/nlp/joint_intent_slot_with_bert.py``. +All code used in this tutorial is based on ``examples/nlp/intent_detection_slot_tagging/joint_intent_slot_with_bert.py``. -There are four pre-trained BERT models that we can select from using the argument `--pretrained_bert_model`. We're currently +There are a variety pre-trained BERT models that we can select from using the argument `--pretrained_bert_model`. We're currently using the script for loading pre-trained models from `pytorch_transformers`. See the list of available pre-trained models `here `__. diff --git a/examples/nlp/asr_postprocessor/asr_postprocessor.py b/examples/nlp/asr_postprocessor/asr_postprocessor.py index 187529ddd2e4..28744c9bcc7a 100644 --- a/examples/nlp/asr_postprocessor/asr_postprocessor.py +++ b/examples/nlp/asr_postprocessor/asr_postprocessor.py @@ -113,7 +113,7 @@ args.d_model, num_classes=vocab_size, num_layers=1, log_softmax=True ) -loss_fn = nemo_nlp.nm.losses.PaddedSmoothedCrossEntropyLossNM(pad_id=tokenizer.pad_id, label_smoothing=0.1) +loss_fn = nemo_nlp.nm.losses.SmoothedCrossEntropyLoss(pad_id=tokenizer.pad_id, label_smoothing=0.1) beam_search = nemo_nlp.nm.trainables.BeamSearchTranslatorNM( decoder=decoder, @@ -174,7 +174,7 @@ def create_pipeline(dataset, tokens_in_batch, clean=False, training=True): input_ids_tgt=tgt, hidden_states_src=src_hiddens, input_mask_src=src_mask, input_mask_tgt=tgt_mask ) log_softmax = t_log_softmax(hidden_states=tgt_hiddens) - loss = loss_fn(logits=log_softmax, target_ids=labels) + loss = loss_fn(logits=log_softmax, labels=labels) beam_results = None if not training: beam_results = beam_search(hidden_states_src=src_hiddens, input_mask_src=src_mask) diff --git a/examples/nlp/dialogue_state_tracking/dialogue_state_tracking_trade.py b/examples/nlp/dialogue_state_tracking/dialogue_state_tracking_trade.py index 9fc7a6ca7f29..b3e9b75195a8 100644 --- a/examples/nlp/dialogue_state_tracking/dialogue_state_tracking_trade.py +++ b/examples/nlp/dialogue_state_tracking/dialogue_state_tracking_trade.py @@ -25,12 +25,14 @@ import numpy as np +import nemo.backends.pytorch as nemo_backend +import nemo.backends.pytorch.common.losses import nemo.collections.nlp as nemo_nlp import nemo.core as nemo_core from nemo import logging from nemo.backends.pytorch.common import EncoderRNN from nemo.collections.nlp.callbacks.state_tracking_trade_callback import eval_epochs_done_callback, eval_iter_callback -from nemo.collections.nlp.data.datasets.state_tracking_trade_dataset import MultiWOZDataDesc +from nemo.collections.nlp.data.datasets.multiwoz_dataset import MultiWOZDataDesc from nemo.utils.lr_policies import get_lr_policy parser = argparse.ArgumentParser(description='Dialog state tracking with TRADE model on MultiWOZ dataset') @@ -97,9 +99,9 @@ teacher_forcing=args.teacher_forcing, ) -gate_loss_fn = nemo_nlp.nm.losses.CrossEntropyLoss3D(num_classes=len(data_desc.gating_dict)) -ptr_loss_fn = nemo_nlp.nm.losses.TRADEMaskedCrossEntropy() -total_loss_fn = nemo_nlp.nm.losses.LossAggregatorNM(num_inputs=2) +gate_loss_fn = nemo_backend.losses.CrossEntropyLossNM(logits_dim=3) +ptr_loss_fn = nemo_nlp.nm.losses.MaskedXEntropyLoss() +total_loss_fn = nemo.backends.pytorch.common.losses.LossAggregatorNM(num_inputs=2) def create_pipeline(num_samples, batch_size, num_gpus, input_dropout, data_prefix, is_training): @@ -142,7 +144,7 @@ def create_pipeline(num_samples, batch_size, num_gpus, input_dropout, data_prefi ) gate_loss = gate_loss_fn(logits=gate_outputs, labels=gate_labels) - ptr_loss = ptr_loss_fn(logits=point_outputs, targets=tgt_ids, loss_mask=tgt_lens) + ptr_loss = ptr_loss_fn(logits=point_outputs, labels=tgt_ids, length_mask=tgt_lens) total_loss = total_loss_fn(loss_1=gate_loss, loss_2=ptr_loss) if is_training: diff --git a/examples/nlp/glue_benchmark/glue_benchmark_with_bert.py b/examples/nlp/glue_benchmark/glue_benchmark_with_bert.py index 7b90c132a506..15b119f060cb 100644 --- a/examples/nlp/glue_benchmark/glue_benchmark_with_bert.py +++ b/examples/nlp/glue_benchmark/glue_benchmark_with_bert.py @@ -68,7 +68,7 @@ import nemo.collections.nlp as nemo_nlp import nemo.core as nemo_core from nemo import logging -from nemo.backends.pytorch.common import CrossEntropyLoss, MSELoss +from nemo.backends.pytorch.common import CrossEntropyLossNM, MSELoss from nemo.collections.nlp.callbacks.glue_benchmark_callback import eval_epochs_done_callback, eval_iter_callback from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer from nemo.collections.nlp.data.datasets.glue_benchmark_dataset import output_modes, processors @@ -231,7 +231,7 @@ else: model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) model.restore_from(args.bert_checkpoint) - logging.info(f"model resotred from {args.bert_checkpoint}") + logging.info(f"model restored from {args.bert_checkpoint}") hidden_size = model.hidden_size @@ -241,7 +241,7 @@ glue_loss = MSELoss() else: pooler = SequenceClassifier(hidden_size=hidden_size, num_classes=num_labels, log_softmax=False) - glue_loss = CrossEntropyLoss() + glue_loss = CrossEntropyLossNM() def create_pipeline( @@ -260,8 +260,6 @@ def create_pipeline( processor=processor, evaluate=evaluate, batch_size=batch_size, - # num_workers=0, - # local_rank=local_rank, tokenizer=tokenizer, data_dir=args.data_dir, max_seq_length=max_seq_length, diff --git a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer.py b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer.py index 196a0e492055..81fdfad719a3 100644 --- a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer.py +++ b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer.py @@ -23,7 +23,7 @@ import nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm from nemo import logging -from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import JointIntentSlotDataDesc +from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.data_descriptor import JointIntentSlotDataDesc # Parsing arguments parser = argparse.ArgumentParser(description='Joint-intent BERT') diff --git a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer_b1.py b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer_b1.py index 84ab723c94a8..3c37c04a6685 100644 --- a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer_b1.py +++ b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer_b1.py @@ -21,8 +21,8 @@ import nemo.collections.nlp as nemo_nlp import nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm -from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import JointIntentSlotDataDesc -from nemo.collections.nlp.utils.common_nlp_utils import read_intent_slot_outputs +from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.data_descriptor import JointIntentSlotDataDesc +from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.inference_utils import read_intent_slot_outputs # Parsing arguments parser = argparse.ArgumentParser(description='Joint-intent BERT') diff --git a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_with_bert.py b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_with_bert.py index 0cbdb08f72cc..579d9ccc340d 100644 --- a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_with_bert.py +++ b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_with_bert.py @@ -21,16 +21,18 @@ import numpy as np from transformers import BertTokenizer +import nemo import nemo.collections.nlp as nemo_nlp -import nemo.collections.nlp.nm.data_layers.joint_intent_slot_datalayer -import nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm from nemo import logging +from nemo.backends.pytorch.common.losses import CrossEntropyLossNM, LossAggregatorNM from nemo.collections.nlp.callbacks.joint_intent_slot_callback import eval_epochs_done_callback, eval_iter_callback -from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import JointIntentSlotDataDesc +from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.data_descriptor import JointIntentSlotDataDesc +from nemo.collections.nlp.nm.data_layers import BertJointIntentSlotDataLayer +from nemo.core import CheckpointCallback, SimpleLossLoggerCallback from nemo.utils.lr_policies import get_lr_policy # Parsing arguments -parser = argparse.ArgumentParser(description='Joint intent slot filling system with pretrained BERT') +parser = argparse.ArgumentParser(description='Joint intent detection and slot filling with pre-trained BERT') parser.add_argument("--local_rank", default=None, type=int) parser.add_argument("--batch_size", default=128, type=int) parser.add_argument("--max_seq_length", default=50, type=int) @@ -87,12 +89,10 @@ nemo_nlp.huggingface.BERT.list_pretrained_models() """ if args.bert_checkpoint and args.bert_config: - pretrained_bert_model = nemo.collections.nlp.nm.trainables.huggingface.BERT(config_filename=args.bert_config) + pretrained_bert_model = nemo_nlp.nm.trainables.huggingface.BERT(config_filename=args.bert_config) pretrained_bert_model.restore_from(args.bert_checkpoint) else: - pretrained_bert_model = nemo.collections.nlp.nm.trainables.huggingface.BERT( - pretrained_model_name=args.pretrained_bert_model - ) + pretrained_bert_model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) hidden_size = pretrained_bert_model.hidden_size @@ -101,31 +101,29 @@ ) # Create sentence classification loss on top -classifier = nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm.JointIntentSlotClassifier( +classifier = nemo_nlp.nm.trainables.JointIntentSlotClassifier( hidden_size=hidden_size, num_intents=data_desc.num_intents, num_slots=data_desc.num_slots, dropout=args.fc_dropout ) if args.class_balancing == 'weighted_loss': - # Using weighted loss will enable weighted loss for both intents and slots - # Use the intent_loss_weight hyperparameter to adjust intent loss to - # prevent overfitting or underfitting. - loss_fn = nemo_nlp.nm.losses.JointIntentSlotLoss( - num_slots=data_desc.num_slots, - slot_classes_loss_weights=data_desc.slot_weights, - intent_classes_loss_weights=data_desc.intent_weights, - intent_loss_weight=args.intent_loss_weight, - ) + # To tackle imbalanced classes, you may use weighted loss + intent_loss_fn = CrossEntropyLossNM(logits_dim=2, weight=data_desc.intent_weights) + slot_loss_fn = CrossEntropyLossNM(logits_dim=3, weight=data_desc.intent_weights) + else: - loss_fn = nemo_nlp.nm.losses.JointIntentSlotLoss(num_slots=data_desc.num_slots) + intent_loss_fn = CrossEntropyLossNM(logits_dim=2) + slot_loss_fn = CrossEntropyLossNM(logits_dim=3) + +total_loss_fn = LossAggregatorNM(num_inputs=2, weights=[args.intent_loss_weight, 1.0 - args.intent_loss_weight]) -def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mode='train'): +def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, mode='train'): logging.info(f"Loading {mode} data...") data_file = f'{data_desc.data_dir}/{mode}.tsv' slot_file = f'{data_desc.data_dir}/{mode}_slots.tsv' shuffle = args.shuffle_data if mode == 'train' else False - data_layer = nemo.collections.nlp.nm.data_layers.joint_intent_slot_datalayer.BertJointIntentSlotDataLayer( + data_layer = BertJointIntentSlotDataLayer( input_file=data_file, slot_file=slot_file, pad_label=data_desc.pad_label, @@ -155,35 +153,27 @@ def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mod intent_logits, slot_logits = classifier(hidden_states=hidden_states) - loss = loss_fn( - intent_logits=intent_logits, slot_logits=slot_logits, loss_mask=loss_mask, intents=intents, slots=slots - ) + intent_loss = intent_loss_fn(logits=intent_logits, labels=intents) + slot_loss = slot_loss_fn(logits=slot_logits, labels=slots, loss_mask=loss_mask) + total_loss = total_loss_fn(loss_1=intent_loss, loss_2=slot_loss) if mode == 'train': - tensors_to_evaluate = [loss, intent_logits, slot_logits] + tensors_to_evaluate = [total_loss, intent_logits, slot_logits] else: tensors_to_evaluate = [intent_logits, slot_logits, intents, slots, subtokens_mask] - return tensors_to_evaluate, loss, steps_per_epoch, data_layer + return tensors_to_evaluate, total_loss, steps_per_epoch, data_layer train_tensors, train_loss, steps_per_epoch, _ = create_pipeline( - args.num_train_samples, - batch_size=args.batch_size, - num_gpus=args.num_gpus, - local_rank=args.local_rank, - mode=args.train_file_prefix, + args.num_train_samples, batch_size=args.batch_size, num_gpus=args.num_gpus, mode=args.train_file_prefix, ) eval_tensors, _, _, data_layer = create_pipeline( - args.num_eval_samples, - batch_size=args.batch_size, - num_gpus=args.num_gpus, - local_rank=args.local_rank, - mode=args.eval_file_prefix, + args.num_eval_samples, batch_size=args.batch_size, num_gpus=args.num_gpus, mode=args.eval_file_prefix, ) # Create callbacks for train and eval modes -train_callback = nemo.core.SimpleLossLoggerCallback( +train_callback = SimpleLossLoggerCallback( tensors=train_tensors, print_func=lambda x: str(np.round(x[0].item(), 3)), tb_writer=nf.tb_writer, @@ -200,7 +190,7 @@ def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mod ) # Create callback to save checkpoints -ckpt_callback = nemo.core.CheckpointCallback( +ckpt_callback = CheckpointCallback( folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq ) diff --git a/examples/nlp/language_modeling/BERTPretrainingTutorial.ipynb b/examples/nlp/language_modeling/BERTPretrainingTutorial.ipynb index b46a87a77079..157fd50bb208 100644 --- a/examples/nlp/language_modeling/BERTPretrainingTutorial.ipynb +++ b/examples/nlp/language_modeling/BERTPretrainingTutorial.ipynb @@ -7,6 +7,15 @@ "### Step 1 Download and prepare data" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "DATA_DIR = 'PATH_TO_THE_DATA_DIR'" + ] + }, { "cell_type": "code", "execution_count": null, @@ -15,7 +24,7 @@ "source": [ "# This example is for demonstration purposes\n", "# Please refer to the corresponding NLP tutorial on NeMo documentation\n", - "! ../scripts/get_wkt2.sh" + "! ../scripts/get_wkt2.sh $DATA_DIR" ] }, { @@ -25,7 +34,7 @@ "outputs": [], "source": [ "# verify data is there \n", - "! ls -l data/lm/wikitext-2" + "! ls -l $DATA_DIR/wikitext-2" ] }, { @@ -35,7 +44,7 @@ "outputs": [], "source": [ "# Prepare tokenization model\n", - "! python ../scripts/create_vocab.py --train_path=data/lm/wikitext-2/train.txt" + "! python ../scripts/create_vocab.py --train_path=$DATA_DIR/wikitext-2/train.txt" ] }, { @@ -155,7 +164,7 @@ " num_classes=tokenizer.vocab_size,\n", " activation=HIDDEN_ACT,\n", " log_softmax=True)\n", - "mlm_loss = nemo_nlp.nm.losses.MaskedLanguageModelingLossNM()\n", + "mlm_loss = nemo_nlp.nm.losses.SmoothedCrossEntropyLoss()\n", "\n", "# Next Sentence Prediciton Loss\n", "nsp_classifier = nemo_nlp.nm.trainables.SequenceClassifier(D_MODEL,\n", @@ -163,7 +172,7 @@ " num_layers=2,\n", " activation='tanh',\n", " log_softmax=False)\n", - "nsp_loss = nemo.backends.pytorch.common.CrossEntropyLoss()\n", + "nsp_loss = nemo.backends.pytorch.common.CrossEntropyLossNM()\n", "\n", "bert_loss = nemo_nlp.nm.losses.LossAggregatorNM(num_inputs=2)" ] @@ -174,10 +183,9 @@ "metadata": {}, "outputs": [], "source": [ - "import os\n", "train_data_layer = nemo_nlp.nm.data_layers.BertPretrainingDataLayer(\n", " tokenizer=tokenizer,\n", - " dataset=os.path.join(\"data/lm/wikitext-2\", \"train.txt\"),\n", + " dataset=os.path.join(DATA_DIR, \"wikitext-2\", \"train.txt\"),\n", " max_seq_length=MAX_SEQ_LENGTH,\n", " mask_probability=MASK_PROBABILITY,\n", " batch_size=BATCH_SIZE\n", @@ -185,7 +193,7 @@ "\n", "eval_data_layer = nemo_nlp.nm.data_layers.BertPretrainingDataLayer(\n", " tokenizer=tokenizer,\n", - " dataset=os.path.join(\"data/lm/wikitext-2\", \"valid.txt\"),\n", + " dataset=os.path.join(DATA_DIR, \"wikitext-2\", \"valid.txt\"),\n", " max_seq_length=MAX_SEQ_LENGTH,\n", " mask_probability=MASK_PROBABILITY,\n", " batch_size=BATCH_SIZE_EVAL\n", @@ -213,7 +221,7 @@ " attention_mask=input_data.input_mask)\n", "\n", "mlm_logits = mlm_classifier(hidden_states=hidden_states)\n", - "t_mlm_loss = mlm_loss(logits=mlm_logits, output_ids=input_data.output_ids, output_mask=input_data.output_mask)\n", + "t_mlm_loss = mlm_loss(logits=mlm_logits, labels=input_data.output_ids, output_mask=input_data.output_mask)\n", "\n", "nsp_logits = nsp_classifier(hidden_states=hidden_states)\n", "t_nsp_loss = nsp_loss(logits=nsp_logits, labels=input_data.labels)\n", @@ -235,7 +243,7 @@ " attention_mask=input_data_eval.input_mask)\n", "\n", "e_mlm_logits = mlm_classifier(hidden_states=e_hidden_states)\n", - "e_mlm_loss = mlm_loss(logits=e_mlm_logits, output_ids=input_data_eval.output_ids, output_mask=input_data_eval.output_mask)\n", + "e_mlm_loss = mlm_loss(logits=e_mlm_logits, labels=input_data_eval.output_ids, output_mask=input_data_eval.output_mask)\n", "\n", "e_nsp_logits = nsp_classifier(hidden_states=e_hidden_states)\n", "e_nsp_loss = nsp_loss(logits=e_nsp_logits, labels=input_data_eval.labels)\n", @@ -270,17 +278,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[NeMo I 2020-02-12 12:08:08 callbacks:196] Step: 300\n", - "Loss: 6.991\n", - "[NeMo I 2020-02-12 12:08:08 callbacks:211] Step time: 0.13242316246032715 seconds\n" - ] - } - ], + "outputs": [], "source": [ "lr_policy = CosineAnnealing(NUM_EPOCHS * steps_per_epoch,\n", " warmup_ratio=LR_WARMUP_PROPORTION)\n", @@ -299,6 +297,13 @@ " })" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, @@ -323,7 +328,16 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.7.4" + }, + "pycharm": { + "stem_cell": { + "cell_type": "raw", + "metadata": { + "collapsed": false + }, + "source": [] + } } }, "nbformat": 4, diff --git a/examples/nlp/language_modeling/bert_pretraining.py b/examples/nlp/language_modeling/bert_pretraining.py index 27bf08d1b3d1..9dd9b341eba8 100644 --- a/examples/nlp/language_modeling/bert_pretraining.py +++ b/examples/nlp/language_modeling/bert_pretraining.py @@ -86,6 +86,7 @@ from transformers import BertConfig import nemo.backends.pytorch.common as nemo_common +import nemo.backends.pytorch.common.losses import nemo.collections.nlp as nemo_nlp import nemo.core as nemo_core from nemo import logging @@ -211,14 +212,14 @@ mlm_classifier = nemo_nlp.nm.trainables.token_classification_nm.BertTokenClassifier( args.hidden_size, num_classes=args.vocab_size, activation=args.hidden_act, log_softmax=True ) -mlm_loss_fn = nemo_nlp.nm.losses.MaskedLanguageModelingLossNM() +mlm_loss_fn = nemo_nlp.nm.losses.SmoothedCrossEntropyLoss() if not args.only_mlm_loss: nsp_classifier = nemo_nlp.nm.trainables.sequence_classification_nm.SequenceClassifier( args.hidden_size, num_classes=2, num_layers=2, activation='tanh', log_softmax=False ) - nsp_loss_fn = nemo_common.CrossEntropyLoss() + nsp_loss_fn = nemo_common.CrossEntropyLossNM() - bert_loss = nemo_nlp.nm.losses.LossAggregatorNM(num_inputs=2) + bert_loss = nemo.backends.pytorch.common.losses.LossAggregatorNM(num_inputs=2) # tie weights of MLM softmax layer and embedding layer of the encoder if mlm_classifier.mlp.last_linear_layer.weight.shape != bert_model.bert.embeddings.word_embeddings.weight.shape: @@ -256,7 +257,7 @@ def create_pipeline(data_file, batch_size, preprocessed_data=False, batches_per_ input_ids=input_data.input_ids, token_type_ids=input_data.input_type_ids, attention_mask=input_data.input_mask ) mlm_logits = mlm_classifier(hidden_states=hidden_states) - mlm_loss = mlm_loss_fn(logits=mlm_logits, output_ids=input_data.output_ids, output_mask=input_data.output_mask) + mlm_loss = mlm_loss_fn(logits=mlm_logits, labels=input_data.output_ids, output_mask=input_data.output_mask) if not args.only_mlm_loss: nsp_logits = nsp_classifier(hidden_states=hidden_states) nsp_loss = nsp_loss_fn(logits=nsp_logits, labels=input_data.labels) diff --git a/examples/nlp/language_modeling/language_modeling_transformer.py b/examples/nlp/language_modeling/language_modeling_transformer.py index 2572b90af785..86299277c30e 100644 --- a/examples/nlp/language_modeling/language_modeling_transformer.py +++ b/examples/nlp/language_modeling/language_modeling_transformer.py @@ -109,9 +109,7 @@ args.d_model, num_classes=vocab_size, num_layers=1, log_softmax=True ) -loss = nemo_nlp.nm.losses.PaddedSmoothedCrossEntropyLossNM( - pad_id=tokenizer.pad_id, label_smoothing=args.label_smoothing -) +loss = nemo_nlp.nm.losses.SmoothedCrossEntropyLoss(pad_id=tokenizer.pad_id, label_smoothing=args.label_smoothing) # tie weight of embedding and log_softmax layers # log_softmax.mlp.last_linear_layer.weight = encoder.embedding_layer.token_embedding.weight @@ -133,7 +131,7 @@ def create_pipeline( src, src_mask, labels = data_layer() src_hiddens = encoder(input_ids=src, input_mask_src=src_mask) logits = log_softmax(hidden_states=src_hiddens) - return loss(logits=logits, target_ids=labels) + return loss(logits=logits, labels=labels) train_loss = create_pipeline( diff --git a/examples/nlp/neural_machine_translation/machine_translation_tutorial.py b/examples/nlp/neural_machine_translation/machine_translation_tutorial.py index ae05afa88e32..c2ecd19df986 100644 --- a/examples/nlp/neural_machine_translation/machine_translation_tutorial.py +++ b/examples/nlp/neural_machine_translation/machine_translation_tutorial.py @@ -161,7 +161,7 @@ eos_token=tgt_tokenizer.eos_id, ) -loss_fn = nemo_nlp.nm.losses.PaddedSmoothedCrossEntropyLossNM( +loss_fn = nemo_nlp.nm.losses.SmoothedCrossEntropyLoss( pad_id=tgt_tokenizer.pad_id, label_smoothing=args.label_smoothing ) @@ -202,7 +202,7 @@ def create_pipeline(dataset_src, dataset_tgt, tokens_in_batch, clean=False, trai input_ids_tgt=tgt, hidden_states_src=src_hiddens, input_mask_src=src_mask, input_mask_tgt=tgt_mask ) logits = log_softmax(hidden_states=tgt_hiddens) - loss = loss_fn(logits=logits, target_ids=labels) + loss = loss_fn(logits=logits, labels=labels) beam_results = None if not training: beam_results = beam_search(hidden_states_src=src_hiddens, input_mask_src=src_mask) diff --git a/examples/nlp/question_answering/question_answering_squad.py b/examples/nlp/question_answering/question_answering_squad.py index 5997614888e7..1bd718883807 100755 --- a/examples/nlp/question_answering/question_answering_squad.py +++ b/examples/nlp/question_answering/question_answering_squad.py @@ -368,7 +368,7 @@ def create_pipeline( qa_head = nemo_nlp.nm.trainables.TokenClassifier( hidden_size=hidden_size, num_classes=2, num_layers=1, log_softmax=False ) - squad_loss = nemo_nlp.nm.losses.QuestionAnsweringLoss() + squad_loss = nemo_nlp.nm.losses.SpanningLoss() if args.bert_checkpoint is not None: model.restore_from(args.bert_checkpoint) diff --git a/examples/nlp/scripts/get_wkt2.sh b/examples/nlp/scripts/get_wkt2.sh index 206160bf8cd8..75efd08722e4 100755 --- a/examples/nlp/scripts/get_wkt2.sh +++ b/examples/nlp/scripts/get_wkt2.sh @@ -4,12 +4,12 @@ This file is adapted from https://github.com/salesforce/awd-lstm-lm/blob/master/getdata.sh Copyright by the AWD LSTM authors. """ - +DATA_DIR=$1 echo "- Downloading WikiText-2" -wget --continue -P data/lm/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip -unzip -q data/lm/wikitext-2-v1.zip -d data/lm -cd data/lm/wikitext-2 +wget --continue -P $DATA_DIR https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip +unzip -q $DATA_DIR/wikitext-2-v1.zip -d $DATA_DIR +cd $DATA_DIR/wikitext-2 mv wiki.train.tokens train.txt sed -i -e "s//[UNK]/g" train.txt mv wiki.valid.tokens valid.txt @@ -18,3 +18,5 @@ mv wiki.test.tokens test.txt sed -i -e "s//[UNK]/g" test.txt cd .. rm wikitext-2-v1.zip + +echo "- WikiText-2 saved at $DATA_DIR/wikitext-2" diff --git a/examples/nlp/text_classification/text_classification_with_bert.py b/examples/nlp/text_classification/text_classification_with_bert.py index 62048e5b4945..202e8c9f3039 100644 --- a/examples/nlp/text_classification/text_classification_with_bert.py +++ b/examples/nlp/text_classification/text_classification_with_bert.py @@ -24,7 +24,7 @@ import nemo.collections.nlp.nm.trainables.common.sequence_classification_nm from nemo import logging from nemo.collections.nlp.callbacks.text_classification_callback import eval_epochs_done_callback, eval_iter_callback -from nemo.collections.nlp.data.datasets.text_classification_dataset import SentenceClassificationDataDesc +from nemo.collections.nlp.data.datasets.text_classification_dataset import TextClassificationDataDesc from nemo.utils.lr_policies import get_lr_policy # Parsing arguments @@ -93,7 +93,7 @@ hidden_size = pretrained_bert_model.hidden_size tokenizer = BertTokenizer.from_pretrained(args.pretrained_bert_model) -data_desc = SentenceClassificationDataDesc(args.dataset_name, args.data_dir, args.do_lower_case) +data_desc = TextClassificationDataDesc(args.dataset_name, args.data_dir, args.do_lower_case) # Create sentence classification loss on top classifier = nemo.collections.nlp.nm.trainables.common.sequence_classification_nm.SequenceClassifier( @@ -102,9 +102,9 @@ if args.class_balancing == 'weighted_loss': # You may need to increase the number of epochs for convergence. - loss_fn = nemo.backends.pytorch.common.CrossEntropyLoss(weight=data_desc.class_weights) + loss_fn = nemo.backends.pytorch.common.CrossEntropyLossNM(weight=data_desc.class_weights) else: - loss_fn = nemo.backends.pytorch.common.CrossEntropyLoss() + loss_fn = nemo.backends.pytorch.common.CrossEntropyLossNM() def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mode='train'): diff --git a/examples/nlp/token_classification/NERWithBERT.ipynb b/examples/nlp/token_classification/NERWithBERT.ipynb index c3a38da0e49a..85389348cb2e 100644 --- a/examples/nlp/token_classification/NERWithBERT.ipynb +++ b/examples/nlp/token_classification/NERWithBERT.ipynb @@ -16,7 +16,7 @@ "from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer\n", "from nemo.collections.nlp.callbacks.token_classification_callback import \\\n", " eval_iter_callback, eval_epochs_done_callback\n", - "from nemo.collections.nlp.nm.losses import TokenClassificationLoss\n", + "from nemo.backends.pytorch.common.losses import CrossEntropyLoss\n", "from nemo.collections.nlp.nm.trainables import TokenClassifier" ] }, @@ -106,7 +106,7 @@ " num_classes=num_classes,\n", " dropout=CLASSIFICATION_DROPOUT)\n", "\n", - "ner_loss = TokenClassificationLoss(num_classes=len(label_ids))\n", + "ner_loss = CrossEntropyLossNM()\n", "\n", "input_ids, input_type_ids, input_mask, loss_mask, _, labels = train_data_layer()\n", "\n", @@ -219,8 +219,17 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" + }, + "pycharm": { + "stem_cell": { + "cell_type": "raw", + "source": [], + "metadata": { + "collapsed": false + } + } } }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/examples/nlp/token_classification/PunctuationWithBERT.ipynb b/examples/nlp/token_classification/PunctuationWithBERT.ipynb index e4905b1d6277..2ca6b93a0b74 100644 --- a/examples/nlp/token_classification/PunctuationWithBERT.ipynb +++ b/examples/nlp/token_classification/PunctuationWithBERT.ipynb @@ -17,9 +17,10 @@ "import nemo.collections.nlp as nemo_nlp\n", "from nemo.collections.nlp.data import NemoBertTokenizer\n", "from nemo.collections.nlp.nm.trainables import TokenClassifier\n", - "from nemo.collections.nlp.nm.losses import TokenClassificationLoss, LossAggregatorNM\n", + "from nemo.collections.nlp.nm.losses import LossAggregatorNM\n", + "from nemo.backends.pytorch.common.losses import CrossEntropyLoss\n", "from nemo.collections.nlp.callbacks.punctuation_capitalization_callback import eval_iter_callback, eval_epochs_done_callback\n", - "from nemo.collections.nlp.utils.common_nlp_utils import calc_class_weights\n", + "from nemo.collections.nlp.data import calc_class_weights\n", "\n", "DATA_DIR = \"PATH_TO_WHERE_THE_DATA_IS\"\n", "WORK_DIR = \"PATH_TO_WHERE_TO_STORE_CHECKPOINTS_AND_LOGS\"\n", @@ -167,10 +168,8 @@ "class_weights = calc_class_weights(punct_label_freqs)\n", "\n", "# define loss\n", - "punct_loss = TokenClassificationLoss(\n", - " num_classes=len(punct_label_ids),\n", - " class_weights=class_weights)\n", - "capit_loss = TokenClassificationLoss(num_classes=len(capit_label_ids))\n", + "punct_loss = CrossEntropyLossNM(logits_dim=3, weight=class_weights)\n", + "capit_loss = CrossEntropyLossNM(logits_dim=3)\n", "task_loss = LossAggregatorNM(num_inputs=2)" ] }, @@ -480,8 +479,17 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" + }, + "pycharm": { + "stem_cell": { + "cell_type": "raw", + "source": [], + "metadata": { + "collapsed": false + } + } } }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/examples/nlp/token_classification/punctuation_capitalization.py b/examples/nlp/token_classification/punctuation_capitalization.py index b74eeff89663..d3295bdeb4e8 100644 --- a/examples/nlp/token_classification/punctuation_capitalization.py +++ b/examples/nlp/token_classification/punctuation_capitalization.py @@ -19,15 +19,16 @@ import os import nemo.collections.nlp as nemo_nlp -import nemo.collections.nlp.utils.common_nlp_utils +import nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing +import nemo.collections.nlp.utils.data_utils from nemo import logging +from nemo.backends.pytorch.common.losses import CrossEntropyLossNM from nemo.collections.nlp.callbacks.punctuation_capitalization_callback import ( eval_epochs_done_callback, eval_iter_callback, ) from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer from nemo.collections.nlp.nm.data_layers import PunctuationCapitalizationDataLayer -from nemo.collections.nlp.nm.losses.token_classification_loss import TokenClassificationLoss from nemo.collections.nlp.nm.trainables import TokenClassifier from nemo.utils.lr_policies import get_lr_policy @@ -214,7 +215,9 @@ def create_pipeline( if args.use_weighted_loss_punct: logging.info(f"Using weighted loss for punctuation task") punct_label_freqs = data_layer.dataset.punct_label_frequencies - class_weights = nemo.collections.nlp.utils.common_nlp_utils.calc_class_weights(punct_label_freqs) + class_weights = nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing.calc_class_weights( + punct_label_freqs + ) # Initialize punctuation loss punct_classifier = punct_classifier( @@ -225,15 +228,15 @@ def create_pipeline( name='Punctuation', ) - punct_loss = TokenClassificationLoss(num_classes=len(punct_label_ids), class_weights=class_weights) + punct_loss = CrossEntropyLossNM(logits_dim=3, weight=class_weights) # Initialize capitalization loss capit_classifier = capit_classifier( hidden_size=hidden_size, num_classes=len(capit_label_ids), dropout=dropout, name='Capitalization' ) - capit_loss = TokenClassificationLoss(num_classes=len(capit_label_ids)) + capit_loss = CrossEntropyLossNM(logits_dim=3) - task_loss = nemo_nlp.nm.losses.LossAggregatorNM(num_inputs=2) + task_loss = nemo.backends.pytorch.common.losses.LossAggregatorNM(num_inputs=2) hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) diff --git a/examples/nlp/token_classification/punctuation_capitalization_infer.py b/examples/nlp/token_classification/punctuation_capitalization_infer.py index 9c2f8bede33c..b4b7cc2e1aab 100644 --- a/examples/nlp/token_classification/punctuation_capitalization_infer.py +++ b/examples/nlp/token_classification/punctuation_capitalization_infer.py @@ -24,7 +24,7 @@ from nemo import logging from nemo.collections.nlp.data import NemoBertTokenizer from nemo.collections.nlp.nm.data_layers import BertTokenClassificationInferDataLayer -from nemo.collections.nlp.utils.common_nlp_utils import get_vocab +from nemo.collections.nlp.utils.data_utils import get_vocab # Parsing arguments parser = argparse.ArgumentParser(description='Punctuation and capitalization detection inference') diff --git a/examples/nlp/token_classification/token_classification.py b/examples/nlp/token_classification/token_classification.py index 7254929863f1..b8d16bd87d59 100644 --- a/examples/nlp/token_classification/token_classification.py +++ b/examples/nlp/token_classification/token_classification.py @@ -19,12 +19,13 @@ import os import nemo.collections.nlp as nemo_nlp -import nemo.collections.nlp.utils.common_nlp_utils +import nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing +import nemo.collections.nlp.utils.data_utils from nemo import logging +from nemo.backends.pytorch.common.losses import CrossEntropyLossNM from nemo.collections.nlp.callbacks.token_classification_callback import eval_epochs_done_callback, eval_iter_callback from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer from nemo.collections.nlp.nm.data_layers import BertTokenClassificationDataLayer -from nemo.collections.nlp.nm.losses import TokenClassificationLoss from nemo.collections.nlp.nm.trainables import TokenClassifier from nemo.utils.lr_policies import get_lr_policy @@ -197,7 +198,9 @@ def create_pipeline( if args.use_weighted_loss: logging.info(f"Using weighted loss") label_freqs = data_layer.dataset.label_frequencies - class_weights = nemo.collections.nlp.utils.common_nlp_utils.calc_class_weights(label_freqs) + class_weights = nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing.calc_class_weights( + label_freqs + ) logging.info(f"class_weights: {class_weights}") @@ -205,7 +208,7 @@ def create_pipeline( hidden_size=hidden_size, num_classes=len(label_ids), dropout=dropout, num_layers=num_layers ) - task_loss = TokenClassificationLoss(num_classes=len(label_ids), class_weights=class_weights) + task_loss = CrossEntropyLossNM(logits_dim=3, weight=class_weights) hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) logits = classifier(hidden_states=hidden_states) diff --git a/examples/nlp/token_classification/token_classification_infer.py b/examples/nlp/token_classification/token_classification_infer.py index f1d7d1bf5cdf..cc63fcb7f3c6 100644 --- a/examples/nlp/token_classification/token_classification_infer.py +++ b/examples/nlp/token_classification/token_classification_infer.py @@ -24,7 +24,7 @@ from nemo import logging from nemo.collections.nlp.data import NemoBertTokenizer from nemo.collections.nlp.nm.trainables import TokenClassifier -from nemo.collections.nlp.utils.common_nlp_utils import get_vocab +from nemo.collections.nlp.utils.data_utils import get_vocab # Parsing arguments parser = argparse.ArgumentParser(description='NER with pretrained BERT') diff --git a/nemo/backends/pytorch/common/losses.py b/nemo/backends/pytorch/common/losses.py index dd0d70082f9f..02b0cd78cf67 100644 --- a/nemo/backends/pytorch/common/losses.py +++ b/nemo/backends/pytorch/common/losses.py @@ -2,12 +2,10 @@ from torch import nn from nemo.backends.pytorch.nm import LossNM -from nemo.core.neural_types import LabelsType, LogitsType, LossType, NeuralType, RegressionValuesType +from nemo.core.neural_types import LabelsType, LogitsType, LossType, MaskType, NeuralType, RegressionValuesType from nemo.utils.decorators import add_port_docs -__all__ = ['SequenceLoss', 'CrossEntropyLoss', 'MSELoss'] - -EPS = 1e-5 +__all__ = ['SequenceLoss', 'CrossEntropyLossNM', 'MSELoss', 'LossAggregatorNM'] class SequenceLoss(LossNM): @@ -29,6 +27,8 @@ class SequenceLoss(LossNM): ctc_blank_id (int): ID of blank symbols to pass to mask when calculating ctc loss. Defaults to None. + eps (float): small number to prevent division by zero in loss calculation + Defaults to 1e-5. """ @@ -47,7 +47,14 @@ def output_ports(self): return {"loss": NeuralType(elements_type=LossType())} def __init__( - self, pad_id=0, smoothing_coef=0.0, sample_wise=False, aux_ctc=False, ctc_initial_coef=0.1, ctc_blank_id=None + self, + pad_id=0, + smoothing_coef=0.0, + sample_wise=False, + aux_ctc=False, + ctc_initial_coef=0.1, + ctc_blank_id=None, + eps=1e-5, ): assert (not aux_ctc) or (ctc_blank_id is not None), "Should be a blank id if using CTC loss" @@ -58,6 +65,7 @@ def __init__( self.sample_wise = sample_wise self.aux_ctc = aux_ctc self.ctc_coef = ctc_initial_coef + self.eps = eps if aux_ctc: self.ctc = nn.CTCLoss(blank=ctc_blank_id, reduction='none', zero_infinity=True) @@ -85,7 +93,7 @@ def _ce_loss(self, log_probs, targets, pad_mask): if self.sample_wise: loss /= target_log_probs.size(0) else: - loss /= pad_mask.sum() + EPS + loss /= pad_mask.sum() + self.eps return loss def _ctc_loss(self, log_probs, targets, pad_mask): @@ -95,10 +103,14 @@ def _ctc_loss(self, log_probs, targets, pad_mask): return loss -class CrossEntropyLoss(LossNM): +class CrossEntropyLossNM(LossNM): """ CrossEntropyLoss - + Args: + logits_dim (int): dimension size of the logits tensor + weight (list): list of rescaling weight given to each class + reduce (bool): controls if reduction would be done over the batch + reduction (str): type of the reduction over the batch """ @property @@ -107,8 +119,9 @@ def input_ports(self): """Returns definitions of module input ports. """ return { - "logits": NeuralType(axes=('B', 'D'), elements_type=LogitsType()), - "labels": NeuralType(axes=tuple('B'), elements_type=LabelsType()), + "logits": NeuralType(['B'] + ['ANY'] * (self._logits_dim - 1), LogitsType()), + "labels": NeuralType(['B'] + ['ANY'] * (self._logits_dim - 2), LabelsType()), + "loss_mask": NeuralType(['B'] + ['ANY'] * (self._logits_dim - 2), MaskType(), optional=True), } @property @@ -121,14 +134,30 @@ def output_ports(self): """ return {"loss": NeuralType(elements_type=LossType())} - def __init__(self, reduction='mean', weight=None): + def __init__(self, logits_dim=2, weight=None, reduce=True, reduction='mean'): super().__init__() + if weight: weight = torch.FloatTensor(weight).to(self._device) - self._criterion = nn.CrossEntropyLoss(weight=weight, reduction=reduction) + self._criterion = nn.CrossEntropyLoss(weight=weight, reduce=reduce, reduction=reduction) + self._logits_dim = logits_dim + + def _loss_function(self, logits, labels, loss_mask=None): + """ + Args: + logits (float): output of the classifier + labels (long): ground truth labels + loss_mask (bool/float/int): tensor to specify the masking + """ + logits_flatten = torch.flatten(logits, start_dim=0, end_dim=-2) + labels_flatten = torch.flatten(labels, start_dim=0, end_dim=-1) + + if loss_mask is not None: + loss_mask_flatten = torch.flatten(loss_mask, start_dim=0, end_dim=-1) + logits_flatten = logits_flatten[loss_mask_flatten] + labels_flatten = labels_flatten[loss_mask_flatten] - def _loss_function(self, logits, labels): - loss = self._criterion(logits, labels) + loss = self._criterion(logits_flatten, labels_flatten) return loss @@ -166,3 +195,52 @@ def __init__(self, reduction='mean'): def _loss_function(self, preds, labels): loss = self._criterion(preds, labels) return loss + + +class LossAggregatorNM(LossNM): + """ + Neural module which combines sums several losses into one. + + Args: + num_inputs (int): number of input losses + weights (list of floats): a list of coefficient for merging losses + """ + + @property + def input_ports(self): + """Returns definitions of module input ports. + + """ + input_ports = {} + for i in range(self._num_losses): + input_ports["loss_" + str(i + 1)] = NeuralType(elements_type=LossType()) + + return input_ports + + @property + def output_ports(self): + """Returns definitions of module output ports. + + loss: + NeuralType(None) + """ + return {"loss": NeuralType(elements_type=LossType())} + + def __init__(self, num_inputs=2, weights=None): + # Store number of inputs/losses. + self._num_losses = num_inputs + if weights is not None and len(weights) != num_inputs: + raise ValueError("Length of weights should be equal to the number of inputs (num_inputs)") + + self._weights = weights + LossNM.__init__(self) + + def _loss_function(self, **kwargs): + values = [kwargs[x] for x in sorted(kwargs.keys())] + loss = torch.zeros_like(values[0]) + for loss_idx, loss_value in enumerate(values): + if self._weights is not None: + loss = loss.add(loss_value, alpha=self._weights[loss_idx]) + else: + loss = loss.add(loss_value) + return loss diff --git a/nemo/backends/pytorch/common/rnn.py b/nemo/backends/pytorch/common/rnn.py index eb9eb7e1e246..c1c62ac08c35 100644 --- a/nemo/backends/pytorch/common/rnn.py +++ b/nemo/backends/pytorch/common/rnn.py @@ -71,11 +71,7 @@ def input_ports(self): """Returns definitions of module input ports. """ return { - # 'targets': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), 'targets': NeuralType(('B', 'T'), LabelsType()), - # 'encoder_outputs': NeuralType( - # {0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}, optional=True, - # ), 'encoder_outputs': NeuralType(('B', 'T', 'D'), ChannelType(), True), } @@ -85,11 +81,7 @@ def output_ports(self): """Returns definitions of module output ports. """ return { - # 'log_probs': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}), 'log_probs': NeuralType(('B', 'T', 'D'), LogprobsType()), - # 'attention_weights': NeuralType( - # {0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(TimeTag),}, optional=True, - # ), 'attention_weights': NeuralType(('B', 'T', 'T'), ChannelType(), True), } @@ -211,8 +203,6 @@ def input_ports(self): """Returns definitions of module input ports. """ return { - # 'inputs': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # 'input_lens': NeuralType({0: AxisType(BatchTag),}, optional=True), 'inputs': NeuralType(('B', 'T'), ChannelType()), 'input_lens': NeuralType(tuple('B'), LengthsType()), } @@ -223,8 +213,6 @@ def output_ports(self): """Returns definitions of module output ports. """ return { - # 'outputs': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}), - # 'hidden': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}), 'outputs': NeuralType(('B', 'T', 'D'), ChannelType()), 'hidden': NeuralType(('B', 'T', 'D'), ChannelType()), } diff --git a/nemo/collections/asr/__init__.py b/nemo/collections/asr/__init__.py index c8f86eb6acc1..29e1df9df347 100644 --- a/nemo/collections/asr/__init__.py +++ b/nemo/collections/asr/__init__.py @@ -19,7 +19,7 @@ from .jasper import JasperDecoderForClassification, JasperDecoderForCTC, JasperEncoder from .las.misc import JasperRNNConnector from .losses import CTCLossNM -from nemo.backends.pytorch.common.losses import CrossEntropyLoss as CrossEntropyLossNM +from nemo.backends.pytorch.common.losses import CrossEntropyLossNM from nemo.core import Backend __all__ = [ diff --git a/nemo/collections/nlp/data/datasets/__init__.py b/nemo/collections/nlp/data/datasets/__init__.py index 8e598e5655d3..4fc8431f5e33 100644 --- a/nemo/collections/nlp/data/datasets/__init__.py +++ b/nemo/collections/nlp/data/datasets/__init__.py @@ -14,9 +14,8 @@ # limitations under the License. # ============================================================================= -from nemo.collections.nlp.data.datasets import datasets_utils -from nemo.collections.nlp.data.datasets.glue_benchmark_dataset import GLUEDataset -from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import ( +from nemo.collections.nlp.data.datasets.glue_benchmark_dataset.glue_benchmark_dataset import GLUEDataset +from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.joint_intent_slot_dataset import ( BertJointIntentSlotDataset, BertJointIntentSlotInferDataset, ) @@ -26,12 +25,12 @@ ) from nemo.collections.nlp.data.datasets.lm_transformer_dataset import LanguageModelingDataset from nemo.collections.nlp.data.datasets.machine_translation_dataset import TranslationDataset +from nemo.collections.nlp.data.datasets.multiwoz_dataset import * from nemo.collections.nlp.data.datasets.punctuation_capitalization_dataset import ( BertPunctuationCapitalizationDataset, BertPunctuationCapitalizationInferDataset, ) -from nemo.collections.nlp.data.datasets.qa_squad_dataset import SquadDataset -from nemo.collections.nlp.data.datasets.state_tracking_trade_dataset import * +from nemo.collections.nlp.data.datasets.qa_squad_dataset.qa_squad_dataset import SquadDataset from nemo.collections.nlp.data.datasets.text_classification_dataset import BertTextClassificationDataset from nemo.collections.nlp.data.datasets.token_classification_dataset import ( BertTokenClassificationDataset, diff --git a/nemo/collections/nlp/data/datasets/datasets_utils.py b/nemo/collections/nlp/data/datasets/datasets_utils.py deleted file mode 100644 index 8f4e0640245d..000000000000 --- a/nemo/collections/nlp/data/datasets/datasets_utils.py +++ /dev/null @@ -1,990 +0,0 @@ -# ============================================================================= -# Copyright 2020 NVIDIA. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= - -import csv -import glob -import json -import os -import random -import re -import shutil -import string -import subprocess -from collections import Counter - -import numpy as np -from tqdm import tqdm - -from nemo import logging -from nemo.collections.nlp.utils.callback_utils import list2str -from nemo.collections.nlp.utils.common_nlp_utils import ( - get_vocab, - ids2text, - if_exist, - write_vocab, - write_vocab_in_order, -) - -__all__ = [ - 'get_label_stats', - 'process_sst_2', - 'process_imdb', - 'process_thucnews', - 'process_nlu', - 'process_twitter_airline', - 'process_atis', - 'process_jarvis_datasets', - 'process_mturk', - 'process_intent_slot_mturk', - 'get_intents_mturk', - 'get_slot_labels', - 'merge', - 'get_intent_query_files_dialogflow', - 'get_intents_slots_dialogflow', - 'get_slots_dialogflow', - 'partition_data', - 'write_files', - 'process_dialogflow', - 'write_data', - 'create_dataset', - 'read_csv', - 'process_snips', - 'get_dataset', - 'partition', - 'map_entities', - 'get_entities', - 'get_data', - 'reverse_dict', - 'get_intent_labels', - 'download_wkt2', - 'normalize_answer', - 'get_tokens', -] - -DATABASE_EXISTS_TMP = '{} dataset has already been processed and stored at {}' -MODE_EXISTS_TMP = '{} mode of {} dataset has already been processed and stored at {}' - - -def get_label_stats(labels, outfile='stats.tsv'): - labels = Counter(labels) - total = sum(labels.values()) - out = open(outfile, 'w') - i = 0 - label_frequencies = labels.most_common() - for k, v in label_frequencies: - out.write(f'{k}\t{v / total}\n') - if i < 3: - logging.info(f'{i} item: {k}, {v} out of {total}, {v / total}.') - i += 1 - return total, label_frequencies - - -def process_sst_2(data_dir): - if not os.path.exists(data_dir): - link = 'https://gluebenchmark.com/tasks' - raise ValueError(f'Data not found at {data_dir}. ' f'Please download SST-2 from {link}.') - logging.info('Keep in mind that SST-2 is only available in lower case.') - return data_dir - - -def process_imdb(data_dir, uncased, modes=['train', 'test']): - if not os.path.exists(data_dir): - link = 'www.kaggle.com/iarunava/imdb-movie-reviews-dataset' - raise ValueError(f'Data not found at {data_dir}. ' f'Please download IMDB from {link}.') - - outfold = f'{data_dir}/nemo-processed' - - if uncased: - outfold = f'{outfold}_uncased' - - if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): - logging.info(DATABASE_EXISTS_TMP.format('IMDB', outfold)) - return outfold - logging.info(f'Processing IMDB dataset and store at {outfold}') - - os.makedirs(outfold, exist_ok=True) - - outfiles = {} - - for mode in modes: - outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w') - outfiles[mode].write('sentence\tlabel\n') - for sent in ['neg', 'pos']: - if sent == 'neg': - label = 0 - else: - label = 1 - files = glob.glob(f'{data_dir}/{mode}/{sent}/*.txt') - for file in files: - with open(file, 'r') as f: - review = f.read().strip() - if uncased: - review = review.lower() - review = review.replace("
", "") - outfiles[mode].write(f'{review}\t{label}\n') - for mode in modes: - outfiles[mode].close() - - return outfold - - -def process_thucnews(data_dir): - modes = ['train', 'test'] - train_size = 0.8 - if not os.path.exists(data_dir): - link = 'thuctc.thunlp.org/' - raise ValueError(f'Data not found at {data_dir}. ' f'Please download THUCNews from {link}.') - - outfold = f'{data_dir}/nemo-processed-thucnews' - - if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): - logging.info(DATABASE_EXISTS_TMP.format('THUCNews', outfold)) - return outfold - logging.info(f'Processing THUCNews dataset and store at {outfold}') - - os.makedirs(outfold, exist_ok=True) - - outfiles = {} - - for mode in modes: - outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'a+', encoding='utf-8') - outfiles[mode].write('sentence\tlabel\n') - categories = ['体育', '娱乐', '家居', '彩票', '房产', '教育', '时尚', '时政', '星座', '游戏', '社会', '科技', '股票', '财经'] - for category in categories: - label = categories.index(category) - category_files = glob.glob(f'{data_dir}/{category}/*.txt') - test_num = int(len(category_files) * (1 - train_size)) - test_files = category_files[:test_num] - train_files = category_files[test_num:] - for mode in modes: - logging.info(f'Processing {mode} data of the category {category}') - if mode == 'test': - files = test_files - else: - files = train_files - for file in tqdm(files): - with open(file, 'r', encoding='utf-8') as f: - news = f.read().strip().replace('\r', '') - news = news.replace('\n', '').replace('\t', ' ') - outfiles[mode].write(f'{news}\t{label}\n') - for mode in modes: - outfiles[mode].close() - - return outfold - - -def process_nlu(filename, uncased, modes=['train', 'test'], dataset_name='nlu-ubuntu'): - """ Dataset has to be of: - - ubuntu - - chat - - web - """ - - if not os.path.exists(filename): - link = 'https://github.com/sebischair/NLU-Evaluation-Corpora' - raise ValueError(f'Data not found at {filename}. ' f'Please download IMDB from {link}.') - - if dataset_name == 'nlu-ubuntu': - INTENT = {'makeupdate': 1, 'setupprinter': 2, 'shutdowncomputer': 3, 'softwarerecommendation': 4, 'none': 0} - elif dataset_name == 'nlu-chat': - INTENT = {'departuretime': 0, 'findconnection': 1} - elif dataset_name == 'nlu-web': - INTENT = { - 'changepassword': 1, - 'deleteaccount': 2, - 'downloadvideo': 3, - 'exportdata': 4, - 'filterspam': 5, - 'findalternative': 6, - 'syncaccounts': 7, - 'none': 0, - } - else: - raise ValueError(f'{dataset_name}: Invalid dataset name') - - infold = filename[: filename.rfind('/')] - outfold = f'{infold}/{dataset_name}-nemo-processed' - - if uncased: - outfold = f'{outfold}_uncased' - - if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): - logging.info(DATABASE_EXISTS_TMP.format(dataset_name.upper(), outfold)) - return outfold - logging.info(f'Processing data and store at {outfold}') - - os.makedirs(outfold, exist_ok=True) - - outfiles = {} - - for mode in modes: - outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w') - outfiles[mode].write('sentence\tlabel\n') - - with open(filename, 'r') as f: - data = json.load(f) - - for obj in data['sentences']: - sentence = obj['text'].strip() - if uncased: - sentence = sentence.lower() - intent = obj['intent'].lower().replace(' ', '') - label = INTENT[intent] - txt = f'{sentence}\t{label}\n' - if obj['training']: - outfiles['train'].write(txt) - else: - outfiles['test'].write(txt) - for mode in modes: - outfiles[mode].close() - return outfold - - -def process_twitter_airline(filename, uncased, modes=['train', 'test']): - """ Dataset from Kaggle: - https://www.kaggle.com/crowdflower/twitter-airline-sentiment - """ - pass - - -def process_atis(infold, uncased, modes=['train', 'test'], dev_split=0): - """ MSFT's dataset, processed by Kaggle - https://www.kaggle.com/siddhadev/atis-dataset-from-ms-cntk - """ - outfold = f'{infold}/nemo-processed' - vocab = get_vocab(f'{infold}/atis.dict.vocab.csv') - - if uncased: - outfold = f'{outfold}-uncased' - - if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): - logging.info(DATABASE_EXISTS_TMP.format('ATIS', outfold)) - return outfold - logging.info(f'Processing ATIS dataset and store at {outfold}') - - os.makedirs(outfold, exist_ok=True) - - outfiles = {} - - for mode in modes: - outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w') - outfiles[mode].write('sentence\tlabel\n') - outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w') - - queries = open(f'{infold}/atis.{mode}.query.csv', 'r').readlines() - intents = open(f'{infold}/atis.{mode}.intent.csv', 'r').readlines() - slots = open(f'{infold}/atis.{mode}.slots.csv', 'r').readlines() - - for i, query in enumerate(queries): - sentence = ids2text(query.strip().split()[1:-1], vocab) - outfiles[mode].write(f'{sentence}\t{intents[i].strip()}\n') - slot = ' '.join(slots[i].strip().split()[1:-1]) - outfiles[mode + '_slots'].write(slot + '\n') - - shutil.copyfile(f'{infold}/atis.dict.intent.csv', f'{outfold}/dict.intents.csv') - shutil.copyfile(f'{infold}/atis.dict.slots.csv', f'{outfold}/dict.slots.csv') - for mode in modes: - outfiles[mode].close() - - return outfold - - -def process_jarvis_datasets(infold, uncased, dataset_name, modes=['train', 'test', 'eval'], ignore_prev_intent=False): - """ process and convert Jarvis datasets into NeMo's BIO format - """ - outfold = f'{infold}/{dataset_name}-nemo-processed' - infold = f'{infold}/' - - if uncased: - outfold = f'{outfold}-uncased' - - if if_exist(outfold, ['dict.intents.csv', 'dict.slots.csv']): - logging.info(DATABASE_EXISTS_TMP.format(dataset_name, outfold)) - return outfold - - logging.info(f'Processing {dataset_name} dataset and store at {outfold}') - - os.makedirs(outfold, exist_ok=True) - - outfiles = {} - intents_list = {} - slots_list = {} - slots_list_all = {} - - outfiles['dict_intents'] = open(f'{outfold}/dict.intents.csv', 'w') - outfiles['dict_slots'] = open(f'{outfold}/dict.slots.csv', 'w') - - outfiles['dict_slots'].write('O\n') - slots_list["O"] = 0 - slots_list_all["O"] = 0 - - for mode in modes: - if if_exist(outfold, [f'{mode}.tsv']): - logging.info(MODE_EXISTS_TMP.format(mode, dataset_name, outfold, mode)) - continue - - if not if_exist(infold, [f'{mode}.tsv']): - logging.info(f'{mode} mode of {dataset_name}' f' is skipped as it was not found.') - continue - - outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w') - outfiles[mode].write('sentence\tlabel\n') - outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w') - - queries = open(f'{infold}/{mode}.tsv', 'r').readlines() - - for i, query in enumerate(queries): - line_splits = query.strip().split("\t") - if len(line_splits) == 3: - intent_str, slot_tags_str, sentence = line_splits - else: - intent_str, sentence = line_splits - slot_tags_str = "" - - if intent_str not in intents_list: - intents_list[intent_str] = len(intents_list) - outfiles['dict_intents'].write(f'{intent_str}\n') - - if ignore_prev_intent: - start_token = 2 - else: - start_token = 1 - sentence_cld = " ".join(sentence.strip().split()[start_token:-1]) - outfiles[mode].write(f'{sentence_cld}\t' f'{str(intents_list[intent_str])}\n') - - slot_tags_list = [] - if slot_tags_str.strip(): - slot_tags = slot_tags_str.strip().split(",") - for st in slot_tags: - if not st.strip(): - continue - [start_i, end_i, slot_name] = st.strip().split(":") - slot_tags_list.append([int(start_i), int(end_i), slot_name]) - if slot_name not in slots_list: - slots_list[slot_name] = len(slots_list) - slots_list_all[f'B-{slot_name}'] = len(slots_list_all) - slots_list_all[f'I-{slot_name}'] = len(slots_list_all) - outfiles['dict_slots'].write(f'B-{slot_name}\n') - outfiles['dict_slots'].write(f'I-{slot_name}\n') - - slot_tags_list.sort(key=lambda x: x[0]) - slots = [] - processed_index = 0 - for tag_start, tag_end, tag_str in slot_tags_list: - if tag_start > processed_index: - words_list = sentence[processed_index:tag_start].strip().split() - slots.extend([str(slots_list_all['O'])] * len(words_list)) - words_list = sentence[tag_start:tag_end].strip().split() - slots.append(str(slots_list_all[f'B-{tag_str}'])) - slots.extend([str(slots_list_all[f'I-{tag_str}'])] * (len(words_list) - 1)) - processed_index = tag_end - - if processed_index < len(sentence): - words_list = sentence[processed_index:].strip().split() - slots.extend([str(slots_list_all['O'])] * len(words_list)) - - slots = slots[1:-1] - slot = ' '.join(slots) - outfiles[mode + '_slots'].write(slot + '\n') - - outfiles[mode + '_slots'].close() - outfiles[mode].close() - - outfiles['dict_slots'].close() - outfiles['dict_intents'].close() - - return outfold - - -def process_mturk(data_dir, uncased, modes=['train', 'test'], dev_split=0.1): - if not os.path.exists(data_dir): - link = 'www.mturk.com' - raise ValueError( - f'Data not found at {data_dir}. ' f'Export your mturk data from' f'{link} and unzip at {data_dir}.' - ) - - outfold = f'{data_dir}/nemo-processed' - - if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): - logging.info(DATABASE_EXISTS_TMP.format('mturk', outfold)) - return outfold - - logging.info(f'Processing dataset from mturk and storing at {outfold}') - - os.makedirs(outfold, exist_ok=True) - - classification_data_file = f'{data_dir}/classification.csv' - annotation_data_file = f'{data_dir}/annotation.manifest' - - if not os.path.exists(classification_data_file): - raise FileNotFoundError(f'File not found ' f'at {classification_data_file}') - - if not os.path.exists(annotation_data_file): - raise FileNotFoundError(f'File not found at {annotation_data_file}') - - utterances = [] - utterances = read_csv(classification_data_file) - - # This function assumes that the intent classification data has been - # reviewed and cleaned and only one label per utterance is present. - agreed_all, intent_names = get_intents_mturk(utterances, outfold) - - with open(annotation_data_file, 'r') as f: - slot_annotations = f.readlines() - - # This function assumes that the preprocess step would have made - # the task_name of all the annotations generic - task_name = 'retail-combined' - - # It is assumed that every utterances will have corresponding - # slot annotation information - if len(slot_annotations) < len(agreed_all): - raise ValueError(f'Every utterance must have corresponding' f'slot annotation information') - - slot_labels, intent_queries, slot_tags = process_intent_slot_mturk( - slot_annotations, agreed_all, intent_names, task_name - ) - - assert len(slot_tags) == len(intent_queries) - - dev_split = 0.1 - - train_queries, train_slots, test_queries, test_slots = partition_data(intent_queries, slot_tags, split=dev_split) - - write_files(train_queries, f'{outfold}/train.tsv') - write_files(train_slots, f'{outfold}/train_slots.tsv') - - write_files(test_queries, f'{outfold}/test.tsv') - write_files(test_slots, f'{outfold}/test_slots.tsv') - - write_files(slot_labels, f'{outfold}/dict.slots.csv') - write_files(intent_names, f'{outfold}/dict.intents.csv') - - return outfold - - -def process_intent_slot_mturk(slot_annotations, agreed_all, intent_names, task_name): - slot_tags = [] - inorder_utterances = [] - all_labels = get_slot_labels(slot_annotations, task_name) - logging.info(f'agreed_all - {len(agreed_all)}') - logging.info(f'Slot annotations - {len(slot_annotations)}') - - for annotation in slot_annotations[0:]: - an = json.loads(annotation) - utterance = an['source'] - if len(utterance) > 2 and utterance.startswith('"') and utterance.endswith('"'): - utterance = utterance[1:-1] - - if utterance in agreed_all: - entities = {} - annotated_entities = an[task_name]['annotations']['entities'] - for i, each_anno in enumerate(annotated_entities): - entities[int(each_anno['startOffset'])] = i - - lastptr = 0 - slotlist = [] - # sorting annotations by the start offset - for i in sorted(entities.keys()): - annotated_entities = an[task_name]['annotations']['entities'] - tags = annotated_entities[entities.get(i)] - untagged_words = utterance[lastptr : tags['startOffset']] - for _ in untagged_words.split(): - slotlist.append(all_labels.get('O')) - anno_words = utterance[tags['startOffset'] : tags['endOffset']] - # tagging with the IOB format. - for j, _ in enumerate(anno_words.split()): - if j == 0: - b_slot = 'B-' + tags['label'] - slotlist.append(all_labels.get(b_slot)) - else: - i_slot = 'I-' + tags['label'] - slotlist.append(all_labels.get(i_slot)) - lastptr = tags['endOffset'] - - untagged_words = utterance[lastptr : len(utterance)] - for _ in untagged_words.split(): - slotlist.append(all_labels.get('O')) - - slotstr = ' '.join(slotlist) - slotstr = f'{slotstr.strip()}\n' - - slot_tags.append(slotstr) - intent_num = intent_names.get(agreed_all.get(utterance)) - query_text = f'{utterance.strip()}\t{intent_num}\n' - inorder_utterances.append(query_text) - # else: - # logging.warning(utterance) - - logging.info(f'inorder utterances - {len(inorder_utterances)}') - - return all_labels, inorder_utterances, slot_tags - - -def get_intents_mturk(utterances, outfold): - intent_names = {} - intent_count = 0 - - agreed_all = {} - - logging.info('Printing all intent_labels') - intent_dict = f'{outfold}/dict.intents.csv' - if os.path.exists(intent_dict): - with open(intent_dict, 'r') as f: - for intent_name in f.readlines(): - intent_names[intent_name.strip()] = intent_count - intent_count += 1 - logging.info(intent_names) - - for i, utterance in enumerate(utterances[1:]): - - if utterance[1] not in agreed_all: - agreed_all[utterance[0]] = utterance[1] - - if utterance[1] not in intent_names: - intent_names[utterance[1]] = intent_count - intent_count += 1 - - logging.info(f'Total number of utterance samples: {len(agreed_all)}') - - return agreed_all, intent_names - - -def get_slot_labels(slot_annotations, task_name): - slot_labels = json.loads(slot_annotations[0]) - - all_labels = {} - count = 0 - # Generating labels with the IOB format. - for label in slot_labels[task_name]['annotations']['labels']: - b_slot = 'B-' + label['label'] - i_slot = 'I-' + label['label'] - all_labels[b_slot] = str(count) - count += 1 - all_labels[i_slot] = str(count) - count += 1 - all_labels['O'] = str(count) - - return all_labels - - -def merge(data_dir, subdirs, dataset_name, modes=['train', 'test']): - outfold = f'{data_dir}/{dataset_name}' - if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): - logging.info(DATABASE_EXISTS_TMP.format('SNIPS-ATIS', outfold)) - slots = get_vocab(f'{outfold}/dict.slots.csv') - none_slot = 0 - for key in slots: - if slots[key] == 'O': - none_slot = key - break - return outfold, int(none_slot) - - os.makedirs(outfold, exist_ok=True) - - data_files, slot_files = {}, {} - for mode in modes: - data_files[mode] = open(f'{outfold}/{mode}.tsv', 'w') - data_files[mode].write('sentence\tlabel\n') - slot_files[mode] = open(f'{outfold}/{mode}_slots.tsv', 'w') - - intents, slots = {}, {} - intent_shift, slot_shift = 0, 0 - none_intent, none_slot = -1, -1 - - for subdir in subdirs: - curr_intents = get_vocab(f'{data_dir}/{subdir}/dict.intents.csv') - curr_slots = get_vocab(f'{data_dir}/{subdir}/dict.slots.csv') - - for key in curr_intents: - if intent_shift > 0 and curr_intents[key] == 'O': - continue - if curr_intents[key] == 'O' and intent_shift == 0: - none_intent = int(key) - intents[int(key) + intent_shift] = curr_intents[key] - - for key in curr_slots: - if slot_shift > 0 and curr_slots[key] == 'O': - continue - if slot_shift == 0 and curr_slots[key] == 'O': - none_slot = int(key) - slots[int(key) + slot_shift] = curr_slots[key] - - for mode in modes: - with open(f'{data_dir}/{subdir}/{mode}.tsv', 'r') as f: - for line in f.readlines()[1:]: - text, label = line.strip().split('\t') - label = int(label) - if curr_intents[label] == 'O': - label = none_intent - else: - label = label + intent_shift - data_files[mode].write(f'{text}\t{label}\n') - - with open(f'{data_dir}/{subdir}/{mode}_slots.tsv', 'r') as f: - for line in f.readlines(): - labels = [int(label) for label in line.strip().split()] - shifted_labels = [] - for label in labels: - if curr_slots[label] == 'O': - shifted_labels.append(none_slot) - else: - shifted_labels.append(label + slot_shift) - slot_files[mode].write(list2str(shifted_labels) + '\n') - - intent_shift += len(curr_intents) - slot_shift += len(curr_slots) - - write_vocab_in_order(intents, f'{outfold}/dict.intents.csv') - write_vocab_in_order(slots, f'{outfold}/dict.slots.csv') - return outfold, none_slot - - -def get_intent_query_files_dialogflow(path): - fileslist = [] - for root, _, files in os.walk(path): - for file in files: - if '_usersays_en.json' in file: - fileslist.append(os.path.join(root, file)) - return fileslist - - -def get_intents_slots_dialogflow(files, slot_labels): - intent_names = [] - intent_queries = [] - slot_tags = [] - - for index, file in enumerate(files): - intent_names.append(os.path.basename(file).split('_usersays')[0]) - - with open(file) as json_file: - intent_data = json.load(json_file) - for query in intent_data: - query_text = "" - slots = "" - for segment in query['data']: - query_text = ''.join([query_text, segment['text']]) - if 'alias' in segment: - for _ in segment['text'].split(): - slots = ' '.join([slots, slot_labels.get(segment['alias'])]) - else: - for _ in segment['text'].split(): - slots = ' '.join([slots, slot_labels.get('O')]) - query_text = f'{query_text.strip()}\t{index}\n' - intent_queries.append(query_text) - slots = f'{slots.strip()}\n' - slot_tags.append(slots) - return intent_queries, intent_names, slot_tags - - -def get_slots_dialogflow(files): - slot_labels = {} - count = 0 - for file in files: - intent_head_file = ''.join([file.split('_usersays')[0], '.json']) - with open(intent_head_file) as json_file: - intent_meta_data = json.load(json_file) - for params in intent_meta_data['responses'][0]['parameters']: - if params['name'] not in slot_labels: - slot_labels[params['name']] = str(count) - count += 1 - slot_labels['O'] = str(count) - return slot_labels - - -def partition_data(intent_queries, slot_tags, split=0.1): - n = len(intent_queries) - n_dev = int(n * split) - dev_idx = set(random.sample(range(n), n_dev)) - dev_intents, dev_slots, train_intents, train_slots = [], [], [], [] - - dev_intents.append('sentence\tlabel\n') - train_intents.append('sentence\tlabel\n') - - for i, item in enumerate(intent_queries): - if i in dev_idx: - dev_intents.append(item) - dev_slots.append(slot_tags[i]) - else: - train_intents.append(item) - train_slots.append(slot_tags[i]) - return train_intents, train_slots, dev_intents, dev_slots - - -def write_files(data, outfile): - with open(outfile, 'w') as f: - for item in data: - item = f'{item.strip()}\n' - f.write(item) - - -def process_dialogflow(data_dir, uncased, modes=['train', 'test'], dev_split=0.1): - if not os.path.exists(data_dir): - link = 'www.dialogflow.com' - raise ValueError( - f'Data not found at {data_dir}. ' f'Export your dialogflow data from' f'{link} and unzip at {data_dir}.' - ) - - outfold = f'{data_dir}/dialogflow/nemo-processed' - - '''TO DO - check for nemo-processed directory - already exists. If exists, skip the entire creation steps below. ''' - - os.makedirs(outfold, exist_ok=True) - - files = get_intent_query_files_dialogflow(data_dir) - - slot_labels = get_slots_dialogflow(files) - - intent_queries, intent_names, slot_tags = get_intents_slots_dialogflow(files, slot_labels) - - train_queries, train_slots, test_queries, test_slots = partition_data(intent_queries, slot_tags, split=dev_split) - - write_files(train_queries, f'{outfold}/train.tsv') - write_files(train_slots, f'{outfold}/train_slots.tsv') - - write_files(test_queries, f'{outfold}/test.tsv') - write_files(test_slots, f'{outfold}/test_slots.tsv') - - write_files(slot_labels, f'{outfold}/dict.slots.csv') - write_files(intent_names, f'{outfold}/dict.intents.csv') - - return outfold - - -def write_data(data, slot_dict, intent_dict, outfold, mode, uncased): - intent_file = open(f'{outfold}/{mode}.tsv', 'w') - intent_file.write('sentence\tlabel\n') - slot_file = open(f'{outfold}/{mode}_slots.tsv', 'w') - for tokens, slots, intent in data: - text = ' '.join(tokens) - if uncased: - text = text.lower() - intent_file.write(f'{text}\t{intent_dict[intent]}\n') - slots = [str(slot_dict[slot]) for slot in slots] - slot_file.write(' '.join(slots) + '\n') - intent_file.close() - slot_file.close() - - -def create_dataset(train, dev, slots, intents, uncased, outfold): - os.makedirs(outfold, exist_ok=True) - if 'O' in slots: - slots.remove('O') - slots = sorted(list(slots)) + ['O'] - intents = sorted(list(intents)) - slots = write_vocab(slots, f'{outfold}/dict.slots.csv') - intents = write_vocab(intents, f'{outfold}/dict.intents.csv') - write_data(train, slots, intents, outfold, 'train', uncased) - write_data(dev, slots, intents, outfold, 'test', uncased) - - -def read_csv(file_path): - rows = [] - with open(file_path, 'r') as csvfile: - read_csv = csv.reader(csvfile, delimiter=',') - for row in read_csv: - rows.append(row) - return rows - - -def process_snips(data_dir, uncased, modes=['train', 'test'], dev_split=0.1): - if not os.path.exists(data_dir): - link = 'www.github.com/snipsco/spoken-language' - '-understanding-research-datasets' - raise ValueError(f'Data not found at {data_dir}. ' f'Resquest to download the SNIPS dataset from {link}.') - - outfold = f'{data_dir}/nemo-processed' - - if uncased: - outfold = f'{outfold}-uncased' - - exist = True - for dataset in ['light', 'speak', 'all']: - if if_exist(f'{outfold}/{dataset}', [f'{mode}.tsv' for mode in modes]): - logging.info(DATABASE_EXISTS_TMP.format('SNIPS-' + dataset.upper(), outfold)) - else: - exist = False - if exist: - return outfold - - logging.info(f'Processing SNIPS dataset and store at {outfold}') - - os.makedirs(outfold, exist_ok=True) - - speak_dir = 'smart-speaker-en-close-field' - light_dir = 'smart-lights-en-close-field' - - light_files = [f'{data_dir}/{light_dir}/dataset.json'] - speak_files = [f'{data_dir}/{speak_dir}/training_dataset.json'] - speak_files.append(f'{data_dir}/{speak_dir}/test_dataset.json') - - light_train, light_dev, light_slots, light_intents = get_dataset(light_files, dev_split) - speak_train, speak_dev, speak_slots, speak_intents = get_dataset(speak_files) - - create_dataset(light_train, light_dev, light_slots, light_intents, uncased, f'{outfold}/light') - create_dataset(speak_train, speak_dev, speak_slots, speak_intents, uncased, f'{outfold}/speak') - create_dataset( - light_train + speak_train, - light_dev + speak_dev, - light_slots | speak_slots, - light_intents | speak_intents, - uncased, - f'{outfold}/all', - ) - - return outfold - - -def get_dataset(files, dev_split=0.1): - entity2value, value2entity = get_entities(files) - data, slots, intents = get_data(files, entity2value, value2entity) - if len(data) == 1: - train, dev = partition(data[0], split=dev_split) - else: - train, dev = data[0], data[1] - return train, dev, slots, intents - - -def partition(data, split=0.1): - n = len(data) - n_dev = int(n * split) - dev_idx = set(random.sample(range(n), n_dev)) - dev, train = [], [] - - for i, item in enumerate(data): - if i in dev_idx: - dev.append(item) - else: - train.append(item) - return train, dev - - -def map_entities(entity2value, entities): - for key in entities: - if 'data' in entities[key]: - if key not in entity2value: - entity2value[key] = set([]) - - values = [] - for value in entities[key]['data']: - values.append(value['value']) - values.extend(value['synonyms']) - entity2value[key] = entity2value[key] | set(values) - - return entity2value - - -def get_entities(files): - entity2value = {} - for file in files: - with open(file, 'r') as json_file: - data = json.load(json_file) - entity2value = map_entities(entity2value, data['entities']) - - value2entity = reverse_dict(entity2value) - return entity2value, value2entity - - -def get_data(files, entity2value, value2entity): - all_data, all_slots, all_intents = [], set(['O']), set() - for file in files: - file_data = [] - with open(file, 'r') as json_file: - data = json.load(json_file) - for intent in data['intents']: - all_intents.add(intent) - utterances = data['intents'][intent]['utterances'] - for utterance in utterances: - tokens, slots = [], [] - for frag in utterance['data']: - frag_tokens = frag['text'].strip().split() - tokens.extend(frag_tokens) - if 'slot_name' not in frag: - slot = 'O' - else: - slot = frag['slot_name'] - all_slots.add(slot) - slots.extend([slot] * len(frag_tokens)) - file_data.append((tokens, slots, intent)) - all_data.append(file_data) - return all_data, all_slots, all_intents - - -def reverse_dict(entity2value): - value2entity = {} - for entity in entity2value: - for value in entity2value[entity]: - value2entity[value] = entity - return value2entity - - -def get_intent_labels(intent_file): - labels = {} - label = 0 - with open(intent_file, 'r') as f: - for line in f: - intent = line.strip() - labels[intent] = label - label += 1 - return labels - - -def download_wkt2(data_dir): - if os.path.exists(data_dir): - return - os.makedirs('data/lm', exist_ok=True) - logging.warning(f'Data not found at {data_dir}. ' f'Downloading wikitext-2 to data/lm') - data_dir = 'data/lm/wikitext-2' - subprocess.call('../scripts/get_wkt2.sh') - return data_dir - - -def normalize_answer(s): - """Lower text and remove punctuation, articles and extra whitespace.""" - - def remove_articles(text): - return re.sub(r'\b(a|an|the)\b', ' ', text) - - def white_space_fix(text): - return ' '.join(text.split()) - - def remove_punc(text): - exclude = set(string.punctuation) - return ''.join(ch for ch in text if ch not in exclude) - - def lower(text): - return text.lower() - - return white_space_fix(remove_articles(remove_punc(lower(s)))) - - -def get_tokens(s): - if not s: - return [] - return normalize_answer(s).split() - - -def get_stats(lengths): - lengths = np.asarray(lengths) - logging.info( - f'Min: {np.min(lengths)} | \ - Max: {np.max(lengths)} | \ - Mean: {np.mean(lengths)} | \ - Median: {np.median(lengths)}' - ) - logging.info(f'75 percentile: {np.percentile(lengths, 75)}') - logging.info(f'99 percentile: {np.percentile(lengths, 99)}') diff --git a/nemo/collections/nlp/utils/loss_utils.py b/nemo/collections/nlp/data/datasets/datasets_utils/__init__.py similarity index 57% rename from nemo/collections/nlp/utils/loss_utils.py rename to nemo/collections/nlp/data/datasets/datasets_utils/__init__.py index a4d3da6ef10f..01b4fb116b57 100644 --- a/nemo/collections/nlp/utils/loss_utils.py +++ b/nemo/collections/nlp/data/datasets/datasets_utils/__init__.py @@ -14,29 +14,7 @@ # limitations under the License. # ============================================================================= -import math - -__all__ = ['_compute_softmax'] - - -def _compute_softmax(scores): - """Compute softmax probability over raw logits.""" - if not scores: - return [] - - max_score = None - for score in scores: - if max_score is None or score > max_score: - max_score = score - - exp_scores = [] - total_sum = 0.0 - for score in scores: - x = math.exp(score - max_score) - exp_scores.append(x) - total_sum += x - - probs = [] - for score in exp_scores: - probs.append(score / total_sum) - return probs +from nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing import * +from nemo.collections.nlp.data.datasets.datasets_utils.datasets_processing import * +from nemo.collections.nlp.data.datasets.datasets_utils.dialogflow_utils import * +from nemo.collections.nlp.data.datasets.datasets_utils.mturk_utils import * diff --git a/nemo/collections/nlp/data/datasets/datasets_utils/data_preprocessing.py b/nemo/collections/nlp/data/datasets/datasets_utils/data_preprocessing.py new file mode 100644 index 000000000000..7bc816f2bbd3 --- /dev/null +++ b/nemo/collections/nlp/data/datasets/datasets_utils/data_preprocessing.py @@ -0,0 +1,332 @@ +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import csv +import json +import os +import pickle +import random +import re +import string +from collections import Counter + +import numpy as np + +from nemo import logging + +__all__ = [ + 'get_label_stats', + 'partition_data', + 'write_files', + 'write_data', + 'create_dataset', + 'read_csv', + 'get_dataset', + 'partition', + 'map_entities', + 'get_entities', + 'get_data', + 'reverse_dict', + 'get_intent_labels', + 'get_stats', + 'DATABASE_EXISTS_TMP', + 'MODE_EXISTS_TMP', + 'is_whitespace', + 'write_vocab', + 'if_exist', + 'remove_punctuation_from_sentence', + 'dataset_to_ids', + 'calc_class_weights', +] + +DATABASE_EXISTS_TMP = '{} dataset has already been processed and stored at {}' +MODE_EXISTS_TMP = '{} mode of {} dataset has already been processed and stored at {}' + + +def get_label_stats(labels, outfile='stats.tsv'): + ''' + + Args: + labels: list of all labels + outfile: path to the file where to save label stats + + Returns: + total (int): total number of labels + label_frequencies (list of tuples): each tuple represent (label, label frequency) + ''' + labels = Counter(labels) + total = sum(labels.values()) + out = open(outfile, 'w') + i = 0 + label_frequencies = labels.most_common() + for k, v in label_frequencies: + out.write(f'{k}\t{v / total}\n') + if i < 3: + logging.info(f'{i} item: {k}, {v} out of {total}, {v / total}.') + i += 1 + return total, label_frequencies + + +def partition_data(intent_queries, slot_tags, split=0.1): + n = len(intent_queries) + n_dev = int(n * split) + dev_idx = set(random.sample(range(n), n_dev)) + dev_intents, dev_slots, train_intents, train_slots = [], [], [], [] + + dev_intents.append('sentence\tlabel\n') + train_intents.append('sentence\tlabel\n') + + for i, item in enumerate(intent_queries): + if i in dev_idx: + dev_intents.append(item) + dev_slots.append(slot_tags[i]) + else: + train_intents.append(item) + train_slots.append(slot_tags[i]) + return train_intents, train_slots, dev_intents, dev_slots + + +def write_files(data, outfile): + with open(outfile, 'w') as f: + for item in data: + item = f'{item.strip()}\n' + f.write(item) + + +def write_data(data, slot_dict, intent_dict, outfold, mode, uncased): + intent_file = open(f'{outfold}/{mode}.tsv', 'w') + intent_file.write('sentence\tlabel\n') + slot_file = open(f'{outfold}/{mode}_slots.tsv', 'w') + for tokens, slots, intent in data: + text = ' '.join(tokens) + if uncased: + text = text.lower() + intent_file.write(f'{text}\t{intent_dict[intent]}\n') + slots = [str(slot_dict[slot]) for slot in slots] + slot_file.write(' '.join(slots) + '\n') + intent_file.close() + slot_file.close() + + +def create_dataset(train, dev, slots, intents, uncased, outfold): + os.makedirs(outfold, exist_ok=True) + if 'O' in slots: + slots.remove('O') + slots = sorted(list(slots)) + ['O'] + intents = sorted(list(intents)) + slots = write_vocab(slots, f'{outfold}/dict.slots.csv') + intents = write_vocab(intents, f'{outfold}/dict.intents.csv') + write_data(train, slots, intents, outfold, 'train', uncased) + write_data(dev, slots, intents, outfold, 'test', uncased) + + +def read_csv(file_path): + rows = [] + with open(file_path, 'r') as csvfile: + read_csv = csv.reader(csvfile, delimiter=',') + for row in read_csv: + rows.append(row) + return rows + + +def get_dataset(files, dev_split=0.1): + entity2value, value2entity = get_entities(files) + data, slots, intents = get_data(files, entity2value, value2entity) + if len(data) == 1: + train, dev = partition(data[0], split=dev_split) + else: + train, dev = data[0], data[1] + return train, dev, slots, intents + + +def partition(data, split=0.1): + n = len(data) + n_dev = int(n * split) + dev_idx = set(random.sample(range(n), n_dev)) + dev, train = [], [] + + for i, item in enumerate(data): + if i in dev_idx: + dev.append(item) + else: + train.append(item) + return train, dev + + +def map_entities(entity2value, entities): + for key in entities: + if 'data' in entities[key]: + if key not in entity2value: + entity2value[key] = set([]) + + values = [] + for value in entities[key]['data']: + values.append(value['value']) + values.extend(value['synonyms']) + entity2value[key] = entity2value[key] | set(values) + + return entity2value + + +def get_entities(files): + entity2value = {} + for file in files: + with open(file, 'r') as json_file: + data = json.load(json_file) + entity2value = map_entities(entity2value, data['entities']) + + value2entity = reverse_dict(entity2value) + return entity2value, value2entity + + +def get_data(files, entity2value, value2entity): + all_data, all_slots, all_intents = [], set(['O']), set() + for file in files: + file_data = [] + with open(file, 'r') as json_file: + data = json.load(json_file) + for intent in data['intents']: + all_intents.add(intent) + utterances = data['intents'][intent]['utterances'] + for utterance in utterances: + tokens, slots = [], [] + for frag in utterance['data']: + frag_tokens = frag['text'].strip().split() + tokens.extend(frag_tokens) + if 'slot_name' not in frag: + slot = 'O' + else: + slot = frag['slot_name'] + all_slots.add(slot) + slots.extend([slot] * len(frag_tokens)) + file_data.append((tokens, slots, intent)) + all_data.append(file_data) + return all_data, all_slots, all_intents + + +def reverse_dict(entity2value): + value2entity = {} + for entity in entity2value: + for value in entity2value[entity]: + value2entity[value] = entity + return value2entity + + +def get_intent_labels(intent_file): + labels = {} + label = 0 + with open(intent_file, 'r') as f: + for line in f: + intent = line.strip() + labels[intent] = label + label += 1 + return labels + + +def get_stats(lengths): + lengths = np.asarray(lengths) + logging.info( + f'Min: {np.min(lengths)} | \ + Max: {np.max(lengths)} | \ + Mean: {np.mean(lengths)} | \ + Median: {np.median(lengths)}' + ) + logging.info(f'75 percentile: {np.percentile(lengths, 75)}') + logging.info(f'99 percentile: {np.percentile(lengths, 99)}') + + +def is_whitespace(c): + if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: + return True + return False + + +def write_vocab(items, outfile): + vocab = {} + idx = 0 + with open(outfile, 'w') as f: + for item in items: + f.write(item + '\n') + vocab[item] = idx + idx += 1 + return vocab + + +def if_exist(outfold, files): + if not os.path.exists(outfold): + return False + for file in files: + if not os.path.exists(f'{outfold}/{file}'): + return False + return True + + +def remove_punctuation_from_sentence(sentence): + sentence = re.sub('[' + string.punctuation + ']', '', sentence) + sentence = sentence.lower() + return sentence + + +def dataset_to_ids(dataset, tokenizer, cache_ids=False, add_bos_eos=True): + """ + Reads dataset from file line by line, tokenizes each line with tokenizer, + and returns list of lists which corresponds to ids of tokenized strings. + + Args: + dataset: path to dataset + tokenizer: tokenizer to convert text into ids + cache_ids: if True, ids are saved to disk as pickle file + with similar name (e.g., data.txt --> data.txt.pkl) + add_bos_eos: bool, whether to add and symbols (e.g., for NMT) + Returns: + ids: list of ids which correspond to tokenized strings of the dataset + """ + + cached_ids_dataset = dataset + str(".pkl") + if os.path.isfile(cached_ids_dataset): + logging.info("Loading cached tokenized dataset ...") + ids = pickle.load(open(cached_ids_dataset, "rb")) + else: + logging.info("Tokenizing dataset ...") + data = open(dataset, "rb").readlines() + ids = [] + for sentence in data: + sent_ids = tokenizer.text_to_ids(sentence.decode("utf-8")) + if add_bos_eos: + sent_ids = [tokenizer.bos_id] + sent_ids + [tokenizer.eos_id] + ids.append(sent_ids) + if cache_ids: + logging.info("Caching tokenized dataset ...") + pickle.dump(ids, open(cached_ids_dataset, "wb")) + return ids + + +def calc_class_weights(label_freq): + """ + Goal is to give more weight to the classes with less samples + so as to match the one with the higest frequency. We achieve this by + dividing the highest frequency by the freq of each label. + Example - + [12, 5, 3] -> [12/12, 12/5, 12/3] -> [1, 2.4, 4] + + Here label_freq is assumed to be sorted by the frequency. I.e. + label_freq[0] is the most frequent element. + + """ + + most_common_label_freq = label_freq[0] + weighted_slots = sorted([(index, most_common_label_freq[1] / freq) for (index, freq) in label_freq]) + return [weight for (_, weight) in weighted_slots] diff --git a/nemo/collections/nlp/data/datasets/datasets_utils/datasets_processing.py b/nemo/collections/nlp/data/datasets/datasets_utils/datasets_processing.py new file mode 100644 index 000000000000..2f90412ed200 --- /dev/null +++ b/nemo/collections/nlp/data/datasets/datasets_utils/datasets_processing.py @@ -0,0 +1,431 @@ +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import csv +import glob +import json +import os +import shutil + +from nemo import logging +from nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing import ( + DATABASE_EXISTS_TMP, + MODE_EXISTS_TMP, + create_dataset, + get_dataset, + if_exist, +) +from nemo.collections.nlp.utils import get_vocab + +__all__ = [ + 'process_atis', + 'process_jarvis_datasets', + 'process_snips', + 'process_sst_2', + 'process_imdb', + 'process_nlu', + 'process_thucnews', +] + + +def ids2text(ids, vocab): + return ' '.join([vocab[int(id_)] for id_ in ids]) + + +def process_atis(infold, uncased, modes=['train', 'test'], dev_split=0): + """ MSFT's dataset, processed by Kaggle + https://www.kaggle.com/siddhadev/atis-dataset-from-ms-cntk + """ + outfold = f'{infold}/nemo-processed' + vocab = get_vocab(f'{infold}/atis.dict.vocab.csv') + + if uncased: + outfold = f'{outfold}-uncased' + + if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): + logging.info(DATABASE_EXISTS_TMP.format('ATIS', outfold)) + return outfold + logging.info(f'Processing ATIS dataset and store at {outfold}') + + os.makedirs(outfold, exist_ok=True) + + outfiles = {} + + for mode in modes: + outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w') + outfiles[mode].write('sentence\tlabel\n') + outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w') + + queries = open(f'{infold}/atis.{mode}.query.csv', 'r').readlines() + intents = open(f'{infold}/atis.{mode}.intent.csv', 'r').readlines() + slots = open(f'{infold}/atis.{mode}.slots.csv', 'r').readlines() + + for i, query in enumerate(queries): + sentence = ids2text(query.strip().split()[1:-1], vocab) + outfiles[mode].write(f'{sentence}\t{intents[i].strip()}\n') + slot = ' '.join(slots[i].strip().split()[1:-1]) + outfiles[mode + '_slots'].write(slot + '\n') + + shutil.copyfile(f'{infold}/atis.dict.intent.csv', f'{outfold}/dict.intents.csv') + shutil.copyfile(f'{infold}/atis.dict.slots.csv', f'{outfold}/dict.slots.csv') + for mode in modes: + outfiles[mode].close() + + return outfold + + +def process_jarvis_datasets(infold, uncased, dataset_name, modes=['train', 'test', 'eval'], ignore_prev_intent=False): + """ process and convert Jarvis datasets into NeMo's BIO format + """ + outfold = f'{infold}/{dataset_name}-nemo-processed' + infold = f'{infold}/' + + if uncased: + outfold = f'{outfold}-uncased' + + if if_exist(outfold, ['dict.intents.csv', 'dict.slots.csv']): + logging.info(DATABASE_EXISTS_TMP.format(dataset_name, outfold)) + return outfold + + logging.info(f'Processing {dataset_name} dataset and store at {outfold}') + + os.makedirs(outfold, exist_ok=True) + + outfiles = {} + intents_list = {} + slots_list = {} + slots_list_all = {} + + outfiles['dict_intents'] = open(f'{outfold}/dict.intents.csv', 'w') + outfiles['dict_slots'] = open(f'{outfold}/dict.slots.csv', 'w') + + outfiles['dict_slots'].write('O\n') + slots_list["O"] = 0 + slots_list_all["O"] = 0 + + for mode in modes: + if if_exist(outfold, [f'{mode}.tsv']): + logging.info(MODE_EXISTS_TMP.format(mode, dataset_name, outfold, mode)) + continue + + if not if_exist(infold, [f'{mode}.tsv']): + logging.info(f'{mode} mode of {dataset_name}' f' is skipped as it was not found.') + continue + + outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w') + outfiles[mode].write('sentence\tlabel\n') + outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w') + + queries = open(f'{infold}/{mode}.tsv', 'r').readlines() + + for i, query in enumerate(queries): + line_splits = query.strip().split("\t") + if len(line_splits) == 3: + intent_str, slot_tags_str, sentence = line_splits + else: + intent_str, sentence = line_splits + slot_tags_str = "" + + if intent_str not in intents_list: + intents_list[intent_str] = len(intents_list) + outfiles['dict_intents'].write(f'{intent_str}\n') + + if ignore_prev_intent: + start_token = 2 + else: + start_token = 1 + sentence_cld = " ".join(sentence.strip().split()[start_token:-1]) + outfiles[mode].write(f'{sentence_cld}\t' f'{str(intents_list[intent_str])}\n') + + slot_tags_list = [] + if slot_tags_str.strip(): + slot_tags = slot_tags_str.strip().split(",") + for st in slot_tags: + if not st.strip(): + continue + [start_i, end_i, slot_name] = st.strip().split(":") + slot_tags_list.append([int(start_i), int(end_i), slot_name]) + if slot_name not in slots_list: + slots_list[slot_name] = len(slots_list) + slots_list_all[f'B-{slot_name}'] = len(slots_list_all) + slots_list_all[f'I-{slot_name}'] = len(slots_list_all) + outfiles['dict_slots'].write(f'B-{slot_name}\n') + outfiles['dict_slots'].write(f'I-{slot_name}\n') + + slot_tags_list.sort(key=lambda x: x[0]) + slots = [] + processed_index = 0 + for tag_start, tag_end, tag_str in slot_tags_list: + if tag_start > processed_index: + words_list = sentence[processed_index:tag_start].strip().split() + slots.extend([str(slots_list_all['O'])] * len(words_list)) + words_list = sentence[tag_start:tag_end].strip().split() + slots.append(str(slots_list_all[f'B-{tag_str}'])) + slots.extend([str(slots_list_all[f'I-{tag_str}'])] * (len(words_list) - 1)) + processed_index = tag_end + + if processed_index < len(sentence): + words_list = sentence[processed_index:].strip().split() + slots.extend([str(slots_list_all['O'])] * len(words_list)) + + slots = slots[1:-1] + slot = ' '.join(slots) + outfiles[mode + '_slots'].write(slot + '\n') + + outfiles[mode + '_slots'].close() + outfiles[mode].close() + + outfiles['dict_slots'].close() + outfiles['dict_intents'].close() + + return outfold + + +def process_snips(data_dir, uncased, modes=['train', 'test'], dev_split=0.1): + if not os.path.exists(data_dir): + link = 'www.github.com/snipsco/spoken-language' + '-understanding-research-datasets' + raise ValueError(f'Data not found at {data_dir}. ' f'Resquest to download the SNIPS dataset from {link}.') + + outfold = f'{data_dir}/nemo-processed' + + if uncased: + outfold = f'{outfold}-uncased' + + exist = True + for dataset in ['light', 'speak', 'all']: + if if_exist(f'{outfold}/{dataset}', [f'{mode}.tsv' for mode in modes]): + logging.info(DATABASE_EXISTS_TMP.format('SNIPS-' + dataset.upper(), outfold)) + else: + exist = False + if exist: + return outfold + + logging.info(f'Processing SNIPS dataset and store at {outfold}') + + os.makedirs(outfold, exist_ok=True) + + speak_dir = 'smart-speaker-en-close-field' + light_dir = 'smart-lights-en-close-field' + + light_files = [f'{data_dir}/{light_dir}/dataset.json'] + speak_files = [f'{data_dir}/{speak_dir}/training_dataset.json'] + speak_files.append(f'{data_dir}/{speak_dir}/test_dataset.json') + + light_train, light_dev, light_slots, light_intents = get_dataset(light_files, dev_split) + speak_train, speak_dev, speak_slots, speak_intents = get_dataset(speak_files) + + create_dataset(light_train, light_dev, light_slots, light_intents, uncased, f'{outfold}/light') + create_dataset(speak_train, speak_dev, speak_slots, speak_intents, uncased, f'{outfold}/speak') + create_dataset( + light_train + speak_train, + light_dev + speak_dev, + light_slots | speak_slots, + light_intents | speak_intents, + uncased, + f'{outfold}/all', + ) + + return outfold + + +def process_sst_2(data_dir): + if not os.path.exists(data_dir): + link = 'https://gluebenchmark.com/tasks' + raise ValueError(f'Data not found at {data_dir}. ' f'Please download SST-2 from {link}.') + logging.info('Keep in mind that SST-2 is only available in lower case.') + return data_dir + + +def process_imdb(data_dir, uncased, modes=['train', 'test']): + if not os.path.exists(data_dir): + link = 'www.kaggle.com/iarunava/imdb-movie-reviews-dataset' + raise ValueError(f'Data not found at {data_dir}. ' f'Please download IMDB from {link}.') + + outfold = f'{data_dir}/nemo-processed' + + if uncased: + outfold = f'{outfold}_uncased' + + if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): + logging.info(DATABASE_EXISTS_TMP.format('IMDB', outfold)) + return outfold + logging.info(f'Processing IMDB dataset and store at {outfold}') + + os.makedirs(outfold, exist_ok=True) + + outfiles = {} + + for mode in modes: + outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w') + outfiles[mode].write('sentence\tlabel\n') + for sent in ['neg', 'pos']: + if sent == 'neg': + label = 0 + else: + label = 1 + files = glob.glob(f'{data_dir}/{mode}/{sent}/*.txt') + for file in files: + with open(file, 'r') as f: + review = f.read().strip() + if uncased: + review = review.lower() + review = review.replace("
", "") + outfiles[mode].write(f'{review}\t{label}\n') + for mode in modes: + outfiles[mode].close() + + return outfold + + +def process_thucnews(data_dir): + modes = ['train', 'test'] + train_size = 0.8 + if not os.path.exists(data_dir): + link = 'thuctc.thunlp.org/' + raise ValueError(f'Data not found at {data_dir}. ' f'Please download THUCNews from {link}.') + + outfold = f'{data_dir}/nemo-processed-thucnews' + + if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): + logging.info(DATABASE_EXISTS_TMP.format('THUCNews', outfold)) + return outfold + logging.info(f'Processing THUCNews dataset and store at {outfold}') + + os.makedirs(outfold, exist_ok=True) + + outfiles = {} + + for mode in modes: + outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'a+', encoding='utf-8') + outfiles[mode].write('sentence\tlabel\n') + categories = ['体育', '娱乐', '家居', '彩票', '房产', '教育', '时尚', '时政', '星座', '游戏', '社会', '科技', '股票', '财经'] + for category in categories: + label = categories.index(category) + category_files = glob.glob(f'{data_dir}/{category}/*.txt') + test_num = int(len(category_files) * (1 - train_size)) + test_files = category_files[:test_num] + train_files = category_files[test_num:] + for mode in modes: + logging.info(f'Processing {mode} data of the category {category}') + if mode == 'test': + files = test_files + else: + files = train_files + for file in tqdm(files): + with open(file, 'r', encoding='utf-8') as f: + news = f.read().strip().replace('\r', '') + news = news.replace('\n', '').replace('\t', ' ') + outfiles[mode].write(f'{news}\t{label}\n') + for mode in modes: + outfiles[mode].close() + + return outfold + + +def process_nlu(filename, uncased, modes=['train', 'test'], dataset_name='nlu-ubuntu'): + """ Dataset has to be of: + - ubuntu + - chat + - web + """ + + if not os.path.exists(filename): + link = 'https://github.com/sebischair/NLU-Evaluation-Corpora' + raise ValueError(f'Data not found at {filename}. ' f'Please download IMDB from {link}.') + + if dataset_name == 'nlu-ubuntu': + INTENT = {'makeupdate': 1, 'setupprinter': 2, 'shutdowncomputer': 3, 'softwarerecommendation': 4, 'none': 0} + elif dataset_name == 'nlu-chat': + INTENT = {'departuretime': 0, 'findconnection': 1} + elif dataset_name == 'nlu-web': + INTENT = { + 'changepassword': 1, + 'deleteaccount': 2, + 'downloadvideo': 3, + 'exportdata': 4, + 'filterspam': 5, + 'findalternative': 6, + 'syncaccounts': 7, + 'none': 0, + } + else: + raise ValueError(f'{dataset_name}: Invalid dataset name') + + infold = filename[: filename.rfind('/')] + outfold = f'{infold}/{dataset_name}-nemo-processed' + + if uncased: + outfold = f'{outfold}_uncased' + + if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): + logging.info(DATABASE_EXISTS_TMP.format(dataset_name.upper(), outfold)) + return outfold + logging.info(f'Processing data and store at {outfold}') + + os.makedirs(outfold, exist_ok=True) + + outfiles = {} + + for mode in modes: + outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w') + outfiles[mode].write('sentence\tlabel\n') + + with open(filename, 'r') as f: + data = json.load(f) + + for obj in data['sentences']: + sentence = obj['text'].strip() + if uncased: + sentence = sentence.lower() + intent = obj['intent'].lower().replace(' ', '') + label = INTENT[intent] + txt = f'{sentence}\t{label}\n' + if obj['training']: + outfiles['train'].write(txt) + else: + outfiles['test'].write(txt) + for mode in modes: + outfiles[mode].close() + return outfold + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with open(input_file, "r", encoding="utf-8-sig") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + # if sys.version_info[0] == 2: + # line = list(unicode(cell, 'utf-8') for cell in line) + lines.append(line) + return lines diff --git a/nemo/collections/nlp/data/datasets/datasets_utils/dialogflow_utils.py b/nemo/collections/nlp/data/datasets/datasets_utils/dialogflow_utils.py new file mode 100644 index 000000000000..0ce116f67e38 --- /dev/null +++ b/nemo/collections/nlp/data/datasets/datasets_utils/dialogflow_utils.py @@ -0,0 +1,113 @@ +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import json +import os + +from nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing import partition_data, write_files + +__all__ = [ + 'get_intent_query_files_dialogflow', + 'get_intents_slots_dialogflow', + 'get_slots_dialogflow', + 'process_dialogflow', +] + + +def get_intent_query_files_dialogflow(path): + fileslist = [] + for root, _, files in os.walk(path): + for file in files: + if '_usersays_en.json' in file: + fileslist.append(os.path.join(root, file)) + return fileslist + + +def get_intents_slots_dialogflow(files, slot_labels): + intent_names = [] + intent_queries = [] + slot_tags = [] + + for index, file in enumerate(files): + intent_names.append(os.path.basename(file).split('_usersays')[0]) + + with open(file) as json_file: + intent_data = json.load(json_file) + for query in intent_data: + query_text = "" + slots = "" + for segment in query['data']: + query_text = ''.join([query_text, segment['text']]) + if 'alias' in segment: + for _ in segment['text'].split(): + slots = ' '.join([slots, slot_labels.get(segment['alias'])]) + else: + for _ in segment['text'].split(): + slots = ' '.join([slots, slot_labels.get('O')]) + query_text = f'{query_text.strip()}\t{index}\n' + intent_queries.append(query_text) + slots = f'{slots.strip()}\n' + slot_tags.append(slots) + return intent_queries, intent_names, slot_tags + + +def get_slots_dialogflow(files): + slot_labels = {} + count = 0 + for file in files: + intent_head_file = ''.join([file.split('_usersays')[0], '.json']) + with open(intent_head_file) as json_file: + intent_meta_data = json.load(json_file) + for params in intent_meta_data['responses'][0]['parameters']: + if params['name'] not in slot_labels: + slot_labels[params['name']] = str(count) + count += 1 + slot_labels['O'] = str(count) + return slot_labels + + +def process_dialogflow(data_dir, uncased, modes=['train', 'test'], dev_split=0.1): + if not os.path.exists(data_dir): + link = 'www.dialogflow.com' + raise ValueError( + f'Data not found at {data_dir}. ' f'Export your dialogflow data from' f'{link} and unzip at {data_dir}.' + ) + + outfold = f'{data_dir}/dialogflow/nemo-processed' + + '''TO DO - check for nemo-processed directory + already exists. If exists, skip the entire creation steps below. ''' + + os.makedirs(outfold, exist_ok=True) + + files = get_intent_query_files_dialogflow(data_dir) + + slot_labels = get_slots_dialogflow(files) + + intent_queries, intent_names, slot_tags = get_intents_slots_dialogflow(files, slot_labels) + + train_queries, train_slots, test_queries, test_slots = partition_data(intent_queries, slot_tags, split=dev_split) + + write_files(train_queries, f'{outfold}/train.tsv') + write_files(train_slots, f'{outfold}/train_slots.tsv') + + write_files(test_queries, f'{outfold}/test.tsv') + write_files(test_slots, f'{outfold}/test_slots.tsv') + + write_files(slot_labels, f'{outfold}/dict.slots.csv') + write_files(intent_names, f'{outfold}/dict.intents.csv') + + return outfold diff --git a/nemo/collections/nlp/data/datasets/datasets_utils/mturk_utils.py b/nemo/collections/nlp/data/datasets/datasets_utils/mturk_utils.py new file mode 100644 index 000000000000..0269962ba1b5 --- /dev/null +++ b/nemo/collections/nlp/data/datasets/datasets_utils/mturk_utils.py @@ -0,0 +1,201 @@ +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import json +import os + +from nemo import logging +from nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing import ( + DATABASE_EXISTS_TMP, + if_exist, + partition_data, + read_csv, + write_files, +) + +__all__ = ['process_mturk', 'process_intent_slot_mturk', 'get_intents_mturk', 'get_slot_labels'] + + +def process_mturk(data_dir, uncased, modes=['train', 'test'], dev_split=0.1): + if not os.path.exists(data_dir): + link = 'www.mturk.com' + raise ValueError( + f'Data not found at {data_dir}. ' f'Export your mturk data from' f'{link} and unzip at {data_dir}.' + ) + + outfold = f'{data_dir}/nemo-processed' + + if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): + logging.info(DATABASE_EXISTS_TMP.format('mturk', outfold)) + return outfold + + logging.info(f'Processing dataset from mturk and storing at {outfold}') + + os.makedirs(outfold, exist_ok=True) + + classification_data_file = f'{data_dir}/classification.csv' + annotation_data_file = f'{data_dir}/annotation.manifest' + + if not os.path.exists(classification_data_file): + raise FileNotFoundError(f'File not found ' f'at {classification_data_file}') + + if not os.path.exists(annotation_data_file): + raise FileNotFoundError(f'File not found at {annotation_data_file}') + + utterances = [] + utterances = read_csv(classification_data_file) + + # This function assumes that the intent classification data has been + # reviewed and cleaned and only one label per utterance is present. + agreed_all, intent_names = get_intents_mturk(utterances, outfold) + + with open(annotation_data_file, 'r') as f: + slot_annotations = f.readlines() + + # This function assumes that the preprocess step would have made + # the task_name of all the annotations generic + task_name = 'retail-combined' + + # It is assumed that every utterances will have corresponding + # slot annotation information + if len(slot_annotations) < len(agreed_all): + raise ValueError(f'Every utterance must have corresponding' f'slot annotation information') + + slot_labels, intent_queries, slot_tags = process_intent_slot_mturk( + slot_annotations, agreed_all, intent_names, task_name + ) + + assert len(slot_tags) == len(intent_queries) + + dev_split = 0.1 + + train_queries, train_slots, test_queries, test_slots = partition_data(intent_queries, slot_tags, split=dev_split) + + write_files(train_queries, f'{outfold}/train.tsv') + write_files(train_slots, f'{outfold}/train_slots.tsv') + + write_files(test_queries, f'{outfold}/test.tsv') + write_files(test_slots, f'{outfold}/test_slots.tsv') + + write_files(slot_labels, f'{outfold}/dict.slots.csv') + write_files(intent_names, f'{outfold}/dict.intents.csv') + + return outfold + + +def process_intent_slot_mturk(slot_annotations, agreed_all, intent_names, task_name): + slot_tags = [] + inorder_utterances = [] + all_labels = get_slot_labels(slot_annotations, task_name) + logging.info(f'agreed_all - {len(agreed_all)}') + logging.info(f'Slot annotations - {len(slot_annotations)}') + + for annotation in slot_annotations[0:]: + an = json.loads(annotation) + utterance = an['source'] + if len(utterance) > 2 and utterance.startswith('"') and utterance.endswith('"'): + utterance = utterance[1:-1] + + if utterance in agreed_all: + entities = {} + annotated_entities = an[task_name]['annotations']['entities'] + for i, each_anno in enumerate(annotated_entities): + entities[int(each_anno['startOffset'])] = i + + lastptr = 0 + slotlist = [] + # sorting annotations by the start offset + for i in sorted(entities.keys()): + annotated_entities = an[task_name]['annotations']['entities'] + tags = annotated_entities[entities.get(i)] + untagged_words = utterance[lastptr : tags['startOffset']] + for _ in untagged_words.split(): + slotlist.append(all_labels.get('O')) + anno_words = utterance[tags['startOffset'] : tags['endOffset']] + # tagging with the IOB format. + for j, _ in enumerate(anno_words.split()): + if j == 0: + b_slot = 'B-' + tags['label'] + slotlist.append(all_labels.get(b_slot)) + else: + i_slot = 'I-' + tags['label'] + slotlist.append(all_labels.get(i_slot)) + lastptr = tags['endOffset'] + + untagged_words = utterance[lastptr : len(utterance)] + for _ in untagged_words.split(): + slotlist.append(all_labels.get('O')) + + slotstr = ' '.join(slotlist) + slotstr = f'{slotstr.strip()}\n' + + slot_tags.append(slotstr) + intent_num = intent_names.get(agreed_all.get(utterance)) + query_text = f'{utterance.strip()}\t{intent_num}\n' + inorder_utterances.append(query_text) + # else: + # logging.warning(utterance) + + logging.info(f'inorder utterances - {len(inorder_utterances)}') + + return all_labels, inorder_utterances, slot_tags + + +def get_intents_mturk(utterances, outfold): + intent_names = {} + intent_count = 0 + + agreed_all = {} + + logging.info('Printing all intent_labels') + intent_dict = f'{outfold}/dict.intents.csv' + if os.path.exists(intent_dict): + with open(intent_dict, 'r') as f: + for intent_name in f.readlines(): + intent_names[intent_name.strip()] = intent_count + intent_count += 1 + logging.info(intent_names) + + for i, utterance in enumerate(utterances[1:]): + + if utterance[1] not in agreed_all: + agreed_all[utterance[0]] = utterance[1] + + if utterance[1] not in intent_names: + intent_names[utterance[1]] = intent_count + intent_count += 1 + + logging.info(f'Total number of utterance samples: {len(agreed_all)}') + + return agreed_all, intent_names + + +def get_slot_labels(slot_annotations, task_name): + slot_labels = json.loads(slot_annotations[0]) + + all_labels = {} + count = 0 + # Generating labels with the IOB format. + for label in slot_labels[task_name]['annotations']['labels']: + b_slot = 'B-' + label['label'] + i_slot = 'I-' + label['label'] + all_labels[b_slot] = str(count) + count += 1 + all_labels[i_slot] = str(count) + count += 1 + all_labels['O'] = str(count) + + return all_labels diff --git a/nemo/collections/nlp/data/datasets/glue_benchmark_dataset.py b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset.py deleted file mode 100644 index 26423c3aa549..000000000000 --- a/nemo/collections/nlp/data/datasets/glue_benchmark_dataset.py +++ /dev/null @@ -1,593 +0,0 @@ -""" -Copyright 2018 The Google AI Language Team Authors and -The HuggingFace Inc. team. -Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -Utility functions for GLUE tasks -Some transformer of this code were adapted from the HuggingFace library at -https://github.com/huggingface/transformers -""" -import csv -import os - -import numpy as np -from torch.utils.data import Dataset - -from nemo import logging - -__all__ = ['GLUEDataset'] - - -class GLUEDataset(Dataset): - def __init__(self, data_dir, tokenizer, max_seq_length, processor, output_mode, evaluate, token_params): - self.tokenizer = tokenizer - self.label_list = processor.get_labels() - self.examples = processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir) - self.features = convert_examples_to_features( - self.examples, self.label_list, max_seq_length, tokenizer, output_mode, **token_params - ) - - def __len__(self): - return len(self.features) - - def __getitem__(self, idx): - feature = self.features[idx] - return ( - np.array(feature.input_ids), - np.array(feature.segment_ids), - np.array(feature.input_mask, dtype=np.long), - np.array(feature.label_id), - ) - - -def convert_examples_to_features( - examples, - label_list, - max_seq_length, - tokenizer, - output_mode, - bos_token=None, - eos_token='[SEP]', - pad_token='[PAD]', - cls_token='[CLS]', - sep_token_extra=None, - cls_token_at_end=False, - cls_token_segment_id=0, - pad_token_segment_id=0, - pad_on_left=False, - mask_padding_with_zero=True, - sequence_a_segment_id=0, - sequence_b_segment_id=1, -): - """ Loads a data file into a list of `InputBatch`s - `cls_token_at_end` define the location of the CLS token: - - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] - `cls_token_segment_id` define the segment id associated to the CLS - token (0 for BERT, 2 for XLNet) - The convention in BERT is: - (a) For sequence pairs: - tokens: [CLS] is this jack ##ville ? [SEP] no it is not . [SEP] - type_ids: 0 0 0 0 0 0 0 1 1 1 1 1 1 - (b) For single sequences: - tokens: [CLS] the dog is hairy . [SEP] - type_ids: 0 0 0 0 0 0 0 - Where "type_ids" are used to indicate whether this is the first - sequence or the second sequence. The embedding vectors for `type=0` - and `type=1` were learned during pre-training and are added to the - wordpiece embedding vector (and position vector). This is - not *strictly* necessarysince the [SEP] token unambiguously separates - the sequences, but it makes it easier for the model to learn - the concept of sequences. - For classification tasks, the first vector (corresponding to [CLS]) - is used as as the "sentence vector". Note that this only makes sense - because the entire model is fine-tuned. - For NMT: - (a) For sequence pairs: - tokens: is this jack ##ville ? no it is not . - type_ids:0 0 0 0 0 0 0 1 1 1 1 1 1 1 - (b) For single sequences: - tokens: the dog is hairy . - type_ids: 0 0 0 0 0 0 0 - """ - label_map = {label: i for i, label in enumerate(label_list)} - - features = [] - for ex_index, example in enumerate(examples): - if ex_index % 10000 == 0: - logging.info("Writing example %d of %d" % (ex_index, len(examples))) - - tokens_a = tokenizer.text_to_tokens(example.text_a) - - tokens_b = None - if example.text_b: - tokens_b = tokenizer.text_to_tokens(example.text_b) - - special_tokens_count = 2 if eos_token else 0 - special_tokens_count += 1 if sep_token_extra else 0 - special_tokens_count += 2 if bos_token else 0 - special_tokens_count += 1 if cls_token else 0 - _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count) - else: - special_tokens_count = 1 if eos_token else 0 - special_tokens_count += 1 if sep_token_extra else 0 - special_tokens_count += 1 if bos_token else 0 - if len(tokens_a) > max_seq_length - special_tokens_count: - tokens_a = tokens_a[: max_seq_length - special_tokens_count] - # Add special tokens to sequence_a - tokens = tokens_a - if bos_token: - tokens = [bos_token] + tokens - if eos_token: - tokens += [eos_token] - segment_ids = [sequence_a_segment_id] * len(tokens) - - # Add sequence separator between sequences - if tokens_b and sep_token_extra: - tokens += [sep_token_extra] - segment_ids += [sequence_a_segment_id] - - # Add special tokens to sequence_b - if tokens_b: - if bos_token: - tokens += [bos_token] - segment_ids += [sequence_b_segment_id] - tokens += tokens_b - segment_ids += [sequence_b_segment_id] * (len(tokens_b)) - if eos_token: - tokens += [eos_token] - segment_ids += [sequence_b_segment_id] - - # Add classification token - for BERT models - if cls_token: - if cls_token_at_end: - tokens += [cls_token] - segment_ids += [cls_token_segment_id] - else: - tokens = [cls_token] + tokens - segment_ids = [cls_token_segment_id] + segment_ids - input_ids = tokenizer.tokens_to_ids(tokens) - - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) - - # Zero-pad up to the sequence length. - padding_length = max_seq_length - len(input_ids) - pad_token_id = tokenizer.tokens_to_ids([pad_token])[0] - if pad_on_left: - input_ids = ([pad_token_id] * padding_length) + input_ids - input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask - segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids - else: - input_ids = input_ids + ([pad_token_id] * padding_length) - input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length) - segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) - if len(input_ids) != max_seq_length: - raise ValueError("input_ids must be of length max_seq_length") - if len(input_mask) != max_seq_length: - raise ValueError("input_mask must be of length max_seq_length") - if len(segment_ids) != max_seq_length: - raise ValueError("segment_ids must be of length max_seq_length") - if output_mode == "classification": - label_id = label_map[example.label] - elif output_mode == "regression": - label_id = np.float32(example.label) - else: - raise KeyError(output_mode) - - if ex_index < 5: - logging.info("*** Example ***") - logging.info("guid: %s" % (example.guid)) - logging.info("tokens: %s" % " ".join(list(map(str, tokens)))) - logging.info("input_ids: %s" % " ".join(list(map(str, input_ids)))) - logging.info("input_mask: %s" % " ".join(list(map(str, input_mask)))) - logging.info("segment_ids: %s" % " ".join(list(map(str, segment_ids)))) - logging.info("label: %s (id = %d)" % (example.label, label_id)) - - features.append( - InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id) - ) - return features - - -def _truncate_seq_pair(tokens_a, tokens_b, max_length): - """Truncates a sequence pair in place to the maximum length. - - This will always truncate the longer sequence one token at a time. - This makes more sense than truncating an equal percent - of tokens from each, since if one sequence is very short then each token - that's truncated likely contains more information than a longer sequence. - """ - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_length: - break - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() - - -""" -Utility functions for GLUE tasks -This code was adapted from the HuggingFace library at -https://github.com/huggingface/transformers -""" - - -class InputFeatures(object): - """A single set of features of data.""" - - def __init__(self, input_ids, input_mask, segment_ids, label_id): - self.input_ids = input_ids - self.input_mask = input_mask - self.segment_ids = segment_ids - self.label_id = label_id - - -class InputExample(object): - """A single training/test example for simple sequence classification.""" - - def __init__(self, guid, text_a, text_b=None, label=None): - """Constructs a InputExample. - - Args: - guid: Unique id for the example. - text_a: string. The untokenized text of the first sequence. - For single sequence tasks, only this sequence must be specified. - text_b: (Optional) string. The untokenized text of the second - sequence. Only must be specified for sequence pair tasks. - label: (Optional) string. The label of the example. This should be - specified for train and dev examples, but not for test examples. - """ - self.guid = guid - self.text_a = text_a - self.text_b = text_b - self.label = label - - -class DataProcessor(object): - """Base class for data converters for sequence classification data sets.""" - - def get_train_examples(self, data_dir): - """Gets a collection of `InputExample`s for the train set.""" - raise NotImplementedError() - - def get_dev_examples(self, data_dir): - """Gets a collection of `InputExample`s for the dev set.""" - raise NotImplementedError() - - def get_labels(self): - """Gets the list of labels for this data set.""" - raise NotImplementedError() - - @classmethod - def _read_tsv(cls, input_file, quotechar=None): - """Reads a tab separated value file.""" - with open(input_file, "r", encoding="utf-8-sig") as f: - reader = csv.reader(f, delimiter="\t", quotechar=quotechar) - lines = [] - for line in reader: - # if sys.version_info[0] == 2: - # line = list(unicode(cell, 'utf-8') for cell in line) - lines.append(line) - return lines - - -class MrpcProcessor(DataProcessor): - """Processor for the MRPC data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - logging.info(f'LOOKING AT {os.path.join(data_dir, "train.tsv")}') - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, i) - text_a = line[3] - text_b = line[4] - label = line[0] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class MnliProcessor(DataProcessor): - """Processor for the MultiNLI data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched") - - def get_labels(self): - """See base class.""" - return ["contradiction", "entailment", "neutral"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[8] - text_b = line[9] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class MnliMismatchedProcessor(MnliProcessor): - """Processor for the MultiNLI Mismatched data set (GLUE version).""" - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_matched") - - -class ColaProcessor(DataProcessor): - """Processor for the CoLA data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - guid = "%s-%s" % (set_type, i) - text_a = line[3] - label = line[1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) - return examples - - -class Sst2Processor(DataProcessor): - """Processor for the SST-2 data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, i) - text_a = line[0] - label = line[1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) - return examples - - -class StsbProcessor(DataProcessor): - """Processor for the STS-B data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return [None] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[7] - text_b = line[8] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class QqpProcessor(DataProcessor): - """Processor for the QQP data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - try: - text_a = line[3] - text_b = line[4] - label = line[5] - except IndexError: - continue - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class QnliProcessor(DataProcessor): - """Processor for the QNLI data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched") - - def get_labels(self): - """See base class.""" - return ["entailment", "not_entailment"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[1] - text_b = line[2] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class RteProcessor(DataProcessor): - """Processor for the RTE data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return ["entailment", "not_entailment"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[1] - text_b = line[2] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class WnliProcessor(DataProcessor): - """Processor for the WNLI data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[1] - text_b = line[2] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -processors = { - "cola": ColaProcessor, - "mnli": MnliProcessor, - "mnli-mm": MnliMismatchedProcessor, - "mrpc": MrpcProcessor, - "sst-2": Sst2Processor, - "sts-b": StsbProcessor, - "qqp": QqpProcessor, - "qnli": QnliProcessor, - "rte": RteProcessor, - "wnli": WnliProcessor, -} -output_modes = { - "cola": "classification", - "mnli": "classification", - "mnli-mm": "classification", - "mrpc": "classification", - "sst-2": "classification", - "sts-b": "regression", - "qqp": "classification", - "qnli": "classification", - "rte": "classification", - "wnli": "classification", -} -GLUE_TASKS_NUM_LABELS = { - "cola": 2, - "mnli": 3, - "mrpc": 2, - "sst-2": 2, - "sts-b": 1, - "qqp": 2, - "qnli": 2, - "rte": 2, - "wnli": 2, -} diff --git a/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/__init__.py b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/__init__.py new file mode 100644 index 000000000000..d396af9c88fb --- /dev/null +++ b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/__init__.py @@ -0,0 +1,18 @@ +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.collections.nlp.data.datasets.glue_benchmark_dataset.data_processors import * +from nemo.collections.nlp.data.datasets.glue_benchmark_dataset.glue_benchmark_dataset import * diff --git a/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/data_processors.py b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/data_processors.py new file mode 100644 index 000000000000..48e9297dbe98 --- /dev/null +++ b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/data_processors.py @@ -0,0 +1,341 @@ +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +# ============================================================================= +# Copyright 2018 The Google AI Language Team Authors and +# The HuggingFace Inc. team. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import os + +from nemo import logging +from nemo.collections.nlp.data.datasets.datasets_utils.datasets_processing import DataProcessor + +__all__ = [ + 'ColaProcessor', + 'MnliProcessor', + 'MnliMismatchedProcessor', + 'MrpcProcessor', + 'Sst2Processor', + 'StsbProcessor', + 'QqpProcessor', + 'QnliProcessor', + 'RteProcessor', + 'WnliProcessor', +] + + +class MrpcProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + logging.info(f'LOOKING AT {os.path.join(data_dir, "train.tsv")}') + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[3] + text_b = line[4] + label = line[0] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class MnliProcessor(DataProcessor): + """Processor for the MultiNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched") + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[8] + text_b = line[9] + label = line[-1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class MnliMismatchedProcessor(MnliProcessor): + """Processor for the MultiNLI Mismatched data set (GLUE version).""" + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_matched") + + +class ColaProcessor(DataProcessor): + """Processor for the CoLA data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text_a = line[3] + label = line[1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +class Sst2Processor(DataProcessor): + """Processor for the SST-2 data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[0] + label = line[1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +class StsbProcessor(DataProcessor): + """Processor for the STS-B data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return [None] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[7] + text_b = line[8] + label = line[-1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class QqpProcessor(DataProcessor): + """Processor for the QQP data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + try: + text_a = line[3] + text_b = line[4] + label = line[5] + except IndexError: + continue + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class QnliProcessor(DataProcessor): + """Processor for the QNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched") + + def get_labels(self): + """See base class.""" + return ["entailment", "not_entailment"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[1] + text_b = line[2] + label = line[-1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class RteProcessor(DataProcessor): + """Processor for the RTE data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["entailment", "not_entailment"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[1] + text_b = line[2] + label = line[-1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class WnliProcessor(DataProcessor): + """Processor for the WNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[1] + text_b = line[2] + label = line[-1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text_a, text_b=None, label=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. + For single sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second + sequence. Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label diff --git a/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/glue_benchmark_dataset.py b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/glue_benchmark_dataset.py new file mode 100644 index 000000000000..60e67639dc62 --- /dev/null +++ b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/glue_benchmark_dataset.py @@ -0,0 +1,289 @@ +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +# ============================================================================= +# Copyright 2018 The Google AI Language Team Authors and +# The HuggingFace Inc. team. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +""" +Utility functions for GLUE tasks +Some transformer of this code were adapted from the HuggingFace library at +https://github.com/huggingface/transformers +""" + +import numpy as np +from torch.utils.data import Dataset + +from nemo import logging +from nemo.collections.nlp.data.datasets.glue_benchmark_dataset.data_processors import * + +__all__ = ['GLUEDataset', 'output_modes', 'processors'] + +processors = { + "cola": ColaProcessor, + "mnli": MnliProcessor, + "mnli-mm": MnliMismatchedProcessor, + "mrpc": MrpcProcessor, + "sst-2": Sst2Processor, + "sts-b": StsbProcessor, + "qqp": QqpProcessor, + "qnli": QnliProcessor, + "rte": RteProcessor, + "wnli": WnliProcessor, +} +output_modes = { + "cola": "classification", + "mnli": "classification", + "mnli-mm": "classification", + "mrpc": "classification", + "sst-2": "classification", + "sts-b": "regression", + "qqp": "classification", + "qnli": "classification", + "rte": "classification", + "wnli": "classification", +} +GLUE_TASKS_NUM_LABELS = { + "cola": 2, + "mnli": 3, + "mrpc": 2, + "sst-2": 2, + "sts-b": 1, + "qqp": 2, + "qnli": 2, + "rte": 2, + "wnli": 2, +} + + +class GLUEDataset(Dataset): + def __init__(self, data_dir, tokenizer, max_seq_length, processor, output_mode, evaluate, token_params): + self.tokenizer = tokenizer + self.label_list = processor.get_labels() + self.examples = processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir) + self.features = self.convert_examples_to_features( + self.examples, self.label_list, max_seq_length, tokenizer, output_mode, **token_params + ) + + def __len__(self): + return len(self.features) + + def __getitem__(self, idx): + feature = self.features[idx] + return ( + np.array(feature.input_ids), + np.array(feature.segment_ids), + np.array(feature.input_mask, dtype=np.long), + np.array(feature.label_id), + ) + + def convert_examples_to_features( + self, + examples, + label_list, + max_seq_length, + tokenizer, + output_mode, + bos_token=None, + eos_token='[SEP]', + pad_token='[PAD]', + cls_token='[CLS]', + sep_token_extra=None, + cls_token_at_end=False, + cls_token_segment_id=0, + pad_token_segment_id=0, + pad_on_left=False, + mask_padding_with_zero=True, + sequence_a_segment_id=0, + sequence_b_segment_id=1, + ): + """ Loads a data file into a list of `InputBatch`s + `cls_token_at_end` define the location of the CLS token: + - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] + - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] + `cls_token_segment_id` define the segment id associated to the CLS + token (0 for BERT, 2 for XLNet) + The convention in BERT is: + (a) For sequence pairs: + tokens: [CLS] is this jack ##ville ? [SEP] no it is not . [SEP] + type_ids: 0 0 0 0 0 0 0 1 1 1 1 1 1 + (b) For single sequences: + tokens: [CLS] the dog is hairy . [SEP] + type_ids: 0 0 0 0 0 0 0 + Where "type_ids" are used to indicate whether this is the first + sequence or the second sequence. The embedding vectors for `type=0` + and `type=1` were learned during pre-training and are added to the + wordpiece embedding vector (and position vector). This is + not *strictly* necessarysince the [SEP] token unambiguously separates + the sequences, but it makes it easier for the model to learn + the concept of sequences. + For classification tasks, the first vector (corresponding to [CLS]) + is used as as the "sentence vector". Note that this only makes sense + because the entire model is fine-tuned. + For NMT: + (a) For sequence pairs: + tokens: is this jack ##ville ? no it is not . + type_ids:0 0 0 0 0 0 0 1 1 1 1 1 1 1 + (b) For single sequences: + tokens: the dog is hairy . + type_ids: 0 0 0 0 0 0 0 + """ + label_map = {label: i for i, label in enumerate(label_list)} + + features = [] + for ex_index, example in enumerate(examples): + if ex_index % 10000 == 0: + logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + tokens_a = tokenizer.text_to_tokens(example.text_a) + + tokens_b = None + if example.text_b: + tokens_b = tokenizer.text_to_tokens(example.text_b) + + special_tokens_count = 2 if eos_token else 0 + special_tokens_count += 1 if sep_token_extra else 0 + special_tokens_count += 2 if bos_token else 0 + special_tokens_count += 1 if cls_token else 0 + self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count) + else: + special_tokens_count = 1 if eos_token else 0 + special_tokens_count += 1 if sep_token_extra else 0 + special_tokens_count += 1 if bos_token else 0 + if len(tokens_a) > max_seq_length - special_tokens_count: + tokens_a = tokens_a[: max_seq_length - special_tokens_count] + # Add special tokens to sequence_a + tokens = tokens_a + if bos_token: + tokens = [bos_token] + tokens + if eos_token: + tokens += [eos_token] + segment_ids = [sequence_a_segment_id] * len(tokens) + + # Add sequence separator between sequences + if tokens_b and sep_token_extra: + tokens += [sep_token_extra] + segment_ids += [sequence_a_segment_id] + + # Add special tokens to sequence_b + if tokens_b: + if bos_token: + tokens += [bos_token] + segment_ids += [sequence_b_segment_id] + tokens += tokens_b + segment_ids += [sequence_b_segment_id] * (len(tokens_b)) + if eos_token: + tokens += [eos_token] + segment_ids += [sequence_b_segment_id] + + # Add classification token - for BERT models + if cls_token: + if cls_token_at_end: + tokens += [cls_token] + segment_ids += [cls_token_segment_id] + else: + tokens = [cls_token] + tokens + segment_ids = [cls_token_segment_id] + segment_ids + input_ids = tokenizer.tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) + + # Zero-pad up to the sequence length. + padding_length = max_seq_length - len(input_ids) + pad_token_id = tokenizer.tokens_to_ids([pad_token])[0] + if pad_on_left: + input_ids = ([pad_token_id] * padding_length) + input_ids + input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask + segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids + else: + input_ids = input_ids + ([pad_token_id] * padding_length) + input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length) + segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) + if len(input_ids) != max_seq_length: + raise ValueError("input_ids must be of length max_seq_length") + if len(input_mask) != max_seq_length: + raise ValueError("input_mask must be of length max_seq_length") + if len(segment_ids) != max_seq_length: + raise ValueError("segment_ids must be of length max_seq_length") + if output_mode == "classification": + label_id = label_map[example.label] + elif output_mode == "regression": + label_id = np.float32(example.label) + else: + raise KeyError(output_mode) + + if ex_index < 5: + logging.info("*** Example ***") + logging.info("guid: %s" % (example.guid)) + logging.info("tokens: %s" % " ".join(list(map(str, tokens)))) + logging.info("input_ids: %s" % " ".join(list(map(str, input_ids)))) + logging.info("input_mask: %s" % " ".join(list(map(str, input_mask)))) + logging.info("segment_ids: %s" % " ".join(list(map(str, segment_ids)))) + logging.info("label: %s (id = %d)" % (example.label, label_id)) + + features.append( + InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id) + ) + return features + + def _truncate_seq_pair(self, tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length. + + This will always truncate the longer sequence one token at a time. + This makes more sense than truncating an equal percent + of tokens from each, since if one sequence is very short then each token + that's truncated likely contains more information than a longer sequence. + """ + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + """ + Utility functions for GLUE tasks + This code was adapted from the HuggingFace library at + https://github.com/huggingface/transformers + """ + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, input_ids, input_mask, segment_ids, label_id): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_id = label_id diff --git a/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/__init__.py b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/__init__.py new file mode 100644 index 000000000000..3717507be2e2 --- /dev/null +++ b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/__init__.py @@ -0,0 +1,18 @@ +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.inference_utils import * +from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.joint_intent_slot_dataset import * diff --git a/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/data_descriptor.py b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/data_descriptor.py new file mode 100644 index 000000000000..e298f4196bdc --- /dev/null +++ b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/data_descriptor.py @@ -0,0 +1,254 @@ +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import itertools +import os + +from nemo import logging +from nemo.collections.nlp.data.datasets.datasets_utils import ( + DATABASE_EXISTS_TMP, + calc_class_weights, + get_label_stats, + if_exist, + process_atis, + process_dialogflow, + process_jarvis_datasets, + process_mturk, + process_snips, +) +from nemo.collections.nlp.utils import get_vocab, list2str + +__all__ = ['JointIntentSlotDataDesc'] + + +class JointIntentSlotDataDesc: + """ Convert the raw data to the standard format supported by + JointIntentSlotDataset. + + By default, the None label for slots is 'O'. + + JointIntentSlotDataset requires two files: + + input_file: file to sequence + label. + the first line is header (sentence [tab] label) + each line should be [sentence][tab][label] + + slot_file: file to slot labels, each line corresponding to + slot labels for a sentence in input_file. No header. + + To keep the mapping from label index to label consistent during + training and inferencing, we require the following files: + dicts.intents.csv: each line is an intent. The first line + corresponding to the 0 intent label, the second line + corresponding to the 1 intent label, and so on. + + dicts.slots.csv: each line is a slot. The first line + corresponding to the 0 slot label, the second line + corresponding to the 1 slot label, and so on. + + Args: + data_dir (str): the directory of the dataset + do_lower_case (bool): whether to set your dataset to lowercase + dataset_name (str): the name of the dataset. If it's a dataset + that follows the standard JointIntentSlotDataset format, + you can set the name as 'default'. + none_slot_label (str): the label for slots that aren't indentified + defaulted to 'O' + pad_label (int): the int used for padding. If set to -1, + it'll be set to the whatever the None label is. + + """ + + def __init__(self, data_dir, do_lower_case=False, dataset_name='default', none_slot_label='O', pad_label=-1): + if dataset_name == 'atis': + self.data_dir = process_atis(data_dir, do_lower_case) + elif dataset_name == 'snips-atis': + self.data_dir, self.pad_label = self.merge( + data_dir, ['ATIS/nemo-processed-uncased', 'snips/nemo-processed-uncased/all'], dataset_name + ) + elif dataset_name == 'dialogflow': + self.data_dir = process_dialogflow(data_dir, do_lower_case) + elif dataset_name == 'mturk-processed': + self.data_dir = process_mturk(data_dir, do_lower_case) + elif dataset_name in set(['snips-light', 'snips-speak', 'snips-all']): + self.data_dir = process_snips(data_dir, do_lower_case) + if dataset_name.endswith('light'): + self.data_dir = f'{self.data_dir}/light' + elif dataset_name.endswith('speak'): + self.data_dir = f'{self.data_dir}/speak' + elif dataset_name.endswith('all'): + self.data_dir = f'{self.data_dir}/all' + elif dataset_name.startswith('jarvis'): + self.data_dir = process_jarvis_datasets( + data_dir, do_lower_case, dataset_name, modes=["train", "test", "eval"], ignore_prev_intent=False + ) + else: + if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']): + raise FileNotFoundError( + "Make sure that your data follows the standard format " + "supported by JointIntentSlotDataset. Your data must " + "contain dict.intents.csv and dict.slots.csv." + ) + self.data_dir = data_dir + + self.intent_dict_file = self.data_dir + '/dict.intents.csv' + self.slot_dict_file = self.data_dir + '/dict.slots.csv' + self.num_intents = len(get_vocab(self.intent_dict_file)) + slots = label2idx(self.slot_dict_file) + self.num_slots = len(slots) + + for mode in ['train', 'test', 'eval']: + + if not if_exist(self.data_dir, [f'{mode}.tsv']): + logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.') + continue + + slot_file = f'{self.data_dir}/{mode}_slots.tsv' + with open(slot_file, 'r') as f: + slot_lines = f.readlines() + + input_file = f'{self.data_dir}/{mode}.tsv' + with open(input_file, 'r') as f: + input_lines = f.readlines()[1:] # Skipping headers at index 0 + + if len(slot_lines) != len(input_lines): + raise ValueError( + "Make sure that the number of slot lines match the " + "number of intent lines. There should be a 1-1 " + "correspondence between every slot and intent lines." + ) + + dataset = list(zip(slot_lines, input_lines)) + + raw_slots, queries, raw_intents = [], [], [] + for slot_line, input_line in dataset: + slot_list = [int(slot) for slot in slot_line.strip().split()] + raw_slots.append(slot_list) + parts = input_line.strip().split() + raw_intents.append(int(parts[-1])) + queries.append(' '.join(parts[:-1])) + + infold = input_file[: input_file.rfind('/')] + + logging.info(f'Three most popular intents during {mode}ing') + total_intents, intent_label_freq = get_label_stats(raw_intents, infold + f'/{mode}_intent_stats.tsv') + merged_slots = itertools.chain.from_iterable(raw_slots) + + logging.info(f'Three most popular slots during {mode}ing') + slots_total, slots_label_freq = get_label_stats(merged_slots, infold + f'/{mode}_slot_stats.tsv') + + if mode == 'train': + self.slot_weights = calc_class_weights(slots_label_freq) + logging.info(f'Slot weights are - {self.slot_weights}') + + self.intent_weights = calc_class_weights(intent_label_freq) + logging.info(f'Intent weights are - {self.intent_weights}') + + logging.info(f'Total intents - {total_intents}') + logging.info(f'Intent label frequency - {intent_label_freq}') + logging.info(f'Total Slots - {slots_total}') + logging.info(f'Slots label frequency - {slots_label_freq}') + + if pad_label != -1: + self.pad_label = pad_label + else: + if none_slot_label not in slots: + raise ValueError(f'none_slot_label {none_slot_label} not ' f'found in {self.slot_dict_file}.') + self.pad_label = slots[none_slot_label] + + def merge(self, data_dir, subdirs, dataset_name, modes=['train', 'test']): + outfold = f'{data_dir}/{dataset_name}' + if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): + logging.info(DATABASE_EXISTS_TMP.format('SNIPS-ATIS', outfold)) + slots = get_vocab(f'{outfold}/dict.slots.csv') + none_slot = 0 + for key in slots: + if slots[key] == 'O': + none_slot = key + break + return outfold, int(none_slot) + + os.makedirs(outfold, exist_ok=True) + + data_files, slot_files = {}, {} + for mode in modes: + data_files[mode] = open(f'{outfold}/{mode}.tsv', 'w') + data_files[mode].write('sentence\tlabel\n') + slot_files[mode] = open(f'{outfold}/{mode}_slots.tsv', 'w') + + intents, slots = {}, {} + intent_shift, slot_shift = 0, 0 + none_intent, none_slot = -1, -1 + + for subdir in subdirs: + curr_intents = get_vocab(f'{data_dir}/{subdir}/dict.intents.csv') + curr_slots = get_vocab(f'{data_dir}/{subdir}/dict.slots.csv') + + for key in curr_intents: + if intent_shift > 0 and curr_intents[key] == 'O': + continue + if curr_intents[key] == 'O' and intent_shift == 0: + none_intent = int(key) + intents[int(key) + intent_shift] = curr_intents[key] + + for key in curr_slots: + if slot_shift > 0 and curr_slots[key] == 'O': + continue + if slot_shift == 0 and curr_slots[key] == 'O': + none_slot = int(key) + slots[int(key) + slot_shift] = curr_slots[key] + + for mode in modes: + with open(f'{data_dir}/{subdir}/{mode}.tsv', 'r') as f: + for line in f.readlines()[1:]: + text, label = line.strip().split('\t') + label = int(label) + if curr_intents[label] == 'O': + label = none_intent + else: + label = label + intent_shift + data_files[mode].write(f'{text}\t{label}\n') + + with open(f'{data_dir}/{subdir}/{mode}_slots.tsv', 'r') as f: + for line in f.readlines(): + labels = [int(label) for label in line.strip().split()] + shifted_labels = [] + for label in labels: + if curr_slots[label] == 'O': + shifted_labels.append(none_slot) + else: + shifted_labels.append(label + slot_shift) + slot_files[mode].write(list2str(shifted_labels) + '\n') + + intent_shift += len(curr_intents) + slot_shift += len(curr_slots) + + write_vocab_in_order(intents, f'{outfold}/dict.intents.csv') + write_vocab_in_order(slots, f'{outfold}/dict.slots.csv') + return outfold, none_slot + + +def label2idx(file): + lines = open(file, 'r').readlines() + lines = [line.strip() for line in lines if line.strip()] + labels = {lines[i]: i for i in range(len(lines))} + return labels + + +def write_vocab_in_order(vocab, outfile): + with open(outfile, 'w') as f: + for key in sorted(vocab.keys()): + f.write(f'{vocab[key]}\n') diff --git a/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/inference_utils.py b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/inference_utils.py new file mode 100644 index 000000000000..a886c20739bf --- /dev/null +++ b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/inference_utils.py @@ -0,0 +1,50 @@ +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import numpy as np + +from nemo import logging +from nemo.collections.nlp.utils import get_vocab + +__all__ = ['read_intent_slot_outputs'] + + +def read_intent_slot_outputs( + queries, intent_file, slot_file, intent_logits, slot_logits, slot_masks, intents=None, slots=None +): + intent_dict = get_vocab(intent_file) + slot_dict = get_vocab(slot_file) + pred_intents = np.argmax(intent_logits, 1) + pred_slots = np.argmax(slot_logits, axis=2) + slot_masks = slot_masks > 0.5 + for i, query in enumerate(queries): + logging.info(f'Query: {query}') + pred = pred_intents[i] + logging.info(f'Predicted intent:\t{pred}\t{intent_dict[pred]}') + if intents is not None: + logging.info(f'True intent:\t{intents[i]}\t{intent_dict[intents[i]]}') + + pred_slot = pred_slots[i][slot_masks[i]] + tokens = query.strip().split() + + if len(pred_slot) != len(tokens): + raise ValueError('Pred_slot and tokens must be of the same length') + + for j, token in enumerate(tokens): + output = f'{token}\t{slot_dict[pred_slot[j]]}' + if slots is not None: + output = f'{output}\t{slot_dict[slots[i][j]]}' + logging.info(output) diff --git a/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset.py b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/joint_intent_slot_dataset.py similarity index 55% rename from nemo/collections/nlp/data/datasets/joint_intent_slot_dataset.py rename to nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/joint_intent_slot_dataset.py index 4abc70923226..b0cbebd41f0f 100644 --- a/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset.py +++ b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/joint_intent_slot_dataset.py @@ -1,3 +1,20 @@ +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +# ============================================================================= # Copyright 2018 The Google AI Language Team Authors and # The HuggingFace Inc. team. # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. @@ -13,31 +30,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# ============================================================================= + """ Utility functions for Token Classification NLP tasks Some parts of this code were adapted from the HuggingFace library at https://github.com/huggingface/pytorch-pretrained-BERT """ -import itertools -import random import numpy as np from torch.utils.data import Dataset from nemo import logging -from nemo.collections.nlp.data.datasets.datasets_utils import ( - get_label_stats, - get_stats, - merge, - process_atis, - process_dialogflow, - process_jarvis_datasets, - process_mturk, - process_snips, -) -from nemo.collections.nlp.utils.common_nlp_utils import calc_class_weights, get_vocab, if_exist, label2idx +from nemo.collections.nlp.data.datasets.datasets_utils import get_stats -__all__ = ['BertJointIntentSlotDataset', 'BertJointIntentSlotInferDataset', 'JointIntentSlotDataDesc'] +__all__ = ['BertJointIntentSlotDataset', 'BertJointIntentSlotInferDataset'] def get_features( @@ -151,7 +158,6 @@ class BertJointIntentSlotDataset(Dataset): tokenizer (Tokenizer): such as BertTokenizer num_samples (int): number of samples you want to use for the dataset. If -1, use all dataset. Useful for testing. - shuffle (bool): whether to shuffle your data. pad_label (int): pad value use for slot labels. by default, it's the neutral label. @@ -164,7 +170,6 @@ def __init__( max_seq_length, tokenizer, num_samples=-1, - shuffle=True, pad_label=128, ignore_extra_tokens=False, ignore_start_end=False, @@ -182,8 +187,6 @@ def __init__( dataset = list(zip(slot_lines, input_lines)) - if shuffle or num_samples > 0: - random.shuffle(dataset) if num_samples > 0: dataset = dataset[:num_samples] @@ -267,139 +270,3 @@ def __getitem__(self, idx): np.array(self.all_loss_mask[idx]), np.array(self.all_subtokens_mask[idx]), ) - - -class JointIntentSlotDataDesc: - """ Convert the raw data to the standard format supported by - JointIntentSlotDataset. - - By default, the None label for slots is 'O'. - - JointIntentSlotDataset requires two files: - - input_file: file to sequence + label. - the first line is header (sentence [tab] label) - each line should be [sentence][tab][label] - - slot_file: file to slot labels, each line corresponding to - slot labels for a sentence in input_file. No header. - - To keep the mapping from label index to label consistent during - training and inferencing, we require the following files: - dicts.intents.csv: each line is an intent. The first line - corresponding to the 0 intent label, the second line - corresponding to the 1 intent label, and so on. - - dicts.slots.csv: each line is a slot. The first line - corresponding to the 0 slot label, the second line - corresponding to the 1 slot label, and so on. - - Args: - data_dir (str): the directory of the dataset - do_lower_case (bool): whether to set your dataset to lowercase - dataset_name (str): the name of the dataset. If it's a dataset - that follows the standard JointIntentSlotDataset format, - you can set the name as 'default'. - none_slot_label (str): the label for slots that aren't indentified - defaulted to 'O' - pad_label (int): the int used for padding. If set to -1, - it'll be set to the whatever the None label is. - - """ - - def __init__(self, data_dir, do_lower_case=False, dataset_name='default', none_slot_label='O', pad_label=-1): - if dataset_name == 'atis': - self.data_dir = process_atis(data_dir, do_lower_case) - elif dataset_name == 'snips-atis': - self.data_dir, self.pad_label = merge( - data_dir, ['ATIS/nemo-processed-uncased', 'snips/nemo-processed-uncased/all'], dataset_name - ) - elif dataset_name == 'dialogflow': - self.data_dir = process_dialogflow(data_dir, do_lower_case) - elif dataset_name == 'mturk-processed': - self.data_dir = process_mturk(data_dir, do_lower_case) - elif dataset_name in set(['snips-light', 'snips-speak', 'snips-all']): - self.data_dir = process_snips(data_dir, do_lower_case) - if dataset_name.endswith('light'): - self.data_dir = f'{self.data_dir}/light' - elif dataset_name.endswith('speak'): - self.data_dir = f'{self.data_dir}/speak' - elif dataset_name.endswith('all'): - self.data_dir = f'{self.data_dir}/all' - elif dataset_name.startswith('jarvis'): - self.data_dir = process_jarvis_datasets( - data_dir, do_lower_case, dataset_name, modes=["train", "test", "eval"], ignore_prev_intent=False - ) - else: - if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']): - raise FileNotFoundError( - "Make sure that your data follows the standard format " - "supported by JointIntentSlotDataset. Your data must " - "contain dict.intents.csv and dict.slots.csv." - ) - self.data_dir = data_dir - - self.intent_dict_file = self.data_dir + '/dict.intents.csv' - self.slot_dict_file = self.data_dir + '/dict.slots.csv' - self.num_intents = len(get_vocab(self.intent_dict_file)) - slots = label2idx(self.slot_dict_file) - self.num_slots = len(slots) - - for mode in ['train', 'test', 'eval']: - - if not if_exist(self.data_dir, [f'{mode}.tsv']): - logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.') - continue - - slot_file = f'{self.data_dir}/{mode}_slots.tsv' - with open(slot_file, 'r') as f: - slot_lines = f.readlines() - - input_file = f'{self.data_dir}/{mode}.tsv' - with open(input_file, 'r') as f: - input_lines = f.readlines()[1:] # Skipping headers at index 0 - - if len(slot_lines) != len(input_lines): - raise ValueError( - "Make sure that the number of slot lines match the " - "number of intent lines. There should be a 1-1 " - "correspondence between every slot and intent lines." - ) - - dataset = list(zip(slot_lines, input_lines)) - - raw_slots, queries, raw_intents = [], [], [] - for slot_line, input_line in dataset: - slot_list = [int(slot) for slot in slot_line.strip().split()] - raw_slots.append(slot_list) - parts = input_line.strip().split() - raw_intents.append(int(parts[-1])) - queries.append(' '.join(parts[:-1])) - - infold = input_file[: input_file.rfind('/')] - - logging.info(f'Three most popular intents during {mode}ing') - total_intents, intent_label_freq = get_label_stats(raw_intents, infold + f'/{mode}_intent_stats.tsv') - merged_slots = itertools.chain.from_iterable(raw_slots) - - logging.info(f'Three most popular slots during {mode}ing') - slots_total, slots_label_freq = get_label_stats(merged_slots, infold + f'/{mode}_slot_stats.tsv') - - if mode == 'train': - self.slot_weights = calc_class_weights(slots_label_freq) - logging.info(f'Slot weights are - {self.slot_weights}') - - self.intent_weights = calc_class_weights(intent_label_freq) - logging.info(f'Intent weights are - {self.intent_weights}') - - logging.info(f'Total intents - {total_intents}') - logging.info(f'Intent label frequency - {intent_label_freq}') - logging.info(f'Total Slots - {slots_total}') - logging.info(f'Slots label frequency - {slots_label_freq}') - - if pad_label != -1: - self.pad_label = pad_label - else: - if none_slot_label not in slots: - raise ValueError(f'none_slot_label {none_slot_label} not ' f'found in {self.slot_dict_file}.') - self.pad_label = slots[none_slot_label] diff --git a/nemo/collections/nlp/data/datasets/lm_bert_dataset.py b/nemo/collections/nlp/data/datasets/lm_bert_dataset.py index 61b74f933c60..32ddb0f82384 100644 --- a/nemo/collections/nlp/data/datasets/lm_bert_dataset.py +++ b/nemo/collections/nlp/data/datasets/lm_bert_dataset.py @@ -24,12 +24,12 @@ import h5py import numpy as np +from sentencepiece import SentencePieceTrainer as SPT from torch.utils.data import Dataset from tqdm import tqdm from nemo import logging -from nemo.collections.nlp.data.datasets.datasets_utils import download_wkt2 -from nemo.collections.nlp.data.datasets.lm_transformer_dataset import create_vocab_mlm +from nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing import DATABASE_EXISTS_TMP, if_exist __all__ = ['BertPretrainingDataset', 'BertPretrainingPreprocessedDataset'] @@ -380,17 +380,75 @@ class BERTPretrainingDataDesc: def __init__(self, dataset_name, data_dir, vocab_size, sample_size, special_tokens, train_file=''): if dataset_name == 'wikitext-2': if not os.path.exists(data_dir): - data_dir = download_wkt2(data_dir) - self.data_dir, self.tokenizer_model = create_vocab_mlm( + raise FileNotFoundError("Dataset not found. Run './get_wkt2.sh DATA_DIR' from examples/nlp/scripts") + self.data_dir, self.tokenizer_model = self.create_vocab_mlm( data_dir, vocab_size, sample_size, special_tokens, train_file ) else: - logging.warning( - "Looks like you passed a dataset name that isn't " - "already supported by NeMo. Please make sure that " + raise ValueError( + "Looks like you passed a dataset name that isn't already supported by NeMo. Please make sure that " "you build the preprocessing method for it." ) self.train_file = f'{data_dir}/train.txt' self.eval_file = f'{data_dir}/valid.txt' self.test_file = f'{data_dir}/test.txt' + + def create_vocab_mlm( + self, + data_dir, + vocab_size, + sample_size, + special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], + train_file='', + ): + vocab = special_tokens[:] + bert_dir = f'{data_dir}/bert' + if if_exist(bert_dir, ['tokenizer.model']): + logging.info(DATABASE_EXISTS_TMP.format('WikiText_BERT', bert_dir)) + return data_dir, f'{bert_dir}/tokenizer.model' + logging.info(f'Processing WikiText dataset and store at {bert_dir}') + os.makedirs(bert_dir, exist_ok=True) + + if not train_file: + files = glob.glob(f'{data_dir}/*.txt') + train_file = f'{bert_dir}/merged.txt' + logging.info(f"Merging {len(files)} txt files into {train_file}") + + with open(train_file, "w") as merged: + for file in tqdm(files): + with open(file, 'r') as inf: + content = inf.read().strip() + merged.write(content + '\n\n\n') + else: + train_file = f'{data_dir}/{train_file}' + + cmd = ( + f"--input={train_file} --model_prefix={bert_dir}/tokenizer " + f"--vocab_size={vocab_size - len(vocab)} " + f"--input_sentence_size={sample_size} " + f"--shuffle_input_sentence=true --hard_vocab_limit=false " + f"--bos_id=-1 --eos_id=-1" + ) + + SPT.Train(cmd) + + # Add BERT control symbols + tokens = [] + + with open(f"{bert_dir}/tokenizer.vocab", "r") as f: + f.readline() # skip first token + + # Read tokens from each line and parse for vocab + for line in f: + piece = line.split("\t")[0] + token = piece[1:] if piece.startswith("▁") else f"##{piece}" + tokens.append(token) + + vocab.extend(tokens) + + # Save vocabulary to output file + with open(f'{bert_dir}/vocab.txt', "w") as f: + for token in vocab: + f.write(f"{token}\n".format()) + return data_dir, f'{bert_dir}/tokenizer.model' diff --git a/nemo/collections/nlp/data/datasets/lm_transformer_dataset.py b/nemo/collections/nlp/data/datasets/lm_transformer_dataset.py index 5d8f20723c6e..303a07904692 100644 --- a/nemo/collections/nlp/data/datasets/lm_transformer_dataset.py +++ b/nemo/collections/nlp/data/datasets/lm_transformer_dataset.py @@ -15,21 +15,16 @@ # ============================================================================= """Pytorch Dataset for training Neural Machine Translation.""" -import glob import os -import pickle import re import numpy as np -from sentencepiece import SentencePieceTrainer as SPT from torch.utils.data import Dataset -from tqdm import tqdm from nemo import logging -from nemo.collections.nlp.data.datasets.datasets_utils import DATABASE_EXISTS_TMP, download_wkt2 -from nemo.collections.nlp.utils.common_nlp_utils import if_exist +from nemo.collections.nlp.data.datasets.datasets_utils import dataset_to_ids, if_exist -__all__ = ['LanguageModelingDataset'] +__all__ = ['LanguageModelingDataset', 'LanguageModelDataDesc'] class LanguageModelingDataset(Dataset): @@ -56,8 +51,8 @@ class LanguageModelDataDesc: def __init__(self, dataset_name, data_dir, do_lower_case): if dataset_name == 'wikitext-2': if not os.path.exists(data_dir): - data_dir = download_wkt2(data_dir) - self.vocab_size = create_vocab_lm(data_dir, do_lower_case) + raise FileNotFoundError("Dataset not found. Run './get_wkt2.sh DATA_DIR' from examples/nlp/scripts") + self.vocab_size = self.create_vocab_lm(data_dir, do_lower_case) self.data_dir = data_dir else: logging.warning( @@ -66,122 +61,33 @@ def __init__(self, dataset_name, data_dir, do_lower_case): "you build the preprocessing method for it." ) - -def create_vocab_mlm( - data_dir, vocab_size, sample_size, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], train_file='' -): - vocab = special_tokens[:] - bert_dir = f'{data_dir}/bert' - if if_exist(bert_dir, ['tokenizer.model']): - logging.info(DATABASE_EXISTS_TMP.format('WikiText_BERT', bert_dir)) - return data_dir, f'{bert_dir}/tokenizer.model' - logging.info(f'Processing WikiText dataset and store at {bert_dir}') - os.makedirs(bert_dir, exist_ok=True) - - if not train_file: - files = glob.glob(f'{data_dir}/*.txt') - train_file = f'{bert_dir}/merged.txt' - logging.info(f"Merging {len(files)} txt files into {train_file}") - - with open(train_file, "w") as merged: - for file in tqdm(files): - with open(file, 'r') as inf: - content = inf.read().strip() - merged.write(content + '\n\n\n') - else: - train_file = f'{data_dir}/{train_file}' - - cmd = ( - f"--input={train_file} --model_prefix={bert_dir}/tokenizer " - f"--vocab_size={vocab_size - len(vocab)} " - f"--input_sentence_size={sample_size} " - f"--shuffle_input_sentence=true --hard_vocab_limit=false " - f"--bos_id=-1 --eos_id=-1" - ) - SPT.Train(cmd) - - # Add BERT control symbols - tokens = [] - - with open(f"{bert_dir}/tokenizer.vocab", "r") as f: - f.readline() # skip first token - - # Read tokens from each line and parse for vocab - for line in f: - piece = line.split("\t")[0] - token = piece[1:] if piece.startswith("▁") else f"##{piece}" - tokens.append(token) - - vocab.extend(tokens) - - # Save vocabulary to output file - with open(f'{bert_dir}/vocab.txt', "w") as f: - for token in vocab: - f.write(f"{token}\n".format()) - return data_dir, f'{bert_dir}/tokenizer.model' - - -def dataset_to_ids(dataset, tokenizer, cache_ids=False, add_bos_eos=True): - """ - Reads dataset from file line by line, tokenizes each line with tokenizer, - and returns list of lists which corresponds to ids of tokenized strings. - - Args: - dataset: path to dataset - tokenizer: tokenizer to convert text into ids - cache_ids: if True, ids are saved to disk as pickle file - with similar name (e.g., data.txt --> data.txt.pkl) - add_bos_eos: bool, whether to add and symbols (e.g., for NMT) - Returns: - ids: list of ids which correspond to tokenized strings of the dataset - """ - - cached_ids_dataset = dataset + str(".pkl") - if os.path.isfile(cached_ids_dataset): - logging.info("Loading cached tokenized dataset ...") - ids = pickle.load(open(cached_ids_dataset, "rb")) - else: - logging.info("Tokenizing dataset ...") - data = open(dataset, "rb").readlines() - ids = [] - for sentence in data: - sent_ids = tokenizer.text_to_ids(sentence.decode("utf-8")) - if add_bos_eos: - sent_ids = [tokenizer.bos_id] + sent_ids + [tokenizer.eos_id] - ids.append(sent_ids) - if cache_ids: - logging.info("Caching tokenized dataset ...") - pickle.dump(ids, open(cached_ids_dataset, "wb")) - return ids - - -def create_vocab_lm(data_dir, do_lower_case): - if if_exist(data_dir, ['train.txt', 'vocab.txt']): - logging.info("Vocabulary has been created.") - with open(os.path.join(data_dir, 'vocab.txt'), 'r') as f: - vocab_size = len(f.readlines()) - return vocab_size - - logging.info(f'Creating vocabulary from training data at {data_dir}') - - with open(f'{data_dir}/train.txt', 'r') as f: - txt = f.read() - if do_lower_case: - txt = txt.lower() - lines = re.split(r'[\n]', txt) - sentences = [line.strip().split() for line in lines if line.strip()] - - vocab = {"[PAD]": 0, "[SEP]": 1, "[CLS]": 2, "[MASK]": 3} - idx = 4 - for sentence in sentences: - for word in sentence: - if word not in vocab: - vocab[word] = idx - idx += 1 - - with open(f'{data_dir}/vocab.txt', 'w') as f: - for word in sorted(vocab.keys()): - f.write(word + '\n') - logging.info(f"Created vocabulary of size {len(vocab)}") - - return len(vocab) + def create_vocab_lm(self, data_dir, do_lower_case): + if if_exist(data_dir, ['train.txt', 'vocab.txt']): + logging.info("Vocabulary has been created.") + with open(os.path.join(data_dir, 'vocab.txt'), 'r') as f: + vocab_size = len(f.readlines()) + return vocab_size + + logging.info(f'Creating vocabulary from training data at {data_dir}') + + with open(f'{data_dir}/train.txt', 'r') as f: + txt = f.read() + if do_lower_case: + txt = txt.lower() + lines = re.split(r'[\n]', txt) + sentences = [line.strip().split() for line in lines if line.strip()] + + vocab = {"[PAD]": 0, "[SEP]": 1, "[CLS]": 2, "[MASK]": 3} + idx = 4 + for sentence in sentences: + for word in sentence: + if word not in vocab: + vocab[word] = idx + idx += 1 + + with open(f'{data_dir}/vocab.txt', 'w') as f: + for word in sorted(vocab.keys()): + f.write(word + '\n') + logging.info(f"Created vocabulary of size {len(vocab)}") + + return len(vocab) diff --git a/nemo/collections/nlp/data/datasets/machine_translation_dataset.py b/nemo/collections/nlp/data/datasets/machine_translation_dataset.py index db8e6b7ace2d..3fe43c1f6820 100644 --- a/nemo/collections/nlp/data/datasets/machine_translation_dataset.py +++ b/nemo/collections/nlp/data/datasets/machine_translation_dataset.py @@ -21,7 +21,7 @@ import numpy as np from torch.utils.data import Dataset -from nemo.collections.nlp.data.datasets.lm_transformer_dataset import dataset_to_ids +from nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing import dataset_to_ids __all__ = ['TranslationDataset'] @@ -36,7 +36,7 @@ def __init__(self, tokenizer_src, tokenizer_tgt, dataset_src, dataset_tgt, token src_ids = dataset_to_ids(dataset_src, tokenizer_src) tgt_ids = dataset_to_ids(dataset_tgt, tokenizer_tgt) if clean: - src_ids, tgt_ids = clean_src_and_target(src_ids, tgt_ids) + src_ids, tgt_ids = self.clean_src_and_target(src_ids, tgt_ids) self.batch_indices = self.pack_data_into_batches(src_ids, tgt_ids) self.batches = self.pad_batches(src_ids, tgt_ids, self.batch_indices) @@ -156,35 +156,36 @@ def pack_data_into_batches(self, src_ids, tgt_ids): return batches + def clean_src_and_target( + self, src_ids, tgt_ids, max_tokens=128, min_tokens=3, max_tokens_diff=25, max_tokens_ratio=2.5 + ): + """ + Cleans source and target sentences to get rid of noisy data. + Specifically, a pair of sentences is removed if + -- either source or target is longer than *max_tokens* + -- either source or target is shorter than *min_tokens* + -- absolute difference between source and target is larger than + *max_tokens_diff* + -- one sentence is *max_tokens_ratio* times longer than the other + """ -def clean_src_and_target(src_ids, tgt_ids, max_tokens=128, min_tokens=3, max_tokens_diff=25, max_tokens_ratio=2.5): - """ - Cleans source and target sentences to get rid of noisy data. - Specifically, a pair of sentences is removed if - -- either source or target is longer than *max_tokens* - -- either source or target is shorter than *min_tokens* - -- absolute difference between source and target is larger than - *max_tokens_diff* - -- one sentence is *max_tokens_ratio* times longer than the other - """ - - if len(src_ids) != len(tgt_ids): - raise ValueError("Source and target corpora have different lengths!") - src_ids_, tgt_ids_ = [], [] - for i in range(len(src_ids)): - src_len, tgt_len = len(src_ids[i]), len(tgt_ids[i]) - if ( - src_len > max_tokens - or tgt_len > max_tokens - or src_len < min_tokens - or tgt_len < min_tokens - or (src_ids[i] == tgt_ids[i]) - or np.abs(src_len - tgt_len) > max_tokens_diff - ): - continue - ratio = max(src_len - 2, 1) / max(tgt_len - 2, 1) - if ratio > max_tokens_ratio or ratio < (1 / max_tokens_ratio): - continue - src_ids_.append(src_ids[i]) - tgt_ids_.append(tgt_ids[i]) - return src_ids_, tgt_ids_ + if len(src_ids) != len(tgt_ids): + raise ValueError("Source and target corpora have different lengths!") + src_ids_, tgt_ids_ = [], [] + for i in range(len(src_ids)): + src_len, tgt_len = len(src_ids[i]), len(tgt_ids[i]) + if ( + src_len > max_tokens + or tgt_len > max_tokens + or src_len < min_tokens + or tgt_len < min_tokens + or (src_ids[i] == tgt_ids[i]) + or np.abs(src_len - tgt_len) > max_tokens_diff + ): + continue + ratio = max(src_len - 2, 1) / max(tgt_len - 2, 1) + if ratio > max_tokens_ratio or ratio < (1 / max_tokens_ratio): + continue + src_ids_.append(src_ids[i]) + tgt_ids_.append(tgt_ids[i]) + return src_ids_, tgt_ids_ diff --git a/nemo/collections/nlp/data/datasets/state_tracking_trade_dataset.py b/nemo/collections/nlp/data/datasets/multiwoz_dataset.py similarity index 99% rename from nemo/collections/nlp/data/datasets/state_tracking_trade_dataset.py rename to nemo/collections/nlp/data/datasets/multiwoz_dataset.py index 9358c79d16d6..17690034fc93 100644 --- a/nemo/collections/nlp/data/datasets/state_tracking_trade_dataset.py +++ b/nemo/collections/nlp/data/datasets/multiwoz_dataset.py @@ -170,7 +170,7 @@ def __getitem__(self, idx): class Vocab: """ - Vocab class for TRADE model + Vocab class for MultiWOZ dataset UNK_token = 0 PAD_token = 1 SOS_token = 3 diff --git a/nemo/collections/nlp/data/datasets/punctuation_capitalization_dataset.py b/nemo/collections/nlp/data/datasets/punctuation_capitalization_dataset.py index b2df10907304..3d99470a9ac7 100644 --- a/nemo/collections/nlp/data/datasets/punctuation_capitalization_dataset.py +++ b/nemo/collections/nlp/data/datasets/punctuation_capitalization_dataset.py @@ -1,6 +1,5 @@ -# Copyright 2018 The Google AI Language Team Authors and -# The HuggingFace Inc. team. -# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,24 +12,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" -Utility functions for Token Classification NLP tasks -Some parts of this code were adapted from the HuggingFace library at -https://github.com/huggingface/pytorch-pretrained-BERT -""" +# ============================================================================= __all__ = ['BertPunctuationCapitalizationDataset', 'BertPunctuationCapitalizationInferDataset'] import itertools import os import pickle -import random import numpy as np from torch.utils.data import Dataset from nemo import logging -from nemo.collections.nlp.data.datasets import datasets_utils as utils +from nemo.collections.nlp.data.datasets.datasets_utils import get_label_stats, get_stats def get_features( @@ -49,7 +43,7 @@ def get_features( Args: queries (list of str): text sequences max_seq_length (int): max sequence length minus 2 for [CLS] and [SEP] - tokenizer (Tokenizer): such as NemoBertTokenizer + tokenizer (TokenizerSpec): such as NemoBertTokenizer pad_label (str): pad value use for labels. by default, it's the neutral label. punct_label_ids (dict): dict to map punctuation labels to label ids. @@ -126,7 +120,7 @@ def get_features( max_seq_length = min(max_seq_length, max(sent_lengths)) logging.info(f'Max length: {max_seq_length}') - utils.get_stats(sent_lengths) + get_stats(sent_lengths) too_long_count = 0 for i, subtokens in enumerate(all_subtokens): @@ -202,7 +196,6 @@ class BertPunctuationCapitalizationDataset(Dataset): tokenizer (Tokenizer): such as NemoBertTokenizer num_samples (int): number of samples you want to use for the dataset. If -1, use all dataset. Useful for testing. - shuffle (bool): whether to shuffle your data. pad_label (str): pad value use for labels. by default, it's the neutral label. punct_label_ids and capit_label_ids (dict): @@ -224,7 +217,6 @@ def __init__( max_seq_length, tokenizer, num_samples=-1, - shuffle=False, pad_label='O', punct_label_ids=None, capit_label_ids=None, @@ -275,17 +267,15 @@ def __init__( if len(punct_labels_lines) != len(text_lines): raise ValueError("Labels file should contain labels for every word") - if shuffle or num_samples > 0: - dataset = list(zip(text_lines, punct_labels_lines, capit_labels_lines)) - random.shuffle(dataset) + dataset = list(zip(text_lines, punct_labels_lines, capit_labels_lines)) - if num_samples > 0: - dataset = dataset[:num_samples] + if num_samples > 0: + dataset = dataset[:num_samples] - dataset = list(zip(*dataset)) - text_lines = dataset[0] - punct_labels_lines = dataset[1] - capit_labels_lines = dataset[2] + dataset = list(zip(*dataset)) + text_lines = dataset[0] + punct_labels_lines = dataset[1] + capit_labels_lines = dataset[2] # for dev/test sets use label mapping from training set if punct_label_ids: @@ -351,7 +341,7 @@ def get_stats_and_save(all_labels, label_ids, name): infold = text_file[: text_file.rfind('/')] merged_labels = itertools.chain.from_iterable(all_labels) logging.info('Three most popular labels') - _, label_frequencies = utils.get_label_stats(merged_labels, infold + '/label_count_' + name + '.tsv') + _, label_frequencies = get_label_stats(merged_labels, infold + '/label_count_' + name + '.tsv') out = open(os.path.join(infold, name + '_label_ids.csv'), 'w') labels, _ = zip(*sorted(label_ids.items(), key=lambda x: x[1])) diff --git a/nemo/collections/nlp/data/datasets/qa_squad_dataset.py b/nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_dataset.py similarity index 62% rename from nemo/collections/nlp/data/datasets/qa_squad_dataset.py rename to nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_dataset.py index b02d0036dc18..318cf6bcdb90 100644 --- a/nemo/collections/nlp/data/datasets/qa_squad_dataset.py +++ b/nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_dataset.py @@ -1,20 +1,37 @@ -""" -Copyright 2018 The Google AI Language Team Authors and -The HuggingFace Inc. team. -Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +# ============================================================================= +# Copyright 2018 The Google AI Language Team Authors and +# The HuggingFace Inc. team. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" import collections import json import os @@ -26,7 +43,9 @@ from tqdm import tqdm from nemo import logging -from nemo.collections.nlp.data.datasets.glue_benchmark_dataset import DataProcessor +from nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing import is_whitespace +from nemo.collections.nlp.data.datasets.datasets_utils.datasets_processing import DataProcessor +from nemo.collections.nlp.data.datasets.qa_squad_dataset.qa_squad_processing import convert_examples_to_features from nemo.collections.nlp.metrics.squad_metrics import ( _get_best_indexes, apply_no_ans_threshold, @@ -36,10 +55,9 @@ get_final_text, make_eval_dict, merge_eval, - normalize_answer, ) -from nemo.collections.nlp.utils.common_nlp_utils import _is_whitespace -from nemo.collections.nlp.utils.loss_utils import _compute_softmax +from nemo.collections.nlp.utils.data_utils import normalize_answer +from nemo.collections.nlp.utils.functional_utils import _compute_softmax __all__ = ['SquadDataset'] @@ -410,209 +428,6 @@ def evaluate( return exact_match, f1, all_predictions -def convert_examples_to_features( - examples, tokenizer, max_seq_length, doc_stride, max_query_length, has_groundtruth, -): - """Loads a data file into a list of `InputBatch`s.""" - - unique_id = 1000000000 - - features = [] - for (example_index, example) in enumerate(examples): - query_tokens = tokenizer.text_to_tokens(example.question_text) - - if len(query_tokens) > max_query_length: - query_tokens = query_tokens[0:max_query_length] - - # context: index of token -> index of word - tok_to_orig_index = [] - # context: index of word -> index of first token in token list - orig_to_tok_index = [] - # context without white spaces after tokenization - all_doc_tokens = [] - # doc tokens is word separated context - for (i, token) in enumerate(example.doc_tokens): - orig_to_tok_index.append(len(all_doc_tokens)) - sub_tokens = tokenizer.text_to_tokens(token) - for sub_token in sub_tokens: - tok_to_orig_index.append(i) - all_doc_tokens.append(sub_token) - - # idx of query token start and end in context - tok_start_position = None - tok_end_position = None - if has_groundtruth and example.is_impossible: - tok_start_position = -1 - tok_end_position = -1 - if has_groundtruth and not example.is_impossible: - tok_start_position = orig_to_tok_index[example.start_position] - if example.end_position < len(example.doc_tokens) - 1: - tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 - else: - tok_end_position = len(all_doc_tokens) - 1 - - (tok_start_position, tok_end_position) = _improve_answer_span( - all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text - ) - - # The -3 accounts for tokenizer.cls_token, tokenizer.sep_token and tokenizer.eos_token - # doc_spans contains all possible contexts options of given length - max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 - _DocSpan = collections.namedtuple("DocSpan", ["start", "length"]) - doc_spans = [] - start_offset = 0 - while start_offset < len(all_doc_tokens): - length = len(all_doc_tokens) - start_offset - if length > max_tokens_for_doc: - length = max_tokens_for_doc - doc_spans.append(_DocSpan(start=start_offset, length=length)) - if start_offset + length == len(all_doc_tokens): - break - start_offset += min(length, doc_stride) - - for (doc_span_index, doc_span) in enumerate(doc_spans): - tokens = [] - # maps context tokens idx in final input -> word idx in context - token_to_orig_map = {} - token_is_max_context = {} - segment_ids = [] - tokens.append(tokenizer.bos_token) - segment_ids.append(0) - for token in query_tokens: - tokens.append(token) - segment_ids.append(0) - tokens.append(tokenizer.sep_token) - segment_ids.append(0) - - for i in range(doc_span.length): - split_token_index = doc_span.start + i - token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] - - is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index) - token_is_max_context[len(tokens)] = is_max_context - tokens.append(all_doc_tokens[split_token_index]) - segment_ids.append(1) - tokens.append(tokenizer.eos_token) - segment_ids.append(1) - - input_ids = tokenizer.tokens_to_ids(tokens) - - # The mask has 1 for real tokens and 0 for padding tokens. - # Only real tokens are attended to. - input_mask = [1] * len(input_ids) - - # Zero-pad up to the sequence length. - while len(input_ids) < max_seq_length: - input_ids.append(tokenizer.pad_id) - input_mask.append(0) - segment_ids.append(0) - - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length - - # calculate start and end position in final array - # of tokens in answer if no answer, - # 0 for both pointing to tokenizer.cls_token - start_position = None - end_position = None - if has_groundtruth and not example.is_impossible: - doc_start = doc_span.start - doc_end = doc_span.start + doc_span.length - 1 - out_of_span = False - if not (tok_start_position >= doc_start and tok_end_position <= doc_end): - out_of_span = True - if out_of_span: - start_position = 0 - end_position = 0 - else: - doc_offset = len(query_tokens) + 2 - start_position = tok_start_position - doc_start + doc_offset - end_position = tok_end_position - doc_start + doc_offset - if has_groundtruth and example.is_impossible: - # if our document chunk does not contain - # an annotation we throw it out, since there is nothing - # to predict. - start_position = 0 - end_position = 0 - - if example_index < 1: - logging.info("*** Example ***") - logging.info("unique_id: %s" % (unique_id)) - logging.info("example_index: %s" % (example_index)) - logging.info("doc_span_index: %s" % (doc_span_index)) - logging.info("tokens: %s" % " ".join(tokens)) - logging.info( - "token_to_orig_map: %s" % " ".join(["%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]) - ) - logging.info( - "token_is_max_context: %s" - % " ".join(["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()]) - ) - logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) - logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) - if has_groundtruth and example.is_impossible: - logging.info("impossible example") - if has_groundtruth and not example.is_impossible: - answer_text = " ".join(tokens[start_position : (end_position + 1)]) - logging.info("start_position: %d" % (start_position)) - logging.info("end_position: %d" % (end_position)) - logging.info("answer: %s" % (answer_text)) - - features.append( - InputFeatures( - unique_id=unique_id, - example_index=example_index, - doc_span_index=doc_span_index, - tokens=tokens, - token_to_orig_map=token_to_orig_map, - token_is_max_context=token_is_max_context, - input_ids=input_ids, - input_mask=input_mask, - segment_ids=segment_ids, - start_position=start_position, - end_position=end_position, - is_impossible=example.is_impossible, - ) - ) - unique_id += 1 - - return features - - -class InputFeatures(object): - """A single set of features of data.""" - - def __init__( - self, - unique_id, - example_index, - doc_span_index, - tokens, - token_to_orig_map, - token_is_max_context, - input_ids, - input_mask, - segment_ids, - start_position=None, - end_position=None, - is_impossible=None, - ): - self.unique_id = unique_id - self.example_index = example_index - self.doc_span_index = doc_span_index - self.tokens = tokens - self.token_to_orig_map = token_to_orig_map - self.token_is_max_context = token_is_max_context - self.input_ids = input_ids - self.input_mask = input_mask - self.segment_ids = segment_ids - self.start_position = start_position - self.end_position = end_position - self.is_impossible = is_impossible - - class SquadProcessor(DataProcessor): """ Processor for the SQuAD data set. @@ -723,7 +538,7 @@ def __init__( # char_to_word_offset = [0, 0, 0, 1, 1] # doc_tokens = ["hi", "yo"] for c in self.context_text: - if _is_whitespace(c): + if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: @@ -743,79 +558,3 @@ def __init__( self.end_position = char_to_word_offset[ min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1) ] - - -def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text): - """Returns tokenized answer spans that - better match the annotated answer.""" - tok_answer_text = " ".join(tokenizer.text_to_tokens(orig_answer_text)) - - for new_start in range(input_start, input_end + 1): - for new_end in range(input_end, new_start - 1, -1): - text_span = " ".join(doc_tokens[new_start : (new_end + 1)]) - if text_span == tok_answer_text: - return (new_start, new_end) - - return (input_start, input_end) - - -def _check_is_max_context(doc_spans, cur_span_index, position): - """Check if this is the 'max context' doc span for the token.""" - best_score = None - best_span_index = None - for (span_index, doc_span) in enumerate(doc_spans): - end = doc_span.start + doc_span.length - 1 - if position < doc_span.start: - continue - if position > end: - continue - num_left_context = position - doc_span.start - num_right_context = end - position - score = min(num_left_context, num_right_context) + 0.01 * doc_span.length - if best_score is None or score > best_score: - best_score = score - best_span_index = span_index - - return cur_span_index == best_span_index - - -def check_is_max_context(doc_spans, cur_span_index, position): - """Check if this is the 'max context' doc span for the token. - - Because of the sliding window approach taken to scoring documents, - a single token can appear in multiple documents. - - Example: - Doc: the man went to the store and bought a gallon of milk - Span A: the man went to the - Span B: to the store and bought - Span C: and bought a gallon of - ... - - Now the word 'bought' will have two scores from spans B and C. We only - want to consider the score with "maximum context", which we define as - the *minimum* of its left and right context (the *sum* of left and - right context will always be the same, of course). - - In the example the maximum context for 'bought' would be span C since - it has 1 left context and 3 right context, while span B has 4 left context - and 0 right context. - - Code adapted from the code by the Google AI and HuggingFace. - """ - best_score = None - best_span_index = None - for (span_index, doc_span) in enumerate(doc_spans): - end = doc_span.start + doc_span.length - 1 - if position < doc_span.start: - continue - if position > end: - continue - num_left_context = position - doc_span.start - num_right_context = end - position - score = min(num_left_context, num_right_context) + 0.01 * doc_span.length - if best_score is None or score > best_score: - best_score = score - best_span_index = span_index - - return cur_span_index == best_span_index diff --git a/nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_processing.py b/nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_processing.py new file mode 100644 index 000000000000..57b3db90c6c9 --- /dev/null +++ b/nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_processing.py @@ -0,0 +1,296 @@ +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +# ============================================================================= +# Copyright 2018 The Google AI Language Team Authors and +# The HuggingFace Inc. team. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import collections + +from nemo import logging + + +def convert_examples_to_features( + examples, tokenizer, max_seq_length, doc_stride, max_query_length, has_groundtruth, +): + """Loads a data file into a list of `InputBatch`s.""" + + unique_id = 1000000000 + + features = [] + for (example_index, example) in enumerate(examples): + query_tokens = tokenizer.text_to_tokens(example.question_text) + + if len(query_tokens) > max_query_length: + query_tokens = query_tokens[0:max_query_length] + + # context: index of token -> index of word + tok_to_orig_index = [] + # context: index of word -> index of first token in token list + orig_to_tok_index = [] + # context without white spaces after tokenization + all_doc_tokens = [] + # doc tokens is word separated context + for (i, token) in enumerate(example.doc_tokens): + orig_to_tok_index.append(len(all_doc_tokens)) + sub_tokens = tokenizer.text_to_tokens(token) + for sub_token in sub_tokens: + tok_to_orig_index.append(i) + all_doc_tokens.append(sub_token) + + # idx of query token start and end in context + tok_start_position = None + tok_end_position = None + if has_groundtruth and example.is_impossible: + tok_start_position = -1 + tok_end_position = -1 + if has_groundtruth and not example.is_impossible: + tok_start_position = orig_to_tok_index[example.start_position] + if example.end_position < len(example.doc_tokens) - 1: + tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 + else: + tok_end_position = len(all_doc_tokens) - 1 + + (tok_start_position, tok_end_position) = _improve_answer_span( + all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text + ) + + # The -3 accounts for tokenizer.cls_token, tokenizer.sep_token and tokenizer.eos_token + # doc_spans contains all possible contexts options of given length + max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 + _DocSpan = collections.namedtuple("DocSpan", ["start", "length"]) + doc_spans = [] + start_offset = 0 + while start_offset < len(all_doc_tokens): + length = len(all_doc_tokens) - start_offset + if length > max_tokens_for_doc: + length = max_tokens_for_doc + doc_spans.append(_DocSpan(start=start_offset, length=length)) + if start_offset + length == len(all_doc_tokens): + break + start_offset += min(length, doc_stride) + + for (doc_span_index, doc_span) in enumerate(doc_spans): + tokens = [] + # maps context tokens idx in final input -> word idx in context + token_to_orig_map = {} + token_is_max_context = {} + segment_ids = [] + tokens.append(tokenizer.bos_token) + segment_ids.append(0) + for token in query_tokens: + tokens.append(token) + segment_ids.append(0) + tokens.append(tokenizer.sep_token) + segment_ids.append(0) + + for i in range(doc_span.length): + split_token_index = doc_span.start + i + token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] + + is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index) + token_is_max_context[len(tokens)] = is_max_context + tokens.append(all_doc_tokens[split_token_index]) + segment_ids.append(1) + tokens.append(tokenizer.eos_token) + segment_ids.append(1) + + input_ids = tokenizer.tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. + # Only real tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(tokenizer.pad_id) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + # calculate start and end position in final array + # of tokens in answer if no answer, + # 0 for both pointing to tokenizer.cls_token + start_position = None + end_position = None + if has_groundtruth and not example.is_impossible: + doc_start = doc_span.start + doc_end = doc_span.start + doc_span.length - 1 + out_of_span = False + if not (tok_start_position >= doc_start and tok_end_position <= doc_end): + out_of_span = True + if out_of_span: + start_position = 0 + end_position = 0 + else: + doc_offset = len(query_tokens) + 2 + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + if has_groundtruth and example.is_impossible: + # if our document chunk does not contain + # an annotation we throw it out, since there is nothing + # to predict. + start_position = 0 + end_position = 0 + + if example_index < 1: + logging.info("*** Example ***") + logging.info("unique_id: %s" % (unique_id)) + logging.info("example_index: %s" % (example_index)) + logging.info("doc_span_index: %s" % (doc_span_index)) + logging.info("tokens: %s" % " ".join(tokens)) + logging.info( + "token_to_orig_map: %s" % " ".join(["%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]) + ) + logging.info( + "token_is_max_context: %s" + % " ".join(["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()]) + ) + logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + if has_groundtruth and example.is_impossible: + logging.info("impossible example") + if has_groundtruth and not example.is_impossible: + answer_text = " ".join(tokens[start_position : (end_position + 1)]) + logging.info("start_position: %d" % (start_position)) + logging.info("end_position: %d" % (end_position)) + logging.info("answer: %s" % (answer_text)) + + features.append( + InputFeatures( + unique_id=unique_id, + example_index=example_index, + doc_span_index=doc_span_index, + tokens=tokens, + token_to_orig_map=token_to_orig_map, + token_is_max_context=token_is_max_context, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + start_position=start_position, + end_position=end_position, + is_impossible=example.is_impossible, + ) + ) + unique_id += 1 + + return features + + +def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text): + """Returns tokenized answer spans that + better match the annotated answer.""" + tok_answer_text = " ".join(tokenizer.text_to_tokens(orig_answer_text)) + + for new_start in range(input_start, input_end + 1): + for new_end in range(input_end, new_start - 1, -1): + text_span = " ".join(doc_tokens[new_start : (new_end + 1)]) + if text_span == tok_answer_text: + return (new_start, new_end) + + return (input_start, input_end) + + +def _check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token. + + Because of the sliding window approach taken to scoring documents, + a single token can appear in multiple documents. + + Example: + Doc: the man went to the store and bought a gallon of milk + Span A: the man went to the + Span B: to the store and bought + Span C: and bought a gallon of + ... + + Now the word 'bought' will have two scores from spans B and C. We only + want to consider the score with "maximum context", which we define as + the *minimum* of its left and right context (the *sum* of left and + right context will always be the same, of course). + + In the example the maximum context for 'bought' would be span C since + it has 1 left context and 3 right context, while span B has 4 left context + and 0 right context. + + Code adapted from the code by the Google AI and HuggingFace. + """ + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__( + self, + unique_id, + example_index, + doc_span_index, + tokens, + token_to_orig_map, + token_is_max_context, + input_ids, + input_mask, + segment_ids, + start_position=None, + end_position=None, + is_impossible=None, + ): + self.unique_id = unique_id + self.example_index = example_index + self.doc_span_index = doc_span_index + self.tokens = tokens + self.token_to_orig_map = token_to_orig_map + self.token_is_max_context = token_is_max_context + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible diff --git a/nemo/collections/nlp/data/datasets/text_classification_dataset.py b/nemo/collections/nlp/data/datasets/text_classification_dataset.py index 11340ffa4da5..ae589641ffc3 100644 --- a/nemo/collections/nlp/data/datasets/text_classification_dataset.py +++ b/nemo/collections/nlp/data/datasets/text_classification_dataset.py @@ -1,6 +1,5 @@ -# Copyright 2018 The Google AI Language Team Authors and -# The HuggingFace Inc. team. -# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,31 +12,28 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# ============================================================================= -""" -Utility functions for Token Classification NLP tasks -Some parts of this code were adapted from the HuggingFace library at -https://github.com/huggingface/pytorch-pretrained-BERT -""" - -import random import numpy as np from torch.utils.data import Dataset from nemo import logging from nemo.collections.nlp.data.datasets.datasets_utils import ( - get_intent_labels, - get_label_stats, - get_stats, process_imdb, process_jarvis_datasets, process_nlu, process_sst_2, process_thucnews, ) +from nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing import ( + calc_class_weights, + get_intent_labels, + get_label_stats, + get_stats, + if_exist, +) from nemo.collections.nlp.utils.callback_utils import list2str -from nemo.collections.nlp.utils.common_nlp_utils import calc_class_weights, if_exist __all__ = ['BertTextClassificationDataset'] @@ -54,10 +50,9 @@ class BertTextClassificationDataset(Dataset): tokenizer (Tokenizer): such as BertTokenizer num_samples (int): number of samples you want to use for the dataset. If -1, use all dataset. Useful for testing. - shuffle (bool): whether to shuffle your data. """ - def __init__(self, input_file, max_seq_length, tokenizer, num_samples=-1, shuffle=True): + def __init__(self, input_file, max_seq_length, tokenizer, num_samples=-1): with open(input_file, "r") as f: sent_labels, all_sent_subtokens = [], [] sent_lengths = [] @@ -66,11 +61,8 @@ def __init__(self, input_file, max_seq_length, tokenizer, num_samples=-1, shuffl lines = f.readlines()[1:] logging.info(f'{input_file}: {len(lines)}') - if shuffle or num_samples > -1: - random.seed(0) - random.shuffle(lines) - if num_samples > 0: - lines = lines[:num_samples] + if num_samples > 0: + lines = lines[:num_samples] for index, line in enumerate(lines): if index % 20000 == 0: @@ -177,7 +169,7 @@ def __init__(self, sent_id, sent_label, input_ids, input_mask, segment_ids): self.segment_ids = segment_ids -class SentenceClassificationDataDesc: +class TextClassificationDataDesc: def __init__(self, dataset_name, data_dir, do_lower_case): if dataset_name == 'sst-2': self.data_dir = process_sst_2(data_dir) diff --git a/nemo/collections/nlp/data/datasets/token_classification_dataset.py b/nemo/collections/nlp/data/datasets/token_classification_dataset.py index cac15d50d2c5..f57073236377 100644 --- a/nemo/collections/nlp/data/datasets/token_classification_dataset.py +++ b/nemo/collections/nlp/data/datasets/token_classification_dataset.py @@ -1,3 +1,20 @@ +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +# ============================================================================= # Copyright 2018 The Google AI Language Team Authors and # The HuggingFace Inc. team. # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. @@ -13,6 +30,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# ============================================================================= """ Utility functions for Token Classification NLP tasks @@ -23,13 +41,12 @@ import itertools import os import pickle -import random import numpy as np from torch.utils.data import Dataset from nemo import logging -from nemo.collections.nlp.data.datasets import datasets_utils +from nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing import get_label_stats, get_stats __all__ = ['BertTokenClassificationDataset', 'BertTokenClassificationInferDataset'] @@ -113,7 +130,7 @@ def get_features( max_seq_length = min(max_seq_length, max(sent_lengths)) logging.info(f'Max length: {max_seq_length}') - datasets_utils.get_stats(sent_lengths) + get_stats(sent_lengths) too_long_count = 0 for i, subtokens in enumerate(all_subtokens): @@ -175,7 +192,6 @@ class BertTokenClassificationDataset(Dataset): tokenizer (Tokenizer): such as NemoBertTokenizer num_samples (int): number of samples you want to use for the dataset. If -1, use all dataset. Useful for testing. - shuffle (bool): whether to shuffle your data. pad_label (str): pad value use for labels. by default, it's the neutral label. label_ids (dict): label_ids (dict): dict to map labels to label ids. @@ -196,7 +212,6 @@ def __init__( max_seq_length, tokenizer, num_samples=-1, - shuffle=False, pad_label='O', label_ids=None, ignore_extra_tokens=False, @@ -241,12 +256,9 @@ def __init__( if len(labels_lines) != len(text_lines): raise ValueError("Labels file should contain labels for every word") - if shuffle or num_samples > 0: + if num_samples > 0: dataset = list(zip(text_lines, labels_lines)) - random.shuffle(dataset) - - if num_samples > 0: - dataset = dataset[:num_samples] + dataset = dataset[:num_samples] dataset = list(zip(*dataset)) text_lines = dataset[0] @@ -308,7 +320,7 @@ def __init__( infold = text_file[: text_file.rfind('/')] merged_labels = itertools.chain.from_iterable(self.all_labels) logging.info('Three most popular labels') - _, self.label_frequencies = datasets_utils.get_label_stats(merged_labels, infold + '/label_stats.tsv') + _, self.label_frequencies = get_label_stats(merged_labels, infold + '/label_stats.tsv') # save label_ids out = open(infold + '/label_ids.csv', 'w') diff --git a/nemo/collections/nlp/metrics/bleu.py b/nemo/collections/nlp/metrics/bleu.py index bab9c5f4c0f6..a49eb0a8c10b 100644 --- a/nemo/collections/nlp/metrics/bleu.py +++ b/nemo/collections/nlp/metrics/bleu.py @@ -1,3 +1,20 @@ +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +# ============================================================================= # Copyright 2017 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,6 +29,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== + """Python implementation of BLEU and smooth-BLEU. This module provides a Python implementation of BLEU and smooth-BLEU. Smooth BLEU is computed following the method outlined in the paper: diff --git a/nemo/collections/nlp/metrics/sacrebleu.py b/nemo/collections/nlp/metrics/sacrebleu.py index 586b19bf2d30..5130dd9633ca 100755 --- a/nemo/collections/nlp/metrics/sacrebleu.py +++ b/nemo/collections/nlp/metrics/sacrebleu.py @@ -1,6 +1,23 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +# ============================================================================= # Copyright 2017--2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You may not @@ -13,6 +30,8 @@ # an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either # express or implied. See the License for the specific language governing # permissions and limitations under the License. +# ============================================================================= + """ SacreBLEU provides hassle-free computation of shareable, comparable, and reproducible BLEU scores. Inspired by Rico Sennrich's `multi-bleu-detok.perl`, it produces the official WMT scores but works with plain text. diff --git a/nemo/collections/nlp/metrics/squad_metrics.py b/nemo/collections/nlp/metrics/squad_metrics.py index e5f0af1e2517..f8a89c9ead35 100644 --- a/nemo/collections/nlp/metrics/squad_metrics.py +++ b/nemo/collections/nlp/metrics/squad_metrics.py @@ -1,27 +1,43 @@ -""" -Copyright 2018 The Google AI Language Team Authors and -The HuggingFace Inc. team. -Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +# ============================================================================= +# Copyright 2018 The Google AI Language Team Authors and +# The HuggingFace Inc. team. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= import collections from transformers.tokenization_bert import BasicTokenizer from nemo import logging -from nemo.collections.nlp.data.datasets.datasets_utils import get_tokens, normalize_answer +from nemo.collections.nlp.utils.data_utils import get_tokens, normalize_answer __all__ = [ 'f1_score', @@ -31,7 +47,6 @@ 'merge_eval', 'find_all_best_thresh', 'find_best_thresh', - 'normalize_answer', '_get_best_indexes', 'get_final_text', ] diff --git a/nemo/collections/nlp/nm/data_layers/glue_benchmark_datalayer.py b/nemo/collections/nlp/nm/data_layers/glue_benchmark_datalayer.py index dca9324b7817..b9f1fc2c1638 100644 --- a/nemo/collections/nlp/nm/data_layers/glue_benchmark_datalayer.py +++ b/nemo/collections/nlp/nm/data_layers/glue_benchmark_datalayer.py @@ -30,6 +30,14 @@ class GlueClassificationDataLayer(TextDataLayer): All the data processing is done in GLUEDataset. Args: + data_dir (str): data directory path + tokenizer (TokenizerSpec): text tokenizer. + max_seq_length (int): maximum allowed length of the text segments . + processor (DataProcessor): data processor. + evaluate (bool): true if data layer is used for evaluation. Default: False. + token_params (dict): dictionary that specifies special tokens. + batch_size (int): batch size in segments + shuffle (bool): whether to shuffle data or not. Default: False. dataset_type (GLUEDataset): the dataset that needs to be converted to DataLayerNM """ @@ -38,12 +46,17 @@ class GlueClassificationDataLayer(TextDataLayer): @add_port_docs() def output_ports(self): """Returns definitions of module output ports. + + input_ids: + indices of tokens which constitute batches of text segments + input_type_ids: + tensor with 0's and 1's to denote the text segment type + input_mask: + bool tensor with 0s in place of tokens to be masked + labels: + integer indices for sentence classication prediction """ return { - # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "labels": NeuralType({0: AxisType(CategoricalTag)}), "input_ids": NeuralType(('B', 'T'), ChannelType()), "input_type_ids": NeuralType(('B', 'T'), ChannelType()), "input_mask": NeuralType(('B', 'T'), ChannelType()), @@ -82,6 +95,14 @@ class GlueRegressionDataLayer(TextDataLayer): All the data processing is done in GLUEDataset. Args: + data_dir (str): data directory path + tokenizer (TokenizerSpec): text tokenizer. + max_seq_length (int): maximum allowed length of the text segments . + processor (DataProcessor): data processor. + evaluate (bool): true if data layer is used for evaluation. Default: False. + token_params (dict): dictionary that specifies special tokens. + batch_size (int): batch size in segments + shuffle (bool): whether to shuffle data or not. Default: False. dataset_type (GLUEDataset): the dataset that needs to be converted to DataLayerNM """ @@ -90,12 +111,17 @@ class GlueRegressionDataLayer(TextDataLayer): @add_port_docs() def output_ports(self): """Returns definitions of module output ports. + + input_ids: + indices of tokens which constitute batches of text segments + input_type_ids: + tensor with 0's and 1's to denote the text segment type + input_mask: + bool tensor with 0s in place of tokens to be masked + labels: + float for sentence regression prediction """ return { - # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "labels": NeuralType({0: AxisType(RegressionTag)}), "input_ids": NeuralType(('B', 'T'), ChannelType()), "input_type_ids": NeuralType(('B', 'T'), ChannelType()), "input_mask": NeuralType(('B', 'T'), ChannelType()), diff --git a/nemo/collections/nlp/nm/data_layers/joint_intent_slot_datalayer.py b/nemo/collections/nlp/nm/data_layers/joint_intent_slot_datalayer.py index df3731cfa454..e9bec213d3e2 100644 --- a/nemo/collections/nlp/nm/data_layers/joint_intent_slot_datalayer.py +++ b/nemo/collections/nlp/nm/data_layers/joint_intent_slot_datalayer.py @@ -16,7 +16,7 @@ from nemo.collections.nlp.data import BertJointIntentSlotDataset, BertJointIntentSlotInferDataset from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer -from nemo.core import ChannelType, NeuralType +from nemo.core import ChannelType, LabelsType, MaskType, NeuralType from nemo.utils.decorators import add_port_docs __all__ = ['BertJointIntentSlotDataLayer', 'BertJointIntentSlotInferDataLayer'] @@ -29,38 +29,53 @@ class BertJointIntentSlotDataLayer(TextDataLayer): All the data processing is done in BertJointIntentSlotDataset. - input_mask: used to ignore some of the input tokens like paddings - - loss_mask: used to mask and ignore tokens in the loss function - - subtokens_mask: used to ignore the outputs of unwanted tokens in - the inference and evaluation like the start and end tokens - Args: - dataset (BertJointIntentSlotDataset): + input_file (str): + data file + slot_file (str): + file to slot labels, each line corresponding to + slot labels for a sentence in input_file. No header. + pad_label (int): pad value use for slot labels + tokenizer (TokenizerSpec): text tokenizer. + max_seq_length (int): + max sequence length minus 2 for [CLS] and [SEP] + dataset_type (BertJointIntentSlotDataset): the dataset that needs to be converted to DataLayerNM + shuffle (bool): whether to shuffle data or not. Default: False. + batch_size: text segments batch size + ignore_extra_tokens (bool): whether or not to ignore extra tokens + ignore_start_end (bool)": whether or not to ignore start and end """ @property @add_port_docs() def output_ports(self): """Returns definitions of module output ports. + + input_ids: + indices of tokens which constitute batches of text segments + input_type_ids: + tensor with 0's and 1's to denote the text segment type + input_mask: + bool tensor with 0s in place of tokens to be masked + loss_mask: + used to mask and ignore tokens in the loss function + subtokens_mask: + used to ignore the outputs of unwanted tokens in + the inference and evaluation like the start and end tokens + intents: + TODO + slots: + TODO """ return { - # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "intents": NeuralType({0: AxisType(BatchTag)}), - # "slots": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), "input_ids": NeuralType(('B', 'T'), ChannelType()), "input_type_ids": NeuralType(('B', 'T'), ChannelType()), "input_mask": NeuralType(('B', 'T'), ChannelType()), - "loss_mask": NeuralType(('B', 'T'), ChannelType()), + "loss_mask": NeuralType(('B', 'T'), MaskType()), "subtokens_mask": NeuralType(('B', 'T'), ChannelType()), - "intents": NeuralType(tuple('B'), ChannelType()), - "slots": NeuralType(('B', 'T'), ChannelType()), + "intents": NeuralType(tuple('B'), LabelsType()), + "slots": NeuralType(('B', 'T'), LabelsType()), } def __init__( @@ -84,11 +99,10 @@ def __init__( 'tokenizer': tokenizer, 'max_seq_length': max_seq_length, 'num_samples': num_samples, - 'shuffle': shuffle, 'ignore_extra_tokens': ignore_extra_tokens, 'ignore_start_end': ignore_start_end, } - super().__init__(dataset_type, dataset_params, batch_size, shuffle) + super().__init__(dataset_type, dataset_params, batch_size, shuffle=shuffle) class BertJointIntentSlotInferDataLayer(TextDataLayer): @@ -98,29 +112,36 @@ class BertJointIntentSlotInferDataLayer(TextDataLayer): All the data processing is done in BertJointIntentSlotInferDataset. - input_mask: used to ignore some of the input tokens like paddings - - loss_mask: used to mask and ignore tokens in the loss function - - subtokens_mask: used to ignore the outputs of unwanted tokens in - the inference and evaluation like the start and end tokens - Args: - dataset (BertJointIntentSlotInferDataset): + queries: + TODO + tokenizer (TokenizerSpec): text tokenizer. + max_seq_length (int): + max sequence length minus 2 for [CLS] and [SEP] + dataset_type (BertJointIntentSlotDataset): the dataset that needs to be converted to DataLayerNM + shuffle (bool): whether to shuffle data or not. Default: False. + batch_size: text segments batch size """ @property @add_port_docs() def output_ports(self): """Returns definitions of module output ports. + + input_ids: + indices of tokens which constitute batches of text segments + input_type_ids: + tensor with 0's and 1's to denote the text segment type + input_mask: + bool tensor with 0s in place of tokens to be masked + loss_mask: + used to mask and ignore tokens in the loss function + subtokens_mask: + used to ignore the outputs of unwanted tokens in + the inference and evaluation like the start and end tokens """ return { - # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), "input_ids": NeuralType(('B', 'T'), ChannelType()), "input_type_ids": NeuralType(('B', 'T'), ChannelType()), "input_mask": NeuralType(('B', 'T'), ChannelType()), @@ -128,6 +149,14 @@ def output_ports(self): "subtokens_mask": NeuralType(('B', 'T'), ChannelType()), } - def __init__(self, queries, tokenizer, max_seq_length, batch_size=1, dataset_type=BertJointIntentSlotInferDataset): + def __init__( + self, + queries, + tokenizer, + max_seq_length, + batch_size=1, + shuffle=False, + dataset_type=BertJointIntentSlotInferDataset, + ): dataset_params = {'queries': queries, 'tokenizer': tokenizer, 'max_seq_length': max_seq_length} - super().__init__(dataset_type, dataset_params, batch_size, shuffle=False) + super().__init__(dataset_type, dataset_params, batch_size, shuffle=shuffle) diff --git a/nemo/collections/nlp/nm/data_layers/lm_bert_datalayer.py b/nemo/collections/nlp/nm/data_layers/lm_bert_datalayer.py index 176a7cc67a59..c59134b9f271 100644 --- a/nemo/collections/nlp/nm/data_layers/lm_bert_datalayer.py +++ b/nemo/collections/nlp/nm/data_layers/lm_bert_datalayer.py @@ -25,7 +25,7 @@ from nemo.backends.pytorch import DataLayerNM from nemo.collections.nlp.data import BertPretrainingDataset, BertPretrainingPreprocessedDataset from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer -from nemo.core import ChannelType, LabelsType, NeuralType +from nemo.core import ChannelType, LabelsType, MaskType, NeuralType from nemo.utils.decorators import add_port_docs __all__ = ['BertPretrainingDataLayer', 'BertPretrainingPreprocessedDataLayer'] @@ -33,7 +33,7 @@ class BertPretrainingDataLayer(TextDataLayer): """ - Data layer for masked language modeling task. + Data layer for masked language modeling task for text data. Args: tokenizer (TokenizerSpec): tokenizer @@ -43,30 +43,36 @@ class BertPretrainingDataLayer(TextDataLayer): batch_size (int): batch size in segments short_seeq_prob (float): Probability of creating sequences which are shorter than the maximum length. - Defualts to 0.1. + Defaults to 0.1. + shuffle (bool): whether to shuffle data or not. Default: False. """ @property @add_port_docs() def output_ports(self): """Returns definitions of module output ports. + input_ids: + indices of tokens which constitute batches of masked text segments + input_type_ids: + tensor with 0's and 1's to denote the text segment type + input_mask: + bool tensor with 0s in place of tokens to be masked + output_ids: indices of tokens which constitute batches of unmasked text segments + output_mask: bool tensor with 0s in place of tokens to be masked + labels: 0 or 1 for next sentence prediction classification """ return { - # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "output_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "labels": NeuralType({0: AxisType(BatchTag)}), "input_ids": NeuralType(('B', 'T'), ChannelType()), "input_type_ids": NeuralType(('B', 'T'), ChannelType()), "input_mask": NeuralType(('B', 'T'), ChannelType()), - "output_ids": NeuralType(('B', 'T'), ChannelType()), - "output_mask": NeuralType(('B', 'T'), ChannelType()), + "output_ids": NeuralType(('B', 'T'), LabelsType()), + "output_mask": NeuralType(('B', 'T'), MaskType()), "labels": NeuralType(tuple('B'), LabelsType()), } - def __init__(self, tokenizer, dataset, max_seq_length, mask_probability, short_seq_prob=0.1, batch_size=64): + def __init__( + self, tokenizer, dataset, max_seq_length, mask_probability, short_seq_prob=0.1, batch_size=64, shuffle=False + ): dataset_params = { 'tokenizer': tokenizer, 'dataset': dataset, @@ -74,41 +80,40 @@ def __init__(self, tokenizer, dataset, max_seq_length, mask_probability, short_s 'mask_probability': mask_probability, 'short_seq_prob': short_seq_prob, } - super().__init__(BertPretrainingDataset, dataset_params, batch_size, shuffle=False) + super().__init__(BertPretrainingDataset, dataset_params, batch_size, shuffle=shuffle) class BertPretrainingPreprocessedDataLayer(DataLayerNM): """ - Data layer for masked language modeling task. + Data layer for masked language modeling task for preprocessed data. Args: - tokenizer (TokenizerSpec): tokenizer dataset (str): directory or a single file with dataset documents max_seq_length (int): maximum allowed length of the text segments - mask_probability (float): probability of masking input sequence tokens batch_size (int): batch size in segments - short_seeq_prob (float): Probability of creating sequences which are - shorter than the maximum length. - Defualts to 0.1. + training (bool): true if in training mode """ @property @add_port_docs() def output_ports(self): """Returns definitions of module output ports. + input_ids: + indices of tokens which constitute batches of masked text segments + input_type_ids: + tensor with 0's and 1's to denote the text segment type + input_mask: + bool tensor with 0s in place of tokens to be masked + output_ids: indices of tokens which constitute batches of unmasked text segments + output_mask: bool tensor with 0s in place of tokens to be masked + labels: 0 or 1 for next sentence prediction classification """ return { - # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "output_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "labels": NeuralType({0: AxisType(BatchTag)}), "input_ids": NeuralType(('B', 'T'), ChannelType()), "input_type_ids": NeuralType(('B', 'T'), ChannelType()), "input_mask": NeuralType(('B', 'T'), ChannelType()), - "output_ids": NeuralType(('B', 'T'), ChannelType()), - "output_mask": NeuralType(('B', 'T'), ChannelType()), + "output_ids": NeuralType(('B', 'T'), LabelsType()), + "output_mask": NeuralType(('B', 'T'), MaskType()), "labels": NeuralType(tuple('B'), LabelsType()), } diff --git a/nemo/collections/nlp/nm/data_layers/lm_transformer_datalayer.py b/nemo/collections/nlp/nm/data_layers/lm_transformer_datalayer.py index a81cb1568c69..7c9df0695991 100644 --- a/nemo/collections/nlp/nm/data_layers/lm_transformer_datalayer.py +++ b/nemo/collections/nlp/nm/data_layers/lm_transformer_datalayer.py @@ -30,8 +30,12 @@ class LanguageModelingDataLayer(TextDataLayer): dataset (str): path to text document with data tokenizer (TokenizerSpec): tokenizer max_seq_length (int): maximum allowed length of the text segments + batch_size (int): batch size batch_step (int): how many tokens to skip between two successive segments of text when constructing batches + dataset_type (Dataset): + the underlying dataset. Default: LanguageModelingDataset + shuffle (bool): whether to shuffle data or not. Default: False. """ @property @@ -40,33 +44,26 @@ def output_ports(self): """Returns definitions of module output ports. input_ids: indices of tokens which constitute batches of text segments - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - input_mask: bool tensor with 0s in place of tokens to be masked - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - labels: indices of tokens which should be predicted from each of the corresponding tokens in input_ids; for left-to-right language modeling equals to input_ids shifted by 1 to the right - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) """ return { - # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), "input_ids": NeuralType(('B', 'T'), ChannelType()), "input_mask": NeuralType(('B', 'T'), ChannelType()), "labels": NeuralType(('B', 'T'), LabelsType()), } def __init__( - self, dataset, tokenizer, max_seq_length, batch_size, batch_step=128, dataset_type=LanguageModelingDataset + self, + dataset, + tokenizer, + max_seq_length, + batch_size, + batch_step=128, + dataset_type=LanguageModelingDataset, + shuffle=False, ): dataset_params = { 'dataset': dataset, @@ -74,4 +71,4 @@ def __init__( 'max_seq_length': max_seq_length, 'batch_step': batch_step, } - super().__init__(dataset_type, dataset_params, batch_size, shuffle=False) + super().__init__(dataset_type, dataset_params, batch_size, shuffle=shuffle) diff --git a/nemo/collections/nlp/nm/data_layers/machine_translation_datalayer.py b/nemo/collections/nlp/nm/data_layers/machine_translation_datalayer.py index 33fa833fa7a6..0ff83ae67b90 100644 --- a/nemo/collections/nlp/nm/data_layers/machine_translation_datalayer.py +++ b/nemo/collections/nlp/nm/data_layers/machine_translation_datalayer.py @@ -42,6 +42,8 @@ class TranslationDataLayer(TextDataLayer): pairs with big difference in sentences length, removing pairs with the same tokens in src and tgt, etc; useful for training data layer and should not be used in evaluation data layer + dataset_type (Dataset): + the underlying dataset. Default: TranslationDataset """ @property @@ -50,28 +52,17 @@ def output_ports(self): """Returns definitions of module output ports. src_ids: indices of tokens which correspond to source sentences - src_mask: bool tensor with 0s in place of source tokens to be masked - tgt_ids: indices of tokens which correspond to target sentences - tgt_mask: bool tensor with 0s in place of target tokens to be masked - labels: indices of tokens which should be predicted from each of the corresponding target tokens in tgt_ids; for standard neural machine translation equals to tgt_ids shifted by 1 to the right - sent_ids: indices of the sentences in a batch; important for evaluation with external metrics, such as SacreBLEU """ return { - # "src_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "src_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "tgt_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "tgt_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "sent_ids": NeuralType({0: AxisType(BatchTag)}), "src_ids": NeuralType(('B', 'T'), ChannelType()), "src_mask": NeuralType(('B', 'T'), ChannelType()), "tgt_ids": NeuralType(('B', 'T'), ChannelType()), diff --git a/nemo/collections/nlp/nm/data_layers/punctuation_capitalization_datalayer.py b/nemo/collections/nlp/nm/data_layers/punctuation_capitalization_datalayer.py index 16de9a8956e7..10e943682e5a 100644 --- a/nemo/collections/nlp/nm/data_layers/punctuation_capitalization_datalayer.py +++ b/nemo/collections/nlp/nm/data_layers/punctuation_capitalization_datalayer.py @@ -16,30 +16,72 @@ from nemo.collections.nlp.data import BertPunctuationCapitalizationDataset from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer -from nemo.core import ChannelType, LabelsType, NeuralType +from nemo.core import ChannelType, LabelsType, MaskType, NeuralType from nemo.utils.decorators import add_port_docs __all__ = ['PunctuationCapitalizationDataLayer'] class PunctuationCapitalizationDataLayer(TextDataLayer): + """ + Data layer for punctuation and capitalization. + + Args: + text_file (str): file to sequences, each line should a sentence, + No header. + label_file (str): file to labels, each line corresponds to + word labels for a sentence in the text_file. No header. + tokenizer (TokenizerSpec): text tokenizer. + max_seq_length (int): max sequence length minus 2 for [CLS] and [SEP] + pad_label (str): ad value use for labels. + by default, it's the neutral label. + punct_label_ids (dict): + dict to map labels to label ids. + Starts with pad_label->0 and then increases in alphabetical order + For dev set use label_ids generated during training to support + cases when not all labels are present in the dev set. + For training set label_ids should be None. + capit_label_ids (dict): + dict to map labels to label ids. + Starts with pad_label->0 and then increases in alphabetical order + For dev set use label_ids generated during training to support + cases when not all labels are present in the dev set. + For training set label_ids should be None. + num_samples (int): + number of samples you want to use for the dataset. + If -1, use all dataset. Useful for testing. + shuffle (bool): whether to shuffle your data. + batch_size (int): batch size + ignore_extra_tokens (bool): whether to ignore extra tokens in + the loss_mask + ignore_start_end (bool): + whether to ignore bos and eos tokens in the loss_mask + use_cache (bool): whether to use data cache + dataset_type (Dataset): Default BertPunctuationCapitalizationDataset. + """ + @property @add_port_docs() def output_ports(self): """Returns definitions of module output ports. + input_ids: + indices of tokens which constitute batches of masked text segments + input_type_ids: + tensor with 0's and 1's to denote the text segment type + input_mask: + bool tensor with 0s in place of tokens to be masked + loss_mask: + used to mask and ignore tokens in the loss function: indices of tokens which constitute batches of unmasked text segments + subtokens_mask: + used to mask all but the first subtoken of the work, could be useful during inference + punct_labels: punctuation label ids + capit_labels: capit_labels label ids """ return { - # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "punct_labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "capit_labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), "input_ids": NeuralType(('B', 'T'), ChannelType()), "input_type_ids": NeuralType(('B', 'T'), ChannelType()), "input_mask": NeuralType(('B', 'T'), ChannelType()), - "loss_mask": NeuralType(('B', 'T'), ChannelType()), + "loss_mask": NeuralType(('B', 'T'), MaskType()), "subtokens_mask": NeuralType(('B', 'T'), ChannelType()), "punct_labels": NeuralType(('B', 'T'), LabelsType()), "capit_labels": NeuralType(('B', 'T'), LabelsType()), @@ -68,7 +110,6 @@ def __init__( 'max_seq_length': max_seq_length, 'tokenizer': tokenizer, 'num_samples': num_samples, - 'shuffle': shuffle, 'pad_label': pad_label, 'punct_label_ids': punct_label_ids, 'capit_label_ids': capit_label_ids, @@ -76,4 +117,4 @@ def __init__( 'ignore_start_end': ignore_start_end, 'use_cache': use_cache, } - super().__init__(dataset_type, dataset_params, batch_size, shuffle) + super().__init__(dataset_type, dataset_params, batch_size, shuffle=shuffle) diff --git a/nemo/collections/nlp/nm/data_layers/qa_squad_datalayer.py b/nemo/collections/nlp/nm/data_layers/qa_squad_datalayer.py index f66af5d20962..c192972ba60b 100644 --- a/nemo/collections/nlp/nm/data_layers/qa_squad_datalayer.py +++ b/nemo/collections/nlp/nm/data_layers/qa_squad_datalayer.py @@ -33,7 +33,7 @@ class BertQuestionAnsweringDataLayer(TextDataLayer): unanswerable questions. doc_stride (int): When splitting up a long document into chunks, how much stride to take between chunks. - max_query_length (iny): All training files which have a duration less + max_query_length (int): All training files which have a duration less than min_duration are dropped. Can't be used if the `utt2dur` file does not exist. Defaults to None. max_seq_length (int): All training files which have a duration more @@ -42,7 +42,7 @@ class BertQuestionAnsweringDataLayer(TextDataLayer): mode (str): Use "train" or "dev" to define between training and evaluation. batch_size (int): Batch size. Defaults to 64. - dataset_type (class): Question Answering class. + dataset_type (Dataset): Question Answering class. Defaults to SquadDataset. """ @@ -50,14 +50,17 @@ class BertQuestionAnsweringDataLayer(TextDataLayer): @add_port_docs() def output_ports(self): """Returns definitions of module output ports. + input_ids: + indices of tokens which constitute batches of masked text segments + input_type_ids: + tensor with 0's and 1's to denote the text segment type + input_mask: + bool tensor with 0s in place of tokens to be masked + start_positions: indices of tokens which constitute start position of answer + end_positions: indices of tokens which constitute end position of answer + unique_ids: id of the Question answer example this instance belongs to """ return { - # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "start_positions": NeuralType({0: AxisType(BatchTag)}), - # "end_positions": NeuralType({0: AxisType(BatchTag)}), - # "unique_ids": NeuralType({0: AxisType(BatchTag)}), "input_ids": NeuralType(('B', 'T'), ChannelType()), "input_type_ids": NeuralType(('B', 'T'), ChannelType()), "input_mask": NeuralType(('B', 'T'), ChannelType()), diff --git a/nemo/collections/nlp/nm/data_layers/state_tracking_trade_datalayer.py b/nemo/collections/nlp/nm/data_layers/state_tracking_trade_datalayer.py index 02088916ac83..42ba379dc461 100644 --- a/nemo/collections/nlp/nm/data_layers/state_tracking_trade_datalayer.py +++ b/nemo/collections/nlp/nm/data_layers/state_tracking_trade_datalayer.py @@ -50,32 +50,52 @@ class MultiWOZDataLayer(TextDataLayer): + """ + Creates the data layer to use for State Tracking dataset MultiWOZ. + + Args: + data_dir (str): TODO + domains: TODO + all_domains: + TODO + vocab: + TODO + slots: + TODO + gating_dict: + TODO + num_samples: + TODO + batch_size: + TODO + mode: + TODO + shuffle: + TODO + num_workers: + TODO + input_dropout: + TODO + is_training: + TODO + dataset_type (Dataset): + TODO + """ + @property @add_port_docs() def output_ports(self): """Returns definitions of module output ports. - src_ids: ids of input sequences - src_lens: lengths of input sequences - tgt_ids: labels for the generator output - tgt_lens: lengths of the generator targets - gating_labels: labels for the gating head - turn_domain: list of the domains NeuralType(None) """ return { - # "src_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "src_lens": NeuralType({0: AxisType(BatchTag)}), - # "tgt_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(TimeTag)}), - # "tgt_lens": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}), - # "gating_labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}), - # "turn_domain": NeuralType(None), "src_ids": NeuralType(('B', 'T'), ChannelType()), "src_lens": NeuralType(tuple('B'), LengthsType()), "tgt_ids": NeuralType(('B', 'D', 'T'), LabelsType()), diff --git a/nemo/collections/nlp/nm/data_layers/text_classification_datalayer.py b/nemo/collections/nlp/nm/data_layers/text_classification_datalayer.py index 2d6e60e0af58..d7f85408f894 100644 --- a/nemo/collections/nlp/nm/data_layers/text_classification_datalayer.py +++ b/nemo/collections/nlp/nm/data_layers/text_classification_datalayer.py @@ -27,9 +27,16 @@ class BertSentenceClassificationDataLayer(TextDataLayer): Creates the data layer to use for the task of sentence classification with pretrained model. - All the data processing is done BertSentenceClassificationDataset. + All the data processing is done BertTextClassificationDataset. Args: + input_file (str): data file + tokenizer (TokenizerSpec): text tokenizer. + max_seq_length (int): max sequence length minus 2 for [CLS] and [SEP] + num_samples: + TODO + shuffle (bool): whether to shuffle data or not. Default: False. + batch_size: text segments batch size dataset (BertTextClassificationDataset): the dataset that needs to be converted to DataLayerNM """ @@ -38,12 +45,15 @@ class BertSentenceClassificationDataLayer(TextDataLayer): @add_port_docs() def output_ports(self): """Returns definitions of module output ports. + input_ids: + indices of tokens which constitute batches of masked text segments + input_type_ids: + tensor with 0's and 1's to denote the text segment type + input_mask: + bool tensor with 0s in place of tokens to be masked + labels: sequence classification id """ return { - # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "labels": NeuralType({0: AxisType(BatchTag)}), "input_ids": NeuralType(('B', 'T'), ChannelType()), "input_type_ids": NeuralType(('B', 'T'), ChannelType()), "input_mask": NeuralType(('B', 'T'), ChannelType()), @@ -65,6 +75,5 @@ def __init__( 'tokenizer': tokenizer, 'max_seq_length': max_seq_length, 'num_samples': num_samples, - 'shuffle': shuffle, } - super().__init__(dataset_type, dataset_params, batch_size, shuffle) + super().__init__(dataset_type, dataset_params, batch_size, shuffle=shuffle) diff --git a/nemo/collections/nlp/nm/data_layers/text_datalayer.py b/nemo/collections/nlp/nm/data_layers/text_datalayer.py index 1b02cb4c1f16..e18da9f0d721 100644 --- a/nemo/collections/nlp/nm/data_layers/text_datalayer.py +++ b/nemo/collections/nlp/nm/data_layers/text_datalayer.py @@ -25,8 +25,10 @@ class TextDataLayer(DataLayerNM): Generic Text Data Layer NM which wraps PyTorch's dataset Args: - dataset_type: type of dataset used for this datalayer + dataset_type (Dataset): type of dataset used for this datalayer dataset_params (dict): all the params for the dataset + batch_size (int): sequence batch size + shuffle (bool): whether to shuffle data """ def __init__(self, dataset_type, dataset_params, batch_size, shuffle=False): diff --git a/nemo/collections/nlp/nm/data_layers/token_classification_datalayer.py b/nemo/collections/nlp/nm/data_layers/token_classification_datalayer.py index 8110fcf16e1b..3cd1256ef54a 100644 --- a/nemo/collections/nlp/nm/data_layers/token_classification_datalayer.py +++ b/nemo/collections/nlp/nm/data_layers/token_classification_datalayer.py @@ -16,29 +16,70 @@ from nemo.collections.nlp.data import BertTokenClassificationDataset, BertTokenClassificationInferDataset from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer -from nemo.core import ChannelType, LabelsType, NeuralType +from nemo.core import ChannelType, LabelsType, MaskType, NeuralType from nemo.utils.decorators import add_port_docs __all__ = ['BertTokenClassificationDataLayer', 'BertTokenClassificationInferDataLayer'] class BertTokenClassificationDataLayer(TextDataLayer): + """ + Creates the data layer to use for the task of token classification + with pretrained model. + + All the data processing is done BertTokenClassificationDataset. + text_file (str): + file to sequences, each line should a sentence, + No header. + label_file (str): + file to labels, each line corresponds to word labels for a sentence in the text_file. No header. + pad_label (int): + d value use for labels. + by default, it's the neutral label. + tokenizer (TokenizerSpec): text tokenizer. + max_seq_length (int): + max sequence length minus 2 for [CLS] and [SEP] + label_ids: + dict to map labels to label ids. + Starts with pad_label->0 and then increases in alphabetical order + For dev set use label_ids generated during training to support + cases when not all labels are present in the dev set. + For training set label_ids should be None. + num_samples (int): + number of samples you want to use for the dataset. + If -1, use all dataset. Useful for testing. + shuffle (bool): whether to shuffle data or not. Default: False. + batch_size (int): text segments batch size + ignore_extra_tokens (bool): whether or not to ignore extra tokens + ignore_start_end (bool): whether or not to ignore start and end + use_cache: + whether to use data cache + dataset_type (BertTokenClassificationDataset): + the dataset that needs to be converted to DataLayerNM + """ + @property @add_port_docs() def output_ports(self): """Returns definitions of module output ports. + input_ids: + indices of tokens which constitute batches of text segments + input_type_ids: + tensor with 0's and 1's to denote the text segment type + input_mask: + bool tensor with 0s in place of tokens to be masked + loss_mask: + used to mask and ignore tokens in the loss function + subtokens_mask: + used to mask all but the first subtoken of the work, could be useful during inference + labels: + token target ids """ return { - # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), "input_ids": NeuralType(('B', 'T'), ChannelType()), "input_type_ids": NeuralType(('B', 'T'), ChannelType()), "input_mask": NeuralType(('B', 'T'), ChannelType()), - "loss_mask": NeuralType(('B', 'T'), ChannelType()), + "loss_mask": NeuralType(('B', 'T'), MaskType()), "subtokens_mask": NeuralType(('B', 'T'), ChannelType()), "labels": NeuralType(('B', 'T'), LabelsType()), } @@ -65,7 +106,6 @@ def __init__( 'max_seq_length': max_seq_length, 'tokenizer': tokenizer, 'num_samples': num_samples, - 'shuffle': shuffle, 'pad_label': pad_label, 'label_ids': label_ids, 'ignore_extra_tokens': ignore_extra_tokens, @@ -76,17 +116,25 @@ def __init__( class BertTokenClassificationInferDataLayer(TextDataLayer): + """ + All the data processing is done BertTokenClassificationInferDataset. + queries: + (list of str): quiries to run inference on + tokenizer (TokenizerSpec): text tokenizer. + max_seq_length (int): + max sequence length minus 2 for [CLS] and [SEP] + shuffle (bool): whether to shuffle data or not. Default: False. + batch_size: text segments batch size + dataset_type (BertTokenClassificationInferDataset): + the dataset that needs to be converted to DataLayerNM + """ + @property @add_port_docs() def output_ports(self): """Returns definitions of module output ports. """ return { - # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), "input_ids": NeuralType(('B', 'T'), ChannelType()), "input_type_ids": NeuralType(('B', 'T'), ChannelType()), "input_mask": NeuralType(('B', 'T'), ChannelType()), @@ -95,7 +143,13 @@ def output_ports(self): } def __init__( - self, queries, tokenizer, max_seq_length, batch_size=1, dataset_type=BertTokenClassificationInferDataset, + self, + queries, + tokenizer, + max_seq_length, + batch_size=1, + shuffle=False, + dataset_type=BertTokenClassificationInferDataset, ): dataset_params = {'queries': queries, 'tokenizer': tokenizer, 'max_seq_length': max_seq_length} - super().__init__(dataset_type, dataset_params, batch_size, shuffle=False) + super().__init__(dataset_type, dataset_params, batch_size, shuffle=shuffle) diff --git a/nemo/collections/nlp/nm/losses/__init__.py b/nemo/collections/nlp/nm/losses/__init__.py index 11c24cdefa6b..ee7b74199e13 100644 --- a/nemo/collections/nlp/nm/losses/__init__.py +++ b/nemo/collections/nlp/nm/losses/__init__.py @@ -14,11 +14,6 @@ # limitations under the License. # ============================================================================= -from nemo.collections.nlp.nm.losses.aggregator_loss import * -from nemo.collections.nlp.nm.losses.joint_intent_slot_loss import * -from nemo.collections.nlp.nm.losses.masked_language_modeling_loss import * -from nemo.collections.nlp.nm.losses.padded_smoothed_cross_entropy_loss import * -from nemo.collections.nlp.nm.losses.qa_squad_loss import * +from nemo.collections.nlp.nm.losses.masked_xentropy_loss import * from nemo.collections.nlp.nm.losses.smoothed_cross_entropy_loss import * -from nemo.collections.nlp.nm.losses.state_tracking_trade_loss import * -from nemo.collections.nlp.nm.losses.token_classification_loss import * +from nemo.collections.nlp.nm.losses.spanning_loss import * diff --git a/nemo/collections/nlp/nm/losses/aggregator_loss.py b/nemo/collections/nlp/nm/losses/aggregator_loss.py deleted file mode 100644 index 3165e19af29b..000000000000 --- a/nemo/collections/nlp/nm/losses/aggregator_loss.py +++ /dev/null @@ -1,64 +0,0 @@ -# ============================================================================= -# Copyright 2020 NVIDIA. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= - -from nemo.backends.pytorch import LossNM -from nemo.core import LossType, NeuralType -from nemo.utils.decorators import add_port_docs - -__all__ = ['LossAggregatorNM'] - - -class LossAggregatorNM(LossNM): - """ - Neural module which combines sums several losses into one. - - Args: - num_inputs (int): number of input losses - """ - - @property - @add_port_docs() - def input_ports(self): - """Returns definitions of module input ports. - - """ - input_ports = {} - for i in range(self.num_losses): - input_ports["loss_" + str(i + 1)] = NeuralType() - - return input_ports - - @property - @add_port_docs() - def output_ports(self): - """Returns definitions of module output ports. - - loss: - NeuralType(None) - """ - return {"loss": NeuralType(elements_type=LossType())} - - def __init__(self, num_inputs=2): - # Store number of inputs/losses. - self.num_losses = num_inputs - LossNM.__init__(self) - - def _loss_function(self, **kwargs): - values = [kwargs[x] for x in sorted(kwargs.keys())] - loss = values[0] - for loss_i in values[1:]: - loss = loss.add(loss_i) - return loss diff --git a/nemo/collections/nlp/nm/losses/joint_intent_slot_loss.py b/nemo/collections/nlp/nm/losses/joint_intent_slot_loss.py deleted file mode 100644 index be5b87936c75..000000000000 --- a/nemo/collections/nlp/nm/losses/joint_intent_slot_loss.py +++ /dev/null @@ -1,112 +0,0 @@ -# ============================================================================= -# Copyright 2020 NVIDIA. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= - -import torch -from torch import nn - -from nemo.backends.pytorch import LossNM -from nemo.core import ChannelType, LogitsType, LossType, NeuralType -from nemo.utils.decorators import add_port_docs - -__all__ = ['JointIntentSlotLoss'] - - -class JointIntentSlotLoss(LossNM): - """ - Loss function for the joint intent classification and slot - filling task. - - The loss is a joint loss of both tasks, aim to maximize: - p(y^i | x)P(y^s1, y^s2, ..., y^sn | x) - - with y^i being the predicted intent and y^s1, y^s2, ..., y^sn - are the predicted slots corresponding to x1, x2, ..., xn. - - Args: - hidden_states: output of the hidden layers - intents: ground truth intents, - slots: ground truth slots. - input_mask: to differentiate from original tokens and paddings - intent_loss_weight: the loss is the sum of: - intent_loss_weight * intent_loss + - (1 - intent_loss_weight) * slot_loss - - """ - - @property - @add_port_docs() - def input_ports(self): - """Returns definitions of module input ports. - - """ - return { - # "intent_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}), - # "slot_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}), - # "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "intents": NeuralType({0: AxisType(BatchTag)}), - # "slots": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "intent_logits": NeuralType(('B', 'D'), LogitsType()), - "slot_logits": NeuralType(('B', 'T', 'D'), LogitsType()), - "loss_mask": NeuralType(('B', 'T'), ChannelType()), - "intents": NeuralType(tuple('B'), ChannelType()), - "slots": NeuralType(('B', 'T'), ChannelType()), - } - - @property - @add_port_docs() - def output_ports(self): - """Returns definitions of module output ports. - - loss: - NeuralType(None) - """ - # return {"loss": NeuralType(None)} - return {"loss": NeuralType(elements_type=LossType())} - - def __init__( - self, num_slots, slot_classes_loss_weights=None, intent_classes_loss_weights=None, intent_loss_weight=0.6, - ): - LossNM.__init__(self) - self.num_slots = num_slots - self.intent_loss_weight = intent_loss_weight - self.slot_classes_loss_weights = slot_classes_loss_weights - self.intent_classes_loss_weights = intent_classes_loss_weights - - # For weighted loss to tackle class imbalance - if slot_classes_loss_weights: - self.slot_classes_loss_weights = torch.FloatTensor(slot_classes_loss_weights).to(self._device) - - if intent_classes_loss_weights: - self.intent_classes_loss_weights = torch.FloatTensor(intent_classes_loss_weights).to(self._device) - - self._criterion_intent = nn.CrossEntropyLoss(weight=self.intent_classes_loss_weights) - self._criterion_slot = nn.CrossEntropyLoss(weight=self.slot_classes_loss_weights) - - def _loss_function(self, intent_logits, slot_logits, loss_mask, intents, slots): - intent_loss = self._criterion_intent(intent_logits, intents) - - active_loss = loss_mask.view(-1) > 0.5 - active_logits = slot_logits.view(-1, self.num_slots)[active_loss] - active_labels = slots.view(-1)[active_loss] - - # To support empty active_labels - if len(active_labels) == 0: - slot_loss = 0.0 - else: - slot_loss = self._criterion_slot(active_logits, active_labels) - loss = intent_loss * self.intent_loss_weight + slot_loss * (1 - self.intent_loss_weight) - - return loss diff --git a/nemo/collections/nlp/nm/losses/masked_language_modeling_loss.py b/nemo/collections/nlp/nm/losses/masked_language_modeling_loss.py deleted file mode 100644 index b29667b1aee0..000000000000 --- a/nemo/collections/nlp/nm/losses/masked_language_modeling_loss.py +++ /dev/null @@ -1,63 +0,0 @@ -# ============================================================================= -# Copyright 2020 NVIDIA. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= - -from nemo.backends.pytorch import LossNM -from nemo.collections.nlp.nm.losses.smoothed_cross_entropy_loss import SmoothedCrossEntropyLoss -from nemo.core import ChannelType, LogitsType, LossType, NeuralType -from nemo.utils.decorators import add_port_docs - -__all__ = ['MaskedLanguageModelingLossNM'] - - -class MaskedLanguageModelingLossNM(LossNM): - """ - Neural module which implements Masked Language Modeling (MLM) loss. - - Args: - label_smoothing (float): label smoothing regularization coefficient - """ - - @property - @add_port_docs() - def input_ports(self): - """Returns definitions of module input ports. - """ - return { - # "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}), - # "output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "output_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "logits": NeuralType(('B', 'T', 'D'), LogitsType()), - "output_ids": NeuralType(('B', 'T'), ChannelType()), - "output_mask": NeuralType(('B', 'T'), ChannelType()), - } - - @property - @add_port_docs() - def output_ports(self): - """Returns definitions of module output ports. - - loss: - NeuralType(None) - """ - return {"loss": NeuralType(elements_type=LossType())} - - def __init__(self, label_smoothing=0.0): - LossNM.__init__(self) - self._criterion = SmoothedCrossEntropyLoss(label_smoothing) - - def _loss_function(self, logits, output_ids, output_mask): - loss = self._criterion(logits, output_ids, output_mask) - return loss diff --git a/nemo/collections/nlp/nm/losses/state_tracking_trade_loss.py b/nemo/collections/nlp/nm/losses/masked_xentropy_loss.py similarity index 56% rename from nemo/collections/nlp/nm/losses/state_tracking_trade_loss.py rename to nemo/collections/nlp/nm/losses/masked_xentropy_loss.py index 7623c8cddc32..ef2240aad303 100644 --- a/nemo/collections/nlp/nm/losses/state_tracking_trade_loss.py +++ b/nemo/collections/nlp/nm/losses/masked_xentropy_loss.py @@ -42,16 +42,16 @@ from nemo.core.neural_types import LabelsType, LengthsType, LogitsType, LossType, NeuralType from nemo.utils.decorators import add_port_docs -__all__ = ['TRADEMaskedCrossEntropy', 'CrossEntropyLoss3D'] +__all__ = ['MaskedXEntropyLoss'] -class TRADEMaskedCrossEntropy(LossNM): +class MaskedXEntropyLoss(LossNM): """ - Neural module which implements a cross entropy for trade model with masking feature. + Neural module which implements a cross entropy model with masking feature. It keeps just the target logit for cross entropy calculation Args: logits (float): output of the classifier - targets (long): ground truth targets + labels (long): ground truth targets loss_mask (long): specifies the ones to get ignored in loss calculation @@ -64,20 +64,15 @@ def input_ports(self): logits: 4d tensor of logits - targets: 3d tensor of labels + labels: 3d tensor of labels loss_mask: specifies the words to be considered in the loss calculation """ return { - # "logits": NeuralType( - # {0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag), 3: AxisType(ChannelTag)} - # ), - # "targets": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(TimeTag)}), - # "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}), "logits": NeuralType(('B', 'T', 'D', 'D'), LogitsType()), - "targets": NeuralType(('B', 'D', 'T'), LabelsType()), - "loss_mask": NeuralType(('B', 'D'), LengthsType()), + "labels": NeuralType(('B', 'D', 'T'), LabelsType()), + "length_mask": NeuralType(('B', 'D'), LengthsType()), } @property @@ -91,65 +86,21 @@ def output_ports(self): def __init__(self): LossNM.__init__(self) - def _loss_function(self, logits, targets, loss_mask): + def _loss_function(self, logits, labels, length_mask, eps=1e-10): logits_flat = logits.view(-1, logits.size(-1)) - eps = 1e-10 log_probs_flat = torch.log(torch.clamp(logits_flat, min=eps)) - target_flat = targets.view(-1, 1) - losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat) - losses = losses_flat.view(*targets.size()) - loss = self.masking(losses, loss_mask) + labels_flat = labels.view(-1, 1) + losses_flat = -torch.gather(log_probs_flat, dim=1, index=labels_flat) + losses = losses_flat.view(*labels.size()) + loss = self.masking(losses, length_mask) return loss @staticmethod - def masking(losses, mask): + def masking(losses, length_mask): max_len = losses.size(2) - mask_ = torch.arange(max_len, device=mask.device)[None, None, :] < mask[:, :, None] + mask_ = torch.arange(max_len, device=length_mask.device)[None, None, :] < length_mask[:, :, None] mask_ = mask_.float() losses = losses * mask_ loss = losses.sum() / mask_.sum() return loss - - -class CrossEntropyLoss3D(LossNM): - """ - Neural module which implements a cross entropy loss for 3d logits. - Args: - num_classes (int): number of classes in a classifier, e.g. size - of the vocabulary in language modeling objective - logits (float): output of the classifier - labels (long): ground truth labels - """ - - @property - @add_port_docs() - def input_ports(self): - """Returns definitions of module input ports. - """ - return { - # "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(ChannelTag)}), - # "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}), - "logits": NeuralType(('B', 'D', 'D'), LogitsType()), - "labels": NeuralType(('B', 'D'), LabelsType()), - } - - @property - @add_port_docs() - def output_ports(self): - """Returns definitions of module output ports. - """ - # return {"loss": NeuralType(None)} - return {"loss": NeuralType(elements_type=LossType())} - - def __init__(self, num_classes, **kwargs): - LossNM.__init__(self, **kwargs) - self._criterion = torch.nn.CrossEntropyLoss() - self.num_classes = num_classes - - def _loss_function(self, logits, labels): - logits_flatten = logits.view(-1, self.num_classes) - labels_flatten = labels.view(-1) - - loss = self._criterion(logits_flatten, labels_flatten) - return loss diff --git a/nemo/collections/nlp/nm/losses/padded_smoothed_cross_entropy_loss.py b/nemo/collections/nlp/nm/losses/padded_smoothed_cross_entropy_loss.py deleted file mode 100644 index dfae9e852987..000000000000 --- a/nemo/collections/nlp/nm/losses/padded_smoothed_cross_entropy_loss.py +++ /dev/null @@ -1,68 +0,0 @@ -# ============================================================================= -# Copyright 2020 NVIDIA. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= - -from nemo.backends.pytorch import LossNM -from nemo.collections.nlp.nm.losses.smoothed_cross_entropy_loss import SmoothedCrossEntropyLoss -from nemo.collections.nlp.utils.common_nlp_utils import mask_padded_tokens -from nemo.core import LabelsType, LogitsType, LossType, NeuralType -from nemo.utils.decorators import add_port_docs - -__all__ = ['PaddedSmoothedCrossEntropyLossNM'] - - -class PaddedSmoothedCrossEntropyLossNM(LossNM): - """ - Neural module which calculates CrossEntropyLoss and - 1) excludes padding tokens from loss calculation - 2) allows to use label smoothing regularization - 3) allows to calculate loss for the desired number of last tokens - - Args: - label_smoothing (float): label smoothing regularization coefficient - predict_last_k (int): how many last tokens to use for the loss - calculation, important for fast evaluation of LM perplexity - """ - - @property - @add_port_docs() - def input_ports(self): - """Returns definitions of module input ports. - """ - return { - # "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}), - # "target_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "logits": NeuralType(('B', 'T', 'D'), LogitsType()), - "target_ids": NeuralType(('B', 'T'), LabelsType()), - } - - @property - @add_port_docs() - def output_ports(self): - """Returns definitions of module output ports. - """ - # return {"loss": NeuralType(None)} - return {"loss": NeuralType(elements_type=LossType())} - - def __init__(self, pad_id, label_smoothing=0, predict_last_k=0): - LossNM.__init__(self) - - self._loss_fn = SmoothedCrossEntropyLoss(label_smoothing, predict_last_k) - self._pad_id = pad_id - - def _loss_function(self, logits, target_ids): - target_mask = mask_padded_tokens(target_ids, self._pad_id).to(logits.dtype) - loss = self._loss_fn(logits, target_ids, target_mask) - return loss diff --git a/nemo/collections/nlp/nm/losses/smoothed_cross_entropy_loss.py b/nemo/collections/nlp/nm/losses/smoothed_cross_entropy_loss.py index cecedece75de..b33a4c4b7611 100644 --- a/nemo/collections/nlp/nm/losses/smoothed_cross_entropy_loss.py +++ b/nemo/collections/nlp/nm/losses/smoothed_cross_entropy_loss.py @@ -16,10 +16,65 @@ import torch +from nemo.backends.pytorch import LossNM +from nemo.collections.nlp.utils.data_utils import mask_padded_tokens +from nemo.core import LabelsType, LogitsType, LossType, MaskType, NeuralType + __all__ = ['SmoothedCrossEntropyLoss'] -class SmoothedCrossEntropyLoss(torch.nn.Module): +class SmoothedCrossEntropyLoss(LossNM): + """ + Neural module which calculates CrossEntropyLoss and + 1) excludes padding tokens from loss calculation + 2) allows to use label smoothing regularization + 3) allows to calculate loss for the desired number of last tokens + + Args: + label_smoothing (float): label smoothing regularization coefficient + predict_last_k (int): how many last tokens to use for the loss + calculation, important for fast evaluation of LM perplexity + """ + + @property + def input_ports(self): + """Returns definitions of module input ports. + """ + return { + "logits": NeuralType(('B', 'T', 'D'), LogitsType()), + "labels": NeuralType(('B', 'T'), LabelsType()), + "output_mask": NeuralType(('B', 'T'), MaskType(), optional=True), + } + + @property + def output_ports(self): + """Returns definitions of module output ports. + """ + # return {"loss": NeuralType(None)} + return {"loss": NeuralType(elements_type=LossType())} + + def __init__(self, pad_id=None, label_smoothing=0, predict_last_k=0): + LossNM.__init__(self) + + self._loss_fn = SmoothedCrossEntropy(label_smoothing, predict_last_k) + self._pad_id = pad_id + + def _loss_function(self, logits, labels, output_mask=None): + if output_mask is not None: + labels_mask = output_mask + elif self._pad_id is not None: + labels_mask = mask_padded_tokens(labels, self._pad_id).to(logits.dtype) + else: + raise ValueError("Both output_mask and pad_id are None") + + if labels_mask.dtype is not logits.dtype: + labels_mask = labels_mask.to(logits.dtype) + + loss = self._loss_fn(logits, labels, labels_mask) + return loss + + +class SmoothedCrossEntropy(torch.nn.Module): """ Cross-entropy loss with label smoothing for a batch of sequences. @@ -41,16 +96,17 @@ def __init__(self, label_smoothing=0.0, predict_last_k=0): self._smoothing = label_smoothing self._predict_last_k = predict_last_k - def forward(self, logits, output_ids, output_mask, eps=1e-6): + def forward(self, logits, labels, output_mask, eps=1e-6): """ Args: - logits: float tensor of shape batch_size x seq_len x vocab_size - output_ids: int tensor of shape batch_size x seq_len + logits: float tensor of shape batch_size x seq_len x vocab_size, values should be log probabilities + labels: int tensor of shape batch_size x seq_len output_mask: binary tensor of shape batch_size x seq_len + eps: epsilon param to avoid divide by zero in loss calculation """ batch_size, seq_len, vocab_size = logits.size() smoothing = vocab_size * self._smoothing / (vocab_size - 1) - target_logits = logits.gather(2, output_ids.unsqueeze(2)).squeeze(2) + target_logits = logits.gather(2, labels.unsqueeze(2)).squeeze(2) smoothing_logits = logits.mean(dim=-1) neg_log_likelihood = (1.0 - smoothing) * target_logits + smoothing * smoothing_logits neg_log_likelihood = neg_log_likelihood[:, -self._predict_last_k :] diff --git a/nemo/collections/nlp/nm/losses/qa_squad_loss.py b/nemo/collections/nlp/nm/losses/spanning_loss.py similarity index 86% rename from nemo/collections/nlp/nm/losses/qa_squad_loss.py rename to nemo/collections/nlp/nm/losses/spanning_loss.py index 289f98ce989e..d0193725887f 100644 --- a/nemo/collections/nlp/nm/losses/qa_squad_loss.py +++ b/nemo/collections/nlp/nm/losses/spanning_loss.py @@ -20,10 +20,10 @@ from nemo.core import ChannelType, LogitsType, LossType, NeuralType from nemo.utils.decorators import add_port_docs -__all__ = ['QuestionAnsweringLoss'] +__all__ = ['SpanningLoss'] -class QuestionAnsweringLoss(LossNM): +class SpanningLoss(LossNM): """ Neural module which implements QuestionAnswering loss. Args: @@ -42,9 +42,6 @@ def input_ports(self): """Returns definitions of module input ports. """ return { - # "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}), - # "start_positions": NeuralType({0: AxisType(BatchTag)}), - # "end_positions": NeuralType({0: AxisType(BatchTag)}), "logits": NeuralType(('B', 'T', 'D'), LogitsType()), "start_positions": NeuralType(tuple('B'), ChannelType()), "end_positions": NeuralType(tuple('B'), ChannelType()), @@ -69,9 +66,6 @@ def output_ports(self): 1: AxisType(TimeTag) """ return { - # "loss": NeuralType(None), - # "start_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "end_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), "loss": NeuralType(elements_type=LossType()), "start_logits": NeuralType(('B', 'T'), ChannelType()), "end_logits": NeuralType(('B', 'T'), ChannelType()), diff --git a/nemo/collections/nlp/nm/losses/token_classification_loss.py b/nemo/collections/nlp/nm/losses/token_classification_loss.py deleted file mode 100644 index ec7dad68c499..000000000000 --- a/nemo/collections/nlp/nm/losses/token_classification_loss.py +++ /dev/null @@ -1,77 +0,0 @@ -# ============================================================================= -# Copyright 2020 NVIDIA. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= - -import torch -from torch import nn - -from nemo.backends.pytorch import LossNM -from nemo.core import ChannelType, LabelsType, LogitsType, LossType, NeuralType -from nemo.utils.decorators import add_port_docs - -__all__ = ['TokenClassificationLoss'] - - -class TokenClassificationLoss(LossNM): - """ - Neural module which implements Token Classification loss. - - Args: - num_classes (int): number of classes in a classifier, e.g. size - of the vocabulary in language modeling objective - logits (float): output of the classifier - labels (long): ground truth labels - loss_mask (long): to differentiate from original tokens and paddings - """ - - @property - @add_port_docs() - def input_ports(self): - """Returns definitions of module input ports. - """ - return { - # "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}), - # "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "logits": NeuralType(('B', 'T', 'D'), LogitsType()), - "labels": NeuralType(('B', 'T'), LabelsType()), - "loss_mask": NeuralType(('B', 'T'), ChannelType()), - } - - @property - @add_port_docs() - def output_ports(self): - """Returns definitions of module output ports. - - loss: - NeuralType(None) - """ - return {"loss": NeuralType(elements_type=LossType())} - - def __init__(self, num_classes, class_weights=None): - LossNM.__init__(self) - if class_weights: - class_weights = torch.FloatTensor(class_weights).to(self._device) - - self._criterion = nn.CrossEntropyLoss(weight=class_weights) - self.num_classes = num_classes - - def _loss_function(self, logits, labels, loss_mask): - active_loss = loss_mask.view(-1) > 0.5 - active_logits = logits.view(-1, self.num_classes)[active_loss] - active_labels = labels.view(-1)[active_loss] - - loss = self._criterion(active_logits, active_labels) - return loss diff --git a/nemo/collections/nlp/nm/trainables/common/huggingface/albert_nm.py b/nemo/collections/nlp/nm/trainables/common/huggingface/albert_nm.py index 5279d60efb47..0be2d9f73583 100644 --- a/nemo/collections/nlp/nm/trainables/common/huggingface/albert_nm.py +++ b/nemo/collections/nlp/nm/trainables/common/huggingface/albert_nm.py @@ -14,6 +14,24 @@ # limitations under the License. # ============================================================================= +# ============================================================================= +# Copyright 2018 The Google AI Language Team Authors and +# The HuggingFace Inc. team. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + from typing import List, Optional from transformers import ( @@ -56,12 +74,10 @@ class Albert(TrainableNM): @add_port_docs() def input_ports(self): """Returns definitions of module input ports. + input_ids: input token ids + token_type_ids: segment type ids + attention_mask: attention mask """ - # return { - # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "token_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "attention_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # } return { "input_ids": NeuralType(('B', 'T'), ChannelType()), "token_type_ids": NeuralType(('B', 'T'), ChannelType()), @@ -71,9 +87,9 @@ def input_ports(self): @property @add_port_docs() def output_ports(self): - """Returns definitions of module output ports. + """Returns definitions of module input ports. + hidden_states: output embedding """ - # return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})} return {"hidden_states": NeuralType(('B', 'T', 'D'), ChannelType())} def __init__( @@ -134,19 +150,12 @@ def __init__( self.add_module("albert", model) self.config = model.config - - # TK: storing config name in init_params instead. - # for key, value in self.config.to_dict().items(): - # self._local_parameters[key] = value - - # Store the only value that will be used externally - hidden_size. self._hidden_size = model.config.hidden_size @property def hidden_size(self): """ Property returning hidden size. - Returns: Hidden size. """ diff --git a/nemo/collections/nlp/nm/trainables/common/huggingface/bert_nm.py b/nemo/collections/nlp/nm/trainables/common/huggingface/bert_nm.py index a4ac1f9d1c66..dd6a554845ee 100644 --- a/nemo/collections/nlp/nm/trainables/common/huggingface/bert_nm.py +++ b/nemo/collections/nlp/nm/trainables/common/huggingface/bert_nm.py @@ -14,6 +14,24 @@ # limitations under the License. # ============================================================================= +# ============================================================================= +# Copyright 2018 The Google AI Language Team Authors and +# The HuggingFace Inc. team. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + from typing import List, Optional from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertConfig, BertModel @@ -51,11 +69,11 @@ class BERT(TrainableNM): @add_port_docs() def input_ports(self): """Returns definitions of module input ports. + input_ids: input token ids + token_type_ids: segment type ids + attention_mask: attention mask """ return { - # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "token_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "attention_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), "input_ids": NeuralType(('B', 'T'), ChannelType()), "token_type_ids": NeuralType(('B', 'T'), ChannelType()), "attention_mask": NeuralType(('B', 'T'), ChannelType()), @@ -65,8 +83,8 @@ def input_ports(self): @add_port_docs() def output_ports(self): """Returns definitions of module output ports. + hidden_states: output embedding """ - # return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})} return {"hidden_states": NeuralType(('B', 'T', 'D'), ChannelType())} def __init__( @@ -127,12 +145,6 @@ def __init__( self.add_module("bert", model) self.config = model.config - - # TK: storing config name in init_params instead. - # for key, value in self.config.to_dict().items(): - # self._local_parameters[key] = value - - # Store the only value that will be used externally - hidden_size. self._hidden_size = model.config.hidden_size @property diff --git a/nemo/collections/nlp/nm/trainables/common/huggingface/roberta_nm.py b/nemo/collections/nlp/nm/trainables/common/huggingface/roberta_nm.py index 650d637bb74e..fa960856a5ce 100644 --- a/nemo/collections/nlp/nm/trainables/common/huggingface/roberta_nm.py +++ b/nemo/collections/nlp/nm/trainables/common/huggingface/roberta_nm.py @@ -14,6 +14,24 @@ # limitations under the License. # ============================================================================= +# ============================================================================= +# Copyright 2018 The Google AI Language Team Authors and +# The HuggingFace Inc. team. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + from typing import List, Optional from transformers import ( @@ -56,12 +74,10 @@ class Roberta(TrainableNM): @add_port_docs() def input_ports(self): """Returns definitions of module input ports. + input_ids: input token ids + token_type_ids: segment type ids + attention_mask: attention mask """ - # return { - # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "token_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "attention_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # } return { "input_ids": NeuralType(('B', 'T'), ChannelType()), "token_type_ids": NeuralType(('B', 'T'), ChannelType()), @@ -72,8 +88,8 @@ def input_ports(self): @add_port_docs() def output_ports(self): """Returns definitions of module output ports. + hidden_states: output embedding """ - # return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})} return {"hidden_states": NeuralType(('B', 'T', 'D'), ChannelType())} def __init__( @@ -134,12 +150,6 @@ def __init__( self.add_module("roberta", model) self.config = model.config - - # TK: storing config name in init_params instead. - # for key, value in self.config.to_dict().items(): - # self._local_parameters[key] = value - - # Store the only value that will be used externally - hidden_size. self._hidden_size = model.config.hidden_size @property diff --git a/nemo/collections/nlp/nm/trainables/common/sequence_classification_nm.py b/nemo/collections/nlp/nm/trainables/common/sequence_classification_nm.py index 5f938b64d4c2..b233242536dc 100644 --- a/nemo/collections/nlp/nm/trainables/common/sequence_classification_nm.py +++ b/nemo/collections/nlp/nm/trainables/common/sequence_classification_nm.py @@ -17,7 +17,7 @@ from torch import nn as nn from nemo.backends.pytorch import MultiLayerPerceptron, TrainableNM -from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import transformer_weights_init +from nemo.collections.nlp.utils.transformer_utils import transformer_weights_init from nemo.core import ChannelType, LogitsType, NeuralType from nemo.utils.decorators import add_port_docs @@ -37,12 +37,15 @@ class SequenceClassifier(TrainableNM): activation (str): activation function applied in classifier MLP layers log_softmax (bool): whether to apply log_softmax to MLP output dropout (float): dropout ratio applied to MLP + use_transformer_pretrained (bool): + TODO """ @property @add_port_docs() def input_ports(self): """Returns definitions of module input ports. + hidden_states: embedding hidden states """ return {"hidden_states": NeuralType(('B', 'T', 'D'), ChannelType())} @@ -50,8 +53,8 @@ def input_ports(self): @add_port_docs() def output_ports(self): """Returns definitions of module output ports. + logits: logits before loss """ - # return {"logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)})} return {"logits": NeuralType(('B', 'D'), LogitsType())} def __init__( diff --git a/nemo/collections/nlp/nm/trainables/common/sequence_regression_nm.py b/nemo/collections/nlp/nm/trainables/common/sequence_regression_nm.py index 8f0db64dd48a..ec681b86d3fa 100644 --- a/nemo/collections/nlp/nm/trainables/common/sequence_regression_nm.py +++ b/nemo/collections/nlp/nm/trainables/common/sequence_regression_nm.py @@ -17,7 +17,7 @@ from torch import nn as nn from nemo.backends.pytorch import MultiLayerPerceptron, TrainableNM -from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import transformer_weights_init +from nemo.collections.nlp.utils.transformer_utils import transformer_weights_init from nemo.core import ChannelType, NeuralType, RegressionValuesType from nemo.utils.decorators import add_port_docs @@ -35,22 +35,24 @@ class SequenceRegression(TrainableNM): num_layers (int): number of layers in classifier MLP activation (str): activation function applied in classifier MLP layers dropout (float): dropout ratio applied to MLP + use_transformer_pretrained (bool): + TODO """ @property @add_port_docs() def input_ports(self): """Returns definitions of module input ports. + hidden_states: embedding hidden states """ - # return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})} return {"hidden_states": NeuralType(('B', 'T', 'D'), ChannelType())} @property @add_port_docs() def output_ports(self): """Returns definitions of module output ports. + preds: predictions before loss """ - # return {"preds": NeuralType({0: AxisType(RegressionTag)})} return {"preds": NeuralType(tuple('B'), RegressionValuesType())} def __init__(self, hidden_size, num_layers=2, activation='relu', dropout=0.0, use_transformer_pretrained=True): diff --git a/nemo/collections/nlp/nm/trainables/common/token_classification_nm.py b/nemo/collections/nlp/nm/trainables/common/token_classification_nm.py index 2eefe80ec3c6..93aee4e9bf8d 100644 --- a/nemo/collections/nlp/nm/trainables/common/token_classification_nm.py +++ b/nemo/collections/nlp/nm/trainables/common/token_classification_nm.py @@ -17,7 +17,8 @@ from torch import nn as nn from nemo.backends.pytorch import MultiLayerPerceptron, TrainableNM -from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import gelu, transformer_weights_init +from nemo.collections.nlp.utils.functional_utils import gelu +from nemo.collections.nlp.utils.transformer_utils import transformer_weights_init from nemo.core import ChannelType, LogitsType, NeuralType from nemo.utils.decorators import add_port_docs @@ -38,22 +39,24 @@ class BertTokenClassifier(TrainableNM): activation (str): activation function applied in classifier MLP layers log_softmax (bool): whether to apply log_softmax to MLP output dropout (float): dropout ratio applied to MLP + use_transformer_pretrained (bool): + TODO """ @property @add_port_docs() def input_ports(self): """Returns definitions of module input ports. + hidden_states: embedding hidden states """ - # return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})} return {"hidden_states": NeuralType(('B', 'T', 'D'), ChannelType())} @property @add_port_docs() def output_ports(self): """Returns definitions of module output ports. + logits: logits before loss """ - # return {"logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})} return {"logits": NeuralType(('B', 'T', 'C'), LogitsType())} def __init__( @@ -108,7 +111,6 @@ class TokenClassifier(TrainableNM): def input_ports(self): """Returns definitions of module input ports. """ - # return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})} return {"hidden_states": NeuralType(('B', 'T', 'C'), ChannelType())} @property @@ -116,7 +118,6 @@ def input_ports(self): def output_ports(self): """Returns definitions of module output ports. """ - # return {"logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})} return {"logits": NeuralType(('B', 'T', 'D'), LogitsType())} def __init__( diff --git a/nemo/collections/nlp/nm/trainables/common/transformer/transformer_decoders.py b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_decoders.py index 1f3cbf0e4f44..fef524343f1c 100644 --- a/nemo/collections/nlp/nm/trainables/common/transformer/transformer_decoders.py +++ b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_decoders.py @@ -1,3 +1,19 @@ +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + import copy import torch @@ -7,7 +23,7 @@ MultiHeadAttention, PositionWiseFF, ) -from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import form_attention_mask +from nemo.collections.nlp.utils.transformer_utils import form_attention_mask __all__ = [] diff --git a/nemo/collections/nlp/nm/trainables/common/transformer/transformer_encoders.py b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_encoders.py index 24c6afce55ad..049a94755b8f 100644 --- a/nemo/collections/nlp/nm/trainables/common/transformer/transformer_encoders.py +++ b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_encoders.py @@ -1,3 +1,19 @@ +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + import copy import torch @@ -8,7 +24,7 @@ PositionWiseFF, TwoStreamSelfAttention, ) -from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import form_attention_mask +from nemo.collections.nlp.utils.transformer_utils import form_attention_mask __all__ = [] diff --git a/nemo/collections/nlp/nm/trainables/common/transformer/transformer_generators.py b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_generators.py index d878ccd17655..d6e25d480832 100644 --- a/nemo/collections/nlp/nm/trainables/common/transformer/transformer_generators.py +++ b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_generators.py @@ -1,13 +1,48 @@ -__all__ = [] +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= import torch import torch.nn as nn -from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import NEG_INF -from nemo.collections.nlp.utils.common_nlp_utils import mask_padded_tokens +from nemo.collections.nlp.utils.data_utils import mask_padded_tokens +from nemo.collections.nlp.utils.transformer_utils import NEG_INF + +__all__ = [] class GreedySequenceGenerator(nn.Module): + """ + Greedy sequence generator based on the decoder followed by log_softmax. + + Args: + embedding: nn.Module, transforms input_ids into vector embeddings + decoder: nn.Module, takes embeddings and produces hidden_states + log_softmax: nn.Module, takes hidden_states and produces log_probs + which correspond to probability distribution of tokens (ids) + pad: index of padding token in the vocabulary + bos: index of beginning of sequence token in the vocabulary + eos: index of end of sequence token in the vocabulary + max_sequence_length: maximum allowed length for generated sequences + max_delta_length: in case of encoder-decoder generation (e.g. NMT), + forbids generated sequences to be longer than the length of + source sequences plus max_delta_length + batch_size: size of the batch of generated sequences if neither + source nor target starting sequences are provided + """ + def __init__( self, embedding, @@ -20,25 +55,6 @@ def __init__( max_delta_length=20, batch_size=1, ): - """ - Greedy sequence generator based on the decoder followed by log_softmax. - - Args: - embedding: nn.Module, transforms input_ids into vector embeddings - decoder: nn.Module, takes embeddings and produces hidden_states - log_softmax: nn.Module, takes hidden_states and produces log_probs - which correspond to probability distribution of tokens (ids) - pad: index of padding token in the vocabulary - bos: index of beginning of sequence token in the vocabulary - eos: index of end of sequence token in the vocabulary - max_sequence_length: maximum allowed length for generated sequences - max_delta_length: in case of encoder-decoder generation (e.g. NMT), - forbids generated sequences to be longer than the length of - source sequences plus max_delta_length - batch_size: size of the batch of generated sequences if neither - source nor target starting sequences are provided - """ - super().__init__() self.embedding = embedding self.decoder = decoder @@ -148,20 +164,20 @@ def forward(self, decoder_input_ids=None, encoder_hidden_states=None, encoder_in class TopKSequenceGenerator(GreedySequenceGenerator): - def __init__(self, embedding, decoder, log_softmax, beam_size=1, temperature=1.0, **kwargs): - """ - Top-k sequence generator based on the decoder followed by log_softmax. - - Args: - *all args of GreedySequenceGenerator class - beam_size: size of the beam (parameter k in top-k) - temperature: temperature of top-k sampling, all logits are divided - by temperature before rescaling. High temperature leads to - uniform distribution, low leads to delta-like distribution. - Kwargs: - all remaining parameters of GreedySequenceGenerator class - """ + """ + Top-k sequence generator based on the decoder followed by log_softmax. + + Args: + *all args of GreedySequenceGenerator class + beam_size: size of the beam (parameter k in top-k) + temperature: temperature of top-k sampling, all logits are divided + by temperature before rescaling. High temperature leads to + uniform distribution, low leads to delta-like distribution. + Kwargs: + all remaining parameters of GreedySequenceGenerator class + """ + def __init__(self, embedding, decoder, log_softmax, beam_size=1, temperature=1.0, **kwargs): super().__init__(embedding, decoder, log_softmax, **kwargs) self.beam_size = beam_size self.temp = temperature diff --git a/nemo/collections/nlp/nm/trainables/common/transformer/transformer_modules.py b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_modules.py index a09a5fa99be8..dd19b221c5df 100644 --- a/nemo/collections/nlp/nm/trainables/common/transformer/transformer_modules.py +++ b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_modules.py @@ -1,27 +1,18 @@ -# coding=utf-8 -""" -Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. -Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -Various parts of Transformer architecture implemented as Pytorch nn.Modules. -Some parts of this code were adapted from the HuggingFace library at -https://github.com/huggingface/pytorch-pretrained-BERT -Some parts of this code were adapted from the Annotated Transformer at -http://nlp.seas.harvard.edu/2018/04/03/attention.html -Copyright by the HuggingFace and Annotated Transformer authors. -""" +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= import math @@ -29,7 +20,7 @@ from torch import nn from nemo import logging -from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import gelu +from nemo.collections.nlp.utils.functional_utils import gelu __all__ = [] diff --git a/nemo/collections/nlp/nm/trainables/common/transformer/transformer_nm.py b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_nm.py index a57d20941f96..73e52e260892 100644 --- a/nemo/collections/nlp/nm/trainables/common/transformer/transformer_nm.py +++ b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_nm.py @@ -1,7 +1,19 @@ -# Copyright (c) 2019 NVIDIA Corporation -""" -This package contains Transformer for translation Neural Module -""" +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + import math @@ -13,7 +25,7 @@ GreedySequenceGenerator, ) from nemo.collections.nlp.nm.trainables.common.transformer.transformer_modules import TransformerEmbedding -from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import transformer_weights_init +from nemo.collections.nlp.utils.transformer_utils import transformer_weights_init from nemo.core.neural_types import ChannelType, NeuralType from nemo.utils.decorators import add_port_docs @@ -49,10 +61,10 @@ class TransformerEncoderNM(TrainableNM): @add_port_docs() def input_ports(self): """Returns definitions of module input ports. + input_ids: ids of input tokens + input_mask_src: input mask """ return { - # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "input_mask_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), "input_ids": NeuralType(('B', 'T'), ChannelType()), "input_mask_src": NeuralType(('B', 'T'), ChannelType()), } @@ -61,9 +73,8 @@ def input_ports(self): @add_port_docs() def output_ports(self): """Returns definitions of module output ports. - + hidden_states: outputs hidden states """ - # return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})} return {"hidden_states": NeuralType(('B', 'T', 'D'), ChannelType())} def __init__( @@ -140,12 +151,12 @@ class TransformerDecoderNM(TrainableNM): @add_port_docs() def input_ports(self): """Returns definitions of module input ports. + input_ids_tgt: ids of target sequence + hidden_states_src: input hidden states + input_mask_src: input token mask + input_mask_tgt: target token mask """ return { - # "input_ids_tgt": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "hidden_states_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}), - # "input_mask_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # "input_mask_tgt": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), "input_ids_tgt": NeuralType(('B', 'T'), ChannelType()), "hidden_states_src": NeuralType(('B', 'T', 'D'), ChannelType()), "input_mask_src": NeuralType(('B', 'T'), ChannelType()), @@ -156,8 +167,8 @@ def input_ports(self): @add_port_docs() def output_ports(self): """Returns definitions of module output ports. + hidden_states: output hidden states """ - # return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})} return {"hidden_states": NeuralType(('B', 'T', 'D'), ChannelType())} def __init__( @@ -224,16 +235,16 @@ class GreedyLanguageGeneratorNM(TrainableNM): @add_port_docs() def input_ports(self): """Returns definitions of module input ports. + input_ids: input ids """ - # return {"input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)})} return {"input_ids": NeuralType(('B', 'T'), ChannelType())} @property @add_port_docs() def output_ports(self): """Returns definitions of module output ports. + output ids: output ids """ - # return {"output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)})} return {"output_ids": NeuralType(('B', 'T'), ChannelType())} def __init__(self, decoder, log_softmax, max_seq_length, pad_token, bos_token, eos_token, batch_size=1): @@ -282,10 +293,10 @@ class BeamSearchTranslatorNM(TrainableNM): @add_port_docs() def input_ports(self): """Returns definitions of module input ports. + hidden_states_src: input hidden states + input_mask_src: input mask """ return { - # "hidden_states_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}), - # "input_mask_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), "hidden_states_src": NeuralType(('B', 'T', 'C'), ChannelType()), "input_mask_src": NeuralType(('B', 'T'), ChannelType()), } @@ -294,8 +305,8 @@ def input_ports(self): @add_port_docs() def output_ports(self): """Returns definitions of module output ports. + output_ids: output ids """ - # return {"output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)})} return {"output_ids": NeuralType(('B', 'T'), ChannelType())} @property diff --git a/nemo/collections/nlp/nm/trainables/dialogue_state_tracking/state_tracking_trade_nm.py b/nemo/collections/nlp/nm/trainables/dialogue_state_tracking/state_tracking_trade_nm.py index a576e4be34be..e6d634ecc26a 100644 --- a/nemo/collections/nlp/nm/trainables/dialogue_state_tracking/state_tracking_trade_nm.py +++ b/nemo/collections/nlp/nm/trainables/dialogue_state_tracking/state_tracking_trade_nm.py @@ -52,33 +52,28 @@ class TRADEGenerator(TrainableNM): + """ + The generator module for state tracking model TRADE + Args: + vocab (Vocab): an instance of Vocab containing the vocabularey + embeddings (Tensor): word embedding matrix + hid_size (int): hidden size of the GRU decoder + dropout (float): dropout of the GRU + slots (list): list of slots + nb_gate (int): number of gates + teacher_forcing (float): 0.5 + """ + @property @add_port_docs() def input_ports(self): """Returns definitions of module input ports. - - encoder_hidden: hidden states of the encoder - - encoder_outputs: outputs of the encoder - - input_lens: lengths of the input sequences to encoder - - src_ids: input sequences to encoder - - targets: targets for the output of the generator - """ return { - # 'encoder_hidden': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}), - # 'encoder_outputs': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}), - # 'input_lens': NeuralType({0: AxisType(BatchTag)}), - # 'src_ids': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - # 'targets': NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(TimeTag)}), 'encoder_hidden': NeuralType(('B', 'T', 'C'), ChannelType()), 'encoder_outputs': NeuralType(('B', 'T', 'C'), ChannelType()), 'input_lens': NeuralType(tuple('B'), LengthsType()), 'src_ids': NeuralType(('B', 'T'), ChannelType()), - # 'targets': NeuralType(ChannelType(), ('B', 'D', 'T')), 'targets': NeuralType(('B', 'D', 'T'), LabelsType()), } @@ -88,16 +83,8 @@ def output_ports(self): """Returns definitions of module output ports. point_outputs: outputs of the generator - gate_outputs: outputs of gating heads - """ - # return { - # 'point_outputs': NeuralType( - # {0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag), 3: AxisType(ChannelTag)} - # ), - # 'gate_outputs': NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(ChannelTag)}), - # } return { 'point_outputs': NeuralType(('B', 'T', 'D', 'D'), LogitsType()), 'gate_outputs': NeuralType(('B', 'D', 'D'), LogitsType()), diff --git a/nemo/collections/nlp/nm/trainables/joint_intent_slot/joint_intent_slot_nm.py b/nemo/collections/nlp/nm/trainables/joint_intent_slot/joint_intent_slot_nm.py index 4020e6e290b9..461a25c902e6 100644 --- a/nemo/collections/nlp/nm/trainables/joint_intent_slot/joint_intent_slot_nm.py +++ b/nemo/collections/nlp/nm/trainables/joint_intent_slot/joint_intent_slot_nm.py @@ -17,7 +17,7 @@ from torch import nn as nn from nemo.backends.pytorch import MultiLayerPerceptron, TrainableNM -from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import transformer_weights_init +from nemo.collections.nlp.utils.transformer_utils import transformer_weights_init from nemo.core import ChannelType, LogitsType, NeuralType from nemo.utils.decorators import add_port_docs @@ -35,14 +35,18 @@ class JointIntentSlotClassifier(TrainableNM): num_intents (int): number of intents num_slots (int): number of slots dropout (float): dropout to be applied to the layer + use_transformer_pretrained (bool): + TODO """ @property @add_port_docs() def input_ports(self): """Returns definitions of module input ports. + + hidden_states: + TODO """ - # return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})} return {"hidden_states": NeuralType(('B', 'T', 'C'), ChannelType())} @property @@ -51,20 +55,11 @@ def output_ports(self): """Returns definitions of module output ports. intent_logits: - 0: AxisType(BatchTag) - - 1: AxisType(ChannelTag) - + TODO slot_logits: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - 2: AxisType(ChannelTag) + TODO """ return { - # "intent_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}), - # "slot_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}), "intent_logits": NeuralType(('B', 'D'), LogitsType()), "slot_logits": NeuralType(('B', 'T', 'D'), LogitsType()), } @@ -85,7 +80,6 @@ def __init__(self, hidden_size, num_intents, num_slots, dropout=0.0, use_transfo ) if use_transformer_pretrained: self.apply(lambda module: transformer_weights_init(module, xavier=False)) - # self.to(self._device) def forward(self, hidden_states): hidden_states = self.dropout(hidden_states) diff --git a/nemo/collections/nlp/utils/__init__.py b/nemo/collections/nlp/utils/__init__.py index 9a0f97ecdc63..feaa9815482a 100644 --- a/nemo/collections/nlp/utils/__init__.py +++ b/nemo/collections/nlp/utils/__init__.py @@ -1,4 +1,21 @@ +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + from nemo.collections.nlp.utils.callback_utils import * -from nemo.collections.nlp.utils.common_nlp_utils import * +from nemo.collections.nlp.utils.data_utils import * +from nemo.collections.nlp.utils.functional_utils import * from nemo.collections.nlp.utils.huggingface_utils import * -from nemo.collections.nlp.utils.loss_utils import * +from nemo.collections.nlp.utils.transformer_utils import * diff --git a/nemo/collections/nlp/utils/common_nlp_utils.py b/nemo/collections/nlp/utils/common_nlp_utils.py deleted file mode 100644 index cb6737bac97e..000000000000 --- a/nemo/collections/nlp/utils/common_nlp_utils.py +++ /dev/null @@ -1,144 +0,0 @@ -# ============================================================================= -# Copyright 2020 NVIDIA. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= - -import os -import re -import string - -import numpy as np - -from nemo import logging - -__all__ = [ - '_is_whitespace', - 'mask_padded_tokens', - 'read_intent_slot_outputs', - 'get_vocab', - 'write_vocab', - 'label2idx', - 'write_vocab_in_order', - 'if_exist', - 'remove_punctuation_from_sentence', - 'ids2text', - 'calc_class_weights', -] - - -def _is_whitespace(c): - if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: - return True - return False - - -def mask_padded_tokens(tokens, pad_id): - mask = tokens != pad_id - return mask - - -def read_intent_slot_outputs( - queries, intent_file, slot_file, intent_logits, slot_logits, slot_masks, intents=None, slots=None -): - intent_dict = get_vocab(intent_file) - slot_dict = get_vocab(slot_file) - pred_intents = np.argmax(intent_logits, 1) - pred_slots = np.argmax(slot_logits, axis=2) - slot_masks = slot_masks > 0.5 - for i, query in enumerate(queries): - logging.info(f'Query: {query}') - pred = pred_intents[i] - logging.info(f'Predicted intent:\t{pred}\t{intent_dict[pred]}') - if intents is not None: - logging.info(f'True intent:\t{intents[i]}\t{intent_dict[intents[i]]}') - - pred_slot = pred_slots[i][slot_masks[i]] - tokens = query.strip().split() - - if len(pred_slot) != len(tokens): - raise ValueError('Pred_slot and tokens must be of the same length') - - for j, token in enumerate(tokens): - output = f'{token}\t{slot_dict[pred_slot[j]]}' - if slots is not None: - output = f'{output}\t{slot_dict[slots[i][j]]}' - logging.info(output) - - -def get_vocab(file): - lines = open(file, 'r').readlines() - lines = [line.strip() for line in lines if line.strip()] - labels = {i: lines[i] for i in range(len(lines))} - return labels - - -def write_vocab(items, outfile): - vocab = {} - idx = 0 - with open(outfile, 'w') as f: - for item in items: - f.write(item + '\n') - vocab[item] = idx - idx += 1 - return vocab - - -def label2idx(file): - lines = open(file, 'r').readlines() - lines = [line.strip() for line in lines if line.strip()] - labels = {lines[i]: i for i in range(len(lines))} - return labels - - -def write_vocab_in_order(vocab, outfile): - with open(outfile, 'w') as f: - for key in sorted(vocab.keys()): - f.write(f'{vocab[key]}\n') - - -def if_exist(outfold, files): - if not os.path.exists(outfold): - return False - for file in files: - if not os.path.exists(f'{outfold}/{file}'): - return False - return True - - -def remove_punctuation_from_sentence(sentence): - sentence = re.sub('[' + string.punctuation + ']', '', sentence) - sentence = sentence.lower() - return sentence - - -def ids2text(ids, vocab): - return ' '.join([vocab[int(id_)] for id_ in ids]) - - -def calc_class_weights(label_freq): - """ - Goal is to give more weight to the classes with less samples - so as to match the one with the higest frequency. We achieve this by - dividing the highest frequency by the freq of each label. - Example - - [12, 5, 3] -> [12/12, 12/5, 12/3] -> [1, 2.4, 4] - - Here label_freq is assumed to be sorted by the frequency. I.e. - label_freq[0] is the most frequent element. - - """ - - most_common_label_freq = label_freq[0] - weighted_slots = sorted([(index, most_common_label_freq[1] / freq) for (index, freq) in label_freq]) - return [weight for (_, weight) in weighted_slots] diff --git a/nemo/collections/nlp/utils/data_utils.py b/nemo/collections/nlp/utils/data_utils.py new file mode 100644 index 000000000000..d57c782fedca --- /dev/null +++ b/nemo/collections/nlp/utils/data_utils.py @@ -0,0 +1,57 @@ +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import re +import string + +__all__ = ['get_vocab', 'get_tokens', 'normalize_answer', 'mask_padded_tokens'] + + +def get_vocab(file): + lines = open(file, 'r').readlines() + lines = [line.strip() for line in lines if line.strip()] + labels = {i: lines[i] for i in range(len(lines))} + return labels + + +def normalize_answer(s): + """Lower text and remove punctuation, articles and extra whitespace.""" + + def remove_articles(text): + return re.sub(r'\b(a|an|the)\b', ' ', text) + + def white_space_fix(text): + return ' '.join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return ''.join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def get_tokens(s): + if not s: + return [] + return normalize_answer(s).split() + + +def mask_padded_tokens(tokens, pad_id): + mask = tokens != pad_id + return mask diff --git a/nemo/collections/nlp/utils/functional_utils.py b/nemo/collections/nlp/utils/functional_utils.py new file mode 100644 index 000000000000..b1f4353dc049 --- /dev/null +++ b/nemo/collections/nlp/utils/functional_utils.py @@ -0,0 +1,66 @@ +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +# ============================================================================= +# Copyright 2018 The Google AI Language Team Authors and +# The HuggingFace Inc. team. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import math + +import torch + +__all__ = ['_compute_softmax', 'gelu'] + + +def _compute_softmax(scores): + """Compute softmax probability over raw logits.""" + if not scores: + return [] + + max_score = None + for score in scores: + if max_score is None or score > max_score: + max_score = score + + exp_scores = [] + total_sum = 0.0 + for score in scores: + x = math.exp(score - max_score) + exp_scores.append(x) + total_sum += x + + probs = [] + for score in exp_scores: + probs.append(score / total_sum) + return probs + + +def gelu(x): + return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) diff --git a/nemo/collections/nlp/utils/huggingface_utils.py b/nemo/collections/nlp/utils/huggingface_utils.py index 98f3df9c36b7..8cb3965ad326 100644 --- a/nemo/collections/nlp/utils/huggingface_utils.py +++ b/nemo/collections/nlp/utils/huggingface_utils.py @@ -14,6 +14,24 @@ # limitations under the License. # ============================================================================= +# ============================================================================= +# Copyright 2018 The Google AI Language Team Authors and +# The HuggingFace Inc. team. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + __all__ = ['MODEL_SPECIAL_TOKENS', 'MODEL_NAMES'] MODEL_SPECIAL_TOKENS = { diff --git a/nemo/collections/nlp/nm/trainables/common/transformer/transformer_utils.py b/nemo/collections/nlp/utils/transformer_utils.py similarity index 73% rename from nemo/collections/nlp/nm/trainables/common/transformer/transformer_utils.py rename to nemo/collections/nlp/utils/transformer_utils.py index 4f3f80ec670a..4c8742098182 100644 --- a/nemo/collections/nlp/nm/trainables/common/transformer/transformer_utils.py +++ b/nemo/collections/nlp/utils/transformer_utils.py @@ -1,13 +1,25 @@ -import math +# ============================================================================= +# Copyright 2020 NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= import torch import torch.nn as nn -NEG_INF = -10000.0 - +__all__ = ['form_attention_mask', 'transformer_weights_init'] -def gelu(x): - return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) +NEG_INF = -10000.0 def form_attention_mask(input_mask, diagonal=None): diff --git a/nemo/core/neural_types/elements.py b/nemo/core/neural_types/elements.py index d963831e2cbc..945506065a34 100644 --- a/nemo/core/neural_types/elements.py +++ b/nemo/core/neural_types/elements.py @@ -34,6 +34,7 @@ 'LengthsType', 'EmbeddedTextType', 'EncodedRepresentation', + 'MaskType', ] import abc from abc import ABC, abstractmethod @@ -188,3 +189,7 @@ class RegressionValuesType(PredictionsType): class CategoricalValuesType(PredictionsType): """Element type to represent labels for categorical classification task""" + + +class MaskType(PredictionsType): + """Element type to represent boolean mask""" diff --git a/tests/core/test_weight_share.py b/tests/core/test_weight_share.py index 6317052ae77d..80934aa8b08a 100644 --- a/tests/core/test_weight_share.py +++ b/tests/core/test_weight_share.py @@ -28,7 +28,7 @@ import nemo import nemo.collections.asr as nemo_asr from nemo.backends.pytorch.nm import DataLayerNM -from nemo.collections.nlp.nm.losses import PaddedSmoothedCrossEntropyLossNM +from nemo.collections.nlp.nm.losses import SmoothedCrossEntropyLoss from nemo.collections.nlp.nm.trainables.common import TokenClassifier from nemo.core import WeightShareTransform from nemo.core.neural_types import * @@ -181,7 +181,7 @@ def data_iterator(self): embd = nemo.backends.pytorch.common.other.SequenceEmbedding(voc_size=voc_size, hidden_size=dim) proj = TokenClassifier(hidden_size=dim, num_classes=voc_size) data = DummyDataLayer(voc_size) - loss = PaddedSmoothedCrossEntropyLossNM(0) + loss = SmoothedCrossEntropyLoss(pad_id=0) embd.tie_weights_with( proj, weight_names=["embedding.weight"], @@ -193,7 +193,7 @@ def data_iterator(self): _in, _out = data() pred = embd(input_seq=_in) pred = proj(hidden_states=pred) - loss_t = loss(target_ids=_out, logits=pred) + loss_t = loss(labels=_out, logits=pred) self.nf.train( [loss_t], optimizer="sgd", optimization_params={"max_steps": 5, "lr": 0.0003}, @@ -245,7 +245,7 @@ def data_iterator(self): embd = nemo.backends.pytorch.common.other.SequenceEmbedding(voc_size=voc_size, hidden_size=dim) proj = TokenClassifier(hidden_size=dim, num_classes=voc_size) data = DummyDataLayer(voc_size) - loss = PaddedSmoothedCrossEntropyLossNM(0) + loss = SmoothedCrossEntropyLoss(pad_id=0) # embd.tie_weights_with( # proj, # weight_names=["embedding.weight"], @@ -257,7 +257,7 @@ def data_iterator(self): _in, _out = data() pred = embd(input_seq=_in) pred = proj(hidden_states=pred) - loss_t = loss(target_ids=_out, logits=pred) + loss_t = loss(labels=_out, logits=pred) self.nf.train( [loss_t], optimizer="sgd", optimization_params={"max_steps": 5, "lr": 0.0003},