From ce70f2667f17e7c188d4a07cb0f5a72671247bb8 Mon Sep 17 00:00:00 2001 From: Evelina Bakhturina Date: Thu, 13 Feb 2020 14:46:39 -0800 Subject: [PATCH] refactor datasets Signed-off-by: Evelina Bakhturina --- .../glue_benchmark_with_bert.py | 2 +- .../joint_intent_slot_infer.py | 2 +- .../joint_intent_slot_infer_b1.py | 2 +- .../joint_intent_slot_with_bert.py | 2 +- .../collections/nlp/data/datasets/__init__.py | 6 +- .../datasets_utils/datasets_processing.py | 35 ++ .../datasets/datasets_utils/preprocessing.py | 1 + .../data/datasets/glue_benchmark_dataset.py | 593 ------------------ .../glue_benchmark_dataset/__init__.py | 0 .../glue_benchmark_dataset/data_processors.py | 302 +++++++++ .../glue_benchmark_dataset.py | 295 +++++++++ .../datasets/joint_intent_slot_dataset.py | 481 -------------- .../data_descriptor.py | 217 +++++++ .../joint_intent_slot_dataset.py | 262 ++++++++ .../nlp/data/datasets/lm_bert_dataset.py | 58 +- .../data/datasets/lm_transformer_dataset.py | 156 +---- .../datasets/machine_translation_dataset.py | 64 +- .../qa_squad_dataset.py | 248 +------- .../qa_squad_dataset/qa_squad_processing.py | 231 +++++++ 19 files changed, 1473 insertions(+), 1484 deletions(-) delete mode 100644 nemo/collections/nlp/data/datasets/glue_benchmark_dataset.py create mode 100644 nemo/collections/nlp/data/datasets/glue_benchmark_dataset/__init__.py create mode 100644 nemo/collections/nlp/data/datasets/glue_benchmark_dataset/data_processors.py create mode 100644 nemo/collections/nlp/data/datasets/glue_benchmark_dataset/glue_benchmark_dataset.py delete mode 100644 nemo/collections/nlp/data/datasets/joint_intent_slot_dataset.py create mode 100644 nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/data_descriptor.py create mode 100644 nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/joint_intent_slot_dataset.py rename nemo/collections/nlp/data/datasets/{ => qa_squad_dataset}/qa_squad_dataset.py (67%) create mode 100644 nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_processing.py diff --git a/examples/nlp/glue_benchmark/glue_benchmark_with_bert.py b/examples/nlp/glue_benchmark/glue_benchmark_with_bert.py index 6c23618a7329..efe38affe4bf 100644 --- a/examples/nlp/glue_benchmark/glue_benchmark_with_bert.py +++ b/examples/nlp/glue_benchmark/glue_benchmark_with_bert.py @@ -70,7 +70,7 @@ from nemo.backends.pytorch.common import CrossEntropyLoss, MSELoss from nemo.collections.nlp.callbacks.glue_benchmark_callback import eval_epochs_done_callback, eval_iter_callback from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer -from nemo.collections.nlp.data.datasets.glue_benchmark_dataset import output_modes, processors +from nemo.collections.nlp.data.datasets.glue_benchmark_dataset.glue_benchmark_dataset import output_modes, processors from nemo.collections.nlp.nm.data_layers import GlueClassificationDataLayer, GlueRegressionDataLayer from nemo.collections.nlp.nm.trainables import SequenceClassifier, SequenceRegression from nemo.utils.lr_policies import get_lr_policy diff --git a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer.py b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer.py index 196a0e492055..81fdfad719a3 100644 --- a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer.py +++ b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer.py @@ -23,7 +23,7 @@ import nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm from nemo import logging -from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import JointIntentSlotDataDesc +from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.data_descriptor import JointIntentSlotDataDesc # Parsing arguments parser = argparse.ArgumentParser(description='Joint-intent BERT') diff --git a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer_b1.py b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer_b1.py index 84ab723c94a8..6c44b1a58042 100644 --- a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer_b1.py +++ b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer_b1.py @@ -21,7 +21,7 @@ import nemo.collections.nlp as nemo_nlp import nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm -from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import JointIntentSlotDataDesc +from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.data_descriptor import JointIntentSlotDataDesc from nemo.collections.nlp.utils.common_nlp_utils import read_intent_slot_outputs # Parsing arguments diff --git a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_with_bert.py b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_with_bert.py index 0cbdb08f72cc..f321d955c8df 100644 --- a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_with_bert.py +++ b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_with_bert.py @@ -26,7 +26,7 @@ import nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm from nemo import logging from nemo.collections.nlp.callbacks.joint_intent_slot_callback import eval_epochs_done_callback, eval_iter_callback -from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import JointIntentSlotDataDesc +from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.data_descriptor import JointIntentSlotDataDesc from nemo.utils.lr_policies import get_lr_policy # Parsing arguments diff --git a/nemo/collections/nlp/data/datasets/__init__.py b/nemo/collections/nlp/data/datasets/__init__.py index 2ca68b4f6991..67561e31959f 100644 --- a/nemo/collections/nlp/data/datasets/__init__.py +++ b/nemo/collections/nlp/data/datasets/__init__.py @@ -15,8 +15,8 @@ # ============================================================================= from nemo.collections.nlp.data.datasets.datasets_utils import * -from nemo.collections.nlp.data.datasets.glue_benchmark_dataset import GLUEDataset -from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import ( +from nemo.collections.nlp.data.datasets.glue_benchmark_dataset.glue_benchmark_dataset import GLUEDataset +from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.joint_intent_slot_dataset import ( BertJointIntentSlotDataset, BertJointIntentSlotInferDataset, ) @@ -31,7 +31,7 @@ BertPunctuationCapitalizationDataset, BertPunctuationCapitalizationInferDataset, ) -from nemo.collections.nlp.data.datasets.qa_squad_dataset import SquadDataset +from nemo.collections.nlp.data.datasets.qa_squad_dataset.qa_squad_dataset import SquadDataset from nemo.collections.nlp.data.datasets.text_classification_dataset import BertTextClassificationDataset from nemo.collections.nlp.data.datasets.token_classification_dataset import ( BertTokenClassificationDataset, diff --git a/nemo/collections/nlp/data/datasets/datasets_utils/datasets_processing.py b/nemo/collections/nlp/data/datasets/datasets_utils/datasets_processing.py index ea14c8716a4e..d1e8ee764719 100644 --- a/nemo/collections/nlp/data/datasets/datasets_utils/datasets_processing.py +++ b/nemo/collections/nlp/data/datasets/datasets_utils/datasets_processing.py @@ -1,6 +1,7 @@ import glob import json import os +import pickle import shutil from nemo import logging @@ -379,3 +380,37 @@ def process_nlu(filename, uncased, modes=['train', 'test'], dataset_name='nlu-ub for mode in modes: outfiles[mode].close() return outfold + + +def dataset_to_ids(dataset, tokenizer, cache_ids=False, add_bos_eos=True): + """ + Reads dataset from file line by line, tokenizes each line with tokenizer, + and returns list of lists which corresponds to ids of tokenized strings. + + Args: + dataset: path to dataset + tokenizer: tokenizer to convert text into ids + cache_ids: if True, ids are saved to disk as pickle file + with similar name (e.g., data.txt --> data.txt.pkl) + add_bos_eos: bool, whether to add and symbols (e.g., for NMT) + Returns: + ids: list of ids which correspond to tokenized strings of the dataset + """ + + cached_ids_dataset = dataset + str(".pkl") + if os.path.isfile(cached_ids_dataset): + logging.info("Loading cached tokenized dataset ...") + ids = pickle.load(open(cached_ids_dataset, "rb")) + else: + logging.info("Tokenizing dataset ...") + data = open(dataset, "rb").readlines() + ids = [] + for sentence in data: + sent_ids = tokenizer.text_to_ids(sentence.decode("utf-8")) + if add_bos_eos: + sent_ids = [tokenizer.bos_id] + sent_ids + [tokenizer.eos_id] + ids.append(sent_ids) + if cache_ids: + logging.info("Caching tokenized dataset ...") + pickle.dump(ids, open(cached_ids_dataset, "wb")) + return ids \ No newline at end of file diff --git a/nemo/collections/nlp/data/datasets/datasets_utils/preprocessing.py b/nemo/collections/nlp/data/datasets/datasets_utils/preprocessing.py index b137305c9353..5fa6ae68c810 100644 --- a/nemo/collections/nlp/data/datasets/datasets_utils/preprocessing.py +++ b/nemo/collections/nlp/data/datasets/datasets_utils/preprocessing.py @@ -43,6 +43,7 @@ 'get_intent_labels', 'normalize_answer', 'get_tokens', + 'get_stats' ] DATABASE_EXISTS_TMP = '{} dataset has already been processed and stored at {}' diff --git a/nemo/collections/nlp/data/datasets/glue_benchmark_dataset.py b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset.py deleted file mode 100644 index 26423c3aa549..000000000000 --- a/nemo/collections/nlp/data/datasets/glue_benchmark_dataset.py +++ /dev/null @@ -1,593 +0,0 @@ -""" -Copyright 2018 The Google AI Language Team Authors and -The HuggingFace Inc. team. -Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -Utility functions for GLUE tasks -Some transformer of this code were adapted from the HuggingFace library at -https://github.com/huggingface/transformers -""" -import csv -import os - -import numpy as np -from torch.utils.data import Dataset - -from nemo import logging - -__all__ = ['GLUEDataset'] - - -class GLUEDataset(Dataset): - def __init__(self, data_dir, tokenizer, max_seq_length, processor, output_mode, evaluate, token_params): - self.tokenizer = tokenizer - self.label_list = processor.get_labels() - self.examples = processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir) - self.features = convert_examples_to_features( - self.examples, self.label_list, max_seq_length, tokenizer, output_mode, **token_params - ) - - def __len__(self): - return len(self.features) - - def __getitem__(self, idx): - feature = self.features[idx] - return ( - np.array(feature.input_ids), - np.array(feature.segment_ids), - np.array(feature.input_mask, dtype=np.long), - np.array(feature.label_id), - ) - - -def convert_examples_to_features( - examples, - label_list, - max_seq_length, - tokenizer, - output_mode, - bos_token=None, - eos_token='[SEP]', - pad_token='[PAD]', - cls_token='[CLS]', - sep_token_extra=None, - cls_token_at_end=False, - cls_token_segment_id=0, - pad_token_segment_id=0, - pad_on_left=False, - mask_padding_with_zero=True, - sequence_a_segment_id=0, - sequence_b_segment_id=1, -): - """ Loads a data file into a list of `InputBatch`s - `cls_token_at_end` define the location of the CLS token: - - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] - `cls_token_segment_id` define the segment id associated to the CLS - token (0 for BERT, 2 for XLNet) - The convention in BERT is: - (a) For sequence pairs: - tokens: [CLS] is this jack ##ville ? [SEP] no it is not . [SEP] - type_ids: 0 0 0 0 0 0 0 1 1 1 1 1 1 - (b) For single sequences: - tokens: [CLS] the dog is hairy . [SEP] - type_ids: 0 0 0 0 0 0 0 - Where "type_ids" are used to indicate whether this is the first - sequence or the second sequence. The embedding vectors for `type=0` - and `type=1` were learned during pre-training and are added to the - wordpiece embedding vector (and position vector). This is - not *strictly* necessarysince the [SEP] token unambiguously separates - the sequences, but it makes it easier for the model to learn - the concept of sequences. - For classification tasks, the first vector (corresponding to [CLS]) - is used as as the "sentence vector". Note that this only makes sense - because the entire model is fine-tuned. - For NMT: - (a) For sequence pairs: - tokens: is this jack ##ville ? no it is not . - type_ids:0 0 0 0 0 0 0 1 1 1 1 1 1 1 - (b) For single sequences: - tokens: the dog is hairy . - type_ids: 0 0 0 0 0 0 0 - """ - label_map = {label: i for i, label in enumerate(label_list)} - - features = [] - for ex_index, example in enumerate(examples): - if ex_index % 10000 == 0: - logging.info("Writing example %d of %d" % (ex_index, len(examples))) - - tokens_a = tokenizer.text_to_tokens(example.text_a) - - tokens_b = None - if example.text_b: - tokens_b = tokenizer.text_to_tokens(example.text_b) - - special_tokens_count = 2 if eos_token else 0 - special_tokens_count += 1 if sep_token_extra else 0 - special_tokens_count += 2 if bos_token else 0 - special_tokens_count += 1 if cls_token else 0 - _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count) - else: - special_tokens_count = 1 if eos_token else 0 - special_tokens_count += 1 if sep_token_extra else 0 - special_tokens_count += 1 if bos_token else 0 - if len(tokens_a) > max_seq_length - special_tokens_count: - tokens_a = tokens_a[: max_seq_length - special_tokens_count] - # Add special tokens to sequence_a - tokens = tokens_a - if bos_token: - tokens = [bos_token] + tokens - if eos_token: - tokens += [eos_token] - segment_ids = [sequence_a_segment_id] * len(tokens) - - # Add sequence separator between sequences - if tokens_b and sep_token_extra: - tokens += [sep_token_extra] - segment_ids += [sequence_a_segment_id] - - # Add special tokens to sequence_b - if tokens_b: - if bos_token: - tokens += [bos_token] - segment_ids += [sequence_b_segment_id] - tokens += tokens_b - segment_ids += [sequence_b_segment_id] * (len(tokens_b)) - if eos_token: - tokens += [eos_token] - segment_ids += [sequence_b_segment_id] - - # Add classification token - for BERT models - if cls_token: - if cls_token_at_end: - tokens += [cls_token] - segment_ids += [cls_token_segment_id] - else: - tokens = [cls_token] + tokens - segment_ids = [cls_token_segment_id] + segment_ids - input_ids = tokenizer.tokens_to_ids(tokens) - - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) - - # Zero-pad up to the sequence length. - padding_length = max_seq_length - len(input_ids) - pad_token_id = tokenizer.tokens_to_ids([pad_token])[0] - if pad_on_left: - input_ids = ([pad_token_id] * padding_length) + input_ids - input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask - segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids - else: - input_ids = input_ids + ([pad_token_id] * padding_length) - input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length) - segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) - if len(input_ids) != max_seq_length: - raise ValueError("input_ids must be of length max_seq_length") - if len(input_mask) != max_seq_length: - raise ValueError("input_mask must be of length max_seq_length") - if len(segment_ids) != max_seq_length: - raise ValueError("segment_ids must be of length max_seq_length") - if output_mode == "classification": - label_id = label_map[example.label] - elif output_mode == "regression": - label_id = np.float32(example.label) - else: - raise KeyError(output_mode) - - if ex_index < 5: - logging.info("*** Example ***") - logging.info("guid: %s" % (example.guid)) - logging.info("tokens: %s" % " ".join(list(map(str, tokens)))) - logging.info("input_ids: %s" % " ".join(list(map(str, input_ids)))) - logging.info("input_mask: %s" % " ".join(list(map(str, input_mask)))) - logging.info("segment_ids: %s" % " ".join(list(map(str, segment_ids)))) - logging.info("label: %s (id = %d)" % (example.label, label_id)) - - features.append( - InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id) - ) - return features - - -def _truncate_seq_pair(tokens_a, tokens_b, max_length): - """Truncates a sequence pair in place to the maximum length. - - This will always truncate the longer sequence one token at a time. - This makes more sense than truncating an equal percent - of tokens from each, since if one sequence is very short then each token - that's truncated likely contains more information than a longer sequence. - """ - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_length: - break - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() - - -""" -Utility functions for GLUE tasks -This code was adapted from the HuggingFace library at -https://github.com/huggingface/transformers -""" - - -class InputFeatures(object): - """A single set of features of data.""" - - def __init__(self, input_ids, input_mask, segment_ids, label_id): - self.input_ids = input_ids - self.input_mask = input_mask - self.segment_ids = segment_ids - self.label_id = label_id - - -class InputExample(object): - """A single training/test example for simple sequence classification.""" - - def __init__(self, guid, text_a, text_b=None, label=None): - """Constructs a InputExample. - - Args: - guid: Unique id for the example. - text_a: string. The untokenized text of the first sequence. - For single sequence tasks, only this sequence must be specified. - text_b: (Optional) string. The untokenized text of the second - sequence. Only must be specified for sequence pair tasks. - label: (Optional) string. The label of the example. This should be - specified for train and dev examples, but not for test examples. - """ - self.guid = guid - self.text_a = text_a - self.text_b = text_b - self.label = label - - -class DataProcessor(object): - """Base class for data converters for sequence classification data sets.""" - - def get_train_examples(self, data_dir): - """Gets a collection of `InputExample`s for the train set.""" - raise NotImplementedError() - - def get_dev_examples(self, data_dir): - """Gets a collection of `InputExample`s for the dev set.""" - raise NotImplementedError() - - def get_labels(self): - """Gets the list of labels for this data set.""" - raise NotImplementedError() - - @classmethod - def _read_tsv(cls, input_file, quotechar=None): - """Reads a tab separated value file.""" - with open(input_file, "r", encoding="utf-8-sig") as f: - reader = csv.reader(f, delimiter="\t", quotechar=quotechar) - lines = [] - for line in reader: - # if sys.version_info[0] == 2: - # line = list(unicode(cell, 'utf-8') for cell in line) - lines.append(line) - return lines - - -class MrpcProcessor(DataProcessor): - """Processor for the MRPC data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - logging.info(f'LOOKING AT {os.path.join(data_dir, "train.tsv")}') - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, i) - text_a = line[3] - text_b = line[4] - label = line[0] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class MnliProcessor(DataProcessor): - """Processor for the MultiNLI data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched") - - def get_labels(self): - """See base class.""" - return ["contradiction", "entailment", "neutral"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[8] - text_b = line[9] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class MnliMismatchedProcessor(MnliProcessor): - """Processor for the MultiNLI Mismatched data set (GLUE version).""" - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_matched") - - -class ColaProcessor(DataProcessor): - """Processor for the CoLA data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - guid = "%s-%s" % (set_type, i) - text_a = line[3] - label = line[1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) - return examples - - -class Sst2Processor(DataProcessor): - """Processor for the SST-2 data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, i) - text_a = line[0] - label = line[1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) - return examples - - -class StsbProcessor(DataProcessor): - """Processor for the STS-B data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return [None] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[7] - text_b = line[8] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class QqpProcessor(DataProcessor): - """Processor for the QQP data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - try: - text_a = line[3] - text_b = line[4] - label = line[5] - except IndexError: - continue - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class QnliProcessor(DataProcessor): - """Processor for the QNLI data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched") - - def get_labels(self): - """See base class.""" - return ["entailment", "not_entailment"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[1] - text_b = line[2] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class RteProcessor(DataProcessor): - """Processor for the RTE data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return ["entailment", "not_entailment"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[1] - text_b = line[2] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class WnliProcessor(DataProcessor): - """Processor for the WNLI data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[1] - text_b = line[2] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -processors = { - "cola": ColaProcessor, - "mnli": MnliProcessor, - "mnli-mm": MnliMismatchedProcessor, - "mrpc": MrpcProcessor, - "sst-2": Sst2Processor, - "sts-b": StsbProcessor, - "qqp": QqpProcessor, - "qnli": QnliProcessor, - "rte": RteProcessor, - "wnli": WnliProcessor, -} -output_modes = { - "cola": "classification", - "mnli": "classification", - "mnli-mm": "classification", - "mrpc": "classification", - "sst-2": "classification", - "sts-b": "regression", - "qqp": "classification", - "qnli": "classification", - "rte": "classification", - "wnli": "classification", -} -GLUE_TASKS_NUM_LABELS = { - "cola": 2, - "mnli": 3, - "mrpc": 2, - "sst-2": 2, - "sts-b": 1, - "qqp": 2, - "qnli": 2, - "rte": 2, - "wnli": 2, -} diff --git a/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/__init__.py b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/data_processors.py b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/data_processors.py new file mode 100644 index 000000000000..b9d2a6bc4451 --- /dev/null +++ b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/data_processors.py @@ -0,0 +1,302 @@ +import csv +import os + +from nemo import logging +from nemo.collections.nlp.data.datasets.glue_benchmark_dataset.glue_benchmark_dataset import InputExample + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with open(input_file, "r", encoding="utf-8-sig") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + # if sys.version_info[0] == 2: + # line = list(unicode(cell, 'utf-8') for cell in line) + lines.append(line) + return lines + + +class MrpcProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + logging.info(f'LOOKING AT {os.path.join(data_dir, "train.tsv")}') + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[3] + text_b = line[4] + label = line[0] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class MnliProcessor(DataProcessor): + """Processor for the MultiNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched") + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[8] + text_b = line[9] + label = line[-1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class MnliMismatchedProcessor(MnliProcessor): + """Processor for the MultiNLI Mismatched data set (GLUE version).""" + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_matched") + + +class ColaProcessor(DataProcessor): + """Processor for the CoLA data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text_a = line[3] + label = line[1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +class Sst2Processor(DataProcessor): + """Processor for the SST-2 data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[0] + label = line[1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +class StsbProcessor(DataProcessor): + """Processor for the STS-B data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return [None] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[7] + text_b = line[8] + label = line[-1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class QqpProcessor(DataProcessor): + """Processor for the QQP data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + try: + text_a = line[3] + text_b = line[4] + label = line[5] + except IndexError: + continue + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class QnliProcessor(DataProcessor): + """Processor for the QNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched") + + def get_labels(self): + """See base class.""" + return ["entailment", "not_entailment"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[1] + text_b = line[2] + label = line[-1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class RteProcessor(DataProcessor): + """Processor for the RTE data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["entailment", "not_entailment"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[1] + text_b = line[2] + label = line[-1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class WnliProcessor(DataProcessor): + """Processor for the WNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[1] + text_b = line[2] + label = line[-1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples \ No newline at end of file diff --git a/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/glue_benchmark_dataset.py b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/glue_benchmark_dataset.py new file mode 100644 index 000000000000..6530a26dd508 --- /dev/null +++ b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/glue_benchmark_dataset.py @@ -0,0 +1,295 @@ +""" +Copyright 2018 The Google AI Language Team Authors and +The HuggingFace Inc. team. +Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +Utility functions for GLUE tasks +Some transformer of this code were adapted from the HuggingFace library at +https://github.com/huggingface/transformers +""" + +import numpy as np +from torch.utils.data import Dataset + +from nemo import logging + +__all__ = ['GLUEDataset'] + +processors = { + "cola": ColaProcessor, + "mnli": MnliProcessor, + "mnli-mm": MnliMismatchedProcessor, + "mrpc": MrpcProcessor, + "sst-2": Sst2Processor, + "sts-b": StsbProcessor, + "qqp": QqpProcessor, + "qnli": QnliProcessor, + "rte": RteProcessor, + "wnli": WnliProcessor, +} +output_modes = { + "cola": "classification", + "mnli": "classification", + "mnli-mm": "classification", + "mrpc": "classification", + "sst-2": "classification", + "sts-b": "regression", + "qqp": "classification", + "qnli": "classification", + "rte": "classification", + "wnli": "classification", +} +GLUE_TASKS_NUM_LABELS = { + "cola": 2, + "mnli": 3, + "mrpc": 2, + "sst-2": 2, + "sts-b": 1, + "qqp": 2, + "qnli": 2, + "rte": 2, + "wnli": 2, +} + +class GLUEDataset(Dataset): + def __init__(self, data_dir, tokenizer, max_seq_length, processor, output_mode, evaluate, token_params): + self.tokenizer = tokenizer + self.label_list = processor.get_labels() + self.examples = processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir) + self.features = convert_examples_to_features( + self.examples, self.label_list, max_seq_length, tokenizer, output_mode, **token_params + ) + + def __len__(self): + return len(self.features) + + def __getitem__(self, idx): + feature = self.features[idx] + return ( + np.array(feature.input_ids), + np.array(feature.segment_ids), + np.array(feature.input_mask, dtype=np.long), + np.array(feature.label_id), + ) + + + def convert_examples_to_features( + examples, + label_list, + max_seq_length, + tokenizer, + output_mode, + bos_token=None, + eos_token='[SEP]', + pad_token='[PAD]', + cls_token='[CLS]', + sep_token_extra=None, + cls_token_at_end=False, + cls_token_segment_id=0, + pad_token_segment_id=0, + pad_on_left=False, + mask_padding_with_zero=True, + sequence_a_segment_id=0, + sequence_b_segment_id=1, + ): + """ Loads a data file into a list of `InputBatch`s + `cls_token_at_end` define the location of the CLS token: + - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] + - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] + `cls_token_segment_id` define the segment id associated to the CLS + token (0 for BERT, 2 for XLNet) + The convention in BERT is: + (a) For sequence pairs: + tokens: [CLS] is this jack ##ville ? [SEP] no it is not . [SEP] + type_ids: 0 0 0 0 0 0 0 1 1 1 1 1 1 + (b) For single sequences: + tokens: [CLS] the dog is hairy . [SEP] + type_ids: 0 0 0 0 0 0 0 + Where "type_ids" are used to indicate whether this is the first + sequence or the second sequence. The embedding vectors for `type=0` + and `type=1` were learned during pre-training and are added to the + wordpiece embedding vector (and position vector). This is + not *strictly* necessarysince the [SEP] token unambiguously separates + the sequences, but it makes it easier for the model to learn + the concept of sequences. + For classification tasks, the first vector (corresponding to [CLS]) + is used as as the "sentence vector". Note that this only makes sense + because the entire model is fine-tuned. + For NMT: + (a) For sequence pairs: + tokens: is this jack ##ville ? no it is not . + type_ids:0 0 0 0 0 0 0 1 1 1 1 1 1 1 + (b) For single sequences: + tokens: the dog is hairy . + type_ids: 0 0 0 0 0 0 0 + """ + label_map = {label: i for i, label in enumerate(label_list)} + + features = [] + for ex_index, example in enumerate(examples): + if ex_index % 10000 == 0: + logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + tokens_a = tokenizer.text_to_tokens(example.text_a) + + tokens_b = None + if example.text_b: + tokens_b = tokenizer.text_to_tokens(example.text_b) + + special_tokens_count = 2 if eos_token else 0 + special_tokens_count += 1 if sep_token_extra else 0 + special_tokens_count += 2 if bos_token else 0 + special_tokens_count += 1 if cls_token else 0 + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count) + else: + special_tokens_count = 1 if eos_token else 0 + special_tokens_count += 1 if sep_token_extra else 0 + special_tokens_count += 1 if bos_token else 0 + if len(tokens_a) > max_seq_length - special_tokens_count: + tokens_a = tokens_a[: max_seq_length - special_tokens_count] + # Add special tokens to sequence_a + tokens = tokens_a + if bos_token: + tokens = [bos_token] + tokens + if eos_token: + tokens += [eos_token] + segment_ids = [sequence_a_segment_id] * len(tokens) + + # Add sequence separator between sequences + if tokens_b and sep_token_extra: + tokens += [sep_token_extra] + segment_ids += [sequence_a_segment_id] + + # Add special tokens to sequence_b + if tokens_b: + if bos_token: + tokens += [bos_token] + segment_ids += [sequence_b_segment_id] + tokens += tokens_b + segment_ids += [sequence_b_segment_id] * (len(tokens_b)) + if eos_token: + tokens += [eos_token] + segment_ids += [sequence_b_segment_id] + + # Add classification token - for BERT models + if cls_token: + if cls_token_at_end: + tokens += [cls_token] + segment_ids += [cls_token_segment_id] + else: + tokens = [cls_token] + tokens + segment_ids = [cls_token_segment_id] + segment_ids + input_ids = tokenizer.tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) + + # Zero-pad up to the sequence length. + padding_length = max_seq_length - len(input_ids) + pad_token_id = tokenizer.tokens_to_ids([pad_token])[0] + if pad_on_left: + input_ids = ([pad_token_id] * padding_length) + input_ids + input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask + segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids + else: + input_ids = input_ids + ([pad_token_id] * padding_length) + input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length) + segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) + if len(input_ids) != max_seq_length: + raise ValueError("input_ids must be of length max_seq_length") + if len(input_mask) != max_seq_length: + raise ValueError("input_mask must be of length max_seq_length") + if len(segment_ids) != max_seq_length: + raise ValueError("segment_ids must be of length max_seq_length") + if output_mode == "classification": + label_id = label_map[example.label] + elif output_mode == "regression": + label_id = np.float32(example.label) + else: + raise KeyError(output_mode) + + if ex_index < 5: + logging.info("*** Example ***") + logging.info("guid: %s" % (example.guid)) + logging.info("tokens: %s" % " ".join(list(map(str, tokens)))) + logging.info("input_ids: %s" % " ".join(list(map(str, input_ids)))) + logging.info("input_mask: %s" % " ".join(list(map(str, input_mask)))) + logging.info("segment_ids: %s" % " ".join(list(map(str, segment_ids)))) + logging.info("label: %s (id = %d)" % (example.label, label_id)) + + features.append( + InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id) + ) + return features + + + def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length. + + This will always truncate the longer sequence one token at a time. + This makes more sense than truncating an equal percent + of tokens from each, since if one sequence is very short then each token + that's truncated likely contains more information than a longer sequence. + """ + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + + """ + Utility functions for GLUE tasks + This code was adapted from the HuggingFace library at + https://github.com/huggingface/transformers + """ + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, input_ids, input_mask, segment_ids, label_id): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_id = label_id + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text_a, text_b=None, label=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. + For single sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second + sequence. Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + + + diff --git a/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset.py b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset.py deleted file mode 100644 index 7dc25bba3848..000000000000 --- a/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset.py +++ /dev/null @@ -1,481 +0,0 @@ -# Copyright 2018 The Google AI Language Team Authors and -# The HuggingFace Inc. team. -# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Utility functions for Token Classification NLP tasks -Some parts of this code were adapted from the HuggingFace library at -https://github.com/huggingface/pytorch-pretrained-BERT -""" -import itertools -import os -import random - -import numpy as np -from torch.utils.data import Dataset - -from nemo import logging -from nemo.collections.nlp.data.datasets.datasets_utils.datasets_processing import ( - process_atis, - process_jarvis_datasets, - process_snips, -) -from nemo.collections.nlp.data.datasets.datasets_utils.dialogflow_utils import process_dialogflow -from nemo.collections.nlp.data.datasets.datasets_utils.mturk_utils import process_mturk -from nemo.collections.nlp.data.datasets.datasets_utils.preprocessing import ( - DATABASE_EXISTS_TMP, - get_label_stats, - get_stats, -) -from nemo.collections.nlp.utils import list2str, write_vocab_in_order -from nemo.collections.nlp.utils.common_nlp_utils import calc_class_weights, get_vocab, if_exist, label2idx - -__all__ = ['BertJointIntentSlotDataset', 'BertJointIntentSlotInferDataset', 'JointIntentSlotDataDesc'] - - -def get_features( - queries, - max_seq_length, - tokenizer, - pad_label=128, - raw_slots=None, - ignore_extra_tokens=False, - ignore_start_end=False, -): - all_subtokens = [] - all_loss_mask = [] - all_subtokens_mask = [] - all_segment_ids = [] - all_input_ids = [] - all_input_mask = [] - sent_lengths = [] - all_slots = [] - - with_label = False - if raw_slots is not None: - with_label = True - - for i, query in enumerate(queries): - words = query.strip().split() - subtokens = ['[CLS]'] - loss_mask = [1 - ignore_start_end] - subtokens_mask = [0] - if with_label: - slots = [pad_label] - - for j, word in enumerate(words): - word_tokens = tokenizer.tokenize(word) - subtokens.extend(word_tokens) - - loss_mask.append(1) - loss_mask.extend([not ignore_extra_tokens] * (len(word_tokens) - 1)) - - subtokens_mask.append(1) - subtokens_mask.extend([0] * (len(word_tokens) - 1)) - - if with_label: - slots.extend([raw_slots[i][j]] * len(word_tokens)) - - subtokens.append('[SEP]') - loss_mask.append(not ignore_start_end) - subtokens_mask.append(0) - sent_lengths.append(len(subtokens)) - all_subtokens.append(subtokens) - all_loss_mask.append(loss_mask) - all_subtokens_mask.append(subtokens_mask) - all_input_mask.append([1] * len(subtokens)) - if with_label: - slots.append(pad_label) - all_slots.append(slots) - - max_seq_length = min(max_seq_length, max(sent_lengths)) - logging.info(f'Max length: {max_seq_length}') - get_stats(sent_lengths) - too_long_count = 0 - - for i, subtokens in enumerate(all_subtokens): - if len(subtokens) > max_seq_length: - subtokens = ['[CLS]'] + subtokens[-max_seq_length + 1 :] - all_input_mask[i] = [1] + all_input_mask[i][-max_seq_length + 1 :] - all_loss_mask[i] = [1 - ignore_start_end] + all_loss_mask[i][-max_seq_length + 1 :] - all_subtokens_mask[i] = [0] + all_subtokens_mask[i][-max_seq_length + 1 :] - - if with_label: - all_slots[i] = [pad_label] + all_slots[i][-max_seq_length + 1 :] - too_long_count += 1 - - all_input_ids.append([tokenizer._convert_token_to_id(t) for t in subtokens]) - - if len(subtokens) < max_seq_length: - extra = max_seq_length - len(subtokens) - all_input_ids[i] = all_input_ids[i] + [0] * extra - all_loss_mask[i] = all_loss_mask[i] + [0] * extra - all_subtokens_mask[i] = all_subtokens_mask[i] + [0] * extra - all_input_mask[i] = all_input_mask[i] + [0] * extra - - if with_label: - all_slots[i] = all_slots[i] + [pad_label] * extra - - all_segment_ids.append([0] * max_seq_length) - - logging.info(f'{too_long_count} are longer than {max_seq_length}') - - return (all_input_ids, all_segment_ids, all_input_mask, all_loss_mask, all_subtokens_mask, all_slots) - - -class BertJointIntentSlotDataset(Dataset): - """ - Creates dataset to use for the task of joint intent - and slot classification with pretrained model. - - Converts from raw data to an instance that can be used by - NMDataLayer. - - For dataset to use during inference without labels, see - BertJointIntentSlotInferDataset. - - Args: - input_file (str): file to sequence + label. - the first line is header (sentence [tab] label) - each line should be [sentence][tab][label] - slot_file (str): file to slot labels, each line corresponding to - slot labels for a sentence in input_file. No header. - max_seq_length (int): max sequence length minus 2 for [CLS] and [SEP] - tokenizer (Tokenizer): such as BertTokenizer - num_samples (int): number of samples you want to use for the dataset. - If -1, use all dataset. Useful for testing. - shuffle (bool): whether to shuffle your data. - pad_label (int): pad value use for slot labels. - by default, it's the neutral label. - - """ - - def __init__( - self, - input_file, - slot_file, - max_seq_length, - tokenizer, - num_samples=-1, - shuffle=True, - pad_label=128, - ignore_extra_tokens=False, - ignore_start_end=False, - ): - if num_samples == 0: - raise ValueError("num_samples has to be positive", num_samples) - - with open(slot_file, 'r') as f: - slot_lines = f.readlines() - - with open(input_file, 'r') as f: - input_lines = f.readlines()[1:] - - assert len(slot_lines) == len(input_lines) - - dataset = list(zip(slot_lines, input_lines)) - - if shuffle or num_samples > 0: - random.shuffle(dataset) - if num_samples > 0: - dataset = dataset[:num_samples] - - raw_slots, queries, raw_intents = [], [], [] - for slot_line, input_line in dataset: - raw_slots.append([int(slot) for slot in slot_line.strip().split()]) - parts = input_line.strip().split() - raw_intents.append(int(parts[-1])) - queries.append(' '.join(parts[:-1])) - - features = get_features( - queries, - max_seq_length, - tokenizer, - pad_label=pad_label, - raw_slots=raw_slots, - ignore_extra_tokens=ignore_extra_tokens, - ignore_start_end=ignore_start_end, - ) - self.all_input_ids = features[0] - self.all_segment_ids = features[1] - self.all_input_mask = features[2] - self.all_loss_mask = features[3] - self.all_subtokens_mask = features[4] - self.all_slots = features[5] - self.all_intents = raw_intents - - def __len__(self): - return len(self.all_input_ids) - - def __getitem__(self, idx): - return ( - np.array(self.all_input_ids[idx]), - np.array(self.all_segment_ids[idx]), - np.array(self.all_input_mask[idx], dtype=np.long), - np.array(self.all_loss_mask[idx]), - np.array(self.all_subtokens_mask[idx]), - self.all_intents[idx], - np.array(self.all_slots[idx]), - ) - - -class BertJointIntentSlotInferDataset(Dataset): - """ - Creates dataset to use for the task of joint intent - and slot classification with pretrained model. - - Converts from raw data to an instance that can be used by - NMDataLayer. - - This is to be used during inference only. - For dataset to use during training with labels, see - BertJointIntentSlotDataset. - - Args: - queries (list): list of queries to run inference on - max_seq_length (int): max sequence length minus 2 for [CLS] and [SEP] - tokenizer (Tokenizer): such as BertTokenizer - pad_label (int): pad value use for slot labels. - by default, it's the neutral label. - - """ - - def __init__(self, queries, max_seq_length, tokenizer): - features = get_features(queries, max_seq_length, tokenizer) - - self.all_input_ids = features[0] - self.all_segment_ids = features[1] - self.all_input_mask = features[2] - self.all_loss_mask = features[3] - self.all_subtokens_mask = features[4] - - def __len__(self): - return len(self.all_input_ids) - - def __getitem__(self, idx): - return ( - np.array(self.all_input_ids[idx]), - np.array(self.all_segment_ids[idx]), - np.array(self.all_input_mask[idx], dtype=np.long), - np.array(self.all_loss_mask[idx]), - np.array(self.all_subtokens_mask[idx]), - ) - - -class JointIntentSlotDataDesc: - """ Convert the raw data to the standard format supported by - JointIntentSlotDataset. - - By default, the None label for slots is 'O'. - - JointIntentSlotDataset requires two files: - - input_file: file to sequence + label. - the first line is header (sentence [tab] label) - each line should be [sentence][tab][label] - - slot_file: file to slot labels, each line corresponding to - slot labels for a sentence in input_file. No header. - - To keep the mapping from label index to label consistent during - training and inferencing, we require the following files: - dicts.intents.csv: each line is an intent. The first line - corresponding to the 0 intent label, the second line - corresponding to the 1 intent label, and so on. - - dicts.slots.csv: each line is a slot. The first line - corresponding to the 0 slot label, the second line - corresponding to the 1 slot label, and so on. - - Args: - data_dir (str): the directory of the dataset - do_lower_case (bool): whether to set your dataset to lowercase - dataset_name (str): the name of the dataset. If it's a dataset - that follows the standard JointIntentSlotDataset format, - you can set the name as 'default'. - none_slot_label (str): the label for slots that aren't indentified - defaulted to 'O' - pad_label (int): the int used for padding. If set to -1, - it'll be set to the whatever the None label is. - - """ - - def __init__(self, data_dir, do_lower_case=False, dataset_name='default', none_slot_label='O', pad_label=-1): - if dataset_name == 'atis': - self.data_dir = process_atis(data_dir, do_lower_case) - elif dataset_name == 'snips-atis': - self.data_dir, self.pad_label = merge( - data_dir, ['ATIS/nemo-processed-uncased', 'snips/nemo-processed-uncased/all'], dataset_name - ) - elif dataset_name == 'dialogflow': - self.data_dir = process_dialogflow(data_dir, do_lower_case) - elif dataset_name == 'mturk-processed': - self.data_dir = process_mturk(data_dir, do_lower_case) - elif dataset_name in set(['snips-light', 'snips-speak', 'snips-all']): - self.data_dir = process_snips(data_dir, do_lower_case) - if dataset_name.endswith('light'): - self.data_dir = f'{self.data_dir}/light' - elif dataset_name.endswith('speak'): - self.data_dir = f'{self.data_dir}/speak' - elif dataset_name.endswith('all'): - self.data_dir = f'{self.data_dir}/all' - elif dataset_name.startswith('jarvis'): - self.data_dir = process_jarvis_datasets( - data_dir, do_lower_case, dataset_name, modes=["train", "test", "eval"], ignore_prev_intent=False - ) - else: - if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']): - raise FileNotFoundError( - "Make sure that your data follows the standard format " - "supported by JointIntentSlotDataset. Your data must " - "contain dict.intents.csv and dict.slots.csv." - ) - self.data_dir = data_dir - - self.intent_dict_file = self.data_dir + '/dict.intents.csv' - self.slot_dict_file = self.data_dir + '/dict.slots.csv' - self.num_intents = len(get_vocab(self.intent_dict_file)) - slots = label2idx(self.slot_dict_file) - self.num_slots = len(slots) - - for mode in ['train', 'test', 'eval']: - - if not if_exist(self.data_dir, [f'{mode}.tsv']): - logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.') - continue - - slot_file = f'{self.data_dir}/{mode}_slots.tsv' - with open(slot_file, 'r') as f: - slot_lines = f.readlines() - - input_file = f'{self.data_dir}/{mode}.tsv' - with open(input_file, 'r') as f: - input_lines = f.readlines()[1:] # Skipping headers at index 0 - - if len(slot_lines) != len(input_lines): - raise ValueError( - "Make sure that the number of slot lines match the " - "number of intent lines. There should be a 1-1 " - "correspondence between every slot and intent lines." - ) - - dataset = list(zip(slot_lines, input_lines)) - - raw_slots, queries, raw_intents = [], [], [] - for slot_line, input_line in dataset: - slot_list = [int(slot) for slot in slot_line.strip().split()] - raw_slots.append(slot_list) - parts = input_line.strip().split() - raw_intents.append(int(parts[-1])) - queries.append(' '.join(parts[:-1])) - - infold = input_file[: input_file.rfind('/')] - - logging.info(f'Three most popular intents during {mode}ing') - total_intents, intent_label_freq = get_label_stats(raw_intents, infold + f'/{mode}_intent_stats.tsv') - merged_slots = itertools.chain.from_iterable(raw_slots) - - logging.info(f'Three most popular slots during {mode}ing') - slots_total, slots_label_freq = get_label_stats(merged_slots, infold + f'/{mode}_slot_stats.tsv') - - if mode == 'train': - self.slot_weights = calc_class_weights(slots_label_freq) - logging.info(f'Slot weights are - {self.slot_weights}') - - self.intent_weights = calc_class_weights(intent_label_freq) - logging.info(f'Intent weights are - {self.intent_weights}') - - logging.info(f'Total intents - {total_intents}') - logging.info(f'Intent label frequency - {intent_label_freq}') - logging.info(f'Total Slots - {slots_total}') - logging.info(f'Slots label frequency - {slots_label_freq}') - - if pad_label != -1: - self.pad_label = pad_label - else: - if none_slot_label not in slots: - raise ValueError(f'none_slot_label {none_slot_label} not ' f'found in {self.slot_dict_file}.') - self.pad_label = slots[none_slot_label] - - -def merge(data_dir, subdirs, dataset_name, modes=['train', 'test']): - outfold = f'{data_dir}/{dataset_name}' - if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): - logging.info(DATABASE_EXISTS_TMP.format('SNIPS-ATIS', outfold)) - slots = get_vocab(f'{outfold}/dict.slots.csv') - none_slot = 0 - for key in slots: - if slots[key] == 'O': - none_slot = key - break - return outfold, int(none_slot) - - os.makedirs(outfold, exist_ok=True) - - data_files, slot_files = {}, {} - for mode in modes: - data_files[mode] = open(f'{outfold}/{mode}.tsv', 'w') - data_files[mode].write('sentence\tlabel\n') - slot_files[mode] = open(f'{outfold}/{mode}_slots.tsv', 'w') - - intents, slots = {}, {} - intent_shift, slot_shift = 0, 0 - none_intent, none_slot = -1, -1 - - for subdir in subdirs: - curr_intents = get_vocab(f'{data_dir}/{subdir}/dict.intents.csv') - curr_slots = get_vocab(f'{data_dir}/{subdir}/dict.slots.csv') - - for key in curr_intents: - if intent_shift > 0 and curr_intents[key] == 'O': - continue - if curr_intents[key] == 'O' and intent_shift == 0: - none_intent = int(key) - intents[int(key) + intent_shift] = curr_intents[key] - - for key in curr_slots: - if slot_shift > 0 and curr_slots[key] == 'O': - continue - if slot_shift == 0 and curr_slots[key] == 'O': - none_slot = int(key) - slots[int(key) + slot_shift] = curr_slots[key] - - for mode in modes: - with open(f'{data_dir}/{subdir}/{mode}.tsv', 'r') as f: - for line in f.readlines()[1:]: - text, label = line.strip().split('\t') - label = int(label) - if curr_intents[label] == 'O': - label = none_intent - else: - label = label + intent_shift - data_files[mode].write(f'{text}\t{label}\n') - - with open(f'{data_dir}/{subdir}/{mode}_slots.tsv', 'r') as f: - for line in f.readlines(): - labels = [int(label) for label in line.strip().split()] - shifted_labels = [] - for label in labels: - if curr_slots[label] == 'O': - shifted_labels.append(none_slot) - else: - shifted_labels.append(label + slot_shift) - slot_files[mode].write(list2str(shifted_labels) + '\n') - - intent_shift += len(curr_intents) - slot_shift += len(curr_slots) - - write_vocab_in_order(intents, f'{outfold}/dict.intents.csv') - write_vocab_in_order(slots, f'{outfold}/dict.slots.csv') - return outfold, none_slot diff --git a/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/data_descriptor.py b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/data_descriptor.py new file mode 100644 index 000000000000..15bde0d4b4cb --- /dev/null +++ b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/data_descriptor.py @@ -0,0 +1,217 @@ +import itertools +import os + +from nemo import logging +from nemo.collections.nlp.data import process_atis, process_dialogflow, process_mturk, process_snips, \ + process_jarvis_datasets, DATABASE_EXISTS_TMP +from nemo.collections.nlp.data.datasets.datasets_utils.preprocessing import get_label_stats +from nemo.collections.nlp.utils import if_exist, get_vocab, label2idx, calc_class_weights, list2str, \ + write_vocab_in_order + + +class JointIntentSlotDataDesc: + """ Convert the raw data to the standard format supported by + JointIntentSlotDataset. + + By default, the None label for slots is 'O'. + + JointIntentSlotDataset requires two files: + + input_file: file to sequence + label. + the first line is header (sentence [tab] label) + each line should be [sentence][tab][label] + + slot_file: file to slot labels, each line corresponding to + slot labels for a sentence in input_file. No header. + + To keep the mapping from label index to label consistent during + training and inferencing, we require the following files: + dicts.intents.csv: each line is an intent. The first line + corresponding to the 0 intent label, the second line + corresponding to the 1 intent label, and so on. + + dicts.slots.csv: each line is a slot. The first line + corresponding to the 0 slot label, the second line + corresponding to the 1 slot label, and so on. + + Args: + data_dir (str): the directory of the dataset + do_lower_case (bool): whether to set your dataset to lowercase + dataset_name (str): the name of the dataset. If it's a dataset + that follows the standard JointIntentSlotDataset format, + you can set the name as 'default'. + none_slot_label (str): the label for slots that aren't indentified + defaulted to 'O' + pad_label (int): the int used for padding. If set to -1, + it'll be set to the whatever the None label is. + + """ + + def __init__(self, data_dir, do_lower_case=False, dataset_name='default', none_slot_label='O', pad_label=-1): + if dataset_name == 'atis': + self.data_dir = process_atis(data_dir, do_lower_case) + elif dataset_name == 'snips-atis': + self.data_dir, self.pad_label = merge( + data_dir, ['ATIS/nemo-processed-uncased', 'snips/nemo-processed-uncased/all'], dataset_name + ) + elif dataset_name == 'dialogflow': + self.data_dir = process_dialogflow(data_dir, do_lower_case) + elif dataset_name == 'mturk-processed': + self.data_dir = process_mturk(data_dir, do_lower_case) + elif dataset_name in set(['snips-light', 'snips-speak', 'snips-all']): + self.data_dir = process_snips(data_dir, do_lower_case) + if dataset_name.endswith('light'): + self.data_dir = f'{self.data_dir}/light' + elif dataset_name.endswith('speak'): + self.data_dir = f'{self.data_dir}/speak' + elif dataset_name.endswith('all'): + self.data_dir = f'{self.data_dir}/all' + elif dataset_name.startswith('jarvis'): + self.data_dir = process_jarvis_datasets( + data_dir, do_lower_case, dataset_name, modes=["train", "test", "eval"], ignore_prev_intent=False + ) + else: + if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']): + raise FileNotFoundError( + "Make sure that your data follows the standard format " + "supported by JointIntentSlotDataset. Your data must " + "contain dict.intents.csv and dict.slots.csv." + ) + self.data_dir = data_dir + + self.intent_dict_file = self.data_dir + '/dict.intents.csv' + self.slot_dict_file = self.data_dir + '/dict.slots.csv' + self.num_intents = len(get_vocab(self.intent_dict_file)) + slots = label2idx(self.slot_dict_file) + self.num_slots = len(slots) + + for mode in ['train', 'test', 'eval']: + + if not if_exist(self.data_dir, [f'{mode}.tsv']): + logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.') + continue + + slot_file = f'{self.data_dir}/{mode}_slots.tsv' + with open(slot_file, 'r') as f: + slot_lines = f.readlines() + + input_file = f'{self.data_dir}/{mode}.tsv' + with open(input_file, 'r') as f: + input_lines = f.readlines()[1:] # Skipping headers at index 0 + + if len(slot_lines) != len(input_lines): + raise ValueError( + "Make sure that the number of slot lines match the " + "number of intent lines. There should be a 1-1 " + "correspondence between every slot and intent lines." + ) + + dataset = list(zip(slot_lines, input_lines)) + + raw_slots, queries, raw_intents = [], [], [] + for slot_line, input_line in dataset: + slot_list = [int(slot) for slot in slot_line.strip().split()] + raw_slots.append(slot_list) + parts = input_line.strip().split() + raw_intents.append(int(parts[-1])) + queries.append(' '.join(parts[:-1])) + + infold = input_file[: input_file.rfind('/')] + + logging.info(f'Three most popular intents during {mode}ing') + total_intents, intent_label_freq = get_label_stats(raw_intents, infold + f'/{mode}_intent_stats.tsv') + merged_slots = itertools.chain.from_iterable(raw_slots) + + logging.info(f'Three most popular slots during {mode}ing') + slots_total, slots_label_freq = get_label_stats(merged_slots, infold + f'/{mode}_slot_stats.tsv') + + if mode == 'train': + self.slot_weights = calc_class_weights(slots_label_freq) + logging.info(f'Slot weights are - {self.slot_weights}') + + self.intent_weights = calc_class_weights(intent_label_freq) + logging.info(f'Intent weights are - {self.intent_weights}') + + logging.info(f'Total intents - {total_intents}') + logging.info(f'Intent label frequency - {intent_label_freq}') + logging.info(f'Total Slots - {slots_total}') + logging.info(f'Slots label frequency - {slots_label_freq}') + + if pad_label != -1: + self.pad_label = pad_label + else: + if none_slot_label not in slots: + raise ValueError(f'none_slot_label {none_slot_label} not ' f'found in {self.slot_dict_file}.') + self.pad_label = slots[none_slot_label] + + +def merge(data_dir, subdirs, dataset_name, modes=['train', 'test']): + outfold = f'{data_dir}/{dataset_name}' + if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): + logging.info(DATABASE_EXISTS_TMP.format('SNIPS-ATIS', outfold)) + slots = get_vocab(f'{outfold}/dict.slots.csv') + none_slot = 0 + for key in slots: + if slots[key] == 'O': + none_slot = key + break + return outfold, int(none_slot) + + os.makedirs(outfold, exist_ok=True) + + data_files, slot_files = {}, {} + for mode in modes: + data_files[mode] = open(f'{outfold}/{mode}.tsv', 'w') + data_files[mode].write('sentence\tlabel\n') + slot_files[mode] = open(f'{outfold}/{mode}_slots.tsv', 'w') + + intents, slots = {}, {} + intent_shift, slot_shift = 0, 0 + none_intent, none_slot = -1, -1 + + for subdir in subdirs: + curr_intents = get_vocab(f'{data_dir}/{subdir}/dict.intents.csv') + curr_slots = get_vocab(f'{data_dir}/{subdir}/dict.slots.csv') + + for key in curr_intents: + if intent_shift > 0 and curr_intents[key] == 'O': + continue + if curr_intents[key] == 'O' and intent_shift == 0: + none_intent = int(key) + intents[int(key) + intent_shift] = curr_intents[key] + + for key in curr_slots: + if slot_shift > 0 and curr_slots[key] == 'O': + continue + if slot_shift == 0 and curr_slots[key] == 'O': + none_slot = int(key) + slots[int(key) + slot_shift] = curr_slots[key] + + for mode in modes: + with open(f'{data_dir}/{subdir}/{mode}.tsv', 'r') as f: + for line in f.readlines()[1:]: + text, label = line.strip().split('\t') + label = int(label) + if curr_intents[label] == 'O': + label = none_intent + else: + label = label + intent_shift + data_files[mode].write(f'{text}\t{label}\n') + + with open(f'{data_dir}/{subdir}/{mode}_slots.tsv', 'r') as f: + for line in f.readlines(): + labels = [int(label) for label in line.strip().split()] + shifted_labels = [] + for label in labels: + if curr_slots[label] == 'O': + shifted_labels.append(none_slot) + else: + shifted_labels.append(label + slot_shift) + slot_files[mode].write(list2str(shifted_labels) + '\n') + + intent_shift += len(curr_intents) + slot_shift += len(curr_slots) + + write_vocab_in_order(intents, f'{outfold}/dict.intents.csv') + write_vocab_in_order(slots, f'{outfold}/dict.slots.csv') + return outfold, none_slot \ No newline at end of file diff --git a/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/joint_intent_slot_dataset.py b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/joint_intent_slot_dataset.py new file mode 100644 index 000000000000..2aad0f79b732 --- /dev/null +++ b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/joint_intent_slot_dataset.py @@ -0,0 +1,262 @@ +# Copyright 2018 The Google AI Language Team Authors and +# The HuggingFace Inc. team. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Utility functions for Token Classification NLP tasks +Some parts of this code were adapted from the HuggingFace library at +https://github.com/huggingface/pytorch-pretrained-BERT +""" +import random + +import numpy as np +from torch.utils.data import Dataset + +from nemo import logging +from nemo.collections.nlp.data.datasets.datasets_utils.preprocessing import ( + get_stats, +) + +__all__ = ['BertJointIntentSlotDataset', 'BertJointIntentSlotInferDataset'] + + +def get_features( + queries, + max_seq_length, + tokenizer, + pad_label=128, + raw_slots=None, + ignore_extra_tokens=False, + ignore_start_end=False, +): + all_subtokens = [] + all_loss_mask = [] + all_subtokens_mask = [] + all_segment_ids = [] + all_input_ids = [] + all_input_mask = [] + sent_lengths = [] + all_slots = [] + + with_label = False + if raw_slots is not None: + with_label = True + + for i, query in enumerate(queries): + words = query.strip().split() + subtokens = ['[CLS]'] + loss_mask = [1 - ignore_start_end] + subtokens_mask = [0] + if with_label: + slots = [pad_label] + + for j, word in enumerate(words): + word_tokens = tokenizer.tokenize(word) + subtokens.extend(word_tokens) + + loss_mask.append(1) + loss_mask.extend([not ignore_extra_tokens] * (len(word_tokens) - 1)) + + subtokens_mask.append(1) + subtokens_mask.extend([0] * (len(word_tokens) - 1)) + + if with_label: + slots.extend([raw_slots[i][j]] * len(word_tokens)) + + subtokens.append('[SEP]') + loss_mask.append(not ignore_start_end) + subtokens_mask.append(0) + sent_lengths.append(len(subtokens)) + all_subtokens.append(subtokens) + all_loss_mask.append(loss_mask) + all_subtokens_mask.append(subtokens_mask) + all_input_mask.append([1] * len(subtokens)) + if with_label: + slots.append(pad_label) + all_slots.append(slots) + + max_seq_length = min(max_seq_length, max(sent_lengths)) + logging.info(f'Max length: {max_seq_length}') + get_stats(sent_lengths) + too_long_count = 0 + + for i, subtokens in enumerate(all_subtokens): + if len(subtokens) > max_seq_length: + subtokens = ['[CLS]'] + subtokens[-max_seq_length + 1 :] + all_input_mask[i] = [1] + all_input_mask[i][-max_seq_length + 1 :] + all_loss_mask[i] = [1 - ignore_start_end] + all_loss_mask[i][-max_seq_length + 1 :] + all_subtokens_mask[i] = [0] + all_subtokens_mask[i][-max_seq_length + 1 :] + + if with_label: + all_slots[i] = [pad_label] + all_slots[i][-max_seq_length + 1 :] + too_long_count += 1 + + all_input_ids.append([tokenizer._convert_token_to_id(t) for t in subtokens]) + + if len(subtokens) < max_seq_length: + extra = max_seq_length - len(subtokens) + all_input_ids[i] = all_input_ids[i] + [0] * extra + all_loss_mask[i] = all_loss_mask[i] + [0] * extra + all_subtokens_mask[i] = all_subtokens_mask[i] + [0] * extra + all_input_mask[i] = all_input_mask[i] + [0] * extra + + if with_label: + all_slots[i] = all_slots[i] + [pad_label] * extra + + all_segment_ids.append([0] * max_seq_length) + + logging.info(f'{too_long_count} are longer than {max_seq_length}') + + return (all_input_ids, all_segment_ids, all_input_mask, all_loss_mask, all_subtokens_mask, all_slots) + + +class BertJointIntentSlotDataset(Dataset): + """ + Creates dataset to use for the task of joint intent + and slot classification with pretrained model. + + Converts from raw data to an instance that can be used by + NMDataLayer. + + For dataset to use during inference without labels, see + BertJointIntentSlotInferDataset. + + Args: + input_file (str): file to sequence + label. + the first line is header (sentence [tab] label) + each line should be [sentence][tab][label] + slot_file (str): file to slot labels, each line corresponding to + slot labels for a sentence in input_file. No header. + max_seq_length (int): max sequence length minus 2 for [CLS] and [SEP] + tokenizer (Tokenizer): such as BertTokenizer + num_samples (int): number of samples you want to use for the dataset. + If -1, use all dataset. Useful for testing. + shuffle (bool): whether to shuffle your data. + pad_label (int): pad value use for slot labels. + by default, it's the neutral label. + + """ + + def __init__( + self, + input_file, + slot_file, + max_seq_length, + tokenizer, + num_samples=-1, + shuffle=True, + pad_label=128, + ignore_extra_tokens=False, + ignore_start_end=False, + ): + if num_samples == 0: + raise ValueError("num_samples has to be positive", num_samples) + + with open(slot_file, 'r') as f: + slot_lines = f.readlines() + + with open(input_file, 'r') as f: + input_lines = f.readlines()[1:] + + assert len(slot_lines) == len(input_lines) + + dataset = list(zip(slot_lines, input_lines)) + + if shuffle or num_samples > 0: + random.shuffle(dataset) + if num_samples > 0: + dataset = dataset[:num_samples] + + raw_slots, queries, raw_intents = [], [], [] + for slot_line, input_line in dataset: + raw_slots.append([int(slot) for slot in slot_line.strip().split()]) + parts = input_line.strip().split() + raw_intents.append(int(parts[-1])) + queries.append(' '.join(parts[:-1])) + + features = get_features( + queries, + max_seq_length, + tokenizer, + pad_label=pad_label, + raw_slots=raw_slots, + ignore_extra_tokens=ignore_extra_tokens, + ignore_start_end=ignore_start_end, + ) + self.all_input_ids = features[0] + self.all_segment_ids = features[1] + self.all_input_mask = features[2] + self.all_loss_mask = features[3] + self.all_subtokens_mask = features[4] + self.all_slots = features[5] + self.all_intents = raw_intents + + def __len__(self): + return len(self.all_input_ids) + + def __getitem__(self, idx): + return ( + np.array(self.all_input_ids[idx]), + np.array(self.all_segment_ids[idx]), + np.array(self.all_input_mask[idx], dtype=np.long), + np.array(self.all_loss_mask[idx]), + np.array(self.all_subtokens_mask[idx]), + self.all_intents[idx], + np.array(self.all_slots[idx]), + ) + + +class BertJointIntentSlotInferDataset(Dataset): + """ + Creates dataset to use for the task of joint intent + and slot classification with pretrained model. + + Converts from raw data to an instance that can be used by + NMDataLayer. + + This is to be used during inference only. + For dataset to use during training with labels, see + BertJointIntentSlotDataset. + + Args: + queries (list): list of queries to run inference on + max_seq_length (int): max sequence length minus 2 for [CLS] and [SEP] + tokenizer (Tokenizer): such as BertTokenizer + pad_label (int): pad value use for slot labels. + by default, it's the neutral label. + + """ + + def __init__(self, queries, max_seq_length, tokenizer): + features = get_features(queries, max_seq_length, tokenizer) + + self.all_input_ids = features[0] + self.all_segment_ids = features[1] + self.all_input_mask = features[2] + self.all_loss_mask = features[3] + self.all_subtokens_mask = features[4] + + def __len__(self): + return len(self.all_input_ids) + + def __getitem__(self, idx): + return ( + np.array(self.all_input_ids[idx]), + np.array(self.all_segment_ids[idx]), + np.array(self.all_input_mask[idx], dtype=np.long), + np.array(self.all_loss_mask[idx]), + np.array(self.all_subtokens_mask[idx]), + ) + + diff --git a/nemo/collections/nlp/data/datasets/lm_bert_dataset.py b/nemo/collections/nlp/data/datasets/lm_bert_dataset.py index f068e614afa3..2d50f4e1e4f7 100644 --- a/nemo/collections/nlp/data/datasets/lm_bert_dataset.py +++ b/nemo/collections/nlp/data/datasets/lm_bert_dataset.py @@ -29,10 +29,12 @@ from tqdm import tqdm from nemo import logging -from nemo.collections.nlp.data.datasets.lm_transformer_dataset import create_vocab_mlm __all__ = ['BertPretrainingDataset', 'BertPretrainingPreprocessedDataset'] +from nemo.collections.nlp.data import DATABASE_EXISTS_TMP +from nemo.collections.nlp.utils import if_exist + class BertPretrainingDataset(Dataset): def __init__( @@ -394,3 +396,57 @@ def __init__(self, dataset_name, data_dir, vocab_size, sample_size, special_toke self.train_file = f'{data_dir}/train.txt' self.eval_file = f'{data_dir}/valid.txt' self.test_file = f'{data_dir}/test.txt' + + + def create_vocab_mlm( + data_dir, vocab_size, sample_size, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], train_file='' + ): + vocab = special_tokens[:] + bert_dir = f'{data_dir}/bert' + if if_exist(bert_dir, ['tokenizer.model']): + logging.info(DATABASE_EXISTS_TMP.format('WikiText_BERT', bert_dir)) + return data_dir, f'{bert_dir}/tokenizer.model' + logging.info(f'Processing WikiText dataset and store at {bert_dir}') + os.makedirs(bert_dir, exist_ok=True) + + if not train_file: + files = glob.glob(f'{data_dir}/*.txt') + train_file = f'{bert_dir}/merged.txt' + logging.info(f"Merging {len(files)} txt files into {train_file}") + + with open(train_file, "w") as merged: + for file in tqdm(files): + with open(file, 'r') as inf: + content = inf.read().strip() + merged.write(content + '\n\n\n') + else: + train_file = f'{data_dir}/{train_file}' + + cmd = ( + f"--input={train_file} --model_prefix={bert_dir}/tokenizer " + f"--vocab_size={vocab_size - len(vocab)} " + f"--input_sentence_size={sample_size} " + f"--shuffle_input_sentence=true --hard_vocab_limit=false " + f"--bos_id=-1 --eos_id=-1" + ) + SPT.Train(cmd) + + # Add BERT control symbols + tokens = [] + + with open(f"{bert_dir}/tokenizer.vocab", "r") as f: + f.readline() # skip first token + + # Read tokens from each line and parse for vocab + for line in f: + piece = line.split("\t")[0] + token = piece[1:] if piece.startswith("▁") else f"##{piece}" + tokens.append(token) + + vocab.extend(tokens) + + # Save vocabulary to output file + with open(f'{bert_dir}/vocab.txt', "w") as f: + for token in vocab: + f.write(f"{token}\n".format()) + return data_dir, f'{bert_dir}/tokenizer.model' \ No newline at end of file diff --git a/nemo/collections/nlp/data/datasets/lm_transformer_dataset.py b/nemo/collections/nlp/data/datasets/lm_transformer_dataset.py index 5d8f20723c6e..7d2075f82fea 100644 --- a/nemo/collections/nlp/data/datasets/lm_transformer_dataset.py +++ b/nemo/collections/nlp/data/datasets/lm_transformer_dataset.py @@ -15,18 +15,15 @@ # ============================================================================= """Pytorch Dataset for training Neural Machine Translation.""" -import glob import os -import pickle import re import numpy as np -from sentencepiece import SentencePieceTrainer as SPT from torch.utils.data import Dataset -from tqdm import tqdm from nemo import logging -from nemo.collections.nlp.data.datasets.datasets_utils import DATABASE_EXISTS_TMP, download_wkt2 +from nemo.collections.nlp.data.datasets.datasets_utils import download_wkt2 +from nemo.collections.nlp.data.datasets.datasets_utils.datasets_processing import dataset_to_ids from nemo.collections.nlp.utils.common_nlp_utils import if_exist __all__ = ['LanguageModelingDataset'] @@ -66,122 +63,33 @@ def __init__(self, dataset_name, data_dir, do_lower_case): "you build the preprocessing method for it." ) - -def create_vocab_mlm( - data_dir, vocab_size, sample_size, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], train_file='' -): - vocab = special_tokens[:] - bert_dir = f'{data_dir}/bert' - if if_exist(bert_dir, ['tokenizer.model']): - logging.info(DATABASE_EXISTS_TMP.format('WikiText_BERT', bert_dir)) - return data_dir, f'{bert_dir}/tokenizer.model' - logging.info(f'Processing WikiText dataset and store at {bert_dir}') - os.makedirs(bert_dir, exist_ok=True) - - if not train_file: - files = glob.glob(f'{data_dir}/*.txt') - train_file = f'{bert_dir}/merged.txt' - logging.info(f"Merging {len(files)} txt files into {train_file}") - - with open(train_file, "w") as merged: - for file in tqdm(files): - with open(file, 'r') as inf: - content = inf.read().strip() - merged.write(content + '\n\n\n') - else: - train_file = f'{data_dir}/{train_file}' - - cmd = ( - f"--input={train_file} --model_prefix={bert_dir}/tokenizer " - f"--vocab_size={vocab_size - len(vocab)} " - f"--input_sentence_size={sample_size} " - f"--shuffle_input_sentence=true --hard_vocab_limit=false " - f"--bos_id=-1 --eos_id=-1" - ) - SPT.Train(cmd) - - # Add BERT control symbols - tokens = [] - - with open(f"{bert_dir}/tokenizer.vocab", "r") as f: - f.readline() # skip first token - - # Read tokens from each line and parse for vocab - for line in f: - piece = line.split("\t")[0] - token = piece[1:] if piece.startswith("▁") else f"##{piece}" - tokens.append(token) - - vocab.extend(tokens) - - # Save vocabulary to output file - with open(f'{bert_dir}/vocab.txt', "w") as f: - for token in vocab: - f.write(f"{token}\n".format()) - return data_dir, f'{bert_dir}/tokenizer.model' - - -def dataset_to_ids(dataset, tokenizer, cache_ids=False, add_bos_eos=True): - """ - Reads dataset from file line by line, tokenizes each line with tokenizer, - and returns list of lists which corresponds to ids of tokenized strings. - - Args: - dataset: path to dataset - tokenizer: tokenizer to convert text into ids - cache_ids: if True, ids are saved to disk as pickle file - with similar name (e.g., data.txt --> data.txt.pkl) - add_bos_eos: bool, whether to add and symbols (e.g., for NMT) - Returns: - ids: list of ids which correspond to tokenized strings of the dataset - """ - - cached_ids_dataset = dataset + str(".pkl") - if os.path.isfile(cached_ids_dataset): - logging.info("Loading cached tokenized dataset ...") - ids = pickle.load(open(cached_ids_dataset, "rb")) - else: - logging.info("Tokenizing dataset ...") - data = open(dataset, "rb").readlines() - ids = [] - for sentence in data: - sent_ids = tokenizer.text_to_ids(sentence.decode("utf-8")) - if add_bos_eos: - sent_ids = [tokenizer.bos_id] + sent_ids + [tokenizer.eos_id] - ids.append(sent_ids) - if cache_ids: - logging.info("Caching tokenized dataset ...") - pickle.dump(ids, open(cached_ids_dataset, "wb")) - return ids - - -def create_vocab_lm(data_dir, do_lower_case): - if if_exist(data_dir, ['train.txt', 'vocab.txt']): - logging.info("Vocabulary has been created.") - with open(os.path.join(data_dir, 'vocab.txt'), 'r') as f: - vocab_size = len(f.readlines()) - return vocab_size - - logging.info(f'Creating vocabulary from training data at {data_dir}') - - with open(f'{data_dir}/train.txt', 'r') as f: - txt = f.read() - if do_lower_case: - txt = txt.lower() - lines = re.split(r'[\n]', txt) - sentences = [line.strip().split() for line in lines if line.strip()] - - vocab = {"[PAD]": 0, "[SEP]": 1, "[CLS]": 2, "[MASK]": 3} - idx = 4 - for sentence in sentences: - for word in sentence: - if word not in vocab: - vocab[word] = idx - idx += 1 - - with open(f'{data_dir}/vocab.txt', 'w') as f: - for word in sorted(vocab.keys()): - f.write(word + '\n') - logging.info(f"Created vocabulary of size {len(vocab)}") - - return len(vocab) + def create_vocab_lm(data_dir, do_lower_case): + if if_exist(data_dir, ['train.txt', 'vocab.txt']): + logging.info("Vocabulary has been created.") + with open(os.path.join(data_dir, 'vocab.txt'), 'r') as f: + vocab_size = len(f.readlines()) + return vocab_size + + logging.info(f'Creating vocabulary from training data at {data_dir}') + + with open(f'{data_dir}/train.txt', 'r') as f: + txt = f.read() + if do_lower_case: + txt = txt.lower() + lines = re.split(r'[\n]', txt) + sentences = [line.strip().split() for line in lines if line.strip()] + + vocab = {"[PAD]": 0, "[SEP]": 1, "[CLS]": 2, "[MASK]": 3} + idx = 4 + for sentence in sentences: + for word in sentence: + if word not in vocab: + vocab[word] = idx + idx += 1 + + with open(f'{data_dir}/vocab.txt', 'w') as f: + for word in sorted(vocab.keys()): + f.write(word + '\n') + logging.info(f"Created vocabulary of size {len(vocab)}") + + return len(vocab) diff --git a/nemo/collections/nlp/data/datasets/machine_translation_dataset.py b/nemo/collections/nlp/data/datasets/machine_translation_dataset.py index db8e6b7ace2d..e3f2bfd8e102 100644 --- a/nemo/collections/nlp/data/datasets/machine_translation_dataset.py +++ b/nemo/collections/nlp/data/datasets/machine_translation_dataset.py @@ -21,7 +21,7 @@ import numpy as np from torch.utils.data import Dataset -from nemo.collections.nlp.data.datasets.lm_transformer_dataset import dataset_to_ids +from nemo.collections.nlp.data.datasets.datasets_utils.datasets_processing import dataset_to_ids __all__ = ['TranslationDataset'] @@ -157,34 +157,34 @@ def pack_data_into_batches(self, src_ids, tgt_ids): return batches -def clean_src_and_target(src_ids, tgt_ids, max_tokens=128, min_tokens=3, max_tokens_diff=25, max_tokens_ratio=2.5): - """ - Cleans source and target sentences to get rid of noisy data. - Specifically, a pair of sentences is removed if - -- either source or target is longer than *max_tokens* - -- either source or target is shorter than *min_tokens* - -- absolute difference between source and target is larger than - *max_tokens_diff* - -- one sentence is *max_tokens_ratio* times longer than the other - """ - - if len(src_ids) != len(tgt_ids): - raise ValueError("Source and target corpora have different lengths!") - src_ids_, tgt_ids_ = [], [] - for i in range(len(src_ids)): - src_len, tgt_len = len(src_ids[i]), len(tgt_ids[i]) - if ( - src_len > max_tokens - or tgt_len > max_tokens - or src_len < min_tokens - or tgt_len < min_tokens - or (src_ids[i] == tgt_ids[i]) - or np.abs(src_len - tgt_len) > max_tokens_diff - ): - continue - ratio = max(src_len - 2, 1) / max(tgt_len - 2, 1) - if ratio > max_tokens_ratio or ratio < (1 / max_tokens_ratio): - continue - src_ids_.append(src_ids[i]) - tgt_ids_.append(tgt_ids[i]) - return src_ids_, tgt_ids_ + def clean_src_and_target(src_ids, tgt_ids, max_tokens=128, min_tokens=3, max_tokens_diff=25, max_tokens_ratio=2.5): + """ + Cleans source and target sentences to get rid of noisy data. + Specifically, a pair of sentences is removed if + -- either source or target is longer than *max_tokens* + -- either source or target is shorter than *min_tokens* + -- absolute difference between source and target is larger than + *max_tokens_diff* + -- one sentence is *max_tokens_ratio* times longer than the other + """ + + if len(src_ids) != len(tgt_ids): + raise ValueError("Source and target corpora have different lengths!") + src_ids_, tgt_ids_ = [], [] + for i in range(len(src_ids)): + src_len, tgt_len = len(src_ids[i]), len(tgt_ids[i]) + if ( + src_len > max_tokens + or tgt_len > max_tokens + or src_len < min_tokens + or tgt_len < min_tokens + or (src_ids[i] == tgt_ids[i]) + or np.abs(src_len - tgt_len) > max_tokens_diff + ): + continue + ratio = max(src_len - 2, 1) / max(tgt_len - 2, 1) + if ratio > max_tokens_ratio or ratio < (1 / max_tokens_ratio): + continue + src_ids_.append(src_ids[i]) + tgt_ids_.append(tgt_ids[i]) + return src_ids_, tgt_ids_ diff --git a/nemo/collections/nlp/data/datasets/qa_squad_dataset.py b/nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_dataset.py similarity index 67% rename from nemo/collections/nlp/data/datasets/qa_squad_dataset.py rename to nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_dataset.py index b927f83ead38..0eaeba4528fe 100644 --- a/nemo/collections/nlp/data/datasets/qa_squad_dataset.py +++ b/nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_dataset.py @@ -26,7 +26,8 @@ from tqdm import tqdm from nemo import logging -from nemo.collections.nlp.data.datasets.glue_benchmark_dataset import DataProcessor +from nemo.collections.nlp.data.datasets.glue_benchmark_dataset.data_processors import DataProcessor +from nemo.collections.nlp.data.datasets.qa_squad_dataset.qa_squad_processing import convert_examples_to_features from nemo.collections.nlp.metrics.squad_metrics import ( _get_best_indexes, apply_no_ans_threshold, @@ -403,177 +404,6 @@ def evaluate( return exact_match, f1, all_predictions -def convert_examples_to_features( - examples, tokenizer, max_seq_length, doc_stride, max_query_length, has_groundtruth, -): - """Loads a data file into a list of `InputBatch`s.""" - - unique_id = 1000000000 - - features = [] - for (example_index, example) in enumerate(examples): - query_tokens = tokenizer.text_to_tokens(example.question_text) - - if len(query_tokens) > max_query_length: - query_tokens = query_tokens[0:max_query_length] - - # context: index of token -> index of word - tok_to_orig_index = [] - # context: index of word -> index of first token in token list - orig_to_tok_index = [] - # context without white spaces after tokenization - all_doc_tokens = [] - # doc tokens is word separated context - for (i, token) in enumerate(example.doc_tokens): - orig_to_tok_index.append(len(all_doc_tokens)) - sub_tokens = tokenizer.text_to_tokens(token) - for sub_token in sub_tokens: - tok_to_orig_index.append(i) - all_doc_tokens.append(sub_token) - - # idx of query token start and end in context - tok_start_position = None - tok_end_position = None - if has_groundtruth and example.is_impossible: - tok_start_position = -1 - tok_end_position = -1 - if has_groundtruth and not example.is_impossible: - tok_start_position = orig_to_tok_index[example.start_position] - if example.end_position < len(example.doc_tokens) - 1: - tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 - else: - tok_end_position = len(all_doc_tokens) - 1 - - (tok_start_position, tok_end_position) = _improve_answer_span( - all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text - ) - - # The -3 accounts for tokenizer.cls_token, tokenizer.sep_token and tokenizer.eos_token - # doc_spans contains all possible contexts options of given length - max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 - _DocSpan = collections.namedtuple("DocSpan", ["start", "length"]) - doc_spans = [] - start_offset = 0 - while start_offset < len(all_doc_tokens): - length = len(all_doc_tokens) - start_offset - if length > max_tokens_for_doc: - length = max_tokens_for_doc - doc_spans.append(_DocSpan(start=start_offset, length=length)) - if start_offset + length == len(all_doc_tokens): - break - start_offset += min(length, doc_stride) - - for (doc_span_index, doc_span) in enumerate(doc_spans): - tokens = [] - # maps context tokens idx in final input -> word idx in context - token_to_orig_map = {} - token_is_max_context = {} - segment_ids = [] - tokens.append(tokenizer.bos_token) - segment_ids.append(0) - for token in query_tokens: - tokens.append(token) - segment_ids.append(0) - tokens.append(tokenizer.sep_token) - segment_ids.append(0) - - for i in range(doc_span.length): - split_token_index = doc_span.start + i - token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] - - is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index) - token_is_max_context[len(tokens)] = is_max_context - tokens.append(all_doc_tokens[split_token_index]) - segment_ids.append(1) - tokens.append(tokenizer.eos_token) - segment_ids.append(1) - - input_ids = tokenizer.tokens_to_ids(tokens) - - # The mask has 1 for real tokens and 0 for padding tokens. - # Only real tokens are attended to. - input_mask = [1] * len(input_ids) - - # Zero-pad up to the sequence length. - while len(input_ids) < max_seq_length: - input_ids.append(tokenizer.pad_id) - input_mask.append(0) - segment_ids.append(0) - - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length - - # calculate start and end position in final array - # of tokens in answer if no answer, - # 0 for both pointing to tokenizer.cls_token - start_position = None - end_position = None - if has_groundtruth and not example.is_impossible: - doc_start = doc_span.start - doc_end = doc_span.start + doc_span.length - 1 - out_of_span = False - if not (tok_start_position >= doc_start and tok_end_position <= doc_end): - out_of_span = True - if out_of_span: - start_position = 0 - end_position = 0 - else: - doc_offset = len(query_tokens) + 2 - start_position = tok_start_position - doc_start + doc_offset - end_position = tok_end_position - doc_start + doc_offset - if has_groundtruth and example.is_impossible: - # if our document chunk does not contain - # an annotation we throw it out, since there is nothing - # to predict. - start_position = 0 - end_position = 0 - - if example_index < 1: - logging.info("*** Example ***") - logging.info("unique_id: %s" % (unique_id)) - logging.info("example_index: %s" % (example_index)) - logging.info("doc_span_index: %s" % (doc_span_index)) - logging.info("tokens: %s" % " ".join(tokens)) - logging.info( - "token_to_orig_map: %s" % " ".join(["%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]) - ) - logging.info( - "token_is_max_context: %s" - % " ".join(["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()]) - ) - logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) - logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) - if has_groundtruth and example.is_impossible: - logging.info("impossible example") - if has_groundtruth and not example.is_impossible: - answer_text = " ".join(tokens[start_position : (end_position + 1)]) - logging.info("start_position: %d" % (start_position)) - logging.info("end_position: %d" % (end_position)) - logging.info("answer: %s" % (answer_text)) - - features.append( - InputFeatures( - unique_id=unique_id, - example_index=example_index, - doc_span_index=doc_span_index, - tokens=tokens, - token_to_orig_map=token_to_orig_map, - token_is_max_context=token_is_max_context, - input_ids=input_ids, - input_mask=input_mask, - segment_ids=segment_ids, - start_position=start_position, - end_position=end_position, - is_impossible=example.is_impossible, - ) - ) - unique_id += 1 - - return features - - class InputFeatures(object): """A single set of features of data.""" @@ -738,77 +568,3 @@ def __init__( ] -def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text): - """Returns tokenized answer spans that - better match the annotated answer.""" - tok_answer_text = " ".join(tokenizer.text_to_tokens(orig_answer_text)) - - for new_start in range(input_start, input_end + 1): - for new_end in range(input_end, new_start - 1, -1): - text_span = " ".join(doc_tokens[new_start : (new_end + 1)]) - if text_span == tok_answer_text: - return (new_start, new_end) - - return (input_start, input_end) - - -def _check_is_max_context(doc_spans, cur_span_index, position): - """Check if this is the 'max context' doc span for the token.""" - best_score = None - best_span_index = None - for (span_index, doc_span) in enumerate(doc_spans): - end = doc_span.start + doc_span.length - 1 - if position < doc_span.start: - continue - if position > end: - continue - num_left_context = position - doc_span.start - num_right_context = end - position - score = min(num_left_context, num_right_context) + 0.01 * doc_span.length - if best_score is None or score > best_score: - best_score = score - best_span_index = span_index - - return cur_span_index == best_span_index - - -def check_is_max_context(doc_spans, cur_span_index, position): - """Check if this is the 'max context' doc span for the token. - - Because of the sliding window approach taken to scoring documents, - a single token can appear in multiple documents. - - Example: - Doc: the man went to the store and bought a gallon of milk - Span A: the man went to the - Span B: to the store and bought - Span C: and bought a gallon of - ... - - Now the word 'bought' will have two scores from spans B and C. We only - want to consider the score with "maximum context", which we define as - the *minimum* of its left and right context (the *sum* of left and - right context will always be the same, of course). - - In the example the maximum context for 'bought' would be span C since - it has 1 left context and 3 right context, while span B has 4 left context - and 0 right context. - - Code adapted from the code by the Google AI and HuggingFace. - """ - best_score = None - best_span_index = None - for (span_index, doc_span) in enumerate(doc_spans): - end = doc_span.start + doc_span.length - 1 - if position < doc_span.start: - continue - if position > end: - continue - num_left_context = position - doc_span.start - num_right_context = end - position - score = min(num_left_context, num_right_context) + 0.01 * doc_span.length - if best_score is None or score > best_score: - best_score = score - best_span_index = span_index - - return cur_span_index == best_span_index diff --git a/nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_processing.py b/nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_processing.py new file mode 100644 index 000000000000..7f8a84fbff2d --- /dev/null +++ b/nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_processing.py @@ -0,0 +1,231 @@ +import collections + +from nemo import logging +from nemo.collections.nlp.data.datasets.qa_squad_dataset.qa_squad_dataset import InputFeatures + + +def convert_examples_to_features( + examples, tokenizer, max_seq_length, doc_stride, max_query_length, has_groundtruth, +): + """Loads a data file into a list of `InputBatch`s.""" + + unique_id = 1000000000 + + features = [] + for (example_index, example) in enumerate(examples): + query_tokens = tokenizer.text_to_tokens(example.question_text) + + if len(query_tokens) > max_query_length: + query_tokens = query_tokens[0:max_query_length] + + # context: index of token -> index of word + tok_to_orig_index = [] + # context: index of word -> index of first token in token list + orig_to_tok_index = [] + # context without white spaces after tokenization + all_doc_tokens = [] + # doc tokens is word separated context + for (i, token) in enumerate(example.doc_tokens): + orig_to_tok_index.append(len(all_doc_tokens)) + sub_tokens = tokenizer.text_to_tokens(token) + for sub_token in sub_tokens: + tok_to_orig_index.append(i) + all_doc_tokens.append(sub_token) + + # idx of query token start and end in context + tok_start_position = None + tok_end_position = None + if has_groundtruth and example.is_impossible: + tok_start_position = -1 + tok_end_position = -1 + if has_groundtruth and not example.is_impossible: + tok_start_position = orig_to_tok_index[example.start_position] + if example.end_position < len(example.doc_tokens) - 1: + tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 + else: + tok_end_position = len(all_doc_tokens) - 1 + + (tok_start_position, tok_end_position) = _improve_answer_span( + all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text + ) + + # The -3 accounts for tokenizer.cls_token, tokenizer.sep_token and tokenizer.eos_token + # doc_spans contains all possible contexts options of given length + max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 + _DocSpan = collections.namedtuple("DocSpan", ["start", "length"]) + doc_spans = [] + start_offset = 0 + while start_offset < len(all_doc_tokens): + length = len(all_doc_tokens) - start_offset + if length > max_tokens_for_doc: + length = max_tokens_for_doc + doc_spans.append(_DocSpan(start=start_offset, length=length)) + if start_offset + length == len(all_doc_tokens): + break + start_offset += min(length, doc_stride) + + for (doc_span_index, doc_span) in enumerate(doc_spans): + tokens = [] + # maps context tokens idx in final input -> word idx in context + token_to_orig_map = {} + token_is_max_context = {} + segment_ids = [] + tokens.append(tokenizer.bos_token) + segment_ids.append(0) + for token in query_tokens: + tokens.append(token) + segment_ids.append(0) + tokens.append(tokenizer.sep_token) + segment_ids.append(0) + + for i in range(doc_span.length): + split_token_index = doc_span.start + i + token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] + + is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index) + token_is_max_context[len(tokens)] = is_max_context + tokens.append(all_doc_tokens[split_token_index]) + segment_ids.append(1) + tokens.append(tokenizer.eos_token) + segment_ids.append(1) + + input_ids = tokenizer.tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. + # Only real tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(tokenizer.pad_id) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + # calculate start and end position in final array + # of tokens in answer if no answer, + # 0 for both pointing to tokenizer.cls_token + start_position = None + end_position = None + if has_groundtruth and not example.is_impossible: + doc_start = doc_span.start + doc_end = doc_span.start + doc_span.length - 1 + out_of_span = False + if not (tok_start_position >= doc_start and tok_end_position <= doc_end): + out_of_span = True + if out_of_span: + start_position = 0 + end_position = 0 + else: + doc_offset = len(query_tokens) + 2 + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + if has_groundtruth and example.is_impossible: + # if our document chunk does not contain + # an annotation we throw it out, since there is nothing + # to predict. + start_position = 0 + end_position = 0 + + if example_index < 1: + logging.info("*** Example ***") + logging.info("unique_id: %s" % (unique_id)) + logging.info("example_index: %s" % (example_index)) + logging.info("doc_span_index: %s" % (doc_span_index)) + logging.info("tokens: %s" % " ".join(tokens)) + logging.info( + "token_to_orig_map: %s" % " ".join(["%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]) + ) + logging.info( + "token_is_max_context: %s" + % " ".join(["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()]) + ) + logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + if has_groundtruth and example.is_impossible: + logging.info("impossible example") + if has_groundtruth and not example.is_impossible: + answer_text = " ".join(tokens[start_position : (end_position + 1)]) + logging.info("start_position: %d" % (start_position)) + logging.info("end_position: %d" % (end_position)) + logging.info("answer: %s" % (answer_text)) + + features.append( + InputFeatures( + unique_id=unique_id, + example_index=example_index, + doc_span_index=doc_span_index, + tokens=tokens, + token_to_orig_map=token_to_orig_map, + token_is_max_context=token_is_max_context, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + start_position=start_position, + end_position=end_position, + is_impossible=example.is_impossible, + ) + ) + unique_id += 1 + + return features + + +def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text): + """Returns tokenized answer spans that + better match the annotated answer.""" + tok_answer_text = " ".join(tokenizer.text_to_tokens(orig_answer_text)) + + for new_start in range(input_start, input_end + 1): + for new_end in range(input_end, new_start - 1, -1): + text_span = " ".join(doc_tokens[new_start : (new_end + 1)]) + if text_span == tok_answer_text: + return (new_start, new_end) + + return (input_start, input_end) + + +def _check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token. + + Because of the sliding window approach taken to scoring documents, + a single token can appear in multiple documents. + + Example: + Doc: the man went to the store and bought a gallon of milk + Span A: the man went to the + Span B: to the store and bought + Span C: and bought a gallon of + ... + + Now the word 'bought' will have two scores from spans B and C. We only + want to consider the score with "maximum context", which we define as + the *minimum* of its left and right context (the *sum* of left and + right context will always be the same, of course). + + In the example the maximum context for 'bought' would be span C since + it has 1 left context and 3 right context, while span B has 4 left context + and 0 right context. + + Code adapted from the code by the Google AI and HuggingFace. + """ + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index \ No newline at end of file