refactor datasets

Signed-off-by: Evelina Bakhturina <ebakhturina@nvidia.com>
NVIDIA · Feb 13, 2020 · ce70f26 · ce70f26
1 parent 8be0691
commit ce70f26
Show file tree

Hide file tree

Showing 19 changed files with 1,473 additions and 1,484 deletions.
diff --git a/examples/nlp/glue_benchmark/glue_benchmark_with_bert.py b/examples/nlp/glue_benchmark/glue_benchmark_with_bert.py
@@ -70,7 +70,7 @@
 from nemo.backends.pytorch.common import CrossEntropyLoss, MSELoss
 from nemo.collections.nlp.callbacks.glue_benchmark_callback import eval_epochs_done_callback, eval_iter_callback
 from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer
-from nemo.collections.nlp.data.datasets.glue_benchmark_dataset import output_modes, processors
+from nemo.collections.nlp.data.datasets.glue_benchmark_dataset.glue_benchmark_dataset import output_modes, processors
 from nemo.collections.nlp.nm.data_layers import GlueClassificationDataLayer, GlueRegressionDataLayer
 from nemo.collections.nlp.nm.trainables import SequenceClassifier, SequenceRegression
 from nemo.utils.lr_policies import get_lr_policy

diff --git a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer.py b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer.py
@@ -23,7 +23,7 @@
 
 import nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm
 from nemo import logging
-from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import JointIntentSlotDataDesc
+from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.data_descriptor import JointIntentSlotDataDesc
 
 # Parsing arguments
 parser = argparse.ArgumentParser(description='Joint-intent BERT')

diff --git a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer_b1.py b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer_b1.py
@@ -21,7 +21,7 @@
 
 import nemo.collections.nlp as nemo_nlp
 import nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm
-from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import JointIntentSlotDataDesc
+from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.data_descriptor import JointIntentSlotDataDesc
 from nemo.collections.nlp.utils.common_nlp_utils import read_intent_slot_outputs
 
 # Parsing arguments

diff --git a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_with_bert.py b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_with_bert.py
@@ -26,7 +26,7 @@
 import nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm
 from nemo import logging
 from nemo.collections.nlp.callbacks.joint_intent_slot_callback import eval_epochs_done_callback, eval_iter_callback
-from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import JointIntentSlotDataDesc
+from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.data_descriptor import JointIntentSlotDataDesc
 from nemo.utils.lr_policies import get_lr_policy
 
 # Parsing arguments

diff --git a/nemo/collections/nlp/data/datasets/__init__.py b/nemo/collections/nlp/data/datasets/__init__.py
@@ -15,8 +15,8 @@
 # =============================================================================
 
 from nemo.collections.nlp.data.datasets.datasets_utils import *
-from nemo.collections.nlp.data.datasets.glue_benchmark_dataset import GLUEDataset
-from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import (
+from nemo.collections.nlp.data.datasets.glue_benchmark_dataset.glue_benchmark_dataset import GLUEDataset
+from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.joint_intent_slot_dataset import (
     BertJointIntentSlotDataset,
     BertJointIntentSlotInferDataset,
 )
@@ -31,7 +31,7 @@
     BertPunctuationCapitalizationDataset,
     BertPunctuationCapitalizationInferDataset,
 )
-from nemo.collections.nlp.data.datasets.qa_squad_dataset import SquadDataset
+from nemo.collections.nlp.data.datasets.qa_squad_dataset.qa_squad_dataset import SquadDataset
 from nemo.collections.nlp.data.datasets.text_classification_dataset import BertTextClassificationDataset
 from nemo.collections.nlp.data.datasets.token_classification_dataset import (
     BertTokenClassificationDataset,

diff --git a/nemo/collections/nlp/data/datasets/datasets_utils/datasets_processing.py b/nemo/collections/nlp/data/datasets/datasets_utils/datasets_processing.py
@@ -1,6 +1,7 @@
 import glob
 import json
 import os
+import pickle
 import shutil
 
 from nemo import logging
@@ -379,3 +380,37 @@ def process_nlu(filename, uncased, modes=['train', 'test'], dataset_name='nlu-ub
     for mode in modes:
         outfiles[mode].close()
     return outfold
+
+
+def dataset_to_ids(dataset, tokenizer, cache_ids=False, add_bos_eos=True):
+    """
+    Reads dataset from file line by line, tokenizes each line with tokenizer,
+    and returns list of lists which corresponds to ids of tokenized strings.
+
+    Args:
+        dataset: path to dataset
+        tokenizer: tokenizer to convert text into ids
+        cache_ids: if True, ids are saved to disk as pickle file
+            with similar name (e.g., data.txt --> data.txt.pkl)
+        add_bos_eos: bool, whether to add <s> and </s> symbols (e.g., for NMT)
+    Returns:
+        ids: list of ids which correspond to tokenized strings of the dataset
+    """
+
+    cached_ids_dataset = dataset + str(".pkl")
+    if os.path.isfile(cached_ids_dataset):
+        logging.info("Loading cached tokenized dataset ...")
+        ids = pickle.load(open(cached_ids_dataset, "rb"))
+    else:
+        logging.info("Tokenizing dataset ...")
+        data = open(dataset, "rb").readlines()
+        ids = []
+        for sentence in data:
+            sent_ids = tokenizer.text_to_ids(sentence.decode("utf-8"))
+            if add_bos_eos:
+                sent_ids = [tokenizer.bos_id] + sent_ids + [tokenizer.eos_id]
+            ids.append(sent_ids)
+        if cache_ids:
+            logging.info("Caching tokenized dataset ...")
+            pickle.dump(ids, open(cached_ids_dataset, "wb"))
+    return ids
diff --git a/nemo/collections/nlp/data/datasets/datasets_utils/preprocessing.py b/nemo/collections/nlp/data/datasets/datasets_utils/preprocessing.py
@@ -43,6 +43,7 @@
     'get_intent_labels',
     'normalize_answer',
     'get_tokens',
+    'get_stats'
 ]
 
 DATABASE_EXISTS_TMP = '{} dataset has already been processed and stored at {}'