import fixed

Signed-off-by: Evelina Bakhturina <ebakhturina@nvidia.com>
NVIDIA · Feb 14, 2020 · 8b1d72d · 8b1d72d
1 parent 4428f37
commit 8b1d72d
Show file tree

Hide file tree

Showing 6 changed files with 91 additions and 76 deletions.
diff --git a/nemo/collections/nlp/data/datasets/datasets_utils/dialogflow_utils.py b/nemo/collections/nlp/data/datasets/datasets_utils/dialogflow_utils.py
@@ -3,6 +3,13 @@
 
 from nemo.collections.nlp.data.datasets.datasets_utils.preprocessing import partition_data, write_files
 
+__all__ = [
+    'get_intent_query_files_dialogflow',
+    'get_intents_slots_dialogflow',
+    'get_slots_dialogflow',
+    'process_dialogflow',
+]
+
 
 def get_intent_query_files_dialogflow(path):
     fileslist = []

diff --git a/nemo/collections/nlp/data/datasets/datasets_utils/mturk_utils.py b/nemo/collections/nlp/data/datasets/datasets_utils/mturk_utils.py
@@ -10,6 +10,8 @@
 )
 from nemo.collections.nlp.utils import if_exist
 
+__all__ = ['process_mturk', 'process_intent_slot_mturk', 'get_intents_mturk', 'get_slot_labels']
+
 
 def process_mturk(data_dir, uncased, modes=['train', 'test'], dev_split=0.1):
     if not os.path.exists(data_dir):

diff --git a/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/data_descriptor.py b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/data_descriptor.py
@@ -63,7 +63,7 @@ def __init__(self, data_dir, do_lower_case=False, dataset_name='default', none_s
         if dataset_name == 'atis':
             self.data_dir = process_atis(data_dir, do_lower_case)
         elif dataset_name == 'snips-atis':
-            self.data_dir, self.pad_label = merge(
+            self.data_dir, self.pad_label = self.merge(
                 data_dir, ['ATIS/nemo-processed-uncased', 'snips/nemo-processed-uncased/all'], dataset_name
             )
         elif dataset_name == 'dialogflow':
@@ -156,74 +156,73 @@ def __init__(self, data_dir, do_lower_case=False, dataset_name='default', none_s
                 raise ValueError(f'none_slot_label {none_slot_label} not ' f'found in {self.slot_dict_file}.')
             self.pad_label = slots[none_slot_label]
 
-
-def merge(data_dir, subdirs, dataset_name, modes=['train', 'test']):
-    outfold = f'{data_dir}/{dataset_name}'
-    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
-        logging.info(DATABASE_EXISTS_TMP.format('SNIPS-ATIS', outfold))
-        slots = get_vocab(f'{outfold}/dict.slots.csv')
-        none_slot = 0
-        for key in slots:
-            if slots[key] == 'O':
-                none_slot = key
-                break
-        return outfold, int(none_slot)
-
-    os.makedirs(outfold, exist_ok=True)
-
-    data_files, slot_files = {}, {}
-    for mode in modes:
-        data_files[mode] = open(f'{outfold}/{mode}.tsv', 'w')
-        data_files[mode].write('sentence\tlabel\n')
-        slot_files[mode] = open(f'{outfold}/{mode}_slots.tsv', 'w')
-
-    intents, slots = {}, {}
-    intent_shift, slot_shift = 0, 0
-    none_intent, none_slot = -1, -1
-
-    for subdir in subdirs:
-        curr_intents = get_vocab(f'{data_dir}/{subdir}/dict.intents.csv')
-        curr_slots = get_vocab(f'{data_dir}/{subdir}/dict.slots.csv')
-
-        for key in curr_intents:
-            if intent_shift > 0 and curr_intents[key] == 'O':
-                continue
-            if curr_intents[key] == 'O' and intent_shift == 0:
-                none_intent = int(key)
-            intents[int(key) + intent_shift] = curr_intents[key]
-
-        for key in curr_slots:
-            if slot_shift > 0 and curr_slots[key] == 'O':
-                continue
-            if slot_shift == 0 and curr_slots[key] == 'O':
-                none_slot = int(key)
-            slots[int(key) + slot_shift] = curr_slots[key]
-
+    def merge(self, data_dir, subdirs, dataset_name, modes=['train', 'test']):
+        outfold = f'{data_dir}/{dataset_name}'
+        if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
+            logging.info(DATABASE_EXISTS_TMP.format('SNIPS-ATIS', outfold))
+            slots = get_vocab(f'{outfold}/dict.slots.csv')
+            none_slot = 0
+            for key in slots:
+                if slots[key] == 'O':
+                    none_slot = key
+                    break
+            return outfold, int(none_slot)
+
+        os.makedirs(outfold, exist_ok=True)
+
+        data_files, slot_files = {}, {}
         for mode in modes:
-            with open(f'{data_dir}/{subdir}/{mode}.tsv', 'r') as f:
-                for line in f.readlines()[1:]:
-                    text, label = line.strip().split('\t')
-                    label = int(label)
-                    if curr_intents[label] == 'O':
-                        label = none_intent
-                    else:
-                        label = label + intent_shift
-                    data_files[mode].write(f'{text}\t{label}\n')
-
-            with open(f'{data_dir}/{subdir}/{mode}_slots.tsv', 'r') as f:
-                for line in f.readlines():
-                    labels = [int(label) for label in line.strip().split()]
-                    shifted_labels = []
-                    for label in labels:
-                        if curr_slots[label] == 'O':
-                            shifted_labels.append(none_slot)
+            data_files[mode] = open(f'{outfold}/{mode}.tsv', 'w')
+            data_files[mode].write('sentence\tlabel\n')
+            slot_files[mode] = open(f'{outfold}/{mode}_slots.tsv', 'w')
+
+        intents, slots = {}, {}
+        intent_shift, slot_shift = 0, 0
+        none_intent, none_slot = -1, -1
+
+        for subdir in subdirs:
+            curr_intents = get_vocab(f'{data_dir}/{subdir}/dict.intents.csv')
+            curr_slots = get_vocab(f'{data_dir}/{subdir}/dict.slots.csv')
+
+            for key in curr_intents:
+                if intent_shift > 0 and curr_intents[key] == 'O':
+                    continue
+                if curr_intents[key] == 'O' and intent_shift == 0:
+                    none_intent = int(key)
+                intents[int(key) + intent_shift] = curr_intents[key]
+
+            for key in curr_slots:
+                if slot_shift > 0 and curr_slots[key] == 'O':
+                    continue
+                if slot_shift == 0 and curr_slots[key] == 'O':
+                    none_slot = int(key)
+                slots[int(key) + slot_shift] = curr_slots[key]
+
+            for mode in modes:
+                with open(f'{data_dir}/{subdir}/{mode}.tsv', 'r') as f:
+                    for line in f.readlines()[1:]:
+                        text, label = line.strip().split('\t')
+                        label = int(label)
+                        if curr_intents[label] == 'O':
+                            label = none_intent
                         else:
-                            shifted_labels.append(label + slot_shift)
-                    slot_files[mode].write(list2str(shifted_labels) + '\n')
-
-        intent_shift += len(curr_intents)
-        slot_shift += len(curr_slots)
-
-    write_vocab_in_order(intents, f'{outfold}/dict.intents.csv')
-    write_vocab_in_order(slots, f'{outfold}/dict.slots.csv')
-    return outfold, none_slot
+                            label = label + intent_shift
+                        data_files[mode].write(f'{text}\t{label}\n')
+
+                with open(f'{data_dir}/{subdir}/{mode}_slots.tsv', 'r') as f:
+                    for line in f.readlines():
+                        labels = [int(label) for label in line.strip().split()]
+                        shifted_labels = []
+                        for label in labels:
+                            if curr_slots[label] == 'O':
+                                shifted_labels.append(none_slot)
+                            else:
+                                shifted_labels.append(label + slot_shift)
+                        slot_files[mode].write(list2str(shifted_labels) + '\n')
+
+            intent_shift += len(curr_intents)
+            slot_shift += len(curr_slots)
+
+        write_vocab_in_order(intents, f'{outfold}/dict.intents.csv')
+        write_vocab_in_order(slots, f'{outfold}/dict.slots.csv')
+        return outfold, none_slot
diff --git a/nemo/collections/nlp/data/datasets/lm_bert_dataset.py b/nemo/collections/nlp/data/datasets/lm_bert_dataset.py
@@ -382,7 +382,7 @@ def __init__(self, dataset_name, data_dir, vocab_size, sample_size, special_toke
         if dataset_name == 'wikitext-2':
             if not os.path.exists(data_dir):
                 data_dir = download_wkt2(data_dir)
-            self.data_dir, self.tokenizer_model = create_vocab_mlm(
+            self.data_dir, self.tokenizer_model = self.create_vocab_mlm(
                 data_dir, vocab_size, sample_size, special_tokens, train_file
             )
         else:
@@ -397,7 +397,12 @@ def __init__(self, dataset_name, data_dir, vocab_size, sample_size, special_toke
         self.test_file = f'{data_dir}/test.txt'
 
     def create_vocab_mlm(
-        data_dir, vocab_size, sample_size, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], train_file=''
+        self,
+        data_dir,
+        vocab_size,
+        sample_size,
+        special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'],
+        train_file='',
     ):
         vocab = special_tokens[:]
         bert_dir = f'{data_dir}/bert'

diff --git a/nemo/collections/nlp/data/datasets/lm_transformer_dataset.py b/nemo/collections/nlp/data/datasets/lm_transformer_dataset.py
@@ -54,7 +54,7 @@ def __init__(self, dataset_name, data_dir, do_lower_case):
         if dataset_name == 'wikitext-2':
             if not os.path.exists(data_dir):
                 data_dir = download_wkt2(data_dir)
-            self.vocab_size = create_vocab_lm(data_dir, do_lower_case)
+            self.vocab_size = self.create_vocab_lm(data_dir, do_lower_case)
             self.data_dir = data_dir
         else:
             logging.warning(
@@ -63,7 +63,7 @@ def __init__(self, dataset_name, data_dir, do_lower_case):
                 "you build the preprocessing method for it."
             )
 
-    def create_vocab_lm(data_dir, do_lower_case):
+    def create_vocab_lm(self, data_dir, do_lower_case):
         if if_exist(data_dir, ['train.txt', 'vocab.txt']):
             logging.info("Vocabulary has been created.")
             with open(os.path.join(data_dir, 'vocab.txt'), 'r') as f:

diff --git a/nemo/collections/nlp/data/datasets/machine_translation_dataset.py b/nemo/collections/nlp/data/datasets/machine_translation_dataset.py
@@ -36,7 +36,7 @@ def __init__(self, tokenizer_src, tokenizer_tgt, dataset_src, dataset_tgt, token
         src_ids = dataset_to_ids(dataset_src, tokenizer_src)
         tgt_ids = dataset_to_ids(dataset_tgt, tokenizer_tgt)
         if clean:
-            src_ids, tgt_ids = clean_src_and_target(src_ids, tgt_ids)
+            src_ids, tgt_ids = self.clean_src_and_target(src_ids, tgt_ids)
         self.batch_indices = self.pack_data_into_batches(src_ids, tgt_ids)
         self.batches = self.pad_batches(src_ids, tgt_ids, self.batch_indices)
 
@@ -156,7 +156,9 @@ def pack_data_into_batches(self, src_ids, tgt_ids):
 
         return batches
 
-    def clean_src_and_target(src_ids, tgt_ids, max_tokens=128, min_tokens=3, max_tokens_diff=25, max_tokens_ratio=2.5):
+    def clean_src_and_target(
+        self, src_ids, tgt_ids, max_tokens=128, min_tokens=3, max_tokens_diff=25, max_tokens_ratio=2.5
+    ):
         """
         Cleans source and target sentences to get rid of noisy data.
         Specifically, a pair of sentences is removed if