Skip to content

Commit

Permalink
import fixed
Browse files Browse the repository at this point in the history
Signed-off-by: Evelina Bakhturina <ebakhturina@nvidia.com>
  • Loading branch information
ekmb committed Feb 14, 2020
1 parent 4428f37 commit 8b1d72d
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 76 deletions.
Expand Up @@ -3,6 +3,13 @@

from nemo.collections.nlp.data.datasets.datasets_utils.preprocessing import partition_data, write_files

__all__ = [
'get_intent_query_files_dialogflow',
'get_intents_slots_dialogflow',
'get_slots_dialogflow',
'process_dialogflow',
]


def get_intent_query_files_dialogflow(path):
fileslist = []
Expand Down
Expand Up @@ -10,6 +10,8 @@
)
from nemo.collections.nlp.utils import if_exist

__all__ = ['process_mturk', 'process_intent_slot_mturk', 'get_intents_mturk', 'get_slot_labels']


def process_mturk(data_dir, uncased, modes=['train', 'test'], dev_split=0.1):
if not os.path.exists(data_dir):
Expand Down
Expand Up @@ -63,7 +63,7 @@ def __init__(self, data_dir, do_lower_case=False, dataset_name='default', none_s
if dataset_name == 'atis':
self.data_dir = process_atis(data_dir, do_lower_case)
elif dataset_name == 'snips-atis':
self.data_dir, self.pad_label = merge(
self.data_dir, self.pad_label = self.merge(
data_dir, ['ATIS/nemo-processed-uncased', 'snips/nemo-processed-uncased/all'], dataset_name
)
elif dataset_name == 'dialogflow':
Expand Down Expand Up @@ -156,74 +156,73 @@ def __init__(self, data_dir, do_lower_case=False, dataset_name='default', none_s
raise ValueError(f'none_slot_label {none_slot_label} not ' f'found in {self.slot_dict_file}.')
self.pad_label = slots[none_slot_label]


def merge(data_dir, subdirs, dataset_name, modes=['train', 'test']):
outfold = f'{data_dir}/{dataset_name}'
if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
logging.info(DATABASE_EXISTS_TMP.format('SNIPS-ATIS', outfold))
slots = get_vocab(f'{outfold}/dict.slots.csv')
none_slot = 0
for key in slots:
if slots[key] == 'O':
none_slot = key
break
return outfold, int(none_slot)

os.makedirs(outfold, exist_ok=True)

data_files, slot_files = {}, {}
for mode in modes:
data_files[mode] = open(f'{outfold}/{mode}.tsv', 'w')
data_files[mode].write('sentence\tlabel\n')
slot_files[mode] = open(f'{outfold}/{mode}_slots.tsv', 'w')

intents, slots = {}, {}
intent_shift, slot_shift = 0, 0
none_intent, none_slot = -1, -1

for subdir in subdirs:
curr_intents = get_vocab(f'{data_dir}/{subdir}/dict.intents.csv')
curr_slots = get_vocab(f'{data_dir}/{subdir}/dict.slots.csv')

for key in curr_intents:
if intent_shift > 0 and curr_intents[key] == 'O':
continue
if curr_intents[key] == 'O' and intent_shift == 0:
none_intent = int(key)
intents[int(key) + intent_shift] = curr_intents[key]

for key in curr_slots:
if slot_shift > 0 and curr_slots[key] == 'O':
continue
if slot_shift == 0 and curr_slots[key] == 'O':
none_slot = int(key)
slots[int(key) + slot_shift] = curr_slots[key]

def merge(self, data_dir, subdirs, dataset_name, modes=['train', 'test']):
outfold = f'{data_dir}/{dataset_name}'
if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
logging.info(DATABASE_EXISTS_TMP.format('SNIPS-ATIS', outfold))
slots = get_vocab(f'{outfold}/dict.slots.csv')
none_slot = 0
for key in slots:
if slots[key] == 'O':
none_slot = key
break
return outfold, int(none_slot)

os.makedirs(outfold, exist_ok=True)

data_files, slot_files = {}, {}
for mode in modes:
with open(f'{data_dir}/{subdir}/{mode}.tsv', 'r') as f:
for line in f.readlines()[1:]:
text, label = line.strip().split('\t')
label = int(label)
if curr_intents[label] == 'O':
label = none_intent
else:
label = label + intent_shift
data_files[mode].write(f'{text}\t{label}\n')

with open(f'{data_dir}/{subdir}/{mode}_slots.tsv', 'r') as f:
for line in f.readlines():
labels = [int(label) for label in line.strip().split()]
shifted_labels = []
for label in labels:
if curr_slots[label] == 'O':
shifted_labels.append(none_slot)
data_files[mode] = open(f'{outfold}/{mode}.tsv', 'w')
data_files[mode].write('sentence\tlabel\n')
slot_files[mode] = open(f'{outfold}/{mode}_slots.tsv', 'w')

intents, slots = {}, {}
intent_shift, slot_shift = 0, 0
none_intent, none_slot = -1, -1

for subdir in subdirs:
curr_intents = get_vocab(f'{data_dir}/{subdir}/dict.intents.csv')
curr_slots = get_vocab(f'{data_dir}/{subdir}/dict.slots.csv')

for key in curr_intents:
if intent_shift > 0 and curr_intents[key] == 'O':
continue
if curr_intents[key] == 'O' and intent_shift == 0:
none_intent = int(key)
intents[int(key) + intent_shift] = curr_intents[key]

for key in curr_slots:
if slot_shift > 0 and curr_slots[key] == 'O':
continue
if slot_shift == 0 and curr_slots[key] == 'O':
none_slot = int(key)
slots[int(key) + slot_shift] = curr_slots[key]

for mode in modes:
with open(f'{data_dir}/{subdir}/{mode}.tsv', 'r') as f:
for line in f.readlines()[1:]:
text, label = line.strip().split('\t')
label = int(label)
if curr_intents[label] == 'O':
label = none_intent
else:
shifted_labels.append(label + slot_shift)
slot_files[mode].write(list2str(shifted_labels) + '\n')

intent_shift += len(curr_intents)
slot_shift += len(curr_slots)

write_vocab_in_order(intents, f'{outfold}/dict.intents.csv')
write_vocab_in_order(slots, f'{outfold}/dict.slots.csv')
return outfold, none_slot
label = label + intent_shift
data_files[mode].write(f'{text}\t{label}\n')

with open(f'{data_dir}/{subdir}/{mode}_slots.tsv', 'r') as f:
for line in f.readlines():
labels = [int(label) for label in line.strip().split()]
shifted_labels = []
for label in labels:
if curr_slots[label] == 'O':
shifted_labels.append(none_slot)
else:
shifted_labels.append(label + slot_shift)
slot_files[mode].write(list2str(shifted_labels) + '\n')

intent_shift += len(curr_intents)
slot_shift += len(curr_slots)

write_vocab_in_order(intents, f'{outfold}/dict.intents.csv')
write_vocab_in_order(slots, f'{outfold}/dict.slots.csv')
return outfold, none_slot
9 changes: 7 additions & 2 deletions nemo/collections/nlp/data/datasets/lm_bert_dataset.py
Expand Up @@ -382,7 +382,7 @@ def __init__(self, dataset_name, data_dir, vocab_size, sample_size, special_toke
if dataset_name == 'wikitext-2':
if not os.path.exists(data_dir):
data_dir = download_wkt2(data_dir)
self.data_dir, self.tokenizer_model = create_vocab_mlm(
self.data_dir, self.tokenizer_model = self.create_vocab_mlm(
data_dir, vocab_size, sample_size, special_tokens, train_file
)
else:
Expand All @@ -397,7 +397,12 @@ def __init__(self, dataset_name, data_dir, vocab_size, sample_size, special_toke
self.test_file = f'{data_dir}/test.txt'

def create_vocab_mlm(
data_dir, vocab_size, sample_size, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], train_file=''
self,
data_dir,
vocab_size,
sample_size,
special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'],
train_file='',
):
vocab = special_tokens[:]
bert_dir = f'{data_dir}/bert'
Expand Down
4 changes: 2 additions & 2 deletions nemo/collections/nlp/data/datasets/lm_transformer_dataset.py
Expand Up @@ -54,7 +54,7 @@ def __init__(self, dataset_name, data_dir, do_lower_case):
if dataset_name == 'wikitext-2':
if not os.path.exists(data_dir):
data_dir = download_wkt2(data_dir)
self.vocab_size = create_vocab_lm(data_dir, do_lower_case)
self.vocab_size = self.create_vocab_lm(data_dir, do_lower_case)
self.data_dir = data_dir
else:
logging.warning(
Expand All @@ -63,7 +63,7 @@ def __init__(self, dataset_name, data_dir, do_lower_case):
"you build the preprocessing method for it."
)

def create_vocab_lm(data_dir, do_lower_case):
def create_vocab_lm(self, data_dir, do_lower_case):
if if_exist(data_dir, ['train.txt', 'vocab.txt']):
logging.info("Vocabulary has been created.")
with open(os.path.join(data_dir, 'vocab.txt'), 'r') as f:
Expand Down
Expand Up @@ -36,7 +36,7 @@ def __init__(self, tokenizer_src, tokenizer_tgt, dataset_src, dataset_tgt, token
src_ids = dataset_to_ids(dataset_src, tokenizer_src)
tgt_ids = dataset_to_ids(dataset_tgt, tokenizer_tgt)
if clean:
src_ids, tgt_ids = clean_src_and_target(src_ids, tgt_ids)
src_ids, tgt_ids = self.clean_src_and_target(src_ids, tgt_ids)
self.batch_indices = self.pack_data_into_batches(src_ids, tgt_ids)
self.batches = self.pad_batches(src_ids, tgt_ids, self.batch_indices)

Expand Down Expand Up @@ -156,7 +156,9 @@ def pack_data_into_batches(self, src_ids, tgt_ids):

return batches

def clean_src_and_target(src_ids, tgt_ids, max_tokens=128, min_tokens=3, max_tokens_diff=25, max_tokens_ratio=2.5):
def clean_src_and_target(
self, src_ids, tgt_ids, max_tokens=128, min_tokens=3, max_tokens_diff=25, max_tokens_ratio=2.5
):
"""
Cleans source and target sentences to get rid of noisy data.
Specifically, a pair of sentences is removed if
Expand Down

0 comments on commit 8b1d72d

Please sign in to comment.