Skip to content

Commit

Permalink
refactor datasets
Browse files Browse the repository at this point in the history
Signed-off-by: Evelina Bakhturina <ebakhturina@nvidia.com>
  • Loading branch information
ekmb committed Feb 13, 2020
1 parent 8be0691 commit ce70f26
Show file tree
Hide file tree
Showing 19 changed files with 1,473 additions and 1,484 deletions.
2 changes: 1 addition & 1 deletion examples/nlp/glue_benchmark/glue_benchmark_with_bert.py
Expand Up @@ -70,7 +70,7 @@
from nemo.backends.pytorch.common import CrossEntropyLoss, MSELoss
from nemo.collections.nlp.callbacks.glue_benchmark_callback import eval_epochs_done_callback, eval_iter_callback
from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer
from nemo.collections.nlp.data.datasets.glue_benchmark_dataset import output_modes, processors
from nemo.collections.nlp.data.datasets.glue_benchmark_dataset.glue_benchmark_dataset import output_modes, processors
from nemo.collections.nlp.nm.data_layers import GlueClassificationDataLayer, GlueRegressionDataLayer
from nemo.collections.nlp.nm.trainables import SequenceClassifier, SequenceRegression
from nemo.utils.lr_policies import get_lr_policy
Expand Down
Expand Up @@ -23,7 +23,7 @@

import nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm
from nemo import logging
from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import JointIntentSlotDataDesc
from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.data_descriptor import JointIntentSlotDataDesc

# Parsing arguments
parser = argparse.ArgumentParser(description='Joint-intent BERT')
Expand Down
Expand Up @@ -21,7 +21,7 @@

import nemo.collections.nlp as nemo_nlp
import nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm
from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import JointIntentSlotDataDesc
from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.data_descriptor import JointIntentSlotDataDesc
from nemo.collections.nlp.utils.common_nlp_utils import read_intent_slot_outputs

# Parsing arguments
Expand Down
Expand Up @@ -26,7 +26,7 @@
import nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm
from nemo import logging
from nemo.collections.nlp.callbacks.joint_intent_slot_callback import eval_epochs_done_callback, eval_iter_callback
from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import JointIntentSlotDataDesc
from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.data_descriptor import JointIntentSlotDataDesc
from nemo.utils.lr_policies import get_lr_policy

# Parsing arguments
Expand Down
6 changes: 3 additions & 3 deletions nemo/collections/nlp/data/datasets/__init__.py
Expand Up @@ -15,8 +15,8 @@
# =============================================================================

from nemo.collections.nlp.data.datasets.datasets_utils import *
from nemo.collections.nlp.data.datasets.glue_benchmark_dataset import GLUEDataset
from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import (
from nemo.collections.nlp.data.datasets.glue_benchmark_dataset.glue_benchmark_dataset import GLUEDataset
from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.joint_intent_slot_dataset import (
BertJointIntentSlotDataset,
BertJointIntentSlotInferDataset,
)
Expand All @@ -31,7 +31,7 @@
BertPunctuationCapitalizationDataset,
BertPunctuationCapitalizationInferDataset,
)
from nemo.collections.nlp.data.datasets.qa_squad_dataset import SquadDataset
from nemo.collections.nlp.data.datasets.qa_squad_dataset.qa_squad_dataset import SquadDataset
from nemo.collections.nlp.data.datasets.text_classification_dataset import BertTextClassificationDataset
from nemo.collections.nlp.data.datasets.token_classification_dataset import (
BertTokenClassificationDataset,
Expand Down
@@ -1,6 +1,7 @@
import glob
import json
import os
import pickle
import shutil

from nemo import logging
Expand Down Expand Up @@ -379,3 +380,37 @@ def process_nlu(filename, uncased, modes=['train', 'test'], dataset_name='nlu-ub
for mode in modes:
outfiles[mode].close()
return outfold


def dataset_to_ids(dataset, tokenizer, cache_ids=False, add_bos_eos=True):
"""
Reads dataset from file line by line, tokenizes each line with tokenizer,
and returns list of lists which corresponds to ids of tokenized strings.
Args:
dataset: path to dataset
tokenizer: tokenizer to convert text into ids
cache_ids: if True, ids are saved to disk as pickle file
with similar name (e.g., data.txt --> data.txt.pkl)
add_bos_eos: bool, whether to add <s> and </s> symbols (e.g., for NMT)
Returns:
ids: list of ids which correspond to tokenized strings of the dataset
"""

cached_ids_dataset = dataset + str(".pkl")
if os.path.isfile(cached_ids_dataset):
logging.info("Loading cached tokenized dataset ...")
ids = pickle.load(open(cached_ids_dataset, "rb"))
else:
logging.info("Tokenizing dataset ...")
data = open(dataset, "rb").readlines()
ids = []
for sentence in data:
sent_ids = tokenizer.text_to_ids(sentence.decode("utf-8"))
if add_bos_eos:
sent_ids = [tokenizer.bos_id] + sent_ids + [tokenizer.eos_id]
ids.append(sent_ids)
if cache_ids:
logging.info("Caching tokenized dataset ...")
pickle.dump(ids, open(cached_ids_dataset, "wb"))
return ids
Expand Up @@ -43,6 +43,7 @@
'get_intent_labels',
'normalize_answer',
'get_tokens',
'get_stats'
]

DATABASE_EXISTS_TMP = '{} dataset has already been processed and stored at {}'
Expand Down

0 comments on commit ce70f26

Please sign in to comment.