From ce70f2667f17e7c188d4a07cb0f5a72671247bb8 Mon Sep 17 00:00:00 2001
From: Evelina Bakhturina <ebakhturina@nvidia.com>
Date: Thu, 13 Feb 2020 14:46:39 -0800
Subject: [PATCH] refactor datasets

Signed-off-by: Evelina Bakhturina <ebakhturina@nvidia.com>
---
 .../glue_benchmark_with_bert.py               |   2 +-
 .../joint_intent_slot_infer.py                |   2 +-
 .../joint_intent_slot_infer_b1.py             |   2 +-
 .../joint_intent_slot_with_bert.py            |   2 +-
 .../collections/nlp/data/datasets/__init__.py |   6 +-
 .../datasets_utils/datasets_processing.py     |  35 ++
 .../datasets/datasets_utils/preprocessing.py  |   1 +
 .../data/datasets/glue_benchmark_dataset.py   | 593 ------------------
 .../glue_benchmark_dataset/__init__.py        |   0
 .../glue_benchmark_dataset/data_processors.py | 302 +++++++++
 .../glue_benchmark_dataset.py                 | 295 +++++++++
 .../datasets/joint_intent_slot_dataset.py     | 481 --------------
 .../data_descriptor.py                        | 217 +++++++
 .../joint_intent_slot_dataset.py              | 262 ++++++++
 .../nlp/data/datasets/lm_bert_dataset.py      |  58 +-
 .../data/datasets/lm_transformer_dataset.py   | 156 +----
 .../datasets/machine_translation_dataset.py   |  64 +-
 .../qa_squad_dataset.py                       | 248 +-------
 .../qa_squad_dataset/qa_squad_processing.py   | 231 +++++++
 19 files changed, 1473 insertions(+), 1484 deletions(-)
 delete mode 100644 nemo/collections/nlp/data/datasets/glue_benchmark_dataset.py
 create mode 100644 nemo/collections/nlp/data/datasets/glue_benchmark_dataset/__init__.py
 create mode 100644 nemo/collections/nlp/data/datasets/glue_benchmark_dataset/data_processors.py
 create mode 100644 nemo/collections/nlp/data/datasets/glue_benchmark_dataset/glue_benchmark_dataset.py
 delete mode 100644 nemo/collections/nlp/data/datasets/joint_intent_slot_dataset.py
 create mode 100644 nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/data_descriptor.py
 create mode 100644 nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/joint_intent_slot_dataset.py
 rename nemo/collections/nlp/data/datasets/{ => qa_squad_dataset}/qa_squad_dataset.py (67%)
 create mode 100644 nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_processing.py

diff --git a/examples/nlp/glue_benchmark/glue_benchmark_with_bert.py b/examples/nlp/glue_benchmark/glue_benchmark_with_bert.py
index 6c23618a7329..efe38affe4bf 100644
--- a/examples/nlp/glue_benchmark/glue_benchmark_with_bert.py
+++ b/examples/nlp/glue_benchmark/glue_benchmark_with_bert.py
@@ -70,7 +70,7 @@
 from nemo.backends.pytorch.common import CrossEntropyLoss, MSELoss
 from nemo.collections.nlp.callbacks.glue_benchmark_callback import eval_epochs_done_callback, eval_iter_callback
 from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer
-from nemo.collections.nlp.data.datasets.glue_benchmark_dataset import output_modes, processors
+from nemo.collections.nlp.data.datasets.glue_benchmark_dataset.glue_benchmark_dataset import output_modes, processors
 from nemo.collections.nlp.nm.data_layers import GlueClassificationDataLayer, GlueRegressionDataLayer
 from nemo.collections.nlp.nm.trainables import SequenceClassifier, SequenceRegression
 from nemo.utils.lr_policies import get_lr_policy
diff --git a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer.py b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer.py
index 196a0e492055..81fdfad719a3 100644
--- a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer.py
+++ b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer.py
@@ -23,7 +23,7 @@
 
 import nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm
 from nemo import logging
-from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import JointIntentSlotDataDesc
+from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.data_descriptor import JointIntentSlotDataDesc
 
 # Parsing arguments
 parser = argparse.ArgumentParser(description='Joint-intent BERT')
diff --git a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer_b1.py b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer_b1.py
index 84ab723c94a8..6c44b1a58042 100644
--- a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer_b1.py
+++ b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer_b1.py
@@ -21,7 +21,7 @@
 
 import nemo.collections.nlp as nemo_nlp
 import nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm
-from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import JointIntentSlotDataDesc
+from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.data_descriptor import JointIntentSlotDataDesc
 from nemo.collections.nlp.utils.common_nlp_utils import read_intent_slot_outputs
 
 # Parsing arguments
diff --git a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_with_bert.py b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_with_bert.py
index 0cbdb08f72cc..f321d955c8df 100644
--- a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_with_bert.py
+++ b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_with_bert.py
@@ -26,7 +26,7 @@
 import nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm
 from nemo import logging
 from nemo.collections.nlp.callbacks.joint_intent_slot_callback import eval_epochs_done_callback, eval_iter_callback
-from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import JointIntentSlotDataDesc
+from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.data_descriptor import JointIntentSlotDataDesc
 from nemo.utils.lr_policies import get_lr_policy
 
 # Parsing arguments
diff --git a/nemo/collections/nlp/data/datasets/__init__.py b/nemo/collections/nlp/data/datasets/__init__.py
index 2ca68b4f6991..67561e31959f 100644
--- a/nemo/collections/nlp/data/datasets/__init__.py
+++ b/nemo/collections/nlp/data/datasets/__init__.py
@@ -15,8 +15,8 @@
 # =============================================================================
 
 from nemo.collections.nlp.data.datasets.datasets_utils import *
-from nemo.collections.nlp.data.datasets.glue_benchmark_dataset import GLUEDataset
-from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import (
+from nemo.collections.nlp.data.datasets.glue_benchmark_dataset.glue_benchmark_dataset import GLUEDataset
+from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.joint_intent_slot_dataset import (
     BertJointIntentSlotDataset,
     BertJointIntentSlotInferDataset,
 )
@@ -31,7 +31,7 @@
     BertPunctuationCapitalizationDataset,
     BertPunctuationCapitalizationInferDataset,
 )
-from nemo.collections.nlp.data.datasets.qa_squad_dataset import SquadDataset
+from nemo.collections.nlp.data.datasets.qa_squad_dataset.qa_squad_dataset import SquadDataset
 from nemo.collections.nlp.data.datasets.text_classification_dataset import BertTextClassificationDataset
 from nemo.collections.nlp.data.datasets.token_classification_dataset import (
     BertTokenClassificationDataset,
diff --git a/nemo/collections/nlp/data/datasets/datasets_utils/datasets_processing.py b/nemo/collections/nlp/data/datasets/datasets_utils/datasets_processing.py
index ea14c8716a4e..d1e8ee764719 100644
--- a/nemo/collections/nlp/data/datasets/datasets_utils/datasets_processing.py
+++ b/nemo/collections/nlp/data/datasets/datasets_utils/datasets_processing.py
@@ -1,6 +1,7 @@
 import glob
 import json
 import os
+import pickle
 import shutil
 
 from nemo import logging
@@ -379,3 +380,37 @@ def process_nlu(filename, uncased, modes=['train', 'test'], dataset_name='nlu-ub
     for mode in modes:
         outfiles[mode].close()
     return outfold
+
+
+def dataset_to_ids(dataset, tokenizer, cache_ids=False, add_bos_eos=True):
+    """
+    Reads dataset from file line by line, tokenizes each line with tokenizer,
+    and returns list of lists which corresponds to ids of tokenized strings.
+
+    Args:
+        dataset: path to dataset
+        tokenizer: tokenizer to convert text into ids
+        cache_ids: if True, ids are saved to disk as pickle file
+            with similar name (e.g., data.txt --> data.txt.pkl)
+        add_bos_eos: bool, whether to add <s> and </s> symbols (e.g., for NMT)
+    Returns:
+        ids: list of ids which correspond to tokenized strings of the dataset
+    """
+
+    cached_ids_dataset = dataset + str(".pkl")
+    if os.path.isfile(cached_ids_dataset):
+        logging.info("Loading cached tokenized dataset ...")
+        ids = pickle.load(open(cached_ids_dataset, "rb"))
+    else:
+        logging.info("Tokenizing dataset ...")
+        data = open(dataset, "rb").readlines()
+        ids = []
+        for sentence in data:
+            sent_ids = tokenizer.text_to_ids(sentence.decode("utf-8"))
+            if add_bos_eos:
+                sent_ids = [tokenizer.bos_id] + sent_ids + [tokenizer.eos_id]
+            ids.append(sent_ids)
+        if cache_ids:
+            logging.info("Caching tokenized dataset ...")
+            pickle.dump(ids, open(cached_ids_dataset, "wb"))
+    return ids
\ No newline at end of file
diff --git a/nemo/collections/nlp/data/datasets/datasets_utils/preprocessing.py b/nemo/collections/nlp/data/datasets/datasets_utils/preprocessing.py
index b137305c9353..5fa6ae68c810 100644
--- a/nemo/collections/nlp/data/datasets/datasets_utils/preprocessing.py
+++ b/nemo/collections/nlp/data/datasets/datasets_utils/preprocessing.py
@@ -43,6 +43,7 @@
     'get_intent_labels',
     'normalize_answer',
     'get_tokens',
+    'get_stats'
 ]
 
 DATABASE_EXISTS_TMP = '{} dataset has already been processed and stored at {}'
diff --git a/nemo/collections/nlp/data/datasets/glue_benchmark_dataset.py b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset.py
deleted file mode 100644
index 26423c3aa549..000000000000
--- a/nemo/collections/nlp/data/datasets/glue_benchmark_dataset.py
+++ /dev/null
@@ -1,593 +0,0 @@
-"""
-Copyright 2018 The Google AI Language Team Authors and
-The HuggingFace Inc. team.
-Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-Utility functions for GLUE tasks
-Some transformer of this code were adapted from the HuggingFace library at
-https://github.com/huggingface/transformers
-"""
-import csv
-import os
-
-import numpy as np
-from torch.utils.data import Dataset
-
-from nemo import logging
-
-__all__ = ['GLUEDataset']
-
-
-class GLUEDataset(Dataset):
-    def __init__(self, data_dir, tokenizer, max_seq_length, processor, output_mode, evaluate, token_params):
-        self.tokenizer = tokenizer
-        self.label_list = processor.get_labels()
-        self.examples = processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir)
-        self.features = convert_examples_to_features(
-            self.examples, self.label_list, max_seq_length, tokenizer, output_mode, **token_params
-        )
-
-    def __len__(self):
-        return len(self.features)
-
-    def __getitem__(self, idx):
-        feature = self.features[idx]
-        return (
-            np.array(feature.input_ids),
-            np.array(feature.segment_ids),
-            np.array(feature.input_mask, dtype=np.long),
-            np.array(feature.label_id),
-        )
-
-
-def convert_examples_to_features(
-    examples,
-    label_list,
-    max_seq_length,
-    tokenizer,
-    output_mode,
-    bos_token=None,
-    eos_token='[SEP]',
-    pad_token='[PAD]',
-    cls_token='[CLS]',
-    sep_token_extra=None,
-    cls_token_at_end=False,
-    cls_token_segment_id=0,
-    pad_token_segment_id=0,
-    pad_on_left=False,
-    mask_padding_with_zero=True,
-    sequence_a_segment_id=0,
-    sequence_b_segment_id=1,
-):
-    """ Loads a data file into a list of `InputBatch`s
-        `cls_token_at_end` define the location of the CLS token:
-            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
-            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
-        `cls_token_segment_id` define the segment id associated to the CLS
-        token (0 for BERT, 2 for XLNet)
-         The convention in BERT is:
-         (a) For sequence pairs:
-          tokens:   [CLS] is this jack ##ville ? [SEP] no it is not . [SEP]
-          type_ids:   0   0  0    0    0       0   0   1  1  1  1   1   1
-         (b) For single sequences:
-          tokens:   [CLS] the dog is hairy . [SEP]
-          type_ids:   0   0   0   0  0     0   0
-         Where "type_ids" are used to indicate whether this is the first
-         sequence or the second sequence. The embedding vectors for `type=0`
-         and `type=1` were learned during pre-training and are added to the
-         wordpiece embedding vector (and position vector). This is
-         not *strictly* necessarysince the [SEP] token unambiguously separates
-         the sequences, but it makes it easier for the model to learn
-         the concept of sequences.
-         For classification tasks, the first vector (corresponding to [CLS])
-         is used as as the "sentence vector". Note that this only makes sense
-         because the entire model is fine-tuned.
-         For NMT:
-         (a) For sequence pairs:
-          tokens:<BOS> is this jack ##ville ? <EOS> <BOS> no it is not . <EOS>
-          type_ids:0   0  0    0    0       0   0     1   1  1  1  1   1   1
-         (b) For single sequences:
-          tokens:   <BOS> the dog is hairy . <EOS>
-          type_ids:   0   0   0   0  0     0   0
-    """
-    label_map = {label: i for i, label in enumerate(label_list)}
-
-    features = []
-    for ex_index, example in enumerate(examples):
-        if ex_index % 10000 == 0:
-            logging.info("Writing example %d of %d" % (ex_index, len(examples)))
-
-        tokens_a = tokenizer.text_to_tokens(example.text_a)
-
-        tokens_b = None
-        if example.text_b:
-            tokens_b = tokenizer.text_to_tokens(example.text_b)
-
-            special_tokens_count = 2 if eos_token else 0
-            special_tokens_count += 1 if sep_token_extra else 0
-            special_tokens_count += 2 if bos_token else 0
-            special_tokens_count += 1 if cls_token else 0
-            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
-        else:
-            special_tokens_count = 1 if eos_token else 0
-            special_tokens_count += 1 if sep_token_extra else 0
-            special_tokens_count += 1 if bos_token else 0
-            if len(tokens_a) > max_seq_length - special_tokens_count:
-                tokens_a = tokens_a[: max_seq_length - special_tokens_count]
-        # Add special tokens to sequence_a
-        tokens = tokens_a
-        if bos_token:
-            tokens = [bos_token] + tokens
-        if eos_token:
-            tokens += [eos_token]
-        segment_ids = [sequence_a_segment_id] * len(tokens)
-
-        # Add sequence separator between sequences
-        if tokens_b and sep_token_extra:
-            tokens += [sep_token_extra]
-            segment_ids += [sequence_a_segment_id]
-
-        # Add special tokens to sequence_b
-        if tokens_b:
-            if bos_token:
-                tokens += [bos_token]
-                segment_ids += [sequence_b_segment_id]
-            tokens += tokens_b
-            segment_ids += [sequence_b_segment_id] * (len(tokens_b))
-            if eos_token:
-                tokens += [eos_token]
-                segment_ids += [sequence_b_segment_id]
-
-        # Add classification token - for BERT models
-        if cls_token:
-            if cls_token_at_end:
-                tokens += [cls_token]
-                segment_ids += [cls_token_segment_id]
-            else:
-                tokens = [cls_token] + tokens
-                segment_ids = [cls_token_segment_id] + segment_ids
-        input_ids = tokenizer.tokens_to_ids(tokens)
-
-        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        # tokens are attended to.
-        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-
-        # Zero-pad up to the sequence length.
-        padding_length = max_seq_length - len(input_ids)
-        pad_token_id = tokenizer.tokens_to_ids([pad_token])[0]
-        if pad_on_left:
-            input_ids = ([pad_token_id] * padding_length) + input_ids
-            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
-            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
-        else:
-            input_ids = input_ids + ([pad_token_id] * padding_length)
-            input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
-            segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
-        if len(input_ids) != max_seq_length:
-            raise ValueError("input_ids must be of length max_seq_length")
-        if len(input_mask) != max_seq_length:
-            raise ValueError("input_mask must be of length max_seq_length")
-        if len(segment_ids) != max_seq_length:
-            raise ValueError("segment_ids must be of length max_seq_length")
-        if output_mode == "classification":
-            label_id = label_map[example.label]
-        elif output_mode == "regression":
-            label_id = np.float32(example.label)
-        else:
-            raise KeyError(output_mode)
-
-        if ex_index < 5:
-            logging.info("*** Example ***")
-            logging.info("guid: %s" % (example.guid))
-            logging.info("tokens: %s" % " ".join(list(map(str, tokens))))
-            logging.info("input_ids: %s" % " ".join(list(map(str, input_ids))))
-            logging.info("input_mask: %s" % " ".join(list(map(str, input_mask))))
-            logging.info("segment_ids: %s" % " ".join(list(map(str, segment_ids))))
-            logging.info("label: %s (id = %d)" % (example.label, label_id))
-
-        features.append(
-            InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id)
-        )
-    return features
-
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length.
-
-     This will always truncate the longer sequence one token at a time.
-     This makes more sense than truncating an equal percent
-     of tokens from each, since if one sequence is very short then each token
-     that's truncated likely contains more information than a longer sequence.
-    """
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-
-"""
-Utility functions for GLUE tasks
-This code was adapted from the HuggingFace library at
-https://github.com/huggingface/transformers
-"""
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self, input_ids, input_mask, segment_ids, label_id):
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.label_id = label_id
-
-
-class InputExample(object):
-    """A single training/test example for simple sequence classification."""
-
-    def __init__(self, guid, text_a, text_b=None, label=None):
-        """Constructs a InputExample.
-
-        Args:
-            guid: Unique id for the example.
-            text_a: string. The untokenized text of the first sequence.
-            For single sequence tasks, only this sequence must be specified.
-            text_b: (Optional) string. The untokenized text of the second
-            sequence. Only must be specified for sequence pair tasks.
-            label: (Optional) string. The label of the example. This should be
-            specified for train and dev examples, but not for test examples.
-        """
-        self.guid = guid
-        self.text_a = text_a
-        self.text_b = text_b
-        self.label = label
-
-
-class DataProcessor(object):
-    """Base class for data converters for sequence classification data sets."""
-
-    def get_train_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the train set."""
-        raise NotImplementedError()
-
-    def get_dev_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the dev set."""
-        raise NotImplementedError()
-
-    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        raise NotImplementedError()
-
-    @classmethod
-    def _read_tsv(cls, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with open(input_file, "r", encoding="utf-8-sig") as f:
-            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
-            lines = []
-            for line in reader:
-                # if sys.version_info[0] == 2:
-                #     line = list(unicode(cell, 'utf-8') for cell in line)
-                lines.append(line)
-            return lines
-
-
-class MrpcProcessor(DataProcessor):
-    """Processor for the MRPC data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        logging.info(f'LOOKING AT {os.path.join(data_dir, "train.tsv")}')
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, i)
-            text_a = line[3]
-            text_b = line[4]
-            label = line[0]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class MnliProcessor(DataProcessor):
-    """Processor for the MultiNLI data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched")
-
-    def get_labels(self):
-        """See base class."""
-        return ["contradiction", "entailment", "neutral"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[8]
-            text_b = line[9]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class MnliMismatchedProcessor(MnliProcessor):
-    """Processor for the MultiNLI Mismatched data set (GLUE version)."""
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_matched")
-
-
-class ColaProcessor(DataProcessor):
-    """Processor for the CoLA data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            guid = "%s-%s" % (set_type, i)
-            text_a = line[3]
-            label = line[1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
-        return examples
-
-
-class Sst2Processor(DataProcessor):
-    """Processor for the SST-2 data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, i)
-            text_a = line[0]
-            label = line[1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
-        return examples
-
-
-class StsbProcessor(DataProcessor):
-    """Processor for the STS-B data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return [None]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[7]
-            text_b = line[8]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class QqpProcessor(DataProcessor):
-    """Processor for the QQP data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            try:
-                text_a = line[3]
-                text_b = line[4]
-                label = line[5]
-            except IndexError:
-                continue
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class QnliProcessor(DataProcessor):
-    """Processor for the QNLI data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched")
-
-    def get_labels(self):
-        """See base class."""
-        return ["entailment", "not_entailment"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[1]
-            text_b = line[2]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class RteProcessor(DataProcessor):
-    """Processor for the RTE data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["entailment", "not_entailment"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[1]
-            text_b = line[2]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class WnliProcessor(DataProcessor):
-    """Processor for the WNLI data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[1]
-            text_b = line[2]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-processors = {
-    "cola": ColaProcessor,
-    "mnli": MnliProcessor,
-    "mnli-mm": MnliMismatchedProcessor,
-    "mrpc": MrpcProcessor,
-    "sst-2": Sst2Processor,
-    "sts-b": StsbProcessor,
-    "qqp": QqpProcessor,
-    "qnli": QnliProcessor,
-    "rte": RteProcessor,
-    "wnli": WnliProcessor,
-}
-output_modes = {
-    "cola": "classification",
-    "mnli": "classification",
-    "mnli-mm": "classification",
-    "mrpc": "classification",
-    "sst-2": "classification",
-    "sts-b": "regression",
-    "qqp": "classification",
-    "qnli": "classification",
-    "rte": "classification",
-    "wnli": "classification",
-}
-GLUE_TASKS_NUM_LABELS = {
-    "cola": 2,
-    "mnli": 3,
-    "mrpc": 2,
-    "sst-2": 2,
-    "sts-b": 1,
-    "qqp": 2,
-    "qnli": 2,
-    "rte": 2,
-    "wnli": 2,
-}
diff --git a/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/__init__.py b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/data_processors.py b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/data_processors.py
new file mode 100644
index 000000000000..b9d2a6bc4451
--- /dev/null
+++ b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/data_processors.py
@@ -0,0 +1,302 @@
+import csv
+import os
+
+from nemo import logging
+from nemo.collections.nlp.data.datasets.glue_benchmark_dataset.glue_benchmark_dataset import InputExample
+
+
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, "r", encoding="utf-8-sig") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                # if sys.version_info[0] == 2:
+                #     line = list(unicode(cell, 'utf-8') for cell in line)
+                lines.append(line)
+            return lines
+
+
+class MrpcProcessor(DataProcessor):
+    """Processor for the MRPC data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logging.info(f'LOOKING AT {os.path.join(data_dir, "train.tsv")}')
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[3]
+            text_b = line[4]
+            label = line[0]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class MnliProcessor(DataProcessor):
+    """Processor for the MultiNLI data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched")
+
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[8]
+            text_b = line[9]
+            label = line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class MnliMismatchedProcessor(MnliProcessor):
+    """Processor for the MultiNLI Mismatched data set (GLUE version)."""
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_matched")
+
+
+class ColaProcessor(DataProcessor):
+    """Processor for the CoLA data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[3]
+            label = line[1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+class Sst2Processor(DataProcessor):
+    """Processor for the SST-2 data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[0]
+            label = line[1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+class StsbProcessor(DataProcessor):
+    """Processor for the STS-B data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return [None]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[7]
+            text_b = line[8]
+            label = line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class QqpProcessor(DataProcessor):
+    """Processor for the QQP data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            try:
+                text_a = line[3]
+                text_b = line[4]
+                label = line[5]
+            except IndexError:
+                continue
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class QnliProcessor(DataProcessor):
+    """Processor for the QNLI data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched")
+
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "not_entailment"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class RteProcessor(DataProcessor):
+    """Processor for the RTE data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "not_entailment"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class WnliProcessor(DataProcessor):
+    """Processor for the WNLI data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
\ No newline at end of file
diff --git a/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/glue_benchmark_dataset.py b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/glue_benchmark_dataset.py
new file mode 100644
index 000000000000..6530a26dd508
--- /dev/null
+++ b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/glue_benchmark_dataset.py
@@ -0,0 +1,295 @@
+"""
+Copyright 2018 The Google AI Language Team Authors and
+The HuggingFace Inc. team.
+Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Utility functions for GLUE tasks
+Some transformer of this code were adapted from the HuggingFace library at
+https://github.com/huggingface/transformers
+"""
+
+import numpy as np
+from torch.utils.data import Dataset
+
+from nemo import logging
+
+__all__ = ['GLUEDataset']
+
+processors = {
+    "cola": ColaProcessor,
+    "mnli": MnliProcessor,
+    "mnli-mm": MnliMismatchedProcessor,
+    "mrpc": MrpcProcessor,
+    "sst-2": Sst2Processor,
+    "sts-b": StsbProcessor,
+    "qqp": QqpProcessor,
+    "qnli": QnliProcessor,
+    "rte": RteProcessor,
+    "wnli": WnliProcessor,
+}
+output_modes = {
+    "cola": "classification",
+    "mnli": "classification",
+    "mnli-mm": "classification",
+    "mrpc": "classification",
+    "sst-2": "classification",
+    "sts-b": "regression",
+    "qqp": "classification",
+    "qnli": "classification",
+    "rte": "classification",
+    "wnli": "classification",
+}
+GLUE_TASKS_NUM_LABELS = {
+    "cola": 2,
+    "mnli": 3,
+    "mrpc": 2,
+    "sst-2": 2,
+    "sts-b": 1,
+    "qqp": 2,
+    "qnli": 2,
+    "rte": 2,
+    "wnli": 2,
+}
+
+class GLUEDataset(Dataset):
+    def __init__(self, data_dir, tokenizer, max_seq_length, processor, output_mode, evaluate, token_params):
+        self.tokenizer = tokenizer
+        self.label_list = processor.get_labels()
+        self.examples = processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir)
+        self.features = convert_examples_to_features(
+            self.examples, self.label_list, max_seq_length, tokenizer, output_mode, **token_params
+        )
+
+    def __len__(self):
+        return len(self.features)
+
+    def __getitem__(self, idx):
+        feature = self.features[idx]
+        return (
+            np.array(feature.input_ids),
+            np.array(feature.segment_ids),
+            np.array(feature.input_mask, dtype=np.long),
+            np.array(feature.label_id),
+        )
+
+
+    def convert_examples_to_features(
+        examples,
+        label_list,
+        max_seq_length,
+        tokenizer,
+        output_mode,
+        bos_token=None,
+        eos_token='[SEP]',
+        pad_token='[PAD]',
+        cls_token='[CLS]',
+        sep_token_extra=None,
+        cls_token_at_end=False,
+        cls_token_segment_id=0,
+        pad_token_segment_id=0,
+        pad_on_left=False,
+        mask_padding_with_zero=True,
+        sequence_a_segment_id=0,
+        sequence_b_segment_id=1,
+    ):
+        """ Loads a data file into a list of `InputBatch`s
+            `cls_token_at_end` define the location of the CLS token:
+                - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
+                - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
+            `cls_token_segment_id` define the segment id associated to the CLS
+            token (0 for BERT, 2 for XLNet)
+             The convention in BERT is:
+             (a) For sequence pairs:
+              tokens:   [CLS] is this jack ##ville ? [SEP] no it is not . [SEP]
+              type_ids:   0   0  0    0    0       0   0   1  1  1  1   1   1
+             (b) For single sequences:
+              tokens:   [CLS] the dog is hairy . [SEP]
+              type_ids:   0   0   0   0  0     0   0
+             Where "type_ids" are used to indicate whether this is the first
+             sequence or the second sequence. The embedding vectors for `type=0`
+             and `type=1` were learned during pre-training and are added to the
+             wordpiece embedding vector (and position vector). This is
+             not *strictly* necessarysince the [SEP] token unambiguously separates
+             the sequences, but it makes it easier for the model to learn
+             the concept of sequences.
+             For classification tasks, the first vector (corresponding to [CLS])
+             is used as as the "sentence vector". Note that this only makes sense
+             because the entire model is fine-tuned.
+             For NMT:
+             (a) For sequence pairs:
+              tokens:<BOS> is this jack ##ville ? <EOS> <BOS> no it is not . <EOS>
+              type_ids:0   0  0    0    0       0   0     1   1  1  1  1   1   1
+             (b) For single sequences:
+              tokens:   <BOS> the dog is hairy . <EOS>
+              type_ids:   0   0   0   0  0     0   0
+        """
+        label_map = {label: i for i, label in enumerate(label_list)}
+
+        features = []
+        for ex_index, example in enumerate(examples):
+            if ex_index % 10000 == 0:
+                logging.info("Writing example %d of %d" % (ex_index, len(examples)))
+
+            tokens_a = tokenizer.text_to_tokens(example.text_a)
+
+            tokens_b = None
+            if example.text_b:
+                tokens_b = tokenizer.text_to_tokens(example.text_b)
+
+                special_tokens_count = 2 if eos_token else 0
+                special_tokens_count += 1 if sep_token_extra else 0
+                special_tokens_count += 2 if bos_token else 0
+                special_tokens_count += 1 if cls_token else 0
+                _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
+            else:
+                special_tokens_count = 1 if eos_token else 0
+                special_tokens_count += 1 if sep_token_extra else 0
+                special_tokens_count += 1 if bos_token else 0
+                if len(tokens_a) > max_seq_length - special_tokens_count:
+                    tokens_a = tokens_a[: max_seq_length - special_tokens_count]
+            # Add special tokens to sequence_a
+            tokens = tokens_a
+            if bos_token:
+                tokens = [bos_token] + tokens
+            if eos_token:
+                tokens += [eos_token]
+            segment_ids = [sequence_a_segment_id] * len(tokens)
+
+            # Add sequence separator between sequences
+            if tokens_b and sep_token_extra:
+                tokens += [sep_token_extra]
+                segment_ids += [sequence_a_segment_id]
+
+            # Add special tokens to sequence_b
+            if tokens_b:
+                if bos_token:
+                    tokens += [bos_token]
+                    segment_ids += [sequence_b_segment_id]
+                tokens += tokens_b
+                segment_ids += [sequence_b_segment_id] * (len(tokens_b))
+                if eos_token:
+                    tokens += [eos_token]
+                    segment_ids += [sequence_b_segment_id]
+
+            # Add classification token - for BERT models
+            if cls_token:
+                if cls_token_at_end:
+                    tokens += [cls_token]
+                    segment_ids += [cls_token_segment_id]
+                else:
+                    tokens = [cls_token] + tokens
+                    segment_ids = [cls_token_segment_id] + segment_ids
+            input_ids = tokenizer.tokens_to_ids(tokens)
+
+            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+            # tokens are attended to.
+            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            padding_length = max_seq_length - len(input_ids)
+            pad_token_id = tokenizer.tokens_to_ids([pad_token])[0]
+            if pad_on_left:
+                input_ids = ([pad_token_id] * padding_length) + input_ids
+                input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
+                segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
+            else:
+                input_ids = input_ids + ([pad_token_id] * padding_length)
+                input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+                segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
+            if len(input_ids) != max_seq_length:
+                raise ValueError("input_ids must be of length max_seq_length")
+            if len(input_mask) != max_seq_length:
+                raise ValueError("input_mask must be of length max_seq_length")
+            if len(segment_ids) != max_seq_length:
+                raise ValueError("segment_ids must be of length max_seq_length")
+            if output_mode == "classification":
+                label_id = label_map[example.label]
+            elif output_mode == "regression":
+                label_id = np.float32(example.label)
+            else:
+                raise KeyError(output_mode)
+
+            if ex_index < 5:
+                logging.info("*** Example ***")
+                logging.info("guid: %s" % (example.guid))
+                logging.info("tokens: %s" % " ".join(list(map(str, tokens))))
+                logging.info("input_ids: %s" % " ".join(list(map(str, input_ids))))
+                logging.info("input_mask: %s" % " ".join(list(map(str, input_mask))))
+                logging.info("segment_ids: %s" % " ".join(list(map(str, segment_ids))))
+                logging.info("label: %s (id = %d)" % (example.label, label_id))
+
+            features.append(
+                InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id)
+            )
+        return features
+
+
+    def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+        """Truncates a sequence pair in place to the maximum length.
+
+         This will always truncate the longer sequence one token at a time.
+         This makes more sense than truncating an equal percent
+         of tokens from each, since if one sequence is very short then each token
+         that's truncated likely contains more information than a longer sequence.
+        """
+        while True:
+            total_length = len(tokens_a) + len(tokens_b)
+            if total_length <= max_length:
+                break
+            if len(tokens_a) > len(tokens_b):
+                tokens_a.pop()
+            else:
+                tokens_b.pop()
+
+
+    """
+    Utility functions for GLUE tasks
+    This code was adapted from the HuggingFace library at
+    https://github.com/huggingface/transformers
+    """
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, input_ids, input_mask, segment_ids, label_id):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+
+
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        """Constructs a InputExample.
+
+        Args:
+            guid: Unique id for the example.
+            text_a: string. The untokenized text of the first sequence.
+            For single sequence tasks, only this sequence must be specified.
+            text_b: (Optional) string. The untokenized text of the second
+            sequence. Only must be specified for sequence pair tasks.
+            label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+
+
+
diff --git a/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset.py b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset.py
deleted file mode 100644
index 7dc25bba3848..000000000000
--- a/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset.py
+++ /dev/null
@@ -1,481 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and
-# The HuggingFace Inc. team.
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Utility functions for Token Classification NLP tasks
-Some parts of this code were adapted from the HuggingFace library at
-https://github.com/huggingface/pytorch-pretrained-BERT
-"""
-import itertools
-import os
-import random
-
-import numpy as np
-from torch.utils.data import Dataset
-
-from nemo import logging
-from nemo.collections.nlp.data.datasets.datasets_utils.datasets_processing import (
-    process_atis,
-    process_jarvis_datasets,
-    process_snips,
-)
-from nemo.collections.nlp.data.datasets.datasets_utils.dialogflow_utils import process_dialogflow
-from nemo.collections.nlp.data.datasets.datasets_utils.mturk_utils import process_mturk
-from nemo.collections.nlp.data.datasets.datasets_utils.preprocessing import (
-    DATABASE_EXISTS_TMP,
-    get_label_stats,
-    get_stats,
-)
-from nemo.collections.nlp.utils import list2str, write_vocab_in_order
-from nemo.collections.nlp.utils.common_nlp_utils import calc_class_weights, get_vocab, if_exist, label2idx
-
-__all__ = ['BertJointIntentSlotDataset', 'BertJointIntentSlotInferDataset', 'JointIntentSlotDataDesc']
-
-
-def get_features(
-    queries,
-    max_seq_length,
-    tokenizer,
-    pad_label=128,
-    raw_slots=None,
-    ignore_extra_tokens=False,
-    ignore_start_end=False,
-):
-    all_subtokens = []
-    all_loss_mask = []
-    all_subtokens_mask = []
-    all_segment_ids = []
-    all_input_ids = []
-    all_input_mask = []
-    sent_lengths = []
-    all_slots = []
-
-    with_label = False
-    if raw_slots is not None:
-        with_label = True
-
-    for i, query in enumerate(queries):
-        words = query.strip().split()
-        subtokens = ['[CLS]']
-        loss_mask = [1 - ignore_start_end]
-        subtokens_mask = [0]
-        if with_label:
-            slots = [pad_label]
-
-        for j, word in enumerate(words):
-            word_tokens = tokenizer.tokenize(word)
-            subtokens.extend(word_tokens)
-
-            loss_mask.append(1)
-            loss_mask.extend([not ignore_extra_tokens] * (len(word_tokens) - 1))
-
-            subtokens_mask.append(1)
-            subtokens_mask.extend([0] * (len(word_tokens) - 1))
-
-            if with_label:
-                slots.extend([raw_slots[i][j]] * len(word_tokens))
-
-        subtokens.append('[SEP]')
-        loss_mask.append(not ignore_start_end)
-        subtokens_mask.append(0)
-        sent_lengths.append(len(subtokens))
-        all_subtokens.append(subtokens)
-        all_loss_mask.append(loss_mask)
-        all_subtokens_mask.append(subtokens_mask)
-        all_input_mask.append([1] * len(subtokens))
-        if with_label:
-            slots.append(pad_label)
-            all_slots.append(slots)
-
-    max_seq_length = min(max_seq_length, max(sent_lengths))
-    logging.info(f'Max length: {max_seq_length}')
-    get_stats(sent_lengths)
-    too_long_count = 0
-
-    for i, subtokens in enumerate(all_subtokens):
-        if len(subtokens) > max_seq_length:
-            subtokens = ['[CLS]'] + subtokens[-max_seq_length + 1 :]
-            all_input_mask[i] = [1] + all_input_mask[i][-max_seq_length + 1 :]
-            all_loss_mask[i] = [1 - ignore_start_end] + all_loss_mask[i][-max_seq_length + 1 :]
-            all_subtokens_mask[i] = [0] + all_subtokens_mask[i][-max_seq_length + 1 :]
-
-            if with_label:
-                all_slots[i] = [pad_label] + all_slots[i][-max_seq_length + 1 :]
-            too_long_count += 1
-
-        all_input_ids.append([tokenizer._convert_token_to_id(t) for t in subtokens])
-
-        if len(subtokens) < max_seq_length:
-            extra = max_seq_length - len(subtokens)
-            all_input_ids[i] = all_input_ids[i] + [0] * extra
-            all_loss_mask[i] = all_loss_mask[i] + [0] * extra
-            all_subtokens_mask[i] = all_subtokens_mask[i] + [0] * extra
-            all_input_mask[i] = all_input_mask[i] + [0] * extra
-
-            if with_label:
-                all_slots[i] = all_slots[i] + [pad_label] * extra
-
-        all_segment_ids.append([0] * max_seq_length)
-
-    logging.info(f'{too_long_count} are longer than {max_seq_length}')
-
-    return (all_input_ids, all_segment_ids, all_input_mask, all_loss_mask, all_subtokens_mask, all_slots)
-
-
-class BertJointIntentSlotDataset(Dataset):
-    """
-    Creates dataset to use for the task of joint intent
-    and slot classification with pretrained model.
-
-    Converts from raw data to an instance that can be used by
-    NMDataLayer.
-
-    For dataset to use during inference without labels, see
-    BertJointIntentSlotInferDataset.
-
-    Args:
-        input_file (str): file to sequence + label.
-            the first line is header (sentence [tab] label)
-            each line should be [sentence][tab][label]
-        slot_file (str): file to slot labels, each line corresponding to
-            slot labels for a sentence in input_file. No header.
-        max_seq_length (int): max sequence length minus 2 for [CLS] and [SEP]
-        tokenizer (Tokenizer): such as BertTokenizer
-        num_samples (int): number of samples you want to use for the dataset.
-            If -1, use all dataset. Useful for testing.
-        shuffle (bool): whether to shuffle your data.
-        pad_label (int): pad value use for slot labels.
-            by default, it's the neutral label.
-
-    """
-
-    def __init__(
-        self,
-        input_file,
-        slot_file,
-        max_seq_length,
-        tokenizer,
-        num_samples=-1,
-        shuffle=True,
-        pad_label=128,
-        ignore_extra_tokens=False,
-        ignore_start_end=False,
-    ):
-        if num_samples == 0:
-            raise ValueError("num_samples has to be positive", num_samples)
-
-        with open(slot_file, 'r') as f:
-            slot_lines = f.readlines()
-
-        with open(input_file, 'r') as f:
-            input_lines = f.readlines()[1:]
-
-        assert len(slot_lines) == len(input_lines)
-
-        dataset = list(zip(slot_lines, input_lines))
-
-        if shuffle or num_samples > 0:
-            random.shuffle(dataset)
-        if num_samples > 0:
-            dataset = dataset[:num_samples]
-
-        raw_slots, queries, raw_intents = [], [], []
-        for slot_line, input_line in dataset:
-            raw_slots.append([int(slot) for slot in slot_line.strip().split()])
-            parts = input_line.strip().split()
-            raw_intents.append(int(parts[-1]))
-            queries.append(' '.join(parts[:-1]))
-
-        features = get_features(
-            queries,
-            max_seq_length,
-            tokenizer,
-            pad_label=pad_label,
-            raw_slots=raw_slots,
-            ignore_extra_tokens=ignore_extra_tokens,
-            ignore_start_end=ignore_start_end,
-        )
-        self.all_input_ids = features[0]
-        self.all_segment_ids = features[1]
-        self.all_input_mask = features[2]
-        self.all_loss_mask = features[3]
-        self.all_subtokens_mask = features[4]
-        self.all_slots = features[5]
-        self.all_intents = raw_intents
-
-    def __len__(self):
-        return len(self.all_input_ids)
-
-    def __getitem__(self, idx):
-        return (
-            np.array(self.all_input_ids[idx]),
-            np.array(self.all_segment_ids[idx]),
-            np.array(self.all_input_mask[idx], dtype=np.long),
-            np.array(self.all_loss_mask[idx]),
-            np.array(self.all_subtokens_mask[idx]),
-            self.all_intents[idx],
-            np.array(self.all_slots[idx]),
-        )
-
-
-class BertJointIntentSlotInferDataset(Dataset):
-    """
-    Creates dataset to use for the task of joint intent
-    and slot classification with pretrained model.
-
-    Converts from raw data to an instance that can be used by
-    NMDataLayer.
-
-    This is to be used during inference only.
-    For dataset to use during training with labels, see
-    BertJointIntentSlotDataset.
-
-    Args:
-        queries (list): list of queries to run inference on
-        max_seq_length (int): max sequence length minus 2 for [CLS] and [SEP]
-        tokenizer (Tokenizer): such as BertTokenizer
-        pad_label (int): pad value use for slot labels.
-            by default, it's the neutral label.
-
-    """
-
-    def __init__(self, queries, max_seq_length, tokenizer):
-        features = get_features(queries, max_seq_length, tokenizer)
-
-        self.all_input_ids = features[0]
-        self.all_segment_ids = features[1]
-        self.all_input_mask = features[2]
-        self.all_loss_mask = features[3]
-        self.all_subtokens_mask = features[4]
-
-    def __len__(self):
-        return len(self.all_input_ids)
-
-    def __getitem__(self, idx):
-        return (
-            np.array(self.all_input_ids[idx]),
-            np.array(self.all_segment_ids[idx]),
-            np.array(self.all_input_mask[idx], dtype=np.long),
-            np.array(self.all_loss_mask[idx]),
-            np.array(self.all_subtokens_mask[idx]),
-        )
-
-
-class JointIntentSlotDataDesc:
-    """ Convert the raw data to the standard format supported by
-    JointIntentSlotDataset.
-
-    By default, the None label for slots is 'O'.
-
-    JointIntentSlotDataset requires two files:
-
-        input_file: file to sequence + label.
-            the first line is header (sentence [tab] label)
-            each line should be [sentence][tab][label]
-
-        slot_file: file to slot labels, each line corresponding to
-            slot labels for a sentence in input_file. No header.
-
-    To keep the mapping from label index to label consistent during
-    training and inferencing, we require the following files:
-        dicts.intents.csv: each line is an intent. The first line
-            corresponding to the 0 intent label, the second line
-            corresponding to the 1 intent label, and so on.
-
-        dicts.slots.csv: each line is a slot. The first line
-            corresponding to the 0 slot label, the second line
-            corresponding to the 1 slot label, and so on.
-
-    Args:
-        data_dir (str): the directory of the dataset
-        do_lower_case (bool): whether to set your dataset to lowercase
-        dataset_name (str): the name of the dataset. If it's a dataset
-            that follows the standard JointIntentSlotDataset format,
-            you can set the name as 'default'.
-        none_slot_label (str): the label for slots that aren't indentified
-            defaulted to 'O'
-        pad_label (int): the int used for padding. If set to -1,
-             it'll be set to the whatever the None label is.
-
-    """
-
-    def __init__(self, data_dir, do_lower_case=False, dataset_name='default', none_slot_label='O', pad_label=-1):
-        if dataset_name == 'atis':
-            self.data_dir = process_atis(data_dir, do_lower_case)
-        elif dataset_name == 'snips-atis':
-            self.data_dir, self.pad_label = merge(
-                data_dir, ['ATIS/nemo-processed-uncased', 'snips/nemo-processed-uncased/all'], dataset_name
-            )
-        elif dataset_name == 'dialogflow':
-            self.data_dir = process_dialogflow(data_dir, do_lower_case)
-        elif dataset_name == 'mturk-processed':
-            self.data_dir = process_mturk(data_dir, do_lower_case)
-        elif dataset_name in set(['snips-light', 'snips-speak', 'snips-all']):
-            self.data_dir = process_snips(data_dir, do_lower_case)
-            if dataset_name.endswith('light'):
-                self.data_dir = f'{self.data_dir}/light'
-            elif dataset_name.endswith('speak'):
-                self.data_dir = f'{self.data_dir}/speak'
-            elif dataset_name.endswith('all'):
-                self.data_dir = f'{self.data_dir}/all'
-        elif dataset_name.startswith('jarvis'):
-            self.data_dir = process_jarvis_datasets(
-                data_dir, do_lower_case, dataset_name, modes=["train", "test", "eval"], ignore_prev_intent=False
-            )
-        else:
-            if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']):
-                raise FileNotFoundError(
-                    "Make sure that your data follows the standard format "
-                    "supported by JointIntentSlotDataset. Your data must "
-                    "contain dict.intents.csv and dict.slots.csv."
-                )
-            self.data_dir = data_dir
-
-        self.intent_dict_file = self.data_dir + '/dict.intents.csv'
-        self.slot_dict_file = self.data_dir + '/dict.slots.csv'
-        self.num_intents = len(get_vocab(self.intent_dict_file))
-        slots = label2idx(self.slot_dict_file)
-        self.num_slots = len(slots)
-
-        for mode in ['train', 'test', 'eval']:
-
-            if not if_exist(self.data_dir, [f'{mode}.tsv']):
-                logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.')
-                continue
-
-            slot_file = f'{self.data_dir}/{mode}_slots.tsv'
-            with open(slot_file, 'r') as f:
-                slot_lines = f.readlines()
-
-            input_file = f'{self.data_dir}/{mode}.tsv'
-            with open(input_file, 'r') as f:
-                input_lines = f.readlines()[1:]  # Skipping headers at index 0
-
-            if len(slot_lines) != len(input_lines):
-                raise ValueError(
-                    "Make sure that the number of slot lines match the "
-                    "number of intent lines. There should be a 1-1 "
-                    "correspondence between every slot and intent lines."
-                )
-
-            dataset = list(zip(slot_lines, input_lines))
-
-            raw_slots, queries, raw_intents = [], [], []
-            for slot_line, input_line in dataset:
-                slot_list = [int(slot) for slot in slot_line.strip().split()]
-                raw_slots.append(slot_list)
-                parts = input_line.strip().split()
-                raw_intents.append(int(parts[-1]))
-                queries.append(' '.join(parts[:-1]))
-
-            infold = input_file[: input_file.rfind('/')]
-
-            logging.info(f'Three most popular intents during {mode}ing')
-            total_intents, intent_label_freq = get_label_stats(raw_intents, infold + f'/{mode}_intent_stats.tsv')
-            merged_slots = itertools.chain.from_iterable(raw_slots)
-
-            logging.info(f'Three most popular slots during {mode}ing')
-            slots_total, slots_label_freq = get_label_stats(merged_slots, infold + f'/{mode}_slot_stats.tsv')
-
-            if mode == 'train':
-                self.slot_weights = calc_class_weights(slots_label_freq)
-                logging.info(f'Slot weights are - {self.slot_weights}')
-
-                self.intent_weights = calc_class_weights(intent_label_freq)
-                logging.info(f'Intent weights are - {self.intent_weights}')
-
-            logging.info(f'Total intents - {total_intents}')
-            logging.info(f'Intent label frequency - {intent_label_freq}')
-            logging.info(f'Total Slots - {slots_total}')
-            logging.info(f'Slots label frequency - {slots_label_freq}')
-
-        if pad_label != -1:
-            self.pad_label = pad_label
-        else:
-            if none_slot_label not in slots:
-                raise ValueError(f'none_slot_label {none_slot_label} not ' f'found in {self.slot_dict_file}.')
-            self.pad_label = slots[none_slot_label]
-
-
-def merge(data_dir, subdirs, dataset_name, modes=['train', 'test']):
-    outfold = f'{data_dir}/{dataset_name}'
-    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
-        logging.info(DATABASE_EXISTS_TMP.format('SNIPS-ATIS', outfold))
-        slots = get_vocab(f'{outfold}/dict.slots.csv')
-        none_slot = 0
-        for key in slots:
-            if slots[key] == 'O':
-                none_slot = key
-                break
-        return outfold, int(none_slot)
-
-    os.makedirs(outfold, exist_ok=True)
-
-    data_files, slot_files = {}, {}
-    for mode in modes:
-        data_files[mode] = open(f'{outfold}/{mode}.tsv', 'w')
-        data_files[mode].write('sentence\tlabel\n')
-        slot_files[mode] = open(f'{outfold}/{mode}_slots.tsv', 'w')
-
-    intents, slots = {}, {}
-    intent_shift, slot_shift = 0, 0
-    none_intent, none_slot = -1, -1
-
-    for subdir in subdirs:
-        curr_intents = get_vocab(f'{data_dir}/{subdir}/dict.intents.csv')
-        curr_slots = get_vocab(f'{data_dir}/{subdir}/dict.slots.csv')
-
-        for key in curr_intents:
-            if intent_shift > 0 and curr_intents[key] == 'O':
-                continue
-            if curr_intents[key] == 'O' and intent_shift == 0:
-                none_intent = int(key)
-            intents[int(key) + intent_shift] = curr_intents[key]
-
-        for key in curr_slots:
-            if slot_shift > 0 and curr_slots[key] == 'O':
-                continue
-            if slot_shift == 0 and curr_slots[key] == 'O':
-                none_slot = int(key)
-            slots[int(key) + slot_shift] = curr_slots[key]
-
-        for mode in modes:
-            with open(f'{data_dir}/{subdir}/{mode}.tsv', 'r') as f:
-                for line in f.readlines()[1:]:
-                    text, label = line.strip().split('\t')
-                    label = int(label)
-                    if curr_intents[label] == 'O':
-                        label = none_intent
-                    else:
-                        label = label + intent_shift
-                    data_files[mode].write(f'{text}\t{label}\n')
-
-            with open(f'{data_dir}/{subdir}/{mode}_slots.tsv', 'r') as f:
-                for line in f.readlines():
-                    labels = [int(label) for label in line.strip().split()]
-                    shifted_labels = []
-                    for label in labels:
-                        if curr_slots[label] == 'O':
-                            shifted_labels.append(none_slot)
-                        else:
-                            shifted_labels.append(label + slot_shift)
-                    slot_files[mode].write(list2str(shifted_labels) + '\n')
-
-        intent_shift += len(curr_intents)
-        slot_shift += len(curr_slots)
-
-    write_vocab_in_order(intents, f'{outfold}/dict.intents.csv')
-    write_vocab_in_order(slots, f'{outfold}/dict.slots.csv')
-    return outfold, none_slot
diff --git a/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/data_descriptor.py b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/data_descriptor.py
new file mode 100644
index 000000000000..15bde0d4b4cb
--- /dev/null
+++ b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/data_descriptor.py
@@ -0,0 +1,217 @@
+import itertools
+import os
+
+from nemo import logging
+from nemo.collections.nlp.data import process_atis, process_dialogflow, process_mturk, process_snips, \
+    process_jarvis_datasets, DATABASE_EXISTS_TMP
+from nemo.collections.nlp.data.datasets.datasets_utils.preprocessing import get_label_stats
+from nemo.collections.nlp.utils import if_exist, get_vocab, label2idx, calc_class_weights, list2str, \
+    write_vocab_in_order
+
+
+class JointIntentSlotDataDesc:
+    """ Convert the raw data to the standard format supported by
+    JointIntentSlotDataset.
+
+    By default, the None label for slots is 'O'.
+
+    JointIntentSlotDataset requires two files:
+
+        input_file: file to sequence + label.
+            the first line is header (sentence [tab] label)
+            each line should be [sentence][tab][label]
+
+        slot_file: file to slot labels, each line corresponding to
+            slot labels for a sentence in input_file. No header.
+
+    To keep the mapping from label index to label consistent during
+    training and inferencing, we require the following files:
+        dicts.intents.csv: each line is an intent. The first line
+            corresponding to the 0 intent label, the second line
+            corresponding to the 1 intent label, and so on.
+
+        dicts.slots.csv: each line is a slot. The first line
+            corresponding to the 0 slot label, the second line
+            corresponding to the 1 slot label, and so on.
+
+    Args:
+        data_dir (str): the directory of the dataset
+        do_lower_case (bool): whether to set your dataset to lowercase
+        dataset_name (str): the name of the dataset. If it's a dataset
+            that follows the standard JointIntentSlotDataset format,
+            you can set the name as 'default'.
+        none_slot_label (str): the label for slots that aren't indentified
+            defaulted to 'O'
+        pad_label (int): the int used for padding. If set to -1,
+             it'll be set to the whatever the None label is.
+
+    """
+
+    def __init__(self, data_dir, do_lower_case=False, dataset_name='default', none_slot_label='O', pad_label=-1):
+        if dataset_name == 'atis':
+            self.data_dir = process_atis(data_dir, do_lower_case)
+        elif dataset_name == 'snips-atis':
+            self.data_dir, self.pad_label = merge(
+                data_dir, ['ATIS/nemo-processed-uncased', 'snips/nemo-processed-uncased/all'], dataset_name
+            )
+        elif dataset_name == 'dialogflow':
+            self.data_dir = process_dialogflow(data_dir, do_lower_case)
+        elif dataset_name == 'mturk-processed':
+            self.data_dir = process_mturk(data_dir, do_lower_case)
+        elif dataset_name in set(['snips-light', 'snips-speak', 'snips-all']):
+            self.data_dir = process_snips(data_dir, do_lower_case)
+            if dataset_name.endswith('light'):
+                self.data_dir = f'{self.data_dir}/light'
+            elif dataset_name.endswith('speak'):
+                self.data_dir = f'{self.data_dir}/speak'
+            elif dataset_name.endswith('all'):
+                self.data_dir = f'{self.data_dir}/all'
+        elif dataset_name.startswith('jarvis'):
+            self.data_dir = process_jarvis_datasets(
+                data_dir, do_lower_case, dataset_name, modes=["train", "test", "eval"], ignore_prev_intent=False
+            )
+        else:
+            if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']):
+                raise FileNotFoundError(
+                    "Make sure that your data follows the standard format "
+                    "supported by JointIntentSlotDataset. Your data must "
+                    "contain dict.intents.csv and dict.slots.csv."
+                )
+            self.data_dir = data_dir
+
+        self.intent_dict_file = self.data_dir + '/dict.intents.csv'
+        self.slot_dict_file = self.data_dir + '/dict.slots.csv'
+        self.num_intents = len(get_vocab(self.intent_dict_file))
+        slots = label2idx(self.slot_dict_file)
+        self.num_slots = len(slots)
+
+        for mode in ['train', 'test', 'eval']:
+
+            if not if_exist(self.data_dir, [f'{mode}.tsv']):
+                logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.')
+                continue
+
+            slot_file = f'{self.data_dir}/{mode}_slots.tsv'
+            with open(slot_file, 'r') as f:
+                slot_lines = f.readlines()
+
+            input_file = f'{self.data_dir}/{mode}.tsv'
+            with open(input_file, 'r') as f:
+                input_lines = f.readlines()[1:]  # Skipping headers at index 0
+
+            if len(slot_lines) != len(input_lines):
+                raise ValueError(
+                    "Make sure that the number of slot lines match the "
+                    "number of intent lines. There should be a 1-1 "
+                    "correspondence between every slot and intent lines."
+                )
+
+            dataset = list(zip(slot_lines, input_lines))
+
+            raw_slots, queries, raw_intents = [], [], []
+            for slot_line, input_line in dataset:
+                slot_list = [int(slot) for slot in slot_line.strip().split()]
+                raw_slots.append(slot_list)
+                parts = input_line.strip().split()
+                raw_intents.append(int(parts[-1]))
+                queries.append(' '.join(parts[:-1]))
+
+            infold = input_file[: input_file.rfind('/')]
+
+            logging.info(f'Three most popular intents during {mode}ing')
+            total_intents, intent_label_freq = get_label_stats(raw_intents, infold + f'/{mode}_intent_stats.tsv')
+            merged_slots = itertools.chain.from_iterable(raw_slots)
+
+            logging.info(f'Three most popular slots during {mode}ing')
+            slots_total, slots_label_freq = get_label_stats(merged_slots, infold + f'/{mode}_slot_stats.tsv')
+
+            if mode == 'train':
+                self.slot_weights = calc_class_weights(slots_label_freq)
+                logging.info(f'Slot weights are - {self.slot_weights}')
+
+                self.intent_weights = calc_class_weights(intent_label_freq)
+                logging.info(f'Intent weights are - {self.intent_weights}')
+
+            logging.info(f'Total intents - {total_intents}')
+            logging.info(f'Intent label frequency - {intent_label_freq}')
+            logging.info(f'Total Slots - {slots_total}')
+            logging.info(f'Slots label frequency - {slots_label_freq}')
+
+        if pad_label != -1:
+            self.pad_label = pad_label
+        else:
+            if none_slot_label not in slots:
+                raise ValueError(f'none_slot_label {none_slot_label} not ' f'found in {self.slot_dict_file}.')
+            self.pad_label = slots[none_slot_label]
+
+
+def merge(data_dir, subdirs, dataset_name, modes=['train', 'test']):
+    outfold = f'{data_dir}/{dataset_name}'
+    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
+        logging.info(DATABASE_EXISTS_TMP.format('SNIPS-ATIS', outfold))
+        slots = get_vocab(f'{outfold}/dict.slots.csv')
+        none_slot = 0
+        for key in slots:
+            if slots[key] == 'O':
+                none_slot = key
+                break
+        return outfold, int(none_slot)
+
+    os.makedirs(outfold, exist_ok=True)
+
+    data_files, slot_files = {}, {}
+    for mode in modes:
+        data_files[mode] = open(f'{outfold}/{mode}.tsv', 'w')
+        data_files[mode].write('sentence\tlabel\n')
+        slot_files[mode] = open(f'{outfold}/{mode}_slots.tsv', 'w')
+
+    intents, slots = {}, {}
+    intent_shift, slot_shift = 0, 0
+    none_intent, none_slot = -1, -1
+
+    for subdir in subdirs:
+        curr_intents = get_vocab(f'{data_dir}/{subdir}/dict.intents.csv')
+        curr_slots = get_vocab(f'{data_dir}/{subdir}/dict.slots.csv')
+
+        for key in curr_intents:
+            if intent_shift > 0 and curr_intents[key] == 'O':
+                continue
+            if curr_intents[key] == 'O' and intent_shift == 0:
+                none_intent = int(key)
+            intents[int(key) + intent_shift] = curr_intents[key]
+
+        for key in curr_slots:
+            if slot_shift > 0 and curr_slots[key] == 'O':
+                continue
+            if slot_shift == 0 and curr_slots[key] == 'O':
+                none_slot = int(key)
+            slots[int(key) + slot_shift] = curr_slots[key]
+
+        for mode in modes:
+            with open(f'{data_dir}/{subdir}/{mode}.tsv', 'r') as f:
+                for line in f.readlines()[1:]:
+                    text, label = line.strip().split('\t')
+                    label = int(label)
+                    if curr_intents[label] == 'O':
+                        label = none_intent
+                    else:
+                        label = label + intent_shift
+                    data_files[mode].write(f'{text}\t{label}\n')
+
+            with open(f'{data_dir}/{subdir}/{mode}_slots.tsv', 'r') as f:
+                for line in f.readlines():
+                    labels = [int(label) for label in line.strip().split()]
+                    shifted_labels = []
+                    for label in labels:
+                        if curr_slots[label] == 'O':
+                            shifted_labels.append(none_slot)
+                        else:
+                            shifted_labels.append(label + slot_shift)
+                    slot_files[mode].write(list2str(shifted_labels) + '\n')
+
+        intent_shift += len(curr_intents)
+        slot_shift += len(curr_slots)
+
+    write_vocab_in_order(intents, f'{outfold}/dict.intents.csv')
+    write_vocab_in_order(slots, f'{outfold}/dict.slots.csv')
+    return outfold, none_slot
\ No newline at end of file
diff --git a/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/joint_intent_slot_dataset.py b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/joint_intent_slot_dataset.py
new file mode 100644
index 000000000000..2aad0f79b732
--- /dev/null
+++ b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/joint_intent_slot_dataset.py
@@ -0,0 +1,262 @@
+# Copyright 2018 The Google AI Language Team Authors and
+# The HuggingFace Inc. team.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utility functions for Token Classification NLP tasks
+Some parts of this code were adapted from the HuggingFace library at
+https://github.com/huggingface/pytorch-pretrained-BERT
+"""
+import random
+
+import numpy as np
+from torch.utils.data import Dataset
+
+from nemo import logging
+from nemo.collections.nlp.data.datasets.datasets_utils.preprocessing import (
+    get_stats,
+)
+
+__all__ = ['BertJointIntentSlotDataset', 'BertJointIntentSlotInferDataset']
+
+
+def get_features(
+    queries,
+    max_seq_length,
+    tokenizer,
+    pad_label=128,
+    raw_slots=None,
+    ignore_extra_tokens=False,
+    ignore_start_end=False,
+):
+    all_subtokens = []
+    all_loss_mask = []
+    all_subtokens_mask = []
+    all_segment_ids = []
+    all_input_ids = []
+    all_input_mask = []
+    sent_lengths = []
+    all_slots = []
+
+    with_label = False
+    if raw_slots is not None:
+        with_label = True
+
+    for i, query in enumerate(queries):
+        words = query.strip().split()
+        subtokens = ['[CLS]']
+        loss_mask = [1 - ignore_start_end]
+        subtokens_mask = [0]
+        if with_label:
+            slots = [pad_label]
+
+        for j, word in enumerate(words):
+            word_tokens = tokenizer.tokenize(word)
+            subtokens.extend(word_tokens)
+
+            loss_mask.append(1)
+            loss_mask.extend([not ignore_extra_tokens] * (len(word_tokens) - 1))
+
+            subtokens_mask.append(1)
+            subtokens_mask.extend([0] * (len(word_tokens) - 1))
+
+            if with_label:
+                slots.extend([raw_slots[i][j]] * len(word_tokens))
+
+        subtokens.append('[SEP]')
+        loss_mask.append(not ignore_start_end)
+        subtokens_mask.append(0)
+        sent_lengths.append(len(subtokens))
+        all_subtokens.append(subtokens)
+        all_loss_mask.append(loss_mask)
+        all_subtokens_mask.append(subtokens_mask)
+        all_input_mask.append([1] * len(subtokens))
+        if with_label:
+            slots.append(pad_label)
+            all_slots.append(slots)
+
+    max_seq_length = min(max_seq_length, max(sent_lengths))
+    logging.info(f'Max length: {max_seq_length}')
+    get_stats(sent_lengths)
+    too_long_count = 0
+
+    for i, subtokens in enumerate(all_subtokens):
+        if len(subtokens) > max_seq_length:
+            subtokens = ['[CLS]'] + subtokens[-max_seq_length + 1 :]
+            all_input_mask[i] = [1] + all_input_mask[i][-max_seq_length + 1 :]
+            all_loss_mask[i] = [1 - ignore_start_end] + all_loss_mask[i][-max_seq_length + 1 :]
+            all_subtokens_mask[i] = [0] + all_subtokens_mask[i][-max_seq_length + 1 :]
+
+            if with_label:
+                all_slots[i] = [pad_label] + all_slots[i][-max_seq_length + 1 :]
+            too_long_count += 1
+
+        all_input_ids.append([tokenizer._convert_token_to_id(t) for t in subtokens])
+
+        if len(subtokens) < max_seq_length:
+            extra = max_seq_length - len(subtokens)
+            all_input_ids[i] = all_input_ids[i] + [0] * extra
+            all_loss_mask[i] = all_loss_mask[i] + [0] * extra
+            all_subtokens_mask[i] = all_subtokens_mask[i] + [0] * extra
+            all_input_mask[i] = all_input_mask[i] + [0] * extra
+
+            if with_label:
+                all_slots[i] = all_slots[i] + [pad_label] * extra
+
+        all_segment_ids.append([0] * max_seq_length)
+
+    logging.info(f'{too_long_count} are longer than {max_seq_length}')
+
+    return (all_input_ids, all_segment_ids, all_input_mask, all_loss_mask, all_subtokens_mask, all_slots)
+
+
+class BertJointIntentSlotDataset(Dataset):
+    """
+    Creates dataset to use for the task of joint intent
+    and slot classification with pretrained model.
+
+    Converts from raw data to an instance that can be used by
+    NMDataLayer.
+
+    For dataset to use during inference without labels, see
+    BertJointIntentSlotInferDataset.
+
+    Args:
+        input_file (str): file to sequence + label.
+            the first line is header (sentence [tab] label)
+            each line should be [sentence][tab][label]
+        slot_file (str): file to slot labels, each line corresponding to
+            slot labels for a sentence in input_file. No header.
+        max_seq_length (int): max sequence length minus 2 for [CLS] and [SEP]
+        tokenizer (Tokenizer): such as BertTokenizer
+        num_samples (int): number of samples you want to use for the dataset.
+            If -1, use all dataset. Useful for testing.
+        shuffle (bool): whether to shuffle your data.
+        pad_label (int): pad value use for slot labels.
+            by default, it's the neutral label.
+
+    """
+
+    def __init__(
+        self,
+        input_file,
+        slot_file,
+        max_seq_length,
+        tokenizer,
+        num_samples=-1,
+        shuffle=True,
+        pad_label=128,
+        ignore_extra_tokens=False,
+        ignore_start_end=False,
+    ):
+        if num_samples == 0:
+            raise ValueError("num_samples has to be positive", num_samples)
+
+        with open(slot_file, 'r') as f:
+            slot_lines = f.readlines()
+
+        with open(input_file, 'r') as f:
+            input_lines = f.readlines()[1:]
+
+        assert len(slot_lines) == len(input_lines)
+
+        dataset = list(zip(slot_lines, input_lines))
+
+        if shuffle or num_samples > 0:
+            random.shuffle(dataset)
+        if num_samples > 0:
+            dataset = dataset[:num_samples]
+
+        raw_slots, queries, raw_intents = [], [], []
+        for slot_line, input_line in dataset:
+            raw_slots.append([int(slot) for slot in slot_line.strip().split()])
+            parts = input_line.strip().split()
+            raw_intents.append(int(parts[-1]))
+            queries.append(' '.join(parts[:-1]))
+
+        features = get_features(
+            queries,
+            max_seq_length,
+            tokenizer,
+            pad_label=pad_label,
+            raw_slots=raw_slots,
+            ignore_extra_tokens=ignore_extra_tokens,
+            ignore_start_end=ignore_start_end,
+        )
+        self.all_input_ids = features[0]
+        self.all_segment_ids = features[1]
+        self.all_input_mask = features[2]
+        self.all_loss_mask = features[3]
+        self.all_subtokens_mask = features[4]
+        self.all_slots = features[5]
+        self.all_intents = raw_intents
+
+    def __len__(self):
+        return len(self.all_input_ids)
+
+    def __getitem__(self, idx):
+        return (
+            np.array(self.all_input_ids[idx]),
+            np.array(self.all_segment_ids[idx]),
+            np.array(self.all_input_mask[idx], dtype=np.long),
+            np.array(self.all_loss_mask[idx]),
+            np.array(self.all_subtokens_mask[idx]),
+            self.all_intents[idx],
+            np.array(self.all_slots[idx]),
+        )
+
+
+class BertJointIntentSlotInferDataset(Dataset):
+    """
+    Creates dataset to use for the task of joint intent
+    and slot classification with pretrained model.
+
+    Converts from raw data to an instance that can be used by
+    NMDataLayer.
+
+    This is to be used during inference only.
+    For dataset to use during training with labels, see
+    BertJointIntentSlotDataset.
+
+    Args:
+        queries (list): list of queries to run inference on
+        max_seq_length (int): max sequence length minus 2 for [CLS] and [SEP]
+        tokenizer (Tokenizer): such as BertTokenizer
+        pad_label (int): pad value use for slot labels.
+            by default, it's the neutral label.
+
+    """
+
+    def __init__(self, queries, max_seq_length, tokenizer):
+        features = get_features(queries, max_seq_length, tokenizer)
+
+        self.all_input_ids = features[0]
+        self.all_segment_ids = features[1]
+        self.all_input_mask = features[2]
+        self.all_loss_mask = features[3]
+        self.all_subtokens_mask = features[4]
+
+    def __len__(self):
+        return len(self.all_input_ids)
+
+    def __getitem__(self, idx):
+        return (
+            np.array(self.all_input_ids[idx]),
+            np.array(self.all_segment_ids[idx]),
+            np.array(self.all_input_mask[idx], dtype=np.long),
+            np.array(self.all_loss_mask[idx]),
+            np.array(self.all_subtokens_mask[idx]),
+        )
+
+
diff --git a/nemo/collections/nlp/data/datasets/lm_bert_dataset.py b/nemo/collections/nlp/data/datasets/lm_bert_dataset.py
index f068e614afa3..2d50f4e1e4f7 100644
--- a/nemo/collections/nlp/data/datasets/lm_bert_dataset.py
+++ b/nemo/collections/nlp/data/datasets/lm_bert_dataset.py
@@ -29,10 +29,12 @@
 from tqdm import tqdm
 
 from nemo import logging
-from nemo.collections.nlp.data.datasets.lm_transformer_dataset import create_vocab_mlm
 
 __all__ = ['BertPretrainingDataset', 'BertPretrainingPreprocessedDataset']
 
+from nemo.collections.nlp.data import DATABASE_EXISTS_TMP
+from nemo.collections.nlp.utils import if_exist
+
 
 class BertPretrainingDataset(Dataset):
     def __init__(
@@ -394,3 +396,57 @@ def __init__(self, dataset_name, data_dir, vocab_size, sample_size, special_toke
         self.train_file = f'{data_dir}/train.txt'
         self.eval_file = f'{data_dir}/valid.txt'
         self.test_file = f'{data_dir}/test.txt'
+
+
+    def create_vocab_mlm(
+        data_dir, vocab_size, sample_size, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], train_file=''
+    ):
+        vocab = special_tokens[:]
+        bert_dir = f'{data_dir}/bert'
+        if if_exist(bert_dir, ['tokenizer.model']):
+            logging.info(DATABASE_EXISTS_TMP.format('WikiText_BERT', bert_dir))
+            return data_dir, f'{bert_dir}/tokenizer.model'
+        logging.info(f'Processing WikiText dataset and store at {bert_dir}')
+        os.makedirs(bert_dir, exist_ok=True)
+
+        if not train_file:
+            files = glob.glob(f'{data_dir}/*.txt')
+            train_file = f'{bert_dir}/merged.txt'
+            logging.info(f"Merging {len(files)} txt files into {train_file}")
+
+            with open(train_file, "w") as merged:
+                for file in tqdm(files):
+                    with open(file, 'r') as inf:
+                        content = inf.read().strip()
+                    merged.write(content + '\n\n\n')
+        else:
+            train_file = f'{data_dir}/{train_file}'
+
+        cmd = (
+            f"--input={train_file} --model_prefix={bert_dir}/tokenizer "
+            f"--vocab_size={vocab_size - len(vocab)} "
+            f"--input_sentence_size={sample_size} "
+            f"--shuffle_input_sentence=true --hard_vocab_limit=false "
+            f"--bos_id=-1 --eos_id=-1"
+        )
+        SPT.Train(cmd)
+
+        # Add BERT control symbols
+        tokens = []
+
+        with open(f"{bert_dir}/tokenizer.vocab", "r") as f:
+            f.readline()  # skip first <unk> token
+
+            # Read tokens from each line and parse for vocab
+            for line in f:
+                piece = line.split("\t")[0]
+                token = piece[1:] if piece.startswith("▁") else f"##{piece}"
+                tokens.append(token)
+
+        vocab.extend(tokens)
+
+        # Save vocabulary to output file
+        with open(f'{bert_dir}/vocab.txt', "w") as f:
+            for token in vocab:
+                f.write(f"{token}\n".format())
+        return data_dir, f'{bert_dir}/tokenizer.model'
\ No newline at end of file
diff --git a/nemo/collections/nlp/data/datasets/lm_transformer_dataset.py b/nemo/collections/nlp/data/datasets/lm_transformer_dataset.py
index 5d8f20723c6e..7d2075f82fea 100644
--- a/nemo/collections/nlp/data/datasets/lm_transformer_dataset.py
+++ b/nemo/collections/nlp/data/datasets/lm_transformer_dataset.py
@@ -15,18 +15,15 @@
 # =============================================================================
 
 """Pytorch Dataset for training Neural Machine Translation."""
-import glob
 import os
-import pickle
 import re
 
 import numpy as np
-from sentencepiece import SentencePieceTrainer as SPT
 from torch.utils.data import Dataset
-from tqdm import tqdm
 
 from nemo import logging
-from nemo.collections.nlp.data.datasets.datasets_utils import DATABASE_EXISTS_TMP, download_wkt2
+from nemo.collections.nlp.data.datasets.datasets_utils import download_wkt2
+from nemo.collections.nlp.data.datasets.datasets_utils.datasets_processing import dataset_to_ids
 from nemo.collections.nlp.utils.common_nlp_utils import if_exist
 
 __all__ = ['LanguageModelingDataset']
@@ -66,122 +63,33 @@ def __init__(self, dataset_name, data_dir, do_lower_case):
                 "you build the preprocessing method for it."
             )
 
-
-def create_vocab_mlm(
-    data_dir, vocab_size, sample_size, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], train_file=''
-):
-    vocab = special_tokens[:]
-    bert_dir = f'{data_dir}/bert'
-    if if_exist(bert_dir, ['tokenizer.model']):
-        logging.info(DATABASE_EXISTS_TMP.format('WikiText_BERT', bert_dir))
-        return data_dir, f'{bert_dir}/tokenizer.model'
-    logging.info(f'Processing WikiText dataset and store at {bert_dir}')
-    os.makedirs(bert_dir, exist_ok=True)
-
-    if not train_file:
-        files = glob.glob(f'{data_dir}/*.txt')
-        train_file = f'{bert_dir}/merged.txt'
-        logging.info(f"Merging {len(files)} txt files into {train_file}")
-
-        with open(train_file, "w") as merged:
-            for file in tqdm(files):
-                with open(file, 'r') as inf:
-                    content = inf.read().strip()
-                merged.write(content + '\n\n\n')
-    else:
-        train_file = f'{data_dir}/{train_file}'
-
-    cmd = (
-        f"--input={train_file} --model_prefix={bert_dir}/tokenizer "
-        f"--vocab_size={vocab_size - len(vocab)} "
-        f"--input_sentence_size={sample_size} "
-        f"--shuffle_input_sentence=true --hard_vocab_limit=false "
-        f"--bos_id=-1 --eos_id=-1"
-    )
-    SPT.Train(cmd)
-
-    # Add BERT control symbols
-    tokens = []
-
-    with open(f"{bert_dir}/tokenizer.vocab", "r") as f:
-        f.readline()  # skip first <unk> token
-
-        # Read tokens from each line and parse for vocab
-        for line in f:
-            piece = line.split("\t")[0]
-            token = piece[1:] if piece.startswith("▁") else f"##{piece}"
-            tokens.append(token)
-
-    vocab.extend(tokens)
-
-    # Save vocabulary to output file
-    with open(f'{bert_dir}/vocab.txt', "w") as f:
-        for token in vocab:
-            f.write(f"{token}\n".format())
-    return data_dir, f'{bert_dir}/tokenizer.model'
-
-
-def dataset_to_ids(dataset, tokenizer, cache_ids=False, add_bos_eos=True):
-    """
-    Reads dataset from file line by line, tokenizes each line with tokenizer,
-    and returns list of lists which corresponds to ids of tokenized strings.
-
-    Args:
-        dataset: path to dataset
-        tokenizer: tokenizer to convert text into ids
-        cache_ids: if True, ids are saved to disk as pickle file
-            with similar name (e.g., data.txt --> data.txt.pkl)
-        add_bos_eos: bool, whether to add <s> and </s> symbols (e.g., for NMT)
-    Returns:
-        ids: list of ids which correspond to tokenized strings of the dataset
-    """
-
-    cached_ids_dataset = dataset + str(".pkl")
-    if os.path.isfile(cached_ids_dataset):
-        logging.info("Loading cached tokenized dataset ...")
-        ids = pickle.load(open(cached_ids_dataset, "rb"))
-    else:
-        logging.info("Tokenizing dataset ...")
-        data = open(dataset, "rb").readlines()
-        ids = []
-        for sentence in data:
-            sent_ids = tokenizer.text_to_ids(sentence.decode("utf-8"))
-            if add_bos_eos:
-                sent_ids = [tokenizer.bos_id] + sent_ids + [tokenizer.eos_id]
-            ids.append(sent_ids)
-        if cache_ids:
-            logging.info("Caching tokenized dataset ...")
-            pickle.dump(ids, open(cached_ids_dataset, "wb"))
-    return ids
-
-
-def create_vocab_lm(data_dir, do_lower_case):
-    if if_exist(data_dir, ['train.txt', 'vocab.txt']):
-        logging.info("Vocabulary has been created.")
-        with open(os.path.join(data_dir, 'vocab.txt'), 'r') as f:
-            vocab_size = len(f.readlines())
-        return vocab_size
-
-    logging.info(f'Creating vocabulary from training data at {data_dir}')
-
-    with open(f'{data_dir}/train.txt', 'r') as f:
-        txt = f.read()
-    if do_lower_case:
-        txt = txt.lower()
-    lines = re.split(r'[\n]', txt)
-    sentences = [line.strip().split() for line in lines if line.strip()]
-
-    vocab = {"[PAD]": 0, "[SEP]": 1, "[CLS]": 2, "[MASK]": 3}
-    idx = 4
-    for sentence in sentences:
-        for word in sentence:
-            if word not in vocab:
-                vocab[word] = idx
-                idx += 1
-
-    with open(f'{data_dir}/vocab.txt', 'w') as f:
-        for word in sorted(vocab.keys()):
-            f.write(word + '\n')
-    logging.info(f"Created vocabulary of size {len(vocab)}")
-
-    return len(vocab)
+    def create_vocab_lm(data_dir, do_lower_case):
+        if if_exist(data_dir, ['train.txt', 'vocab.txt']):
+            logging.info("Vocabulary has been created.")
+            with open(os.path.join(data_dir, 'vocab.txt'), 'r') as f:
+                vocab_size = len(f.readlines())
+            return vocab_size
+
+        logging.info(f'Creating vocabulary from training data at {data_dir}')
+
+        with open(f'{data_dir}/train.txt', 'r') as f:
+            txt = f.read()
+        if do_lower_case:
+            txt = txt.lower()
+        lines = re.split(r'[\n]', txt)
+        sentences = [line.strip().split() for line in lines if line.strip()]
+
+        vocab = {"[PAD]": 0, "[SEP]": 1, "[CLS]": 2, "[MASK]": 3}
+        idx = 4
+        for sentence in sentences:
+            for word in sentence:
+                if word not in vocab:
+                    vocab[word] = idx
+                    idx += 1
+
+        with open(f'{data_dir}/vocab.txt', 'w') as f:
+            for word in sorted(vocab.keys()):
+                f.write(word + '\n')
+        logging.info(f"Created vocabulary of size {len(vocab)}")
+
+        return len(vocab)
diff --git a/nemo/collections/nlp/data/datasets/machine_translation_dataset.py b/nemo/collections/nlp/data/datasets/machine_translation_dataset.py
index db8e6b7ace2d..e3f2bfd8e102 100644
--- a/nemo/collections/nlp/data/datasets/machine_translation_dataset.py
+++ b/nemo/collections/nlp/data/datasets/machine_translation_dataset.py
@@ -21,7 +21,7 @@
 import numpy as np
 from torch.utils.data import Dataset
 
-from nemo.collections.nlp.data.datasets.lm_transformer_dataset import dataset_to_ids
+from nemo.collections.nlp.data.datasets.datasets_utils.datasets_processing import dataset_to_ids
 
 __all__ = ['TranslationDataset']
 
@@ -157,34 +157,34 @@ def pack_data_into_batches(self, src_ids, tgt_ids):
         return batches
 
 
-def clean_src_and_target(src_ids, tgt_ids, max_tokens=128, min_tokens=3, max_tokens_diff=25, max_tokens_ratio=2.5):
-    """
-    Cleans source and target sentences to get rid of noisy data.
-    Specifically, a pair of sentences is removed if
-      -- either source or target is longer than *max_tokens*
-      -- either source or target is shorter than *min_tokens*
-      -- absolute difference between source and target is larger than
-         *max_tokens_diff*
-      -- one sentence is *max_tokens_ratio* times longer than the other
-    """
-
-    if len(src_ids) != len(tgt_ids):
-        raise ValueError("Source and target corpora have different lengths!")
-    src_ids_, tgt_ids_ = [], []
-    for i in range(len(src_ids)):
-        src_len, tgt_len = len(src_ids[i]), len(tgt_ids[i])
-        if (
-            src_len > max_tokens
-            or tgt_len > max_tokens
-            or src_len < min_tokens
-            or tgt_len < min_tokens
-            or (src_ids[i] == tgt_ids[i])
-            or np.abs(src_len - tgt_len) > max_tokens_diff
-        ):
-            continue
-        ratio = max(src_len - 2, 1) / max(tgt_len - 2, 1)
-        if ratio > max_tokens_ratio or ratio < (1 / max_tokens_ratio):
-            continue
-        src_ids_.append(src_ids[i])
-        tgt_ids_.append(tgt_ids[i])
-    return src_ids_, tgt_ids_
+    def clean_src_and_target(src_ids, tgt_ids, max_tokens=128, min_tokens=3, max_tokens_diff=25, max_tokens_ratio=2.5):
+        """
+        Cleans source and target sentences to get rid of noisy data.
+        Specifically, a pair of sentences is removed if
+          -- either source or target is longer than *max_tokens*
+          -- either source or target is shorter than *min_tokens*
+          -- absolute difference between source and target is larger than
+             *max_tokens_diff*
+          -- one sentence is *max_tokens_ratio* times longer than the other
+        """
+
+        if len(src_ids) != len(tgt_ids):
+            raise ValueError("Source and target corpora have different lengths!")
+        src_ids_, tgt_ids_ = [], []
+        for i in range(len(src_ids)):
+            src_len, tgt_len = len(src_ids[i]), len(tgt_ids[i])
+            if (
+                src_len > max_tokens
+                or tgt_len > max_tokens
+                or src_len < min_tokens
+                or tgt_len < min_tokens
+                or (src_ids[i] == tgt_ids[i])
+                or np.abs(src_len - tgt_len) > max_tokens_diff
+            ):
+                continue
+            ratio = max(src_len - 2, 1) / max(tgt_len - 2, 1)
+            if ratio > max_tokens_ratio or ratio < (1 / max_tokens_ratio):
+                continue
+            src_ids_.append(src_ids[i])
+            tgt_ids_.append(tgt_ids[i])
+        return src_ids_, tgt_ids_
diff --git a/nemo/collections/nlp/data/datasets/qa_squad_dataset.py b/nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_dataset.py
similarity index 67%
rename from nemo/collections/nlp/data/datasets/qa_squad_dataset.py
rename to nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_dataset.py
index b927f83ead38..0eaeba4528fe 100644
--- a/nemo/collections/nlp/data/datasets/qa_squad_dataset.py
+++ b/nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_dataset.py
@@ -26,7 +26,8 @@
 from tqdm import tqdm
 
 from nemo import logging
-from nemo.collections.nlp.data.datasets.glue_benchmark_dataset import DataProcessor
+from nemo.collections.nlp.data.datasets.glue_benchmark_dataset.data_processors import DataProcessor
+from nemo.collections.nlp.data.datasets.qa_squad_dataset.qa_squad_processing import convert_examples_to_features
 from nemo.collections.nlp.metrics.squad_metrics import (
     _get_best_indexes,
     apply_no_ans_threshold,
@@ -403,177 +404,6 @@ def evaluate(
         return exact_match, f1, all_predictions
 
 
-def convert_examples_to_features(
-    examples, tokenizer, max_seq_length, doc_stride, max_query_length, has_groundtruth,
-):
-    """Loads a data file into a list of `InputBatch`s."""
-
-    unique_id = 1000000000
-
-    features = []
-    for (example_index, example) in enumerate(examples):
-        query_tokens = tokenizer.text_to_tokens(example.question_text)
-
-        if len(query_tokens) > max_query_length:
-            query_tokens = query_tokens[0:max_query_length]
-
-        # context: index of token -> index of word
-        tok_to_orig_index = []
-        # context: index of word -> index of first token in token list
-        orig_to_tok_index = []
-        # context without white spaces after tokenization
-        all_doc_tokens = []
-        # doc tokens is word separated context
-        for (i, token) in enumerate(example.doc_tokens):
-            orig_to_tok_index.append(len(all_doc_tokens))
-            sub_tokens = tokenizer.text_to_tokens(token)
-            for sub_token in sub_tokens:
-                tok_to_orig_index.append(i)
-                all_doc_tokens.append(sub_token)
-
-        # idx of query token start and end in context
-        tok_start_position = None
-        tok_end_position = None
-        if has_groundtruth and example.is_impossible:
-            tok_start_position = -1
-            tok_end_position = -1
-        if has_groundtruth and not example.is_impossible:
-            tok_start_position = orig_to_tok_index[example.start_position]
-            if example.end_position < len(example.doc_tokens) - 1:
-                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
-            else:
-                tok_end_position = len(all_doc_tokens) - 1
-
-            (tok_start_position, tok_end_position) = _improve_answer_span(
-                all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
-            )
-
-        # The -3 accounts for tokenizer.cls_token, tokenizer.sep_token and tokenizer.eos_token
-        # doc_spans contains all possible contexts options of given length
-        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
-        _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
-        doc_spans = []
-        start_offset = 0
-        while start_offset < len(all_doc_tokens):
-            length = len(all_doc_tokens) - start_offset
-            if length > max_tokens_for_doc:
-                length = max_tokens_for_doc
-            doc_spans.append(_DocSpan(start=start_offset, length=length))
-            if start_offset + length == len(all_doc_tokens):
-                break
-            start_offset += min(length, doc_stride)
-
-        for (doc_span_index, doc_span) in enumerate(doc_spans):
-            tokens = []
-            # maps context tokens idx in final input -> word idx in context
-            token_to_orig_map = {}
-            token_is_max_context = {}
-            segment_ids = []
-            tokens.append(tokenizer.bos_token)
-            segment_ids.append(0)
-            for token in query_tokens:
-                tokens.append(token)
-                segment_ids.append(0)
-            tokens.append(tokenizer.sep_token)
-            segment_ids.append(0)
-
-            for i in range(doc_span.length):
-                split_token_index = doc_span.start + i
-                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
-
-                is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index)
-                token_is_max_context[len(tokens)] = is_max_context
-                tokens.append(all_doc_tokens[split_token_index])
-                segment_ids.append(1)
-            tokens.append(tokenizer.eos_token)
-            segment_ids.append(1)
-
-            input_ids = tokenizer.tokens_to_ids(tokens)
-
-            # The mask has 1 for real tokens and 0 for padding tokens.
-            # Only real tokens are attended to.
-            input_mask = [1] * len(input_ids)
-
-            # Zero-pad up to the sequence length.
-            while len(input_ids) < max_seq_length:
-                input_ids.append(tokenizer.pad_id)
-                input_mask.append(0)
-                segment_ids.append(0)
-
-            assert len(input_ids) == max_seq_length
-            assert len(input_mask) == max_seq_length
-            assert len(segment_ids) == max_seq_length
-
-            # calculate start and end position in final array
-            # of tokens in answer if no answer,
-            # 0 for both pointing to tokenizer.cls_token
-            start_position = None
-            end_position = None
-            if has_groundtruth and not example.is_impossible:
-                doc_start = doc_span.start
-                doc_end = doc_span.start + doc_span.length - 1
-                out_of_span = False
-                if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
-                    out_of_span = True
-                if out_of_span:
-                    start_position = 0
-                    end_position = 0
-                else:
-                    doc_offset = len(query_tokens) + 2
-                    start_position = tok_start_position - doc_start + doc_offset
-                    end_position = tok_end_position - doc_start + doc_offset
-            if has_groundtruth and example.is_impossible:
-                # if our document chunk does not contain
-                # an annotation we throw it out, since there is nothing
-                # to predict.
-                start_position = 0
-                end_position = 0
-
-            if example_index < 1:
-                logging.info("*** Example ***")
-                logging.info("unique_id: %s" % (unique_id))
-                logging.info("example_index: %s" % (example_index))
-                logging.info("doc_span_index: %s" % (doc_span_index))
-                logging.info("tokens: %s" % " ".join(tokens))
-                logging.info(
-                    "token_to_orig_map: %s" % " ".join(["%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])
-                )
-                logging.info(
-                    "token_is_max_context: %s"
-                    % " ".join(["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()])
-                )
-                logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-                logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
-                logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
-                if has_groundtruth and example.is_impossible:
-                    logging.info("impossible example")
-                if has_groundtruth and not example.is_impossible:
-                    answer_text = " ".join(tokens[start_position : (end_position + 1)])
-                    logging.info("start_position: %d" % (start_position))
-                    logging.info("end_position: %d" % (end_position))
-                    logging.info("answer: %s" % (answer_text))
-
-            features.append(
-                InputFeatures(
-                    unique_id=unique_id,
-                    example_index=example_index,
-                    doc_span_index=doc_span_index,
-                    tokens=tokens,
-                    token_to_orig_map=token_to_orig_map,
-                    token_is_max_context=token_is_max_context,
-                    input_ids=input_ids,
-                    input_mask=input_mask,
-                    segment_ids=segment_ids,
-                    start_position=start_position,
-                    end_position=end_position,
-                    is_impossible=example.is_impossible,
-                )
-            )
-            unique_id += 1
-
-    return features
-
-
 class InputFeatures(object):
     """A single set of features of data."""
 
@@ -738,77 +568,3 @@ def __init__(
             ]
 
 
-def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
-    """Returns tokenized answer spans that
-    better match the annotated answer."""
-    tok_answer_text = " ".join(tokenizer.text_to_tokens(orig_answer_text))
-
-    for new_start in range(input_start, input_end + 1):
-        for new_end in range(input_end, new_start - 1, -1):
-            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
-            if text_span == tok_answer_text:
-                return (new_start, new_end)
-
-    return (input_start, input_end)
-
-
-def _check_is_max_context(doc_spans, cur_span_index, position):
-    """Check if this is the 'max context' doc span for the token."""
-    best_score = None
-    best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
-        end = doc_span.start + doc_span.length - 1
-        if position < doc_span.start:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - doc_span.start
-        num_right_context = end - position
-        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
-
-    return cur_span_index == best_span_index
-
-
-def check_is_max_context(doc_spans, cur_span_index, position):
-    """Check if this is the 'max context' doc span for the token.
-
-    Because of the sliding window approach taken to scoring documents,
-    a single token can appear in multiple documents.
-
-    Example:
-        Doc: the man went to the store and bought a gallon of milk
-        Span A: the man went to the
-        Span B: to the store and bought
-        Span C: and bought a gallon of
-        ...
-
-    Now the word 'bought' will have two scores from spans B and C. We only
-    want to consider the score with "maximum context", which we define as
-    the *minimum* of its left and right context (the *sum* of left and
-    right context will always be the same, of course).
-
-    In the example the maximum context for 'bought' would be span C since
-    it has 1 left context and 3 right context, while span B has 4 left context
-    and 0 right context.
-
-    Code adapted from the code by the Google AI and HuggingFace.
-    """
-    best_score = None
-    best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
-        end = doc_span.start + doc_span.length - 1
-        if position < doc_span.start:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - doc_span.start
-        num_right_context = end - position
-        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
-
-    return cur_span_index == best_span_index
diff --git a/nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_processing.py b/nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_processing.py
new file mode 100644
index 000000000000..7f8a84fbff2d
--- /dev/null
+++ b/nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_processing.py
@@ -0,0 +1,231 @@
+import collections
+
+from nemo import logging
+from nemo.collections.nlp.data.datasets.qa_squad_dataset.qa_squad_dataset import InputFeatures
+
+
+def convert_examples_to_features(
+    examples, tokenizer, max_seq_length, doc_stride, max_query_length, has_groundtruth,
+):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    unique_id = 1000000000
+
+    features = []
+    for (example_index, example) in enumerate(examples):
+        query_tokens = tokenizer.text_to_tokens(example.question_text)
+
+        if len(query_tokens) > max_query_length:
+            query_tokens = query_tokens[0:max_query_length]
+
+        # context: index of token -> index of word
+        tok_to_orig_index = []
+        # context: index of word -> index of first token in token list
+        orig_to_tok_index = []
+        # context without white spaces after tokenization
+        all_doc_tokens = []
+        # doc tokens is word separated context
+        for (i, token) in enumerate(example.doc_tokens):
+            orig_to_tok_index.append(len(all_doc_tokens))
+            sub_tokens = tokenizer.text_to_tokens(token)
+            for sub_token in sub_tokens:
+                tok_to_orig_index.append(i)
+                all_doc_tokens.append(sub_token)
+
+        # idx of query token start and end in context
+        tok_start_position = None
+        tok_end_position = None
+        if has_groundtruth and example.is_impossible:
+            tok_start_position = -1
+            tok_end_position = -1
+        if has_groundtruth and not example.is_impossible:
+            tok_start_position = orig_to_tok_index[example.start_position]
+            if example.end_position < len(example.doc_tokens) - 1:
+                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
+            else:
+                tok_end_position = len(all_doc_tokens) - 1
+
+            (tok_start_position, tok_end_position) = _improve_answer_span(
+                all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
+            )
+
+        # The -3 accounts for tokenizer.cls_token, tokenizer.sep_token and tokenizer.eos_token
+        # doc_spans contains all possible contexts options of given length
+        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+        _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
+        doc_spans = []
+        start_offset = 0
+        while start_offset < len(all_doc_tokens):
+            length = len(all_doc_tokens) - start_offset
+            if length > max_tokens_for_doc:
+                length = max_tokens_for_doc
+            doc_spans.append(_DocSpan(start=start_offset, length=length))
+            if start_offset + length == len(all_doc_tokens):
+                break
+            start_offset += min(length, doc_stride)
+
+        for (doc_span_index, doc_span) in enumerate(doc_spans):
+            tokens = []
+            # maps context tokens idx in final input -> word idx in context
+            token_to_orig_map = {}
+            token_is_max_context = {}
+            segment_ids = []
+            tokens.append(tokenizer.bos_token)
+            segment_ids.append(0)
+            for token in query_tokens:
+                tokens.append(token)
+                segment_ids.append(0)
+            tokens.append(tokenizer.sep_token)
+            segment_ids.append(0)
+
+            for i in range(doc_span.length):
+                split_token_index = doc_span.start + i
+                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
+
+                is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index)
+                token_is_max_context[len(tokens)] = is_max_context
+                tokens.append(all_doc_tokens[split_token_index])
+                segment_ids.append(1)
+            tokens.append(tokenizer.eos_token)
+            segment_ids.append(1)
+
+            input_ids = tokenizer.tokens_to_ids(tokens)
+
+            # The mask has 1 for real tokens and 0 for padding tokens.
+            # Only real tokens are attended to.
+            input_mask = [1] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            while len(input_ids) < max_seq_length:
+                input_ids.append(tokenizer.pad_id)
+                input_mask.append(0)
+                segment_ids.append(0)
+
+            assert len(input_ids) == max_seq_length
+            assert len(input_mask) == max_seq_length
+            assert len(segment_ids) == max_seq_length
+
+            # calculate start and end position in final array
+            # of tokens in answer if no answer,
+            # 0 for both pointing to tokenizer.cls_token
+            start_position = None
+            end_position = None
+            if has_groundtruth and not example.is_impossible:
+                doc_start = doc_span.start
+                doc_end = doc_span.start + doc_span.length - 1
+                out_of_span = False
+                if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
+                    out_of_span = True
+                if out_of_span:
+                    start_position = 0
+                    end_position = 0
+                else:
+                    doc_offset = len(query_tokens) + 2
+                    start_position = tok_start_position - doc_start + doc_offset
+                    end_position = tok_end_position - doc_start + doc_offset
+            if has_groundtruth and example.is_impossible:
+                # if our document chunk does not contain
+                # an annotation we throw it out, since there is nothing
+                # to predict.
+                start_position = 0
+                end_position = 0
+
+            if example_index < 1:
+                logging.info("*** Example ***")
+                logging.info("unique_id: %s" % (unique_id))
+                logging.info("example_index: %s" % (example_index))
+                logging.info("doc_span_index: %s" % (doc_span_index))
+                logging.info("tokens: %s" % " ".join(tokens))
+                logging.info(
+                    "token_to_orig_map: %s" % " ".join(["%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])
+                )
+                logging.info(
+                    "token_is_max_context: %s"
+                    % " ".join(["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()])
+                )
+                logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+                logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+                logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+                if has_groundtruth and example.is_impossible:
+                    logging.info("impossible example")
+                if has_groundtruth and not example.is_impossible:
+                    answer_text = " ".join(tokens[start_position : (end_position + 1)])
+                    logging.info("start_position: %d" % (start_position))
+                    logging.info("end_position: %d" % (end_position))
+                    logging.info("answer: %s" % (answer_text))
+
+            features.append(
+                InputFeatures(
+                    unique_id=unique_id,
+                    example_index=example_index,
+                    doc_span_index=doc_span_index,
+                    tokens=tokens,
+                    token_to_orig_map=token_to_orig_map,
+                    token_is_max_context=token_is_max_context,
+                    input_ids=input_ids,
+                    input_mask=input_mask,
+                    segment_ids=segment_ids,
+                    start_position=start_position,
+                    end_position=end_position,
+                    is_impossible=example.is_impossible,
+                )
+            )
+            unique_id += 1
+
+    return features
+
+
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
+    """Returns tokenized answer spans that
+    better match the annotated answer."""
+    tok_answer_text = " ".join(tokenizer.text_to_tokens(orig_answer_text))
+
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
+            if text_span == tok_answer_text:
+                return (new_start, new_end)
+
+    return (input_start, input_end)
+
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token.
+
+    Because of the sliding window approach taken to scoring documents,
+    a single token can appear in multiple documents.
+
+    Example:
+        Doc: the man went to the store and bought a gallon of milk
+        Span A: the man went to the
+        Span B: to the store and bought
+        Span C: and bought a gallon of
+        ...
+
+    Now the word 'bought' will have two scores from spans B and C. We only
+    want to consider the score with "maximum context", which we define as
+    the *minimum* of its left and right context (the *sum* of left and
+    right context will always be the same, of course).
+
+    In the example the maximum context for 'bought' would be span C since
+    it has 1 left context and 3 right context, while span B has 4 left context
+    and 0 right context.
+
+    Code adapted from the code by the Google AI and HuggingFace.
+    """
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
\ No newline at end of file