diff --git a/CHANGELOG.md b/CHANGELOG.md
index 105d67e44600..a9285d147fec 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -85,6 +85,13 @@ files, along with unit tests, examples and tutorials
 ([PR #375](https://github.com/NVIDIA/NeMo/pull/375)) - @titu1994
 
 ### Changed
+- Refactoring of `nemo_nlp` collections: 
+([PR #368](https://github.com/NVIDIA/NeMo/pull/368)) - @VahidooX, @yzhang123, @ekmb
+    - renaming and restructuring of files, folder, and functions in `nemo_nlp`
+    - losses cleaned up. LossAggregatorNM moved to nemo/backends/pytorch/common/losses
+ ([PR #316](https://github.com/NVIDIA/NeMo/pull/316)) - @VahidooX, @yzhang123, @ekmb
+    - renaming and restructuring of files, folder, and functions in `nemo_nlp`
+    - Updated licenses
 - All collections changed to use New Neural Type System.
 ([PR #307](https://github.com/NVIDIA/NeMo/pull/307)) - @okuchaiev
 - Additional Collections Repositories merged into core `nemo_toolkit` package.
@@ -95,10 +102,6 @@ files, along with unit tests, examples and tutorials
 ([PR #286](https://github.com/NVIDIA/NeMo/pull/286)) - @stasbel
 - Major cleanup of Neural Module constructors (init), aiming at increasing the framework robustness: cleanup of NeuralModule initialization logic, refactor of trainer/actions (getting rid of local_params), fixes of several examples and unit tests, extraction and storing of intial parameters (init_params).  
 ([PR #309](https://github.com/NVIDIA/NeMo/pull/309)) - @tkornuta-nvidia
-- Refactoring of `nemo_nlp` collections: 
-([PR #316](https://github.com/NVIDIA/NeMo/pull/316)) - @VahidooX, @yzhang123, @ekmb
-    - renaming of files and restructuring of folder in `nemo_nlp`
-    - Updated licenses
 - Updated nemo's use of the logging library. from nemo import logging is now the reccomended way of using the nemo logger. neural_factory.logger and all other instances of logger are now deprecated and planned for removal in the next version. Please see PR 267 for complete change information.
 ([PR #267](https://github.com/NVIDIA/NeMo/pull/267), [PR #283](https://github.com/NVIDIA/NeMo/pull/283), [PR #305](https://github.com/NVIDIA/NeMo/pull/305), [PR #311](https://github.com/NVIDIA/NeMo/pull/311)) - @blisc
 - Changed Distributed Data Parallel from Apex to Torch
diff --git a/Jenkinsfile b/Jenkinsfile
index 5512971ba8d1..8e6955647162 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -91,19 +91,21 @@ pipeline {
       parallel {
         stage ('Text Classification with BERT Test') {
           steps {
-            sh 'cd examples/nlp/text_classification && CUDA_VISIBLE_DEVICES=0 python text_classification_with_bert.py --num_epochs=1 --max_seq_length=50 --dataset_name=jarvis --data_dir=/home/TestData/nlp/retail/ --eval_file_prefix=eval --batch_size=10 --num_train_samples=-1 --do_lower_case --shuffle_data --work_dir=outputs'
+            sh 'cd examples/nlp/text_classification && CUDA_VISIBLE_DEVICES=0 python text_classification_with_bert.py --pretrained_bert_model bert-base-uncased --num_epochs=1 --max_seq_length=50 --dataset_name=jarvis --data_dir=/home/TestData/nlp/retail/ --eval_file_prefix=eval --batch_size=10 --num_train_samples=-1 --do_lower_case --shuffle_data --work_dir=outputs'
             sh 'rm -rf examples/nlp/text_classification/outputs'
           }
         }
         stage ('Dialogue State Tracking - TRADE - Multi-GPUs') {
           steps {
+            sh 'rm -rf /home/TestData/nlp/multiwoz2.1/vocab.pkl'
             sh 'cd examples/nlp/dialogue_state_tracking && CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 dialogue_state_tracking_trade.py --batch_size=10 --eval_batch_size=10 --num_train_samples=-1 --num_eval_samples=-1 --num_epochs=1 --dropout=0.2 --eval_file_prefix=test --shuffle_data --num_gpus=2 --lr=0.001 --grad_norm_clip=10 --work_dir=outputs --data_dir=/home/TestData/nlp/multiwoz2.1'
             sh 'rm -rf examples/nlp/dialogue_state_tracking/outputs'
+            sh 'rm -rf /home/TestData/nlp/multiwoz2.1/vocab.pkl'
           }
         }
         stage ('GLUE Benchmark Test') {
           steps {
-            sh 'cd examples/nlp/glue_benchmark && CUDA_VISIBLE_DEVICES=1 python glue_benchmark_with_bert.py --data_dir /home/TestData/nlp/glue_fake/MRPC --work_dir glue_output --save_step_freq -1 --num_epochs 1 --task_name mrpc --batch_size 2'
+            sh 'cd examples/nlp/glue_benchmark && CUDA_VISIBLE_DEVICES=1 python glue_benchmark_with_bert.py --data_dir /home/TestData/nlp/glue_fake/MRPC --pretrained_bert_model bert-base-uncased --work_dir glue_output --save_step_freq -1 --num_epochs 1 --task_name mrpc --batch_size 2'
             sh 'rm -rf examples/nlp/glue_benchmark/glue_output'
           }
         }
@@ -122,8 +124,8 @@ pipeline {
       parallel {
         stage('Token Classification Training/Inference Test') {
           steps {
-            sh 'cd examples/nlp/token_classification && CUDA_VISIBLE_DEVICES=0 python token_classification.py --data_dir /home/TestData/nlp/token_classification_punctuation/ --batch_size 2 --num_epochs 1 --save_epoch_freq 1 --work_dir token_classification_output --pretrained_bert_model bert-base-cased'
-            sh 'cd examples/nlp/token_classification && DATE_F=$(ls token_classification_output/) && CUDA_VISIBLE_DEVICES=0 python token_classification_infer.py --work_dir token_classification_output/$DATE_F/checkpoints/ --labels_dict /home/TestData/nlp/token_classification_punctuation/label_ids.csv --pretrained_bert_model bert-base-cased'
+            sh 'cd examples/nlp/token_classification && CUDA_VISIBLE_DEVICES=0 python token_classification.py --data_dir /home/TestData/nlp/token_classification_punctuation/ --batch_size 2 --num_epochs 1 --save_epoch_freq 1 --work_dir token_classification_output --pretrained_bert_model bert-base-uncased'
+            sh 'cd examples/nlp/token_classification && DATE_F=$(ls token_classification_output/) && CUDA_VISIBLE_DEVICES=0 python token_classification_infer.py --work_dir token_classification_output/$DATE_F/checkpoints/ --labels_dict /home/TestData/nlp/token_classification_punctuation/label_ids.csv --pretrained_bert_model bert-base-uncased'
             sh 'rm -rf examples/nlp/token_classification/token_classification_output'
           }
         }
diff --git a/docs/sources/source/nlp/bert_pretraining.rst b/docs/sources/source/nlp/bert_pretraining.rst
index 389f6a307466..4b19ec9f512c 100644
--- a/docs/sources/source/nlp/bert_pretraining.rst
+++ b/docs/sources/source/nlp/bert_pretraining.rst
@@ -191,7 +191,7 @@ For training from raw text use nemo_nlp.BertPretrainingDataLayer, for preprocess
 
             mlm_logits = mlm_classifier(hidden_states=hidden_states)
             mlm_loss = mlm_loss_fn(logits=mlm_logits,
-                                   output_ids=input_data.output_ids,
+                                   labels=input_data.output_ids,
                                    output_mask=input_data.output_mask)
 
             nsp_logits = nsp_classifier(hidden_states=hidden_states)
diff --git a/docs/sources/source/nlp/joint_intent_slot_filling.rst b/docs/sources/source/nlp/joint_intent_slot_filling.rst
index 57b82629b0be..29f0fd954806 100644
--- a/docs/sources/source/nlp/joint_intent_slot_filling.rst
+++ b/docs/sources/source/nlp/joint_intent_slot_filling.rst
@@ -3,9 +3,9 @@ Tutorial
 
 In this tutorial, we are going to implement a joint intent and slot filling system with pretrained BERT model based on
 `BERT for Joint Intent Classification and Slot Filling <https://arxiv.org/abs/1902.10909>`_ :cite:`nlp-slot-chen2019bert`.
-All code used in this tutorial is based on ``examples/nlp/joint_intent_slot_with_bert.py``.
+All code used in this tutorial is based on ``examples/nlp/intent_detection_slot_tagging/joint_intent_slot_with_bert.py``.
 
-There are four pre-trained BERT models that we can select from using the argument `--pretrained_bert_model`. We're currently
+There are a variety pre-trained BERT models that we can select from using the argument `--pretrained_bert_model`. We're currently
 using the script for loading pre-trained models from `pytorch_transformers`. See the list of available pre-trained models
 `here <https://huggingface.co/pytorch-transformers/pretrained_models.html>`__. 
 
diff --git a/examples/nlp/asr_postprocessor/asr_postprocessor.py b/examples/nlp/asr_postprocessor/asr_postprocessor.py
index 187529ddd2e4..28744c9bcc7a 100644
--- a/examples/nlp/asr_postprocessor/asr_postprocessor.py
+++ b/examples/nlp/asr_postprocessor/asr_postprocessor.py
@@ -113,7 +113,7 @@
     args.d_model, num_classes=vocab_size, num_layers=1, log_softmax=True
 )
 
-loss_fn = nemo_nlp.nm.losses.PaddedSmoothedCrossEntropyLossNM(pad_id=tokenizer.pad_id, label_smoothing=0.1)
+loss_fn = nemo_nlp.nm.losses.SmoothedCrossEntropyLoss(pad_id=tokenizer.pad_id, label_smoothing=0.1)
 
 beam_search = nemo_nlp.nm.trainables.BeamSearchTranslatorNM(
     decoder=decoder,
@@ -174,7 +174,7 @@ def create_pipeline(dataset, tokens_in_batch, clean=False, training=True):
         input_ids_tgt=tgt, hidden_states_src=src_hiddens, input_mask_src=src_mask, input_mask_tgt=tgt_mask
     )
     log_softmax = t_log_softmax(hidden_states=tgt_hiddens)
-    loss = loss_fn(logits=log_softmax, target_ids=labels)
+    loss = loss_fn(logits=log_softmax, labels=labels)
     beam_results = None
     if not training:
         beam_results = beam_search(hidden_states_src=src_hiddens, input_mask_src=src_mask)
diff --git a/examples/nlp/dialogue_state_tracking/dialogue_state_tracking_trade.py b/examples/nlp/dialogue_state_tracking/dialogue_state_tracking_trade.py
index 9fc7a6ca7f29..b3e9b75195a8 100644
--- a/examples/nlp/dialogue_state_tracking/dialogue_state_tracking_trade.py
+++ b/examples/nlp/dialogue_state_tracking/dialogue_state_tracking_trade.py
@@ -25,12 +25,14 @@
 
 import numpy as np
 
+import nemo.backends.pytorch as nemo_backend
+import nemo.backends.pytorch.common.losses
 import nemo.collections.nlp as nemo_nlp
 import nemo.core as nemo_core
 from nemo import logging
 from nemo.backends.pytorch.common import EncoderRNN
 from nemo.collections.nlp.callbacks.state_tracking_trade_callback import eval_epochs_done_callback, eval_iter_callback
-from nemo.collections.nlp.data.datasets.state_tracking_trade_dataset import MultiWOZDataDesc
+from nemo.collections.nlp.data.datasets.multiwoz_dataset import MultiWOZDataDesc
 from nemo.utils.lr_policies import get_lr_policy
 
 parser = argparse.ArgumentParser(description='Dialog state tracking with TRADE model on MultiWOZ dataset')
@@ -97,9 +99,9 @@
     teacher_forcing=args.teacher_forcing,
 )
 
-gate_loss_fn = nemo_nlp.nm.losses.CrossEntropyLoss3D(num_classes=len(data_desc.gating_dict))
-ptr_loss_fn = nemo_nlp.nm.losses.TRADEMaskedCrossEntropy()
-total_loss_fn = nemo_nlp.nm.losses.LossAggregatorNM(num_inputs=2)
+gate_loss_fn = nemo_backend.losses.CrossEntropyLossNM(logits_dim=3)
+ptr_loss_fn = nemo_nlp.nm.losses.MaskedXEntropyLoss()
+total_loss_fn = nemo.backends.pytorch.common.losses.LossAggregatorNM(num_inputs=2)
 
 
 def create_pipeline(num_samples, batch_size, num_gpus, input_dropout, data_prefix, is_training):
@@ -142,7 +144,7 @@ def create_pipeline(num_samples, batch_size, num_gpus, input_dropout, data_prefi
     )
 
     gate_loss = gate_loss_fn(logits=gate_outputs, labels=gate_labels)
-    ptr_loss = ptr_loss_fn(logits=point_outputs, targets=tgt_ids, loss_mask=tgt_lens)
+    ptr_loss = ptr_loss_fn(logits=point_outputs, labels=tgt_ids, length_mask=tgt_lens)
     total_loss = total_loss_fn(loss_1=gate_loss, loss_2=ptr_loss)
 
     if is_training:
diff --git a/examples/nlp/glue_benchmark/glue_benchmark_with_bert.py b/examples/nlp/glue_benchmark/glue_benchmark_with_bert.py
index 7b90c132a506..15b119f060cb 100644
--- a/examples/nlp/glue_benchmark/glue_benchmark_with_bert.py
+++ b/examples/nlp/glue_benchmark/glue_benchmark_with_bert.py
@@ -68,7 +68,7 @@
 import nemo.collections.nlp as nemo_nlp
 import nemo.core as nemo_core
 from nemo import logging
-from nemo.backends.pytorch.common import CrossEntropyLoss, MSELoss
+from nemo.backends.pytorch.common import CrossEntropyLossNM, MSELoss
 from nemo.collections.nlp.callbacks.glue_benchmark_callback import eval_epochs_done_callback, eval_iter_callback
 from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer
 from nemo.collections.nlp.data.datasets.glue_benchmark_dataset import output_modes, processors
@@ -231,7 +231,7 @@
     else:
         model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
     model.restore_from(args.bert_checkpoint)
-    logging.info(f"model resotred from {args.bert_checkpoint}")
+    logging.info(f"model restored from {args.bert_checkpoint}")
 
 hidden_size = model.hidden_size
 
@@ -241,7 +241,7 @@
     glue_loss = MSELoss()
 else:
     pooler = SequenceClassifier(hidden_size=hidden_size, num_classes=num_labels, log_softmax=False)
-    glue_loss = CrossEntropyLoss()
+    glue_loss = CrossEntropyLossNM()
 
 
 def create_pipeline(
@@ -260,8 +260,6 @@ def create_pipeline(
         processor=processor,
         evaluate=evaluate,
         batch_size=batch_size,
-        # num_workers=0,
-        # local_rank=local_rank,
         tokenizer=tokenizer,
         data_dir=args.data_dir,
         max_seq_length=max_seq_length,
diff --git a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer.py b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer.py
index 196a0e492055..81fdfad719a3 100644
--- a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer.py
+++ b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer.py
@@ -23,7 +23,7 @@
 
 import nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm
 from nemo import logging
-from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import JointIntentSlotDataDesc
+from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.data_descriptor import JointIntentSlotDataDesc
 
 # Parsing arguments
 parser = argparse.ArgumentParser(description='Joint-intent BERT')
diff --git a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer_b1.py b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer_b1.py
index 84ab723c94a8..3c37c04a6685 100644
--- a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer_b1.py
+++ b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_infer_b1.py
@@ -21,8 +21,8 @@
 
 import nemo.collections.nlp as nemo_nlp
 import nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm
-from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import JointIntentSlotDataDesc
-from nemo.collections.nlp.utils.common_nlp_utils import read_intent_slot_outputs
+from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.data_descriptor import JointIntentSlotDataDesc
+from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.inference_utils import read_intent_slot_outputs
 
 # Parsing arguments
 parser = argparse.ArgumentParser(description='Joint-intent BERT')
diff --git a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_with_bert.py b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_with_bert.py
index 0cbdb08f72cc..579d9ccc340d 100644
--- a/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_with_bert.py
+++ b/examples/nlp/intent_detection_slot_tagging/joint_intent_slot_with_bert.py
@@ -21,16 +21,18 @@
 import numpy as np
 from transformers import BertTokenizer
 
+import nemo
 import nemo.collections.nlp as nemo_nlp
-import nemo.collections.nlp.nm.data_layers.joint_intent_slot_datalayer
-import nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm
 from nemo import logging
+from nemo.backends.pytorch.common.losses import CrossEntropyLossNM, LossAggregatorNM
 from nemo.collections.nlp.callbacks.joint_intent_slot_callback import eval_epochs_done_callback, eval_iter_callback
-from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import JointIntentSlotDataDesc
+from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.data_descriptor import JointIntentSlotDataDesc
+from nemo.collections.nlp.nm.data_layers import BertJointIntentSlotDataLayer
+from nemo.core import CheckpointCallback, SimpleLossLoggerCallback
 from nemo.utils.lr_policies import get_lr_policy
 
 # Parsing arguments
-parser = argparse.ArgumentParser(description='Joint intent slot filling system with pretrained BERT')
+parser = argparse.ArgumentParser(description='Joint intent detection and slot filling with pre-trained BERT')
 parser.add_argument("--local_rank", default=None, type=int)
 parser.add_argument("--batch_size", default=128, type=int)
 parser.add_argument("--max_seq_length", default=50, type=int)
@@ -87,12 +89,10 @@
 nemo_nlp.huggingface.BERT.list_pretrained_models()
 """
 if args.bert_checkpoint and args.bert_config:
-    pretrained_bert_model = nemo.collections.nlp.nm.trainables.huggingface.BERT(config_filename=args.bert_config)
+    pretrained_bert_model = nemo_nlp.nm.trainables.huggingface.BERT(config_filename=args.bert_config)
     pretrained_bert_model.restore_from(args.bert_checkpoint)
 else:
-    pretrained_bert_model = nemo.collections.nlp.nm.trainables.huggingface.BERT(
-        pretrained_model_name=args.pretrained_bert_model
-    )
+    pretrained_bert_model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
 
 hidden_size = pretrained_bert_model.hidden_size
 
@@ -101,31 +101,29 @@
 )
 
 # Create sentence classification loss on top
-classifier = nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm.JointIntentSlotClassifier(
+classifier = nemo_nlp.nm.trainables.JointIntentSlotClassifier(
     hidden_size=hidden_size, num_intents=data_desc.num_intents, num_slots=data_desc.num_slots, dropout=args.fc_dropout
 )
 
 if args.class_balancing == 'weighted_loss':
-    # Using weighted loss will enable weighted loss for both intents and slots
-    # Use the intent_loss_weight hyperparameter to adjust intent loss to
-    # prevent overfitting or underfitting.
-    loss_fn = nemo_nlp.nm.losses.JointIntentSlotLoss(
-        num_slots=data_desc.num_slots,
-        slot_classes_loss_weights=data_desc.slot_weights,
-        intent_classes_loss_weights=data_desc.intent_weights,
-        intent_loss_weight=args.intent_loss_weight,
-    )
+    # To tackle imbalanced classes, you may use weighted loss
+    intent_loss_fn = CrossEntropyLossNM(logits_dim=2, weight=data_desc.intent_weights)
+    slot_loss_fn = CrossEntropyLossNM(logits_dim=3, weight=data_desc.intent_weights)
+
 else:
-    loss_fn = nemo_nlp.nm.losses.JointIntentSlotLoss(num_slots=data_desc.num_slots)
+    intent_loss_fn = CrossEntropyLossNM(logits_dim=2)
+    slot_loss_fn = CrossEntropyLossNM(logits_dim=3)
+
+total_loss_fn = LossAggregatorNM(num_inputs=2, weights=[args.intent_loss_weight, 1.0 - args.intent_loss_weight])
 
 
-def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mode='train'):
+def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, mode='train'):
     logging.info(f"Loading {mode} data...")
     data_file = f'{data_desc.data_dir}/{mode}.tsv'
     slot_file = f'{data_desc.data_dir}/{mode}_slots.tsv'
     shuffle = args.shuffle_data if mode == 'train' else False
 
-    data_layer = nemo.collections.nlp.nm.data_layers.joint_intent_slot_datalayer.BertJointIntentSlotDataLayer(
+    data_layer = BertJointIntentSlotDataLayer(
         input_file=data_file,
         slot_file=slot_file,
         pad_label=data_desc.pad_label,
@@ -155,35 +153,27 @@ def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mod
 
     intent_logits, slot_logits = classifier(hidden_states=hidden_states)
 
-    loss = loss_fn(
-        intent_logits=intent_logits, slot_logits=slot_logits, loss_mask=loss_mask, intents=intents, slots=slots
-    )
+    intent_loss = intent_loss_fn(logits=intent_logits, labels=intents)
+    slot_loss = slot_loss_fn(logits=slot_logits, labels=slots, loss_mask=loss_mask)
+    total_loss = total_loss_fn(loss_1=intent_loss, loss_2=slot_loss)
 
     if mode == 'train':
-        tensors_to_evaluate = [loss, intent_logits, slot_logits]
+        tensors_to_evaluate = [total_loss, intent_logits, slot_logits]
     else:
         tensors_to_evaluate = [intent_logits, slot_logits, intents, slots, subtokens_mask]
 
-    return tensors_to_evaluate, loss, steps_per_epoch, data_layer
+    return tensors_to_evaluate, total_loss, steps_per_epoch, data_layer
 
 
 train_tensors, train_loss, steps_per_epoch, _ = create_pipeline(
-    args.num_train_samples,
-    batch_size=args.batch_size,
-    num_gpus=args.num_gpus,
-    local_rank=args.local_rank,
-    mode=args.train_file_prefix,
+    args.num_train_samples, batch_size=args.batch_size, num_gpus=args.num_gpus, mode=args.train_file_prefix,
 )
 eval_tensors, _, _, data_layer = create_pipeline(
-    args.num_eval_samples,
-    batch_size=args.batch_size,
-    num_gpus=args.num_gpus,
-    local_rank=args.local_rank,
-    mode=args.eval_file_prefix,
+    args.num_eval_samples, batch_size=args.batch_size, num_gpus=args.num_gpus, mode=args.eval_file_prefix,
 )
 
 # Create callbacks for train and eval modes
-train_callback = nemo.core.SimpleLossLoggerCallback(
+train_callback = SimpleLossLoggerCallback(
     tensors=train_tensors,
     print_func=lambda x: str(np.round(x[0].item(), 3)),
     tb_writer=nf.tb_writer,
@@ -200,7 +190,7 @@ def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mod
 )
 
 # Create callback to save checkpoints
-ckpt_callback = nemo.core.CheckpointCallback(
+ckpt_callback = CheckpointCallback(
     folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq
 )
 
diff --git a/examples/nlp/language_modeling/BERTPretrainingTutorial.ipynb b/examples/nlp/language_modeling/BERTPretrainingTutorial.ipynb
index b46a87a77079..157fd50bb208 100644
--- a/examples/nlp/language_modeling/BERTPretrainingTutorial.ipynb
+++ b/examples/nlp/language_modeling/BERTPretrainingTutorial.ipynb
@@ -7,6 +7,15 @@
     "### Step 1 Download and prepare data"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DATA_DIR = 'PATH_TO_THE_DATA_DIR'"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -15,7 +24,7 @@
    "source": [
     "# This example is for demonstration purposes\n",
     "# Please refer to the corresponding NLP tutorial on NeMo documentation\n",
-    "! ../scripts/get_wkt2.sh"
+    "! ../scripts/get_wkt2.sh $DATA_DIR"
    ]
   },
   {
@@ -25,7 +34,7 @@
    "outputs": [],
    "source": [
     "# verify data is there \n",
-    "! ls -l data/lm/wikitext-2"
+    "! ls -l $DATA_DIR/wikitext-2"
    ]
   },
   {
@@ -35,7 +44,7 @@
    "outputs": [],
    "source": [
     "# Prepare tokenization model\n",
-    "! python ../scripts/create_vocab.py --train_path=data/lm/wikitext-2/train.txt"
+    "! python ../scripts/create_vocab.py --train_path=$DATA_DIR/wikitext-2/train.txt"
    ]
   },
   {
@@ -155,7 +164,7 @@
     "                                          num_classes=tokenizer.vocab_size,\n",
     "                                              activation=HIDDEN_ACT,\n",
     "                                          log_softmax=True)\n",
-    "mlm_loss = nemo_nlp.nm.losses.MaskedLanguageModelingLossNM()\n",
+    "mlm_loss = nemo_nlp.nm.losses.SmoothedCrossEntropyLoss()\n",
     "\n",
     "# Next Sentence Prediciton Loss\n",
     "nsp_classifier = nemo_nlp.nm.trainables.SequenceClassifier(D_MODEL,\n",
@@ -163,7 +172,7 @@
     "                                             num_layers=2,\n",
     "                                             activation='tanh',\n",
     "                                             log_softmax=False)\n",
-    "nsp_loss = nemo.backends.pytorch.common.CrossEntropyLoss()\n",
+    "nsp_loss = nemo.backends.pytorch.common.CrossEntropyLossNM()\n",
     "\n",
     "bert_loss = nemo_nlp.nm.losses.LossAggregatorNM(num_inputs=2)"
    ]
@@ -174,10 +183,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
     "train_data_layer = nemo_nlp.nm.data_layers.BertPretrainingDataLayer(\n",
     "    tokenizer=tokenizer,\n",
-    "    dataset=os.path.join(\"data/lm/wikitext-2\", \"train.txt\"),\n",
+    "    dataset=os.path.join(DATA_DIR, \"wikitext-2\", \"train.txt\"),\n",
     "    max_seq_length=MAX_SEQ_LENGTH,\n",
     "    mask_probability=MASK_PROBABILITY,\n",
     "    batch_size=BATCH_SIZE\n",
@@ -185,7 +193,7 @@
     "\n",
     "eval_data_layer = nemo_nlp.nm.data_layers.BertPretrainingDataLayer(\n",
     "    tokenizer=tokenizer,\n",
-    "    dataset=os.path.join(\"data/lm/wikitext-2\", \"valid.txt\"),\n",
+    "    dataset=os.path.join(DATA_DIR, \"wikitext-2\", \"valid.txt\"),\n",
     "    max_seq_length=MAX_SEQ_LENGTH,\n",
     "    mask_probability=MASK_PROBABILITY,\n",
     "    batch_size=BATCH_SIZE_EVAL\n",
@@ -213,7 +221,7 @@
     "                           attention_mask=input_data.input_mask)\n",
     "\n",
     "mlm_logits = mlm_classifier(hidden_states=hidden_states)\n",
-    "t_mlm_loss = mlm_loss(logits=mlm_logits, output_ids=input_data.output_ids, output_mask=input_data.output_mask)\n",
+    "t_mlm_loss = mlm_loss(logits=mlm_logits, labels=input_data.output_ids, output_mask=input_data.output_mask)\n",
     "\n",
     "nsp_logits = nsp_classifier(hidden_states=hidden_states)\n",
     "t_nsp_loss = nsp_loss(logits=nsp_logits, labels=input_data.labels)\n",
@@ -235,7 +243,7 @@
     "                           attention_mask=input_data_eval.input_mask)\n",
     "\n",
     "e_mlm_logits = mlm_classifier(hidden_states=e_hidden_states)\n",
-    "e_mlm_loss = mlm_loss(logits=e_mlm_logits, output_ids=input_data_eval.output_ids, output_mask=input_data_eval.output_mask)\n",
+    "e_mlm_loss = mlm_loss(logits=e_mlm_logits, labels=input_data_eval.output_ids, output_mask=input_data_eval.output_mask)\n",
     "\n",
     "e_nsp_logits = nsp_classifier(hidden_states=e_hidden_states)\n",
     "e_nsp_loss = nsp_loss(logits=e_nsp_logits, labels=input_data_eval.labels)\n",
@@ -270,17 +278,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[NeMo I 2020-02-12 12:08:08 callbacks:196] Step: 300\n",
-      "Loss: 6.991\n",
-      "[NeMo I 2020-02-12 12:08:08 callbacks:211] Step time: 0.13242316246032715 seconds\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "lr_policy = CosineAnnealing(NUM_EPOCHS * steps_per_epoch,\n",
     "                            warmup_ratio=LR_WARMUP_PROPORTION)\n",
@@ -299,6 +297,13 @@
     "                })"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -323,7 +328,16 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.6"
+   "version": "3.7.4"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "metadata": {
+     "collapsed": false
+    },
+    "source": []
+   }
   }
  },
  "nbformat": 4,
diff --git a/examples/nlp/language_modeling/bert_pretraining.py b/examples/nlp/language_modeling/bert_pretraining.py
index 27bf08d1b3d1..9dd9b341eba8 100644
--- a/examples/nlp/language_modeling/bert_pretraining.py
+++ b/examples/nlp/language_modeling/bert_pretraining.py
@@ -86,6 +86,7 @@
 from transformers import BertConfig
 
 import nemo.backends.pytorch.common as nemo_common
+import nemo.backends.pytorch.common.losses
 import nemo.collections.nlp as nemo_nlp
 import nemo.core as nemo_core
 from nemo import logging
@@ -211,14 +212,14 @@
 mlm_classifier = nemo_nlp.nm.trainables.token_classification_nm.BertTokenClassifier(
     args.hidden_size, num_classes=args.vocab_size, activation=args.hidden_act, log_softmax=True
 )
-mlm_loss_fn = nemo_nlp.nm.losses.MaskedLanguageModelingLossNM()
+mlm_loss_fn = nemo_nlp.nm.losses.SmoothedCrossEntropyLoss()
 if not args.only_mlm_loss:
     nsp_classifier = nemo_nlp.nm.trainables.sequence_classification_nm.SequenceClassifier(
         args.hidden_size, num_classes=2, num_layers=2, activation='tanh', log_softmax=False
     )
-    nsp_loss_fn = nemo_common.CrossEntropyLoss()
+    nsp_loss_fn = nemo_common.CrossEntropyLossNM()
 
-    bert_loss = nemo_nlp.nm.losses.LossAggregatorNM(num_inputs=2)
+    bert_loss = nemo.backends.pytorch.common.losses.LossAggregatorNM(num_inputs=2)
 
 # tie weights of MLM softmax layer and embedding layer of the encoder
 if mlm_classifier.mlp.last_linear_layer.weight.shape != bert_model.bert.embeddings.word_embeddings.weight.shape:
@@ -256,7 +257,7 @@ def create_pipeline(data_file, batch_size, preprocessed_data=False, batches_per_
         input_ids=input_data.input_ids, token_type_ids=input_data.input_type_ids, attention_mask=input_data.input_mask
     )
     mlm_logits = mlm_classifier(hidden_states=hidden_states)
-    mlm_loss = mlm_loss_fn(logits=mlm_logits, output_ids=input_data.output_ids, output_mask=input_data.output_mask)
+    mlm_loss = mlm_loss_fn(logits=mlm_logits, labels=input_data.output_ids, output_mask=input_data.output_mask)
     if not args.only_mlm_loss:
         nsp_logits = nsp_classifier(hidden_states=hidden_states)
         nsp_loss = nsp_loss_fn(logits=nsp_logits, labels=input_data.labels)
diff --git a/examples/nlp/language_modeling/language_modeling_transformer.py b/examples/nlp/language_modeling/language_modeling_transformer.py
index 2572b90af785..86299277c30e 100644
--- a/examples/nlp/language_modeling/language_modeling_transformer.py
+++ b/examples/nlp/language_modeling/language_modeling_transformer.py
@@ -109,9 +109,7 @@
     args.d_model, num_classes=vocab_size, num_layers=1, log_softmax=True
 )
 
-loss = nemo_nlp.nm.losses.PaddedSmoothedCrossEntropyLossNM(
-    pad_id=tokenizer.pad_id, label_smoothing=args.label_smoothing
-)
+loss = nemo_nlp.nm.losses.SmoothedCrossEntropyLoss(pad_id=tokenizer.pad_id, label_smoothing=args.label_smoothing)
 
 # tie weight of embedding and log_softmax layers
 # log_softmax.mlp.last_linear_layer.weight = encoder.embedding_layer.token_embedding.weight
@@ -133,7 +131,7 @@ def create_pipeline(
     src, src_mask, labels = data_layer()
     src_hiddens = encoder(input_ids=src, input_mask_src=src_mask)
     logits = log_softmax(hidden_states=src_hiddens)
-    return loss(logits=logits, target_ids=labels)
+    return loss(logits=logits, labels=labels)
 
 
 train_loss = create_pipeline(
diff --git a/examples/nlp/neural_machine_translation/machine_translation_tutorial.py b/examples/nlp/neural_machine_translation/machine_translation_tutorial.py
index ae05afa88e32..c2ecd19df986 100644
--- a/examples/nlp/neural_machine_translation/machine_translation_tutorial.py
+++ b/examples/nlp/neural_machine_translation/machine_translation_tutorial.py
@@ -161,7 +161,7 @@
     eos_token=tgt_tokenizer.eos_id,
 )
 
-loss_fn = nemo_nlp.nm.losses.PaddedSmoothedCrossEntropyLossNM(
+loss_fn = nemo_nlp.nm.losses.SmoothedCrossEntropyLoss(
     pad_id=tgt_tokenizer.pad_id, label_smoothing=args.label_smoothing
 )
 
@@ -202,7 +202,7 @@ def create_pipeline(dataset_src, dataset_tgt, tokens_in_batch, clean=False, trai
         input_ids_tgt=tgt, hidden_states_src=src_hiddens, input_mask_src=src_mask, input_mask_tgt=tgt_mask
     )
     logits = log_softmax(hidden_states=tgt_hiddens)
-    loss = loss_fn(logits=logits, target_ids=labels)
+    loss = loss_fn(logits=logits, labels=labels)
     beam_results = None
     if not training:
         beam_results = beam_search(hidden_states_src=src_hiddens, input_mask_src=src_mask)
diff --git a/examples/nlp/question_answering/question_answering_squad.py b/examples/nlp/question_answering/question_answering_squad.py
index 5997614888e7..1bd718883807 100755
--- a/examples/nlp/question_answering/question_answering_squad.py
+++ b/examples/nlp/question_answering/question_answering_squad.py
@@ -368,7 +368,7 @@ def create_pipeline(
     qa_head = nemo_nlp.nm.trainables.TokenClassifier(
         hidden_size=hidden_size, num_classes=2, num_layers=1, log_softmax=False
     )
-    squad_loss = nemo_nlp.nm.losses.QuestionAnsweringLoss()
+    squad_loss = nemo_nlp.nm.losses.SpanningLoss()
     if args.bert_checkpoint is not None:
         model.restore_from(args.bert_checkpoint)
 
diff --git a/examples/nlp/scripts/get_wkt2.sh b/examples/nlp/scripts/get_wkt2.sh
index 206160bf8cd8..75efd08722e4 100755
--- a/examples/nlp/scripts/get_wkt2.sh
+++ b/examples/nlp/scripts/get_wkt2.sh
@@ -4,12 +4,12 @@ This file is adapted from
 https://github.com/salesforce/awd-lstm-lm/blob/master/getdata.sh
 Copyright by the AWD LSTM authors.
 """
-
+DATA_DIR=$1
 echo "- Downloading WikiText-2"
 
-wget --continue -P data/lm/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip
-unzip -q data/lm/wikitext-2-v1.zip -d data/lm
-cd data/lm/wikitext-2
+wget --continue -P $DATA_DIR https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip
+unzip -q $DATA_DIR/wikitext-2-v1.zip -d $DATA_DIR
+cd $DATA_DIR/wikitext-2
 mv wiki.train.tokens train.txt
 sed -i -e "s/<unk>/[UNK]/g" train.txt
 mv wiki.valid.tokens valid.txt
@@ -18,3 +18,5 @@ mv wiki.test.tokens test.txt
 sed -i -e "s/<unk>/[UNK]/g" test.txt
 cd ..
 rm wikitext-2-v1.zip
+
+echo "- WikiText-2 saved at $DATA_DIR/wikitext-2"
diff --git a/examples/nlp/text_classification/text_classification_with_bert.py b/examples/nlp/text_classification/text_classification_with_bert.py
index 62048e5b4945..202e8c9f3039 100644
--- a/examples/nlp/text_classification/text_classification_with_bert.py
+++ b/examples/nlp/text_classification/text_classification_with_bert.py
@@ -24,7 +24,7 @@
 import nemo.collections.nlp.nm.trainables.common.sequence_classification_nm
 from nemo import logging
 from nemo.collections.nlp.callbacks.text_classification_callback import eval_epochs_done_callback, eval_iter_callback
-from nemo.collections.nlp.data.datasets.text_classification_dataset import SentenceClassificationDataDesc
+from nemo.collections.nlp.data.datasets.text_classification_dataset import TextClassificationDataDesc
 from nemo.utils.lr_policies import get_lr_policy
 
 # Parsing arguments
@@ -93,7 +93,7 @@
 hidden_size = pretrained_bert_model.hidden_size
 tokenizer = BertTokenizer.from_pretrained(args.pretrained_bert_model)
 
-data_desc = SentenceClassificationDataDesc(args.dataset_name, args.data_dir, args.do_lower_case)
+data_desc = TextClassificationDataDesc(args.dataset_name, args.data_dir, args.do_lower_case)
 
 # Create sentence classification loss on top
 classifier = nemo.collections.nlp.nm.trainables.common.sequence_classification_nm.SequenceClassifier(
@@ -102,9 +102,9 @@
 
 if args.class_balancing == 'weighted_loss':
     # You may need to increase the number of epochs for convergence.
-    loss_fn = nemo.backends.pytorch.common.CrossEntropyLoss(weight=data_desc.class_weights)
+    loss_fn = nemo.backends.pytorch.common.CrossEntropyLossNM(weight=data_desc.class_weights)
 else:
-    loss_fn = nemo.backends.pytorch.common.CrossEntropyLoss()
+    loss_fn = nemo.backends.pytorch.common.CrossEntropyLossNM()
 
 
 def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mode='train'):
diff --git a/examples/nlp/token_classification/NERWithBERT.ipynb b/examples/nlp/token_classification/NERWithBERT.ipynb
index c3a38da0e49a..85389348cb2e 100644
--- a/examples/nlp/token_classification/NERWithBERT.ipynb
+++ b/examples/nlp/token_classification/NERWithBERT.ipynb
@@ -16,7 +16,7 @@
     "from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer\n",
     "from nemo.collections.nlp.callbacks.token_classification_callback import \\\n",
     "    eval_iter_callback, eval_epochs_done_callback\n",
-    "from nemo.collections.nlp.nm.losses import TokenClassificationLoss\n",
+    "from nemo.backends.pytorch.common.losses import CrossEntropyLoss\n",
     "from nemo.collections.nlp.nm.trainables import TokenClassifier"
    ]
   },
@@ -106,7 +106,7 @@
     "                                          num_classes=num_classes,\n",
     "                                          dropout=CLASSIFICATION_DROPOUT)\n",
     "\n",
-    "ner_loss = TokenClassificationLoss(num_classes=len(label_ids))\n",
+    "ner_loss = CrossEntropyLossNM()\n",
     "\n",
     "input_ids, input_type_ids, input_mask, loss_mask, _, labels = train_data_layer()\n",
     "\n",
@@ -219,8 +219,17 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.7.4"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "source": [],
+    "metadata": {
+     "collapsed": false
+    }
+   }
   }
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
\ No newline at end of file
diff --git a/examples/nlp/token_classification/PunctuationWithBERT.ipynb b/examples/nlp/token_classification/PunctuationWithBERT.ipynb
index e4905b1d6277..2ca6b93a0b74 100644
--- a/examples/nlp/token_classification/PunctuationWithBERT.ipynb
+++ b/examples/nlp/token_classification/PunctuationWithBERT.ipynb
@@ -17,9 +17,10 @@
     "import nemo.collections.nlp as nemo_nlp\n",
     "from nemo.collections.nlp.data import NemoBertTokenizer\n",
     "from nemo.collections.nlp.nm.trainables import TokenClassifier\n",
-    "from nemo.collections.nlp.nm.losses import TokenClassificationLoss, LossAggregatorNM\n",
+    "from nemo.collections.nlp.nm.losses import LossAggregatorNM\n",
+    "from nemo.backends.pytorch.common.losses import CrossEntropyLoss\n",
     "from nemo.collections.nlp.callbacks.punctuation_capitalization_callback import eval_iter_callback, eval_epochs_done_callback\n",
-    "from nemo.collections.nlp.utils.common_nlp_utils import calc_class_weights\n",
+    "from nemo.collections.nlp.data import calc_class_weights\n",
     "\n",
     "DATA_DIR = \"PATH_TO_WHERE_THE_DATA_IS\"\n",
     "WORK_DIR = \"PATH_TO_WHERE_TO_STORE_CHECKPOINTS_AND_LOGS\"\n",
@@ -167,10 +168,8 @@
     "class_weights = calc_class_weights(punct_label_freqs)\n",
     "\n",
     "# define loss\n",
-    "punct_loss = TokenClassificationLoss(\n",
-    "    num_classes=len(punct_label_ids),\n",
-    "    class_weights=class_weights)\n",
-    "capit_loss = TokenClassificationLoss(num_classes=len(capit_label_ids))\n",
+    "punct_loss = CrossEntropyLossNM(logits_dim=3, weight=class_weights)\n",
+    "capit_loss = CrossEntropyLossNM(logits_dim=3)\n",
     "task_loss = LossAggregatorNM(num_inputs=2)"
    ]
   },
@@ -480,8 +479,17 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.7.4"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "source": [],
+    "metadata": {
+     "collapsed": false
+    }
+   }
   }
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
\ No newline at end of file
diff --git a/examples/nlp/token_classification/punctuation_capitalization.py b/examples/nlp/token_classification/punctuation_capitalization.py
index b74eeff89663..d3295bdeb4e8 100644
--- a/examples/nlp/token_classification/punctuation_capitalization.py
+++ b/examples/nlp/token_classification/punctuation_capitalization.py
@@ -19,15 +19,16 @@
 import os
 
 import nemo.collections.nlp as nemo_nlp
-import nemo.collections.nlp.utils.common_nlp_utils
+import nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing
+import nemo.collections.nlp.utils.data_utils
 from nemo import logging
+from nemo.backends.pytorch.common.losses import CrossEntropyLossNM
 from nemo.collections.nlp.callbacks.punctuation_capitalization_callback import (
     eval_epochs_done_callback,
     eval_iter_callback,
 )
 from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer
 from nemo.collections.nlp.nm.data_layers import PunctuationCapitalizationDataLayer
-from nemo.collections.nlp.nm.losses.token_classification_loss import TokenClassificationLoss
 from nemo.collections.nlp.nm.trainables import TokenClassifier
 from nemo.utils.lr_policies import get_lr_policy
 
@@ -214,7 +215,9 @@ def create_pipeline(
         if args.use_weighted_loss_punct:
             logging.info(f"Using weighted loss for punctuation task")
             punct_label_freqs = data_layer.dataset.punct_label_frequencies
-            class_weights = nemo.collections.nlp.utils.common_nlp_utils.calc_class_weights(punct_label_freqs)
+            class_weights = nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing.calc_class_weights(
+                punct_label_freqs
+            )
 
         # Initialize punctuation loss
         punct_classifier = punct_classifier(
@@ -225,15 +228,15 @@ def create_pipeline(
             name='Punctuation',
         )
 
-        punct_loss = TokenClassificationLoss(num_classes=len(punct_label_ids), class_weights=class_weights)
+        punct_loss = CrossEntropyLossNM(logits_dim=3, weight=class_weights)
 
         # Initialize capitalization loss
         capit_classifier = capit_classifier(
             hidden_size=hidden_size, num_classes=len(capit_label_ids), dropout=dropout, name='Capitalization'
         )
-        capit_loss = TokenClassificationLoss(num_classes=len(capit_label_ids))
+        capit_loss = CrossEntropyLossNM(logits_dim=3)
 
-        task_loss = nemo_nlp.nm.losses.LossAggregatorNM(num_inputs=2)
+        task_loss = nemo.backends.pytorch.common.losses.LossAggregatorNM(num_inputs=2)
 
     hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)
 
diff --git a/examples/nlp/token_classification/punctuation_capitalization_infer.py b/examples/nlp/token_classification/punctuation_capitalization_infer.py
index 9c2f8bede33c..b4b7cc2e1aab 100644
--- a/examples/nlp/token_classification/punctuation_capitalization_infer.py
+++ b/examples/nlp/token_classification/punctuation_capitalization_infer.py
@@ -24,7 +24,7 @@
 from nemo import logging
 from nemo.collections.nlp.data import NemoBertTokenizer
 from nemo.collections.nlp.nm.data_layers import BertTokenClassificationInferDataLayer
-from nemo.collections.nlp.utils.common_nlp_utils import get_vocab
+from nemo.collections.nlp.utils.data_utils import get_vocab
 
 # Parsing arguments
 parser = argparse.ArgumentParser(description='Punctuation and capitalization detection inference')
diff --git a/examples/nlp/token_classification/token_classification.py b/examples/nlp/token_classification/token_classification.py
index 7254929863f1..b8d16bd87d59 100644
--- a/examples/nlp/token_classification/token_classification.py
+++ b/examples/nlp/token_classification/token_classification.py
@@ -19,12 +19,13 @@
 import os
 
 import nemo.collections.nlp as nemo_nlp
-import nemo.collections.nlp.utils.common_nlp_utils
+import nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing
+import nemo.collections.nlp.utils.data_utils
 from nemo import logging
+from nemo.backends.pytorch.common.losses import CrossEntropyLossNM
 from nemo.collections.nlp.callbacks.token_classification_callback import eval_epochs_done_callback, eval_iter_callback
 from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer
 from nemo.collections.nlp.nm.data_layers import BertTokenClassificationDataLayer
-from nemo.collections.nlp.nm.losses import TokenClassificationLoss
 from nemo.collections.nlp.nm.trainables import TokenClassifier
 from nemo.utils.lr_policies import get_lr_policy
 
@@ -197,7 +198,9 @@ def create_pipeline(
         if args.use_weighted_loss:
             logging.info(f"Using weighted loss")
             label_freqs = data_layer.dataset.label_frequencies
-            class_weights = nemo.collections.nlp.utils.common_nlp_utils.calc_class_weights(label_freqs)
+            class_weights = nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing.calc_class_weights(
+                label_freqs
+            )
 
             logging.info(f"class_weights: {class_weights}")
 
@@ -205,7 +208,7 @@ def create_pipeline(
             hidden_size=hidden_size, num_classes=len(label_ids), dropout=dropout, num_layers=num_layers
         )
 
-        task_loss = TokenClassificationLoss(num_classes=len(label_ids), class_weights=class_weights)
+        task_loss = CrossEntropyLossNM(logits_dim=3, weight=class_weights)
 
     hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)
     logits = classifier(hidden_states=hidden_states)
diff --git a/examples/nlp/token_classification/token_classification_infer.py b/examples/nlp/token_classification/token_classification_infer.py
index f1d7d1bf5cdf..cc63fcb7f3c6 100644
--- a/examples/nlp/token_classification/token_classification_infer.py
+++ b/examples/nlp/token_classification/token_classification_infer.py
@@ -24,7 +24,7 @@
 from nemo import logging
 from nemo.collections.nlp.data import NemoBertTokenizer
 from nemo.collections.nlp.nm.trainables import TokenClassifier
-from nemo.collections.nlp.utils.common_nlp_utils import get_vocab
+from nemo.collections.nlp.utils.data_utils import get_vocab
 
 # Parsing arguments
 parser = argparse.ArgumentParser(description='NER with pretrained BERT')
diff --git a/nemo/backends/pytorch/common/losses.py b/nemo/backends/pytorch/common/losses.py
index dd0d70082f9f..02b0cd78cf67 100644
--- a/nemo/backends/pytorch/common/losses.py
+++ b/nemo/backends/pytorch/common/losses.py
@@ -2,12 +2,10 @@
 from torch import nn
 
 from nemo.backends.pytorch.nm import LossNM
-from nemo.core.neural_types import LabelsType, LogitsType, LossType, NeuralType, RegressionValuesType
+from nemo.core.neural_types import LabelsType, LogitsType, LossType, MaskType, NeuralType, RegressionValuesType
 from nemo.utils.decorators import add_port_docs
 
-__all__ = ['SequenceLoss', 'CrossEntropyLoss', 'MSELoss']
-
-EPS = 1e-5
+__all__ = ['SequenceLoss', 'CrossEntropyLossNM', 'MSELoss', 'LossAggregatorNM']
 
 
 class SequenceLoss(LossNM):
@@ -29,6 +27,8 @@ class SequenceLoss(LossNM):
         ctc_blank_id (int): ID of blank symbols to pass to mask when
             calculating ctc loss.
             Defaults to None.
+        eps (float): small number to prevent division by zero in loss calculation
+            Defaults to 1e-5.
 
     """
 
@@ -47,7 +47,14 @@ def output_ports(self):
         return {"loss": NeuralType(elements_type=LossType())}
 
     def __init__(
-        self, pad_id=0, smoothing_coef=0.0, sample_wise=False, aux_ctc=False, ctc_initial_coef=0.1, ctc_blank_id=None
+        self,
+        pad_id=0,
+        smoothing_coef=0.0,
+        sample_wise=False,
+        aux_ctc=False,
+        ctc_initial_coef=0.1,
+        ctc_blank_id=None,
+        eps=1e-5,
     ):
         assert (not aux_ctc) or (ctc_blank_id is not None), "Should be a blank id if using CTC loss"
 
@@ -58,6 +65,7 @@ def __init__(
         self.sample_wise = sample_wise
         self.aux_ctc = aux_ctc
         self.ctc_coef = ctc_initial_coef
+        self.eps = eps
 
         if aux_ctc:
             self.ctc = nn.CTCLoss(blank=ctc_blank_id, reduction='none', zero_infinity=True)
@@ -85,7 +93,7 @@ def _ce_loss(self, log_probs, targets, pad_mask):
         if self.sample_wise:
             loss /= target_log_probs.size(0)
         else:
-            loss /= pad_mask.sum() + EPS
+            loss /= pad_mask.sum() + self.eps
         return loss
 
     def _ctc_loss(self, log_probs, targets, pad_mask):
@@ -95,10 +103,14 @@ def _ctc_loss(self, log_probs, targets, pad_mask):
         return loss
 
 
-class CrossEntropyLoss(LossNM):
+class CrossEntropyLossNM(LossNM):
     """
     CrossEntropyLoss
-
+    Args:
+        logits_dim (int): dimension size of the logits tensor
+        weight (list): list of rescaling weight given to each class
+        reduce (bool): controls if reduction would be done over the batch
+        reduction (str): type of the reduction over the batch
     """
 
     @property
@@ -107,8 +119,9 @@ def input_ports(self):
         """Returns definitions of module input ports.
         """
         return {
-            "logits": NeuralType(axes=('B', 'D'), elements_type=LogitsType()),
-            "labels": NeuralType(axes=tuple('B'), elements_type=LabelsType()),
+            "logits": NeuralType(['B'] + ['ANY'] * (self._logits_dim - 1), LogitsType()),
+            "labels": NeuralType(['B'] + ['ANY'] * (self._logits_dim - 2), LabelsType()),
+            "loss_mask": NeuralType(['B'] + ['ANY'] * (self._logits_dim - 2), MaskType(), optional=True),
         }
 
     @property
@@ -121,14 +134,30 @@ def output_ports(self):
         """
         return {"loss": NeuralType(elements_type=LossType())}
 
-    def __init__(self, reduction='mean', weight=None):
+    def __init__(self, logits_dim=2, weight=None, reduce=True, reduction='mean'):
         super().__init__()
+
         if weight:
             weight = torch.FloatTensor(weight).to(self._device)
-        self._criterion = nn.CrossEntropyLoss(weight=weight, reduction=reduction)
+        self._criterion = nn.CrossEntropyLoss(weight=weight, reduce=reduce, reduction=reduction)
+        self._logits_dim = logits_dim
+
+    def _loss_function(self, logits, labels, loss_mask=None):
+        """
+        Args:
+            logits (float): output of the classifier
+            labels (long): ground truth labels
+            loss_mask (bool/float/int): tensor to specify the masking
+        """
+        logits_flatten = torch.flatten(logits, start_dim=0, end_dim=-2)
+        labels_flatten = torch.flatten(labels, start_dim=0, end_dim=-1)
+
+        if loss_mask is not None:
+            loss_mask_flatten = torch.flatten(loss_mask, start_dim=0, end_dim=-1)
+            logits_flatten = logits_flatten[loss_mask_flatten]
+            labels_flatten = labels_flatten[loss_mask_flatten]
 
-    def _loss_function(self, logits, labels):
-        loss = self._criterion(logits, labels)
+        loss = self._criterion(logits_flatten, labels_flatten)
         return loss
 
 
@@ -166,3 +195,52 @@ def __init__(self, reduction='mean'):
     def _loss_function(self, preds, labels):
         loss = self._criterion(preds, labels)
         return loss
+
+
+class LossAggregatorNM(LossNM):
+    """
+    Neural module which combines sums several losses into one.
+
+    Args:
+        num_inputs (int): number of input losses
+        weights (list of floats): a list of coefficient for merging losses
+    """
+
+    @property
+    def input_ports(self):
+        """Returns definitions of module input ports.
+
+        """
+        input_ports = {}
+        for i in range(self._num_losses):
+            input_ports["loss_" + str(i + 1)] = NeuralType(elements_type=LossType())
+
+        return input_ports
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+        loss:
+            NeuralType(None)
+        """
+        return {"loss": NeuralType(elements_type=LossType())}
+
+    def __init__(self, num_inputs=2, weights=None):
+        # Store number of inputs/losses.
+        self._num_losses = num_inputs
+        if weights is not None and len(weights) != num_inputs:
+            raise ValueError("Length of weights should be equal to the number of inputs (num_inputs)")
+
+        self._weights = weights
+        LossNM.__init__(self)
+
+    def _loss_function(self, **kwargs):
+        values = [kwargs[x] for x in sorted(kwargs.keys())]
+        loss = torch.zeros_like(values[0])
+        for loss_idx, loss_value in enumerate(values):
+            if self._weights is not None:
+                loss = loss.add(loss_value, alpha=self._weights[loss_idx])
+            else:
+                loss = loss.add(loss_value)
+        return loss
diff --git a/nemo/backends/pytorch/common/rnn.py b/nemo/backends/pytorch/common/rnn.py
index eb9eb7e1e246..c1c62ac08c35 100644
--- a/nemo/backends/pytorch/common/rnn.py
+++ b/nemo/backends/pytorch/common/rnn.py
@@ -71,11 +71,7 @@ def input_ports(self):
         """Returns definitions of module input ports.
         """
         return {
-            # 'targets': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
             'targets': NeuralType(('B', 'T'), LabelsType()),
-            # 'encoder_outputs': NeuralType(
-            #   {0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}, optional=True,
-            # ),
             'encoder_outputs': NeuralType(('B', 'T', 'D'), ChannelType(), True),
         }
 
@@ -85,11 +81,7 @@ def output_ports(self):
         """Returns definitions of module output ports.
         """
         return {
-            # 'log_probs': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}),
             'log_probs': NeuralType(('B', 'T', 'D'), LogprobsType()),
-            # 'attention_weights': NeuralType(
-            #    {0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(TimeTag),}, optional=True,
-            # ),
             'attention_weights': NeuralType(('B', 'T', 'T'), ChannelType(), True),
         }
 
@@ -211,8 +203,6 @@ def input_ports(self):
         """Returns definitions of module input ports.
         """
         return {
-            # 'inputs': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # 'input_lens': NeuralType({0: AxisType(BatchTag),}, optional=True),
             'inputs': NeuralType(('B', 'T'), ChannelType()),
             'input_lens': NeuralType(tuple('B'), LengthsType()),
         }
@@ -223,8 +213,6 @@ def output_ports(self):
         """Returns definitions of module output ports.
         """
         return {
-            # 'outputs': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}),
-            # 'hidden': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}),
             'outputs': NeuralType(('B', 'T', 'D'), ChannelType()),
             'hidden': NeuralType(('B', 'T', 'D'), ChannelType()),
         }
diff --git a/nemo/collections/asr/__init__.py b/nemo/collections/asr/__init__.py
index c8f86eb6acc1..29e1df9df347 100644
--- a/nemo/collections/asr/__init__.py
+++ b/nemo/collections/asr/__init__.py
@@ -19,7 +19,7 @@
 from .jasper import JasperDecoderForClassification, JasperDecoderForCTC, JasperEncoder
 from .las.misc import JasperRNNConnector
 from .losses import CTCLossNM
-from nemo.backends.pytorch.common.losses import CrossEntropyLoss as CrossEntropyLossNM
+from nemo.backends.pytorch.common.losses import CrossEntropyLossNM
 from nemo.core import Backend
 
 __all__ = [
diff --git a/nemo/collections/nlp/data/datasets/__init__.py b/nemo/collections/nlp/data/datasets/__init__.py
index 8e598e5655d3..4fc8431f5e33 100644
--- a/nemo/collections/nlp/data/datasets/__init__.py
+++ b/nemo/collections/nlp/data/datasets/__init__.py
@@ -14,9 +14,8 @@
 # limitations under the License.
 # =============================================================================
 
-from nemo.collections.nlp.data.datasets import datasets_utils
-from nemo.collections.nlp.data.datasets.glue_benchmark_dataset import GLUEDataset
-from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import (
+from nemo.collections.nlp.data.datasets.glue_benchmark_dataset.glue_benchmark_dataset import GLUEDataset
+from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.joint_intent_slot_dataset import (
     BertJointIntentSlotDataset,
     BertJointIntentSlotInferDataset,
 )
@@ -26,12 +25,12 @@
 )
 from nemo.collections.nlp.data.datasets.lm_transformer_dataset import LanguageModelingDataset
 from nemo.collections.nlp.data.datasets.machine_translation_dataset import TranslationDataset
+from nemo.collections.nlp.data.datasets.multiwoz_dataset import *
 from nemo.collections.nlp.data.datasets.punctuation_capitalization_dataset import (
     BertPunctuationCapitalizationDataset,
     BertPunctuationCapitalizationInferDataset,
 )
-from nemo.collections.nlp.data.datasets.qa_squad_dataset import SquadDataset
-from nemo.collections.nlp.data.datasets.state_tracking_trade_dataset import *
+from nemo.collections.nlp.data.datasets.qa_squad_dataset.qa_squad_dataset import SquadDataset
 from nemo.collections.nlp.data.datasets.text_classification_dataset import BertTextClassificationDataset
 from nemo.collections.nlp.data.datasets.token_classification_dataset import (
     BertTokenClassificationDataset,
diff --git a/nemo/collections/nlp/data/datasets/datasets_utils.py b/nemo/collections/nlp/data/datasets/datasets_utils.py
deleted file mode 100644
index 8f4e0640245d..000000000000
--- a/nemo/collections/nlp/data/datasets/datasets_utils.py
+++ /dev/null
@@ -1,990 +0,0 @@
-# =============================================================================
-# Copyright 2020 NVIDIA. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-import csv
-import glob
-import json
-import os
-import random
-import re
-import shutil
-import string
-import subprocess
-from collections import Counter
-
-import numpy as np
-from tqdm import tqdm
-
-from nemo import logging
-from nemo.collections.nlp.utils.callback_utils import list2str
-from nemo.collections.nlp.utils.common_nlp_utils import (
-    get_vocab,
-    ids2text,
-    if_exist,
-    write_vocab,
-    write_vocab_in_order,
-)
-
-__all__ = [
-    'get_label_stats',
-    'process_sst_2',
-    'process_imdb',
-    'process_thucnews',
-    'process_nlu',
-    'process_twitter_airline',
-    'process_atis',
-    'process_jarvis_datasets',
-    'process_mturk',
-    'process_intent_slot_mturk',
-    'get_intents_mturk',
-    'get_slot_labels',
-    'merge',
-    'get_intent_query_files_dialogflow',
-    'get_intents_slots_dialogflow',
-    'get_slots_dialogflow',
-    'partition_data',
-    'write_files',
-    'process_dialogflow',
-    'write_data',
-    'create_dataset',
-    'read_csv',
-    'process_snips',
-    'get_dataset',
-    'partition',
-    'map_entities',
-    'get_entities',
-    'get_data',
-    'reverse_dict',
-    'get_intent_labels',
-    'download_wkt2',
-    'normalize_answer',
-    'get_tokens',
-]
-
-DATABASE_EXISTS_TMP = '{} dataset has already been processed and stored at {}'
-MODE_EXISTS_TMP = '{} mode of {} dataset has already been processed and stored at {}'
-
-
-def get_label_stats(labels, outfile='stats.tsv'):
-    labels = Counter(labels)
-    total = sum(labels.values())
-    out = open(outfile, 'w')
-    i = 0
-    label_frequencies = labels.most_common()
-    for k, v in label_frequencies:
-        out.write(f'{k}\t{v / total}\n')
-        if i < 3:
-            logging.info(f'{i} item: {k}, {v} out of {total}, {v / total}.')
-        i += 1
-    return total, label_frequencies
-
-
-def process_sst_2(data_dir):
-    if not os.path.exists(data_dir):
-        link = 'https://gluebenchmark.com/tasks'
-        raise ValueError(f'Data not found at {data_dir}. ' f'Please download SST-2 from {link}.')
-    logging.info('Keep in mind that SST-2 is only available in lower case.')
-    return data_dir
-
-
-def process_imdb(data_dir, uncased, modes=['train', 'test']):
-    if not os.path.exists(data_dir):
-        link = 'www.kaggle.com/iarunava/imdb-movie-reviews-dataset'
-        raise ValueError(f'Data not found at {data_dir}. ' f'Please download IMDB from {link}.')
-
-    outfold = f'{data_dir}/nemo-processed'
-
-    if uncased:
-        outfold = f'{outfold}_uncased'
-
-    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
-        logging.info(DATABASE_EXISTS_TMP.format('IMDB', outfold))
-        return outfold
-    logging.info(f'Processing IMDB dataset and store at {outfold}')
-
-    os.makedirs(outfold, exist_ok=True)
-
-    outfiles = {}
-
-    for mode in modes:
-        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
-        outfiles[mode].write('sentence\tlabel\n')
-        for sent in ['neg', 'pos']:
-            if sent == 'neg':
-                label = 0
-            else:
-                label = 1
-            files = glob.glob(f'{data_dir}/{mode}/{sent}/*.txt')
-            for file in files:
-                with open(file, 'r') as f:
-                    review = f.read().strip()
-                if uncased:
-                    review = review.lower()
-                review = review.replace("<br />", "")
-                outfiles[mode].write(f'{review}\t{label}\n')
-    for mode in modes:
-        outfiles[mode].close()
-
-    return outfold
-
-
-def process_thucnews(data_dir):
-    modes = ['train', 'test']
-    train_size = 0.8
-    if not os.path.exists(data_dir):
-        link = 'thuctc.thunlp.org/'
-        raise ValueError(f'Data not found at {data_dir}. ' f'Please download THUCNews from {link}.')
-
-    outfold = f'{data_dir}/nemo-processed-thucnews'
-
-    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
-        logging.info(DATABASE_EXISTS_TMP.format('THUCNews', outfold))
-        return outfold
-    logging.info(f'Processing THUCNews dataset and store at {outfold}')
-
-    os.makedirs(outfold, exist_ok=True)
-
-    outfiles = {}
-
-    for mode in modes:
-        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'a+', encoding='utf-8')
-        outfiles[mode].write('sentence\tlabel\n')
-    categories = ['体育', '娱乐', '家居', '彩票', '房产', '教育', '时尚', '时政', '星座', '游戏', '社会', '科技', '股票', '财经']
-    for category in categories:
-        label = categories.index(category)
-        category_files = glob.glob(f'{data_dir}/{category}/*.txt')
-        test_num = int(len(category_files) * (1 - train_size))
-        test_files = category_files[:test_num]
-        train_files = category_files[test_num:]
-        for mode in modes:
-            logging.info(f'Processing {mode} data of the category {category}')
-            if mode == 'test':
-                files = test_files
-            else:
-                files = train_files
-            for file in tqdm(files):
-                with open(file, 'r', encoding='utf-8') as f:
-                    news = f.read().strip().replace('\r', '')
-                    news = news.replace('\n', '').replace('\t', ' ')
-                    outfiles[mode].write(f'{news}\t{label}\n')
-    for mode in modes:
-        outfiles[mode].close()
-
-    return outfold
-
-
-def process_nlu(filename, uncased, modes=['train', 'test'], dataset_name='nlu-ubuntu'):
-    """ Dataset has to be of:
-    - ubuntu
-    - chat
-    - web
-    """
-
-    if not os.path.exists(filename):
-        link = 'https://github.com/sebischair/NLU-Evaluation-Corpora'
-        raise ValueError(f'Data not found at {filename}. ' f'Please download IMDB from {link}.')
-
-    if dataset_name == 'nlu-ubuntu':
-        INTENT = {'makeupdate': 1, 'setupprinter': 2, 'shutdowncomputer': 3, 'softwarerecommendation': 4, 'none': 0}
-    elif dataset_name == 'nlu-chat':
-        INTENT = {'departuretime': 0, 'findconnection': 1}
-    elif dataset_name == 'nlu-web':
-        INTENT = {
-            'changepassword': 1,
-            'deleteaccount': 2,
-            'downloadvideo': 3,
-            'exportdata': 4,
-            'filterspam': 5,
-            'findalternative': 6,
-            'syncaccounts': 7,
-            'none': 0,
-        }
-    else:
-        raise ValueError(f'{dataset_name}: Invalid dataset name')
-
-    infold = filename[: filename.rfind('/')]
-    outfold = f'{infold}/{dataset_name}-nemo-processed'
-
-    if uncased:
-        outfold = f'{outfold}_uncased'
-
-    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
-        logging.info(DATABASE_EXISTS_TMP.format(dataset_name.upper(), outfold))
-        return outfold
-    logging.info(f'Processing data and store at {outfold}')
-
-    os.makedirs(outfold, exist_ok=True)
-
-    outfiles = {}
-
-    for mode in modes:
-        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
-        outfiles[mode].write('sentence\tlabel\n')
-
-    with open(filename, 'r') as f:
-        data = json.load(f)
-
-    for obj in data['sentences']:
-        sentence = obj['text'].strip()
-        if uncased:
-            sentence = sentence.lower()
-        intent = obj['intent'].lower().replace(' ', '')
-        label = INTENT[intent]
-        txt = f'{sentence}\t{label}\n'
-        if obj['training']:
-            outfiles['train'].write(txt)
-        else:
-            outfiles['test'].write(txt)
-    for mode in modes:
-        outfiles[mode].close()
-    return outfold
-
-
-def process_twitter_airline(filename, uncased, modes=['train', 'test']):
-    """ Dataset from Kaggle:
-    https://www.kaggle.com/crowdflower/twitter-airline-sentiment
-    """
-    pass
-
-
-def process_atis(infold, uncased, modes=['train', 'test'], dev_split=0):
-    """ MSFT's dataset, processed by Kaggle
-    https://www.kaggle.com/siddhadev/atis-dataset-from-ms-cntk
-    """
-    outfold = f'{infold}/nemo-processed'
-    vocab = get_vocab(f'{infold}/atis.dict.vocab.csv')
-
-    if uncased:
-        outfold = f'{outfold}-uncased'
-
-    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
-        logging.info(DATABASE_EXISTS_TMP.format('ATIS', outfold))
-        return outfold
-    logging.info(f'Processing ATIS dataset and store at {outfold}')
-
-    os.makedirs(outfold, exist_ok=True)
-
-    outfiles = {}
-
-    for mode in modes:
-        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
-        outfiles[mode].write('sentence\tlabel\n')
-        outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w')
-
-        queries = open(f'{infold}/atis.{mode}.query.csv', 'r').readlines()
-        intents = open(f'{infold}/atis.{mode}.intent.csv', 'r').readlines()
-        slots = open(f'{infold}/atis.{mode}.slots.csv', 'r').readlines()
-
-        for i, query in enumerate(queries):
-            sentence = ids2text(query.strip().split()[1:-1], vocab)
-            outfiles[mode].write(f'{sentence}\t{intents[i].strip()}\n')
-            slot = ' '.join(slots[i].strip().split()[1:-1])
-            outfiles[mode + '_slots'].write(slot + '\n')
-
-    shutil.copyfile(f'{infold}/atis.dict.intent.csv', f'{outfold}/dict.intents.csv')
-    shutil.copyfile(f'{infold}/atis.dict.slots.csv', f'{outfold}/dict.slots.csv')
-    for mode in modes:
-        outfiles[mode].close()
-
-    return outfold
-
-
-def process_jarvis_datasets(infold, uncased, dataset_name, modes=['train', 'test', 'eval'], ignore_prev_intent=False):
-    """ process and convert Jarvis datasets into NeMo's BIO format
-    """
-    outfold = f'{infold}/{dataset_name}-nemo-processed'
-    infold = f'{infold}/'
-
-    if uncased:
-        outfold = f'{outfold}-uncased'
-
-    if if_exist(outfold, ['dict.intents.csv', 'dict.slots.csv']):
-        logging.info(DATABASE_EXISTS_TMP.format(dataset_name, outfold))
-        return outfold
-
-    logging.info(f'Processing {dataset_name} dataset and store at {outfold}')
-
-    os.makedirs(outfold, exist_ok=True)
-
-    outfiles = {}
-    intents_list = {}
-    slots_list = {}
-    slots_list_all = {}
-
-    outfiles['dict_intents'] = open(f'{outfold}/dict.intents.csv', 'w')
-    outfiles['dict_slots'] = open(f'{outfold}/dict.slots.csv', 'w')
-
-    outfiles['dict_slots'].write('O\n')
-    slots_list["O"] = 0
-    slots_list_all["O"] = 0
-
-    for mode in modes:
-        if if_exist(outfold, [f'{mode}.tsv']):
-            logging.info(MODE_EXISTS_TMP.format(mode, dataset_name, outfold, mode))
-            continue
-
-        if not if_exist(infold, [f'{mode}.tsv']):
-            logging.info(f'{mode} mode of {dataset_name}' f' is skipped as it was not found.')
-            continue
-
-        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
-        outfiles[mode].write('sentence\tlabel\n')
-        outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w')
-
-        queries = open(f'{infold}/{mode}.tsv', 'r').readlines()
-
-        for i, query in enumerate(queries):
-            line_splits = query.strip().split("\t")
-            if len(line_splits) == 3:
-                intent_str, slot_tags_str, sentence = line_splits
-            else:
-                intent_str, sentence = line_splits
-                slot_tags_str = ""
-
-            if intent_str not in intents_list:
-                intents_list[intent_str] = len(intents_list)
-                outfiles['dict_intents'].write(f'{intent_str}\n')
-
-            if ignore_prev_intent:
-                start_token = 2
-            else:
-                start_token = 1
-            sentence_cld = " ".join(sentence.strip().split()[start_token:-1])
-            outfiles[mode].write(f'{sentence_cld}\t' f'{str(intents_list[intent_str])}\n')
-
-            slot_tags_list = []
-            if slot_tags_str.strip():
-                slot_tags = slot_tags_str.strip().split(",")
-                for st in slot_tags:
-                    if not st.strip():
-                        continue
-                    [start_i, end_i, slot_name] = st.strip().split(":")
-                    slot_tags_list.append([int(start_i), int(end_i), slot_name])
-                    if slot_name not in slots_list:
-                        slots_list[slot_name] = len(slots_list)
-                        slots_list_all[f'B-{slot_name}'] = len(slots_list_all)
-                        slots_list_all[f'I-{slot_name}'] = len(slots_list_all)
-                        outfiles['dict_slots'].write(f'B-{slot_name}\n')
-                        outfiles['dict_slots'].write(f'I-{slot_name}\n')
-
-            slot_tags_list.sort(key=lambda x: x[0])
-            slots = []
-            processed_index = 0
-            for tag_start, tag_end, tag_str in slot_tags_list:
-                if tag_start > processed_index:
-                    words_list = sentence[processed_index:tag_start].strip().split()
-                    slots.extend([str(slots_list_all['O'])] * len(words_list))
-                words_list = sentence[tag_start:tag_end].strip().split()
-                slots.append(str(slots_list_all[f'B-{tag_str}']))
-                slots.extend([str(slots_list_all[f'I-{tag_str}'])] * (len(words_list) - 1))
-                processed_index = tag_end
-
-            if processed_index < len(sentence):
-                words_list = sentence[processed_index:].strip().split()
-                slots.extend([str(slots_list_all['O'])] * len(words_list))
-
-            slots = slots[1:-1]
-            slot = ' '.join(slots)
-            outfiles[mode + '_slots'].write(slot + '\n')
-
-        outfiles[mode + '_slots'].close()
-        outfiles[mode].close()
-
-    outfiles['dict_slots'].close()
-    outfiles['dict_intents'].close()
-
-    return outfold
-
-
-def process_mturk(data_dir, uncased, modes=['train', 'test'], dev_split=0.1):
-    if not os.path.exists(data_dir):
-        link = 'www.mturk.com'
-        raise ValueError(
-            f'Data not found at {data_dir}. ' f'Export your mturk data from' f'{link} and unzip at {data_dir}.'
-        )
-
-    outfold = f'{data_dir}/nemo-processed'
-
-    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
-        logging.info(DATABASE_EXISTS_TMP.format('mturk', outfold))
-        return outfold
-
-    logging.info(f'Processing dataset from mturk and storing at {outfold}')
-
-    os.makedirs(outfold, exist_ok=True)
-
-    classification_data_file = f'{data_dir}/classification.csv'
-    annotation_data_file = f'{data_dir}/annotation.manifest'
-
-    if not os.path.exists(classification_data_file):
-        raise FileNotFoundError(f'File not found ' f'at {classification_data_file}')
-
-    if not os.path.exists(annotation_data_file):
-        raise FileNotFoundError(f'File not found at {annotation_data_file}')
-
-    utterances = []
-    utterances = read_csv(classification_data_file)
-
-    # This function assumes that the intent classification data has been
-    # reviewed and cleaned and only one label per utterance is present.
-    agreed_all, intent_names = get_intents_mturk(utterances, outfold)
-
-    with open(annotation_data_file, 'r') as f:
-        slot_annotations = f.readlines()
-
-    # This function assumes that the preprocess step would have made
-    # the task_name of all the annotations generic
-    task_name = 'retail-combined'
-
-    # It is assumed that every utterances will have corresponding
-    # slot annotation information
-    if len(slot_annotations) < len(agreed_all):
-        raise ValueError(f'Every utterance must have corresponding' f'slot annotation information')
-
-    slot_labels, intent_queries, slot_tags = process_intent_slot_mturk(
-        slot_annotations, agreed_all, intent_names, task_name
-    )
-
-    assert len(slot_tags) == len(intent_queries)
-
-    dev_split = 0.1
-
-    train_queries, train_slots, test_queries, test_slots = partition_data(intent_queries, slot_tags, split=dev_split)
-
-    write_files(train_queries, f'{outfold}/train.tsv')
-    write_files(train_slots, f'{outfold}/train_slots.tsv')
-
-    write_files(test_queries, f'{outfold}/test.tsv')
-    write_files(test_slots, f'{outfold}/test_slots.tsv')
-
-    write_files(slot_labels, f'{outfold}/dict.slots.csv')
-    write_files(intent_names, f'{outfold}/dict.intents.csv')
-
-    return outfold
-
-
-def process_intent_slot_mturk(slot_annotations, agreed_all, intent_names, task_name):
-    slot_tags = []
-    inorder_utterances = []
-    all_labels = get_slot_labels(slot_annotations, task_name)
-    logging.info(f'agreed_all - {len(agreed_all)}')
-    logging.info(f'Slot annotations - {len(slot_annotations)}')
-
-    for annotation in slot_annotations[0:]:
-        an = json.loads(annotation)
-        utterance = an['source']
-        if len(utterance) > 2 and utterance.startswith('"') and utterance.endswith('"'):
-            utterance = utterance[1:-1]
-
-        if utterance in agreed_all:
-            entities = {}
-            annotated_entities = an[task_name]['annotations']['entities']
-            for i, each_anno in enumerate(annotated_entities):
-                entities[int(each_anno['startOffset'])] = i
-
-            lastptr = 0
-            slotlist = []
-            # sorting annotations by the start offset
-            for i in sorted(entities.keys()):
-                annotated_entities = an[task_name]['annotations']['entities']
-                tags = annotated_entities[entities.get(i)]
-                untagged_words = utterance[lastptr : tags['startOffset']]
-                for _ in untagged_words.split():
-                    slotlist.append(all_labels.get('O'))
-                anno_words = utterance[tags['startOffset'] : tags['endOffset']]
-                # tagging with the IOB format.
-                for j, _ in enumerate(anno_words.split()):
-                    if j == 0:
-                        b_slot = 'B-' + tags['label']
-                        slotlist.append(all_labels.get(b_slot))
-                    else:
-                        i_slot = 'I-' + tags['label']
-                        slotlist.append(all_labels.get(i_slot))
-                lastptr = tags['endOffset']
-
-            untagged_words = utterance[lastptr : len(utterance)]
-            for _ in untagged_words.split():
-                slotlist.append(all_labels.get('O'))
-
-            slotstr = ' '.join(slotlist)
-            slotstr = f'{slotstr.strip()}\n'
-
-            slot_tags.append(slotstr)
-            intent_num = intent_names.get(agreed_all.get(utterance))
-            query_text = f'{utterance.strip()}\t{intent_num}\n'
-            inorder_utterances.append(query_text)
-        # else:
-        #     logging.warning(utterance)
-
-    logging.info(f'inorder utterances - {len(inorder_utterances)}')
-
-    return all_labels, inorder_utterances, slot_tags
-
-
-def get_intents_mturk(utterances, outfold):
-    intent_names = {}
-    intent_count = 0
-
-    agreed_all = {}
-
-    logging.info('Printing all intent_labels')
-    intent_dict = f'{outfold}/dict.intents.csv'
-    if os.path.exists(intent_dict):
-        with open(intent_dict, 'r') as f:
-            for intent_name in f.readlines():
-                intent_names[intent_name.strip()] = intent_count
-                intent_count += 1
-    logging.info(intent_names)
-
-    for i, utterance in enumerate(utterances[1:]):
-
-        if utterance[1] not in agreed_all:
-            agreed_all[utterance[0]] = utterance[1]
-
-        if utterance[1] not in intent_names:
-            intent_names[utterance[1]] = intent_count
-            intent_count += 1
-
-    logging.info(f'Total number of utterance samples: {len(agreed_all)}')
-
-    return agreed_all, intent_names
-
-
-def get_slot_labels(slot_annotations, task_name):
-    slot_labels = json.loads(slot_annotations[0])
-
-    all_labels = {}
-    count = 0
-    # Generating labels with the IOB format.
-    for label in slot_labels[task_name]['annotations']['labels']:
-        b_slot = 'B-' + label['label']
-        i_slot = 'I-' + label['label']
-        all_labels[b_slot] = str(count)
-        count += 1
-        all_labels[i_slot] = str(count)
-        count += 1
-    all_labels['O'] = str(count)
-
-    return all_labels
-
-
-def merge(data_dir, subdirs, dataset_name, modes=['train', 'test']):
-    outfold = f'{data_dir}/{dataset_name}'
-    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
-        logging.info(DATABASE_EXISTS_TMP.format('SNIPS-ATIS', outfold))
-        slots = get_vocab(f'{outfold}/dict.slots.csv')
-        none_slot = 0
-        for key in slots:
-            if slots[key] == 'O':
-                none_slot = key
-                break
-        return outfold, int(none_slot)
-
-    os.makedirs(outfold, exist_ok=True)
-
-    data_files, slot_files = {}, {}
-    for mode in modes:
-        data_files[mode] = open(f'{outfold}/{mode}.tsv', 'w')
-        data_files[mode].write('sentence\tlabel\n')
-        slot_files[mode] = open(f'{outfold}/{mode}_slots.tsv', 'w')
-
-    intents, slots = {}, {}
-    intent_shift, slot_shift = 0, 0
-    none_intent, none_slot = -1, -1
-
-    for subdir in subdirs:
-        curr_intents = get_vocab(f'{data_dir}/{subdir}/dict.intents.csv')
-        curr_slots = get_vocab(f'{data_dir}/{subdir}/dict.slots.csv')
-
-        for key in curr_intents:
-            if intent_shift > 0 and curr_intents[key] == 'O':
-                continue
-            if curr_intents[key] == 'O' and intent_shift == 0:
-                none_intent = int(key)
-            intents[int(key) + intent_shift] = curr_intents[key]
-
-        for key in curr_slots:
-            if slot_shift > 0 and curr_slots[key] == 'O':
-                continue
-            if slot_shift == 0 and curr_slots[key] == 'O':
-                none_slot = int(key)
-            slots[int(key) + slot_shift] = curr_slots[key]
-
-        for mode in modes:
-            with open(f'{data_dir}/{subdir}/{mode}.tsv', 'r') as f:
-                for line in f.readlines()[1:]:
-                    text, label = line.strip().split('\t')
-                    label = int(label)
-                    if curr_intents[label] == 'O':
-                        label = none_intent
-                    else:
-                        label = label + intent_shift
-                    data_files[mode].write(f'{text}\t{label}\n')
-
-            with open(f'{data_dir}/{subdir}/{mode}_slots.tsv', 'r') as f:
-                for line in f.readlines():
-                    labels = [int(label) for label in line.strip().split()]
-                    shifted_labels = []
-                    for label in labels:
-                        if curr_slots[label] == 'O':
-                            shifted_labels.append(none_slot)
-                        else:
-                            shifted_labels.append(label + slot_shift)
-                    slot_files[mode].write(list2str(shifted_labels) + '\n')
-
-        intent_shift += len(curr_intents)
-        slot_shift += len(curr_slots)
-
-    write_vocab_in_order(intents, f'{outfold}/dict.intents.csv')
-    write_vocab_in_order(slots, f'{outfold}/dict.slots.csv')
-    return outfold, none_slot
-
-
-def get_intent_query_files_dialogflow(path):
-    fileslist = []
-    for root, _, files in os.walk(path):
-        for file in files:
-            if '_usersays_en.json' in file:
-                fileslist.append(os.path.join(root, file))
-    return fileslist
-
-
-def get_intents_slots_dialogflow(files, slot_labels):
-    intent_names = []
-    intent_queries = []
-    slot_tags = []
-
-    for index, file in enumerate(files):
-        intent_names.append(os.path.basename(file).split('_usersays')[0])
-
-        with open(file) as json_file:
-            intent_data = json.load(json_file)
-            for query in intent_data:
-                query_text = ""
-                slots = ""
-                for segment in query['data']:
-                    query_text = ''.join([query_text, segment['text']])
-                    if 'alias' in segment:
-                        for _ in segment['text'].split():
-                            slots = ' '.join([slots, slot_labels.get(segment['alias'])])
-                    else:
-                        for _ in segment['text'].split():
-                            slots = ' '.join([slots, slot_labels.get('O')])
-                query_text = f'{query_text.strip()}\t{index}\n'
-                intent_queries.append(query_text)
-                slots = f'{slots.strip()}\n'
-                slot_tags.append(slots)
-    return intent_queries, intent_names, slot_tags
-
-
-def get_slots_dialogflow(files):
-    slot_labels = {}
-    count = 0
-    for file in files:
-        intent_head_file = ''.join([file.split('_usersays')[0], '.json'])
-        with open(intent_head_file) as json_file:
-            intent_meta_data = json.load(json_file)
-            for params in intent_meta_data['responses'][0]['parameters']:
-                if params['name'] not in slot_labels:
-                    slot_labels[params['name']] = str(count)
-                    count += 1
-    slot_labels['O'] = str(count)
-    return slot_labels
-
-
-def partition_data(intent_queries, slot_tags, split=0.1):
-    n = len(intent_queries)
-    n_dev = int(n * split)
-    dev_idx = set(random.sample(range(n), n_dev))
-    dev_intents, dev_slots, train_intents, train_slots = [], [], [], []
-
-    dev_intents.append('sentence\tlabel\n')
-    train_intents.append('sentence\tlabel\n')
-
-    for i, item in enumerate(intent_queries):
-        if i in dev_idx:
-            dev_intents.append(item)
-            dev_slots.append(slot_tags[i])
-        else:
-            train_intents.append(item)
-            train_slots.append(slot_tags[i])
-    return train_intents, train_slots, dev_intents, dev_slots
-
-
-def write_files(data, outfile):
-    with open(outfile, 'w') as f:
-        for item in data:
-            item = f'{item.strip()}\n'
-            f.write(item)
-
-
-def process_dialogflow(data_dir, uncased, modes=['train', 'test'], dev_split=0.1):
-    if not os.path.exists(data_dir):
-        link = 'www.dialogflow.com'
-        raise ValueError(
-            f'Data not found at {data_dir}. ' f'Export your dialogflow data from' f'{link} and unzip at {data_dir}.'
-        )
-
-    outfold = f'{data_dir}/dialogflow/nemo-processed'
-
-    '''TO DO  - check for nemo-processed directory
-    already exists. If exists, skip the entire creation steps below. '''
-
-    os.makedirs(outfold, exist_ok=True)
-
-    files = get_intent_query_files_dialogflow(data_dir)
-
-    slot_labels = get_slots_dialogflow(files)
-
-    intent_queries, intent_names, slot_tags = get_intents_slots_dialogflow(files, slot_labels)
-
-    train_queries, train_slots, test_queries, test_slots = partition_data(intent_queries, slot_tags, split=dev_split)
-
-    write_files(train_queries, f'{outfold}/train.tsv')
-    write_files(train_slots, f'{outfold}/train_slots.tsv')
-
-    write_files(test_queries, f'{outfold}/test.tsv')
-    write_files(test_slots, f'{outfold}/test_slots.tsv')
-
-    write_files(slot_labels, f'{outfold}/dict.slots.csv')
-    write_files(intent_names, f'{outfold}/dict.intents.csv')
-
-    return outfold
-
-
-def write_data(data, slot_dict, intent_dict, outfold, mode, uncased):
-    intent_file = open(f'{outfold}/{mode}.tsv', 'w')
-    intent_file.write('sentence\tlabel\n')
-    slot_file = open(f'{outfold}/{mode}_slots.tsv', 'w')
-    for tokens, slots, intent in data:
-        text = ' '.join(tokens)
-        if uncased:
-            text = text.lower()
-        intent_file.write(f'{text}\t{intent_dict[intent]}\n')
-        slots = [str(slot_dict[slot]) for slot in slots]
-        slot_file.write(' '.join(slots) + '\n')
-    intent_file.close()
-    slot_file.close()
-
-
-def create_dataset(train, dev, slots, intents, uncased, outfold):
-    os.makedirs(outfold, exist_ok=True)
-    if 'O' in slots:
-        slots.remove('O')
-    slots = sorted(list(slots)) + ['O']
-    intents = sorted(list(intents))
-    slots = write_vocab(slots, f'{outfold}/dict.slots.csv')
-    intents = write_vocab(intents, f'{outfold}/dict.intents.csv')
-    write_data(train, slots, intents, outfold, 'train', uncased)
-    write_data(dev, slots, intents, outfold, 'test', uncased)
-
-
-def read_csv(file_path):
-    rows = []
-    with open(file_path, 'r') as csvfile:
-        read_csv = csv.reader(csvfile, delimiter=',')
-        for row in read_csv:
-            rows.append(row)
-    return rows
-
-
-def process_snips(data_dir, uncased, modes=['train', 'test'], dev_split=0.1):
-    if not os.path.exists(data_dir):
-        link = 'www.github.com/snipsco/spoken-language'
-        '-understanding-research-datasets'
-        raise ValueError(f'Data not found at {data_dir}. ' f'Resquest to download the SNIPS dataset from {link}.')
-
-    outfold = f'{data_dir}/nemo-processed'
-
-    if uncased:
-        outfold = f'{outfold}-uncased'
-
-    exist = True
-    for dataset in ['light', 'speak', 'all']:
-        if if_exist(f'{outfold}/{dataset}', [f'{mode}.tsv' for mode in modes]):
-            logging.info(DATABASE_EXISTS_TMP.format('SNIPS-' + dataset.upper(), outfold))
-        else:
-            exist = False
-    if exist:
-        return outfold
-
-    logging.info(f'Processing SNIPS dataset and store at {outfold}')
-
-    os.makedirs(outfold, exist_ok=True)
-
-    speak_dir = 'smart-speaker-en-close-field'
-    light_dir = 'smart-lights-en-close-field'
-
-    light_files = [f'{data_dir}/{light_dir}/dataset.json']
-    speak_files = [f'{data_dir}/{speak_dir}/training_dataset.json']
-    speak_files.append(f'{data_dir}/{speak_dir}/test_dataset.json')
-
-    light_train, light_dev, light_slots, light_intents = get_dataset(light_files, dev_split)
-    speak_train, speak_dev, speak_slots, speak_intents = get_dataset(speak_files)
-
-    create_dataset(light_train, light_dev, light_slots, light_intents, uncased, f'{outfold}/light')
-    create_dataset(speak_train, speak_dev, speak_slots, speak_intents, uncased, f'{outfold}/speak')
-    create_dataset(
-        light_train + speak_train,
-        light_dev + speak_dev,
-        light_slots | speak_slots,
-        light_intents | speak_intents,
-        uncased,
-        f'{outfold}/all',
-    )
-
-    return outfold
-
-
-def get_dataset(files, dev_split=0.1):
-    entity2value, value2entity = get_entities(files)
-    data, slots, intents = get_data(files, entity2value, value2entity)
-    if len(data) == 1:
-        train, dev = partition(data[0], split=dev_split)
-    else:
-        train, dev = data[0], data[1]
-    return train, dev, slots, intents
-
-
-def partition(data, split=0.1):
-    n = len(data)
-    n_dev = int(n * split)
-    dev_idx = set(random.sample(range(n), n_dev))
-    dev, train = [], []
-
-    for i, item in enumerate(data):
-        if i in dev_idx:
-            dev.append(item)
-        else:
-            train.append(item)
-    return train, dev
-
-
-def map_entities(entity2value, entities):
-    for key in entities:
-        if 'data' in entities[key]:
-            if key not in entity2value:
-                entity2value[key] = set([])
-
-            values = []
-            for value in entities[key]['data']:
-                values.append(value['value'])
-                values.extend(value['synonyms'])
-            entity2value[key] = entity2value[key] | set(values)
-
-    return entity2value
-
-
-def get_entities(files):
-    entity2value = {}
-    for file in files:
-        with open(file, 'r') as json_file:
-            data = json.load(json_file)
-            entity2value = map_entities(entity2value, data['entities'])
-
-    value2entity = reverse_dict(entity2value)
-    return entity2value, value2entity
-
-
-def get_data(files, entity2value, value2entity):
-    all_data, all_slots, all_intents = [], set(['O']), set()
-    for file in files:
-        file_data = []
-        with open(file, 'r') as json_file:
-            data = json.load(json_file)
-            for intent in data['intents']:
-                all_intents.add(intent)
-                utterances = data['intents'][intent]['utterances']
-                for utterance in utterances:
-                    tokens, slots = [], []
-                    for frag in utterance['data']:
-                        frag_tokens = frag['text'].strip().split()
-                        tokens.extend(frag_tokens)
-                        if 'slot_name' not in frag:
-                            slot = 'O'
-                        else:
-                            slot = frag['slot_name']
-                            all_slots.add(slot)
-                        slots.extend([slot] * len(frag_tokens))
-                    file_data.append((tokens, slots, intent))
-        all_data.append(file_data)
-    return all_data, all_slots, all_intents
-
-
-def reverse_dict(entity2value):
-    value2entity = {}
-    for entity in entity2value:
-        for value in entity2value[entity]:
-            value2entity[value] = entity
-    return value2entity
-
-
-def get_intent_labels(intent_file):
-    labels = {}
-    label = 0
-    with open(intent_file, 'r') as f:
-        for line in f:
-            intent = line.strip()
-            labels[intent] = label
-            label += 1
-    return labels
-
-
-def download_wkt2(data_dir):
-    if os.path.exists(data_dir):
-        return
-    os.makedirs('data/lm', exist_ok=True)
-    logging.warning(f'Data not found at {data_dir}. ' f'Downloading wikitext-2 to data/lm')
-    data_dir = 'data/lm/wikitext-2'
-    subprocess.call('../scripts/get_wkt2.sh')
-    return data_dir
-
-
-def normalize_answer(s):
-    """Lower text and remove punctuation, articles and extra whitespace."""
-
-    def remove_articles(text):
-        return re.sub(r'\b(a|an|the)\b', ' ', text)
-
-    def white_space_fix(text):
-        return ' '.join(text.split())
-
-    def remove_punc(text):
-        exclude = set(string.punctuation)
-        return ''.join(ch for ch in text if ch not in exclude)
-
-    def lower(text):
-        return text.lower()
-
-    return white_space_fix(remove_articles(remove_punc(lower(s))))
-
-
-def get_tokens(s):
-    if not s:
-        return []
-    return normalize_answer(s).split()
-
-
-def get_stats(lengths):
-    lengths = np.asarray(lengths)
-    logging.info(
-        f'Min: {np.min(lengths)} | \
-                 Max: {np.max(lengths)} | \
-                 Mean: {np.mean(lengths)} | \
-                 Median: {np.median(lengths)}'
-    )
-    logging.info(f'75 percentile: {np.percentile(lengths, 75)}')
-    logging.info(f'99 percentile: {np.percentile(lengths, 99)}')
diff --git a/nemo/collections/nlp/utils/loss_utils.py b/nemo/collections/nlp/data/datasets/datasets_utils/__init__.py
similarity index 57%
rename from nemo/collections/nlp/utils/loss_utils.py
rename to nemo/collections/nlp/data/datasets/datasets_utils/__init__.py
index a4d3da6ef10f..01b4fb116b57 100644
--- a/nemo/collections/nlp/utils/loss_utils.py
+++ b/nemo/collections/nlp/data/datasets/datasets_utils/__init__.py
@@ -14,29 +14,7 @@
 # limitations under the License.
 # =============================================================================
 
-import math
-
-__all__ = ['_compute_softmax']
-
-
-def _compute_softmax(scores):
-    """Compute softmax probability over raw logits."""
-    if not scores:
-        return []
-
-    max_score = None
-    for score in scores:
-        if max_score is None or score > max_score:
-            max_score = score
-
-    exp_scores = []
-    total_sum = 0.0
-    for score in scores:
-        x = math.exp(score - max_score)
-        exp_scores.append(x)
-        total_sum += x
-
-    probs = []
-    for score in exp_scores:
-        probs.append(score / total_sum)
-    return probs
+from nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing import *
+from nemo.collections.nlp.data.datasets.datasets_utils.datasets_processing import *
+from nemo.collections.nlp.data.datasets.datasets_utils.dialogflow_utils import *
+from nemo.collections.nlp.data.datasets.datasets_utils.mturk_utils import *
diff --git a/nemo/collections/nlp/data/datasets/datasets_utils/data_preprocessing.py b/nemo/collections/nlp/data/datasets/datasets_utils/data_preprocessing.py
new file mode 100644
index 000000000000..7bc816f2bbd3
--- /dev/null
+++ b/nemo/collections/nlp/data/datasets/datasets_utils/data_preprocessing.py
@@ -0,0 +1,332 @@
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import csv
+import json
+import os
+import pickle
+import random
+import re
+import string
+from collections import Counter
+
+import numpy as np
+
+from nemo import logging
+
+__all__ = [
+    'get_label_stats',
+    'partition_data',
+    'write_files',
+    'write_data',
+    'create_dataset',
+    'read_csv',
+    'get_dataset',
+    'partition',
+    'map_entities',
+    'get_entities',
+    'get_data',
+    'reverse_dict',
+    'get_intent_labels',
+    'get_stats',
+    'DATABASE_EXISTS_TMP',
+    'MODE_EXISTS_TMP',
+    'is_whitespace',
+    'write_vocab',
+    'if_exist',
+    'remove_punctuation_from_sentence',
+    'dataset_to_ids',
+    'calc_class_weights',
+]
+
+DATABASE_EXISTS_TMP = '{} dataset has already been processed and stored at {}'
+MODE_EXISTS_TMP = '{} mode of {} dataset has already been processed and stored at {}'
+
+
+def get_label_stats(labels, outfile='stats.tsv'):
+    '''
+
+    Args:
+        labels: list of all labels
+        outfile: path to the file where to save label stats
+
+    Returns:
+        total (int): total number of labels
+        label_frequencies (list of tuples): each tuple represent (label, label frequency)
+    '''
+    labels = Counter(labels)
+    total = sum(labels.values())
+    out = open(outfile, 'w')
+    i = 0
+    label_frequencies = labels.most_common()
+    for k, v in label_frequencies:
+        out.write(f'{k}\t{v / total}\n')
+        if i < 3:
+            logging.info(f'{i} item: {k}, {v} out of {total}, {v / total}.')
+        i += 1
+    return total, label_frequencies
+
+
+def partition_data(intent_queries, slot_tags, split=0.1):
+    n = len(intent_queries)
+    n_dev = int(n * split)
+    dev_idx = set(random.sample(range(n), n_dev))
+    dev_intents, dev_slots, train_intents, train_slots = [], [], [], []
+
+    dev_intents.append('sentence\tlabel\n')
+    train_intents.append('sentence\tlabel\n')
+
+    for i, item in enumerate(intent_queries):
+        if i in dev_idx:
+            dev_intents.append(item)
+            dev_slots.append(slot_tags[i])
+        else:
+            train_intents.append(item)
+            train_slots.append(slot_tags[i])
+    return train_intents, train_slots, dev_intents, dev_slots
+
+
+def write_files(data, outfile):
+    with open(outfile, 'w') as f:
+        for item in data:
+            item = f'{item.strip()}\n'
+            f.write(item)
+
+
+def write_data(data, slot_dict, intent_dict, outfold, mode, uncased):
+    intent_file = open(f'{outfold}/{mode}.tsv', 'w')
+    intent_file.write('sentence\tlabel\n')
+    slot_file = open(f'{outfold}/{mode}_slots.tsv', 'w')
+    for tokens, slots, intent in data:
+        text = ' '.join(tokens)
+        if uncased:
+            text = text.lower()
+        intent_file.write(f'{text}\t{intent_dict[intent]}\n')
+        slots = [str(slot_dict[slot]) for slot in slots]
+        slot_file.write(' '.join(slots) + '\n')
+    intent_file.close()
+    slot_file.close()
+
+
+def create_dataset(train, dev, slots, intents, uncased, outfold):
+    os.makedirs(outfold, exist_ok=True)
+    if 'O' in slots:
+        slots.remove('O')
+    slots = sorted(list(slots)) + ['O']
+    intents = sorted(list(intents))
+    slots = write_vocab(slots, f'{outfold}/dict.slots.csv')
+    intents = write_vocab(intents, f'{outfold}/dict.intents.csv')
+    write_data(train, slots, intents, outfold, 'train', uncased)
+    write_data(dev, slots, intents, outfold, 'test', uncased)
+
+
+def read_csv(file_path):
+    rows = []
+    with open(file_path, 'r') as csvfile:
+        read_csv = csv.reader(csvfile, delimiter=',')
+        for row in read_csv:
+            rows.append(row)
+    return rows
+
+
+def get_dataset(files, dev_split=0.1):
+    entity2value, value2entity = get_entities(files)
+    data, slots, intents = get_data(files, entity2value, value2entity)
+    if len(data) == 1:
+        train, dev = partition(data[0], split=dev_split)
+    else:
+        train, dev = data[0], data[1]
+    return train, dev, slots, intents
+
+
+def partition(data, split=0.1):
+    n = len(data)
+    n_dev = int(n * split)
+    dev_idx = set(random.sample(range(n), n_dev))
+    dev, train = [], []
+
+    for i, item in enumerate(data):
+        if i in dev_idx:
+            dev.append(item)
+        else:
+            train.append(item)
+    return train, dev
+
+
+def map_entities(entity2value, entities):
+    for key in entities:
+        if 'data' in entities[key]:
+            if key not in entity2value:
+                entity2value[key] = set([])
+
+            values = []
+            for value in entities[key]['data']:
+                values.append(value['value'])
+                values.extend(value['synonyms'])
+            entity2value[key] = entity2value[key] | set(values)
+
+    return entity2value
+
+
+def get_entities(files):
+    entity2value = {}
+    for file in files:
+        with open(file, 'r') as json_file:
+            data = json.load(json_file)
+            entity2value = map_entities(entity2value, data['entities'])
+
+    value2entity = reverse_dict(entity2value)
+    return entity2value, value2entity
+
+
+def get_data(files, entity2value, value2entity):
+    all_data, all_slots, all_intents = [], set(['O']), set()
+    for file in files:
+        file_data = []
+        with open(file, 'r') as json_file:
+            data = json.load(json_file)
+            for intent in data['intents']:
+                all_intents.add(intent)
+                utterances = data['intents'][intent]['utterances']
+                for utterance in utterances:
+                    tokens, slots = [], []
+                    for frag in utterance['data']:
+                        frag_tokens = frag['text'].strip().split()
+                        tokens.extend(frag_tokens)
+                        if 'slot_name' not in frag:
+                            slot = 'O'
+                        else:
+                            slot = frag['slot_name']
+                            all_slots.add(slot)
+                        slots.extend([slot] * len(frag_tokens))
+                    file_data.append((tokens, slots, intent))
+        all_data.append(file_data)
+    return all_data, all_slots, all_intents
+
+
+def reverse_dict(entity2value):
+    value2entity = {}
+    for entity in entity2value:
+        for value in entity2value[entity]:
+            value2entity[value] = entity
+    return value2entity
+
+
+def get_intent_labels(intent_file):
+    labels = {}
+    label = 0
+    with open(intent_file, 'r') as f:
+        for line in f:
+            intent = line.strip()
+            labels[intent] = label
+            label += 1
+    return labels
+
+
+def get_stats(lengths):
+    lengths = np.asarray(lengths)
+    logging.info(
+        f'Min: {np.min(lengths)} | \
+                 Max: {np.max(lengths)} | \
+                 Mean: {np.mean(lengths)} | \
+                 Median: {np.median(lengths)}'
+    )
+    logging.info(f'75 percentile: {np.percentile(lengths, 75)}')
+    logging.info(f'99 percentile: {np.percentile(lengths, 99)}')
+
+
+def is_whitespace(c):
+    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+        return True
+    return False
+
+
+def write_vocab(items, outfile):
+    vocab = {}
+    idx = 0
+    with open(outfile, 'w') as f:
+        for item in items:
+            f.write(item + '\n')
+            vocab[item] = idx
+            idx += 1
+    return vocab
+
+
+def if_exist(outfold, files):
+    if not os.path.exists(outfold):
+        return False
+    for file in files:
+        if not os.path.exists(f'{outfold}/{file}'):
+            return False
+    return True
+
+
+def remove_punctuation_from_sentence(sentence):
+    sentence = re.sub('[' + string.punctuation + ']', '', sentence)
+    sentence = sentence.lower()
+    return sentence
+
+
+def dataset_to_ids(dataset, tokenizer, cache_ids=False, add_bos_eos=True):
+    """
+    Reads dataset from file line by line, tokenizes each line with tokenizer,
+    and returns list of lists which corresponds to ids of tokenized strings.
+
+    Args:
+        dataset: path to dataset
+        tokenizer: tokenizer to convert text into ids
+        cache_ids: if True, ids are saved to disk as pickle file
+            with similar name (e.g., data.txt --> data.txt.pkl)
+        add_bos_eos: bool, whether to add <s> and </s> symbols (e.g., for NMT)
+    Returns:
+        ids: list of ids which correspond to tokenized strings of the dataset
+    """
+
+    cached_ids_dataset = dataset + str(".pkl")
+    if os.path.isfile(cached_ids_dataset):
+        logging.info("Loading cached tokenized dataset ...")
+        ids = pickle.load(open(cached_ids_dataset, "rb"))
+    else:
+        logging.info("Tokenizing dataset ...")
+        data = open(dataset, "rb").readlines()
+        ids = []
+        for sentence in data:
+            sent_ids = tokenizer.text_to_ids(sentence.decode("utf-8"))
+            if add_bos_eos:
+                sent_ids = [tokenizer.bos_id] + sent_ids + [tokenizer.eos_id]
+            ids.append(sent_ids)
+        if cache_ids:
+            logging.info("Caching tokenized dataset ...")
+            pickle.dump(ids, open(cached_ids_dataset, "wb"))
+    return ids
+
+
+def calc_class_weights(label_freq):
+    """
+    Goal is to give more weight to the classes with less samples
+    so as to match the one with the higest frequency. We achieve this by
+    dividing the highest frequency by the freq of each label.
+    Example -
+    [12, 5, 3] -> [12/12, 12/5, 12/3] -> [1, 2.4, 4]
+
+    Here label_freq is assumed to be sorted by the frequency. I.e.
+    label_freq[0] is the most frequent element.
+
+    """
+
+    most_common_label_freq = label_freq[0]
+    weighted_slots = sorted([(index, most_common_label_freq[1] / freq) for (index, freq) in label_freq])
+    return [weight for (_, weight) in weighted_slots]
diff --git a/nemo/collections/nlp/data/datasets/datasets_utils/datasets_processing.py b/nemo/collections/nlp/data/datasets/datasets_utils/datasets_processing.py
new file mode 100644
index 000000000000..2f90412ed200
--- /dev/null
+++ b/nemo/collections/nlp/data/datasets/datasets_utils/datasets_processing.py
@@ -0,0 +1,431 @@
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import csv
+import glob
+import json
+import os
+import shutil
+
+from nemo import logging
+from nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing import (
+    DATABASE_EXISTS_TMP,
+    MODE_EXISTS_TMP,
+    create_dataset,
+    get_dataset,
+    if_exist,
+)
+from nemo.collections.nlp.utils import get_vocab
+
+__all__ = [
+    'process_atis',
+    'process_jarvis_datasets',
+    'process_snips',
+    'process_sst_2',
+    'process_imdb',
+    'process_nlu',
+    'process_thucnews',
+]
+
+
+def ids2text(ids, vocab):
+    return ' '.join([vocab[int(id_)] for id_ in ids])
+
+
+def process_atis(infold, uncased, modes=['train', 'test'], dev_split=0):
+    """ MSFT's dataset, processed by Kaggle
+    https://www.kaggle.com/siddhadev/atis-dataset-from-ms-cntk
+    """
+    outfold = f'{infold}/nemo-processed'
+    vocab = get_vocab(f'{infold}/atis.dict.vocab.csv')
+
+    if uncased:
+        outfold = f'{outfold}-uncased'
+
+    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
+        logging.info(DATABASE_EXISTS_TMP.format('ATIS', outfold))
+        return outfold
+    logging.info(f'Processing ATIS dataset and store at {outfold}')
+
+    os.makedirs(outfold, exist_ok=True)
+
+    outfiles = {}
+
+    for mode in modes:
+        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
+        outfiles[mode].write('sentence\tlabel\n')
+        outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w')
+
+        queries = open(f'{infold}/atis.{mode}.query.csv', 'r').readlines()
+        intents = open(f'{infold}/atis.{mode}.intent.csv', 'r').readlines()
+        slots = open(f'{infold}/atis.{mode}.slots.csv', 'r').readlines()
+
+        for i, query in enumerate(queries):
+            sentence = ids2text(query.strip().split()[1:-1], vocab)
+            outfiles[mode].write(f'{sentence}\t{intents[i].strip()}\n')
+            slot = ' '.join(slots[i].strip().split()[1:-1])
+            outfiles[mode + '_slots'].write(slot + '\n')
+
+    shutil.copyfile(f'{infold}/atis.dict.intent.csv', f'{outfold}/dict.intents.csv')
+    shutil.copyfile(f'{infold}/atis.dict.slots.csv', f'{outfold}/dict.slots.csv')
+    for mode in modes:
+        outfiles[mode].close()
+
+    return outfold
+
+
+def process_jarvis_datasets(infold, uncased, dataset_name, modes=['train', 'test', 'eval'], ignore_prev_intent=False):
+    """ process and convert Jarvis datasets into NeMo's BIO format
+    """
+    outfold = f'{infold}/{dataset_name}-nemo-processed'
+    infold = f'{infold}/'
+
+    if uncased:
+        outfold = f'{outfold}-uncased'
+
+    if if_exist(outfold, ['dict.intents.csv', 'dict.slots.csv']):
+        logging.info(DATABASE_EXISTS_TMP.format(dataset_name, outfold))
+        return outfold
+
+    logging.info(f'Processing {dataset_name} dataset and store at {outfold}')
+
+    os.makedirs(outfold, exist_ok=True)
+
+    outfiles = {}
+    intents_list = {}
+    slots_list = {}
+    slots_list_all = {}
+
+    outfiles['dict_intents'] = open(f'{outfold}/dict.intents.csv', 'w')
+    outfiles['dict_slots'] = open(f'{outfold}/dict.slots.csv', 'w')
+
+    outfiles['dict_slots'].write('O\n')
+    slots_list["O"] = 0
+    slots_list_all["O"] = 0
+
+    for mode in modes:
+        if if_exist(outfold, [f'{mode}.tsv']):
+            logging.info(MODE_EXISTS_TMP.format(mode, dataset_name, outfold, mode))
+            continue
+
+        if not if_exist(infold, [f'{mode}.tsv']):
+            logging.info(f'{mode} mode of {dataset_name}' f' is skipped as it was not found.')
+            continue
+
+        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
+        outfiles[mode].write('sentence\tlabel\n')
+        outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w')
+
+        queries = open(f'{infold}/{mode}.tsv', 'r').readlines()
+
+        for i, query in enumerate(queries):
+            line_splits = query.strip().split("\t")
+            if len(line_splits) == 3:
+                intent_str, slot_tags_str, sentence = line_splits
+            else:
+                intent_str, sentence = line_splits
+                slot_tags_str = ""
+
+            if intent_str not in intents_list:
+                intents_list[intent_str] = len(intents_list)
+                outfiles['dict_intents'].write(f'{intent_str}\n')
+
+            if ignore_prev_intent:
+                start_token = 2
+            else:
+                start_token = 1
+            sentence_cld = " ".join(sentence.strip().split()[start_token:-1])
+            outfiles[mode].write(f'{sentence_cld}\t' f'{str(intents_list[intent_str])}\n')
+
+            slot_tags_list = []
+            if slot_tags_str.strip():
+                slot_tags = slot_tags_str.strip().split(",")
+                for st in slot_tags:
+                    if not st.strip():
+                        continue
+                    [start_i, end_i, slot_name] = st.strip().split(":")
+                    slot_tags_list.append([int(start_i), int(end_i), slot_name])
+                    if slot_name not in slots_list:
+                        slots_list[slot_name] = len(slots_list)
+                        slots_list_all[f'B-{slot_name}'] = len(slots_list_all)
+                        slots_list_all[f'I-{slot_name}'] = len(slots_list_all)
+                        outfiles['dict_slots'].write(f'B-{slot_name}\n')
+                        outfiles['dict_slots'].write(f'I-{slot_name}\n')
+
+            slot_tags_list.sort(key=lambda x: x[0])
+            slots = []
+            processed_index = 0
+            for tag_start, tag_end, tag_str in slot_tags_list:
+                if tag_start > processed_index:
+                    words_list = sentence[processed_index:tag_start].strip().split()
+                    slots.extend([str(slots_list_all['O'])] * len(words_list))
+                words_list = sentence[tag_start:tag_end].strip().split()
+                slots.append(str(slots_list_all[f'B-{tag_str}']))
+                slots.extend([str(slots_list_all[f'I-{tag_str}'])] * (len(words_list) - 1))
+                processed_index = tag_end
+
+            if processed_index < len(sentence):
+                words_list = sentence[processed_index:].strip().split()
+                slots.extend([str(slots_list_all['O'])] * len(words_list))
+
+            slots = slots[1:-1]
+            slot = ' '.join(slots)
+            outfiles[mode + '_slots'].write(slot + '\n')
+
+        outfiles[mode + '_slots'].close()
+        outfiles[mode].close()
+
+    outfiles['dict_slots'].close()
+    outfiles['dict_intents'].close()
+
+    return outfold
+
+
+def process_snips(data_dir, uncased, modes=['train', 'test'], dev_split=0.1):
+    if not os.path.exists(data_dir):
+        link = 'www.github.com/snipsco/spoken-language'
+        '-understanding-research-datasets'
+        raise ValueError(f'Data not found at {data_dir}. ' f'Resquest to download the SNIPS dataset from {link}.')
+
+    outfold = f'{data_dir}/nemo-processed'
+
+    if uncased:
+        outfold = f'{outfold}-uncased'
+
+    exist = True
+    for dataset in ['light', 'speak', 'all']:
+        if if_exist(f'{outfold}/{dataset}', [f'{mode}.tsv' for mode in modes]):
+            logging.info(DATABASE_EXISTS_TMP.format('SNIPS-' + dataset.upper(), outfold))
+        else:
+            exist = False
+    if exist:
+        return outfold
+
+    logging.info(f'Processing SNIPS dataset and store at {outfold}')
+
+    os.makedirs(outfold, exist_ok=True)
+
+    speak_dir = 'smart-speaker-en-close-field'
+    light_dir = 'smart-lights-en-close-field'
+
+    light_files = [f'{data_dir}/{light_dir}/dataset.json']
+    speak_files = [f'{data_dir}/{speak_dir}/training_dataset.json']
+    speak_files.append(f'{data_dir}/{speak_dir}/test_dataset.json')
+
+    light_train, light_dev, light_slots, light_intents = get_dataset(light_files, dev_split)
+    speak_train, speak_dev, speak_slots, speak_intents = get_dataset(speak_files)
+
+    create_dataset(light_train, light_dev, light_slots, light_intents, uncased, f'{outfold}/light')
+    create_dataset(speak_train, speak_dev, speak_slots, speak_intents, uncased, f'{outfold}/speak')
+    create_dataset(
+        light_train + speak_train,
+        light_dev + speak_dev,
+        light_slots | speak_slots,
+        light_intents | speak_intents,
+        uncased,
+        f'{outfold}/all',
+    )
+
+    return outfold
+
+
+def process_sst_2(data_dir):
+    if not os.path.exists(data_dir):
+        link = 'https://gluebenchmark.com/tasks'
+        raise ValueError(f'Data not found at {data_dir}. ' f'Please download SST-2 from {link}.')
+    logging.info('Keep in mind that SST-2 is only available in lower case.')
+    return data_dir
+
+
+def process_imdb(data_dir, uncased, modes=['train', 'test']):
+    if not os.path.exists(data_dir):
+        link = 'www.kaggle.com/iarunava/imdb-movie-reviews-dataset'
+        raise ValueError(f'Data not found at {data_dir}. ' f'Please download IMDB from {link}.')
+
+    outfold = f'{data_dir}/nemo-processed'
+
+    if uncased:
+        outfold = f'{outfold}_uncased'
+
+    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
+        logging.info(DATABASE_EXISTS_TMP.format('IMDB', outfold))
+        return outfold
+    logging.info(f'Processing IMDB dataset and store at {outfold}')
+
+    os.makedirs(outfold, exist_ok=True)
+
+    outfiles = {}
+
+    for mode in modes:
+        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
+        outfiles[mode].write('sentence\tlabel\n')
+        for sent in ['neg', 'pos']:
+            if sent == 'neg':
+                label = 0
+            else:
+                label = 1
+            files = glob.glob(f'{data_dir}/{mode}/{sent}/*.txt')
+            for file in files:
+                with open(file, 'r') as f:
+                    review = f.read().strip()
+                if uncased:
+                    review = review.lower()
+                review = review.replace("<br />", "")
+                outfiles[mode].write(f'{review}\t{label}\n')
+    for mode in modes:
+        outfiles[mode].close()
+
+    return outfold
+
+
+def process_thucnews(data_dir):
+    modes = ['train', 'test']
+    train_size = 0.8
+    if not os.path.exists(data_dir):
+        link = 'thuctc.thunlp.org/'
+        raise ValueError(f'Data not found at {data_dir}. ' f'Please download THUCNews from {link}.')
+
+    outfold = f'{data_dir}/nemo-processed-thucnews'
+
+    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
+        logging.info(DATABASE_EXISTS_TMP.format('THUCNews', outfold))
+        return outfold
+    logging.info(f'Processing THUCNews dataset and store at {outfold}')
+
+    os.makedirs(outfold, exist_ok=True)
+
+    outfiles = {}
+
+    for mode in modes:
+        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'a+', encoding='utf-8')
+        outfiles[mode].write('sentence\tlabel\n')
+    categories = ['体育', '娱乐', '家居', '彩票', '房产', '教育', '时尚', '时政', '星座', '游戏', '社会', '科技', '股票', '财经']
+    for category in categories:
+        label = categories.index(category)
+        category_files = glob.glob(f'{data_dir}/{category}/*.txt')
+        test_num = int(len(category_files) * (1 - train_size))
+        test_files = category_files[:test_num]
+        train_files = category_files[test_num:]
+        for mode in modes:
+            logging.info(f'Processing {mode} data of the category {category}')
+            if mode == 'test':
+                files = test_files
+            else:
+                files = train_files
+            for file in tqdm(files):
+                with open(file, 'r', encoding='utf-8') as f:
+                    news = f.read().strip().replace('\r', '')
+                    news = news.replace('\n', '').replace('\t', ' ')
+                    outfiles[mode].write(f'{news}\t{label}\n')
+    for mode in modes:
+        outfiles[mode].close()
+
+    return outfold
+
+
+def process_nlu(filename, uncased, modes=['train', 'test'], dataset_name='nlu-ubuntu'):
+    """ Dataset has to be of:
+    - ubuntu
+    - chat
+    - web
+    """
+
+    if not os.path.exists(filename):
+        link = 'https://github.com/sebischair/NLU-Evaluation-Corpora'
+        raise ValueError(f'Data not found at {filename}. ' f'Please download IMDB from {link}.')
+
+    if dataset_name == 'nlu-ubuntu':
+        INTENT = {'makeupdate': 1, 'setupprinter': 2, 'shutdowncomputer': 3, 'softwarerecommendation': 4, 'none': 0}
+    elif dataset_name == 'nlu-chat':
+        INTENT = {'departuretime': 0, 'findconnection': 1}
+    elif dataset_name == 'nlu-web':
+        INTENT = {
+            'changepassword': 1,
+            'deleteaccount': 2,
+            'downloadvideo': 3,
+            'exportdata': 4,
+            'filterspam': 5,
+            'findalternative': 6,
+            'syncaccounts': 7,
+            'none': 0,
+        }
+    else:
+        raise ValueError(f'{dataset_name}: Invalid dataset name')
+
+    infold = filename[: filename.rfind('/')]
+    outfold = f'{infold}/{dataset_name}-nemo-processed'
+
+    if uncased:
+        outfold = f'{outfold}_uncased'
+
+    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
+        logging.info(DATABASE_EXISTS_TMP.format(dataset_name.upper(), outfold))
+        return outfold
+    logging.info(f'Processing data and store at {outfold}')
+
+    os.makedirs(outfold, exist_ok=True)
+
+    outfiles = {}
+
+    for mode in modes:
+        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
+        outfiles[mode].write('sentence\tlabel\n')
+
+    with open(filename, 'r') as f:
+        data = json.load(f)
+
+    for obj in data['sentences']:
+        sentence = obj['text'].strip()
+        if uncased:
+            sentence = sentence.lower()
+        intent = obj['intent'].lower().replace(' ', '')
+        label = INTENT[intent]
+        txt = f'{sentence}\t{label}\n'
+        if obj['training']:
+            outfiles['train'].write(txt)
+        else:
+            outfiles['test'].write(txt)
+    for mode in modes:
+        outfiles[mode].close()
+    return outfold
+
+
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, "r", encoding="utf-8-sig") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                # if sys.version_info[0] == 2:
+                #     line = list(unicode(cell, 'utf-8') for cell in line)
+                lines.append(line)
+            return lines
diff --git a/nemo/collections/nlp/data/datasets/datasets_utils/dialogflow_utils.py b/nemo/collections/nlp/data/datasets/datasets_utils/dialogflow_utils.py
new file mode 100644
index 000000000000..0ce116f67e38
--- /dev/null
+++ b/nemo/collections/nlp/data/datasets/datasets_utils/dialogflow_utils.py
@@ -0,0 +1,113 @@
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import json
+import os
+
+from nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing import partition_data, write_files
+
+__all__ = [
+    'get_intent_query_files_dialogflow',
+    'get_intents_slots_dialogflow',
+    'get_slots_dialogflow',
+    'process_dialogflow',
+]
+
+
+def get_intent_query_files_dialogflow(path):
+    fileslist = []
+    for root, _, files in os.walk(path):
+        for file in files:
+            if '_usersays_en.json' in file:
+                fileslist.append(os.path.join(root, file))
+    return fileslist
+
+
+def get_intents_slots_dialogflow(files, slot_labels):
+    intent_names = []
+    intent_queries = []
+    slot_tags = []
+
+    for index, file in enumerate(files):
+        intent_names.append(os.path.basename(file).split('_usersays')[0])
+
+        with open(file) as json_file:
+            intent_data = json.load(json_file)
+            for query in intent_data:
+                query_text = ""
+                slots = ""
+                for segment in query['data']:
+                    query_text = ''.join([query_text, segment['text']])
+                    if 'alias' in segment:
+                        for _ in segment['text'].split():
+                            slots = ' '.join([slots, slot_labels.get(segment['alias'])])
+                    else:
+                        for _ in segment['text'].split():
+                            slots = ' '.join([slots, slot_labels.get('O')])
+                query_text = f'{query_text.strip()}\t{index}\n'
+                intent_queries.append(query_text)
+                slots = f'{slots.strip()}\n'
+                slot_tags.append(slots)
+    return intent_queries, intent_names, slot_tags
+
+
+def get_slots_dialogflow(files):
+    slot_labels = {}
+    count = 0
+    for file in files:
+        intent_head_file = ''.join([file.split('_usersays')[0], '.json'])
+        with open(intent_head_file) as json_file:
+            intent_meta_data = json.load(json_file)
+            for params in intent_meta_data['responses'][0]['parameters']:
+                if params['name'] not in slot_labels:
+                    slot_labels[params['name']] = str(count)
+                    count += 1
+    slot_labels['O'] = str(count)
+    return slot_labels
+
+
+def process_dialogflow(data_dir, uncased, modes=['train', 'test'], dev_split=0.1):
+    if not os.path.exists(data_dir):
+        link = 'www.dialogflow.com'
+        raise ValueError(
+            f'Data not found at {data_dir}. ' f'Export your dialogflow data from' f'{link} and unzip at {data_dir}.'
+        )
+
+    outfold = f'{data_dir}/dialogflow/nemo-processed'
+
+    '''TO DO  - check for nemo-processed directory
+    already exists. If exists, skip the entire creation steps below. '''
+
+    os.makedirs(outfold, exist_ok=True)
+
+    files = get_intent_query_files_dialogflow(data_dir)
+
+    slot_labels = get_slots_dialogflow(files)
+
+    intent_queries, intent_names, slot_tags = get_intents_slots_dialogflow(files, slot_labels)
+
+    train_queries, train_slots, test_queries, test_slots = partition_data(intent_queries, slot_tags, split=dev_split)
+
+    write_files(train_queries, f'{outfold}/train.tsv')
+    write_files(train_slots, f'{outfold}/train_slots.tsv')
+
+    write_files(test_queries, f'{outfold}/test.tsv')
+    write_files(test_slots, f'{outfold}/test_slots.tsv')
+
+    write_files(slot_labels, f'{outfold}/dict.slots.csv')
+    write_files(intent_names, f'{outfold}/dict.intents.csv')
+
+    return outfold
diff --git a/nemo/collections/nlp/data/datasets/datasets_utils/mturk_utils.py b/nemo/collections/nlp/data/datasets/datasets_utils/mturk_utils.py
new file mode 100644
index 000000000000..0269962ba1b5
--- /dev/null
+++ b/nemo/collections/nlp/data/datasets/datasets_utils/mturk_utils.py
@@ -0,0 +1,201 @@
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import json
+import os
+
+from nemo import logging
+from nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing import (
+    DATABASE_EXISTS_TMP,
+    if_exist,
+    partition_data,
+    read_csv,
+    write_files,
+)
+
+__all__ = ['process_mturk', 'process_intent_slot_mturk', 'get_intents_mturk', 'get_slot_labels']
+
+
+def process_mturk(data_dir, uncased, modes=['train', 'test'], dev_split=0.1):
+    if not os.path.exists(data_dir):
+        link = 'www.mturk.com'
+        raise ValueError(
+            f'Data not found at {data_dir}. ' f'Export your mturk data from' f'{link} and unzip at {data_dir}.'
+        )
+
+    outfold = f'{data_dir}/nemo-processed'
+
+    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
+        logging.info(DATABASE_EXISTS_TMP.format('mturk', outfold))
+        return outfold
+
+    logging.info(f'Processing dataset from mturk and storing at {outfold}')
+
+    os.makedirs(outfold, exist_ok=True)
+
+    classification_data_file = f'{data_dir}/classification.csv'
+    annotation_data_file = f'{data_dir}/annotation.manifest'
+
+    if not os.path.exists(classification_data_file):
+        raise FileNotFoundError(f'File not found ' f'at {classification_data_file}')
+
+    if not os.path.exists(annotation_data_file):
+        raise FileNotFoundError(f'File not found at {annotation_data_file}')
+
+    utterances = []
+    utterances = read_csv(classification_data_file)
+
+    # This function assumes that the intent classification data has been
+    # reviewed and cleaned and only one label per utterance is present.
+    agreed_all, intent_names = get_intents_mturk(utterances, outfold)
+
+    with open(annotation_data_file, 'r') as f:
+        slot_annotations = f.readlines()
+
+    # This function assumes that the preprocess step would have made
+    # the task_name of all the annotations generic
+    task_name = 'retail-combined'
+
+    # It is assumed that every utterances will have corresponding
+    # slot annotation information
+    if len(slot_annotations) < len(agreed_all):
+        raise ValueError(f'Every utterance must have corresponding' f'slot annotation information')
+
+    slot_labels, intent_queries, slot_tags = process_intent_slot_mturk(
+        slot_annotations, agreed_all, intent_names, task_name
+    )
+
+    assert len(slot_tags) == len(intent_queries)
+
+    dev_split = 0.1
+
+    train_queries, train_slots, test_queries, test_slots = partition_data(intent_queries, slot_tags, split=dev_split)
+
+    write_files(train_queries, f'{outfold}/train.tsv')
+    write_files(train_slots, f'{outfold}/train_slots.tsv')
+
+    write_files(test_queries, f'{outfold}/test.tsv')
+    write_files(test_slots, f'{outfold}/test_slots.tsv')
+
+    write_files(slot_labels, f'{outfold}/dict.slots.csv')
+    write_files(intent_names, f'{outfold}/dict.intents.csv')
+
+    return outfold
+
+
+def process_intent_slot_mturk(slot_annotations, agreed_all, intent_names, task_name):
+    slot_tags = []
+    inorder_utterances = []
+    all_labels = get_slot_labels(slot_annotations, task_name)
+    logging.info(f'agreed_all - {len(agreed_all)}')
+    logging.info(f'Slot annotations - {len(slot_annotations)}')
+
+    for annotation in slot_annotations[0:]:
+        an = json.loads(annotation)
+        utterance = an['source']
+        if len(utterance) > 2 and utterance.startswith('"') and utterance.endswith('"'):
+            utterance = utterance[1:-1]
+
+        if utterance in agreed_all:
+            entities = {}
+            annotated_entities = an[task_name]['annotations']['entities']
+            for i, each_anno in enumerate(annotated_entities):
+                entities[int(each_anno['startOffset'])] = i
+
+            lastptr = 0
+            slotlist = []
+            # sorting annotations by the start offset
+            for i in sorted(entities.keys()):
+                annotated_entities = an[task_name]['annotations']['entities']
+                tags = annotated_entities[entities.get(i)]
+                untagged_words = utterance[lastptr : tags['startOffset']]
+                for _ in untagged_words.split():
+                    slotlist.append(all_labels.get('O'))
+                anno_words = utterance[tags['startOffset'] : tags['endOffset']]
+                # tagging with the IOB format.
+                for j, _ in enumerate(anno_words.split()):
+                    if j == 0:
+                        b_slot = 'B-' + tags['label']
+                        slotlist.append(all_labels.get(b_slot))
+                    else:
+                        i_slot = 'I-' + tags['label']
+                        slotlist.append(all_labels.get(i_slot))
+                lastptr = tags['endOffset']
+
+            untagged_words = utterance[lastptr : len(utterance)]
+            for _ in untagged_words.split():
+                slotlist.append(all_labels.get('O'))
+
+            slotstr = ' '.join(slotlist)
+            slotstr = f'{slotstr.strip()}\n'
+
+            slot_tags.append(slotstr)
+            intent_num = intent_names.get(agreed_all.get(utterance))
+            query_text = f'{utterance.strip()}\t{intent_num}\n'
+            inorder_utterances.append(query_text)
+        # else:
+        #     logging.warning(utterance)
+
+    logging.info(f'inorder utterances - {len(inorder_utterances)}')
+
+    return all_labels, inorder_utterances, slot_tags
+
+
+def get_intents_mturk(utterances, outfold):
+    intent_names = {}
+    intent_count = 0
+
+    agreed_all = {}
+
+    logging.info('Printing all intent_labels')
+    intent_dict = f'{outfold}/dict.intents.csv'
+    if os.path.exists(intent_dict):
+        with open(intent_dict, 'r') as f:
+            for intent_name in f.readlines():
+                intent_names[intent_name.strip()] = intent_count
+                intent_count += 1
+    logging.info(intent_names)
+
+    for i, utterance in enumerate(utterances[1:]):
+
+        if utterance[1] not in agreed_all:
+            agreed_all[utterance[0]] = utterance[1]
+
+        if utterance[1] not in intent_names:
+            intent_names[utterance[1]] = intent_count
+            intent_count += 1
+
+    logging.info(f'Total number of utterance samples: {len(agreed_all)}')
+
+    return agreed_all, intent_names
+
+
+def get_slot_labels(slot_annotations, task_name):
+    slot_labels = json.loads(slot_annotations[0])
+
+    all_labels = {}
+    count = 0
+    # Generating labels with the IOB format.
+    for label in slot_labels[task_name]['annotations']['labels']:
+        b_slot = 'B-' + label['label']
+        i_slot = 'I-' + label['label']
+        all_labels[b_slot] = str(count)
+        count += 1
+        all_labels[i_slot] = str(count)
+        count += 1
+    all_labels['O'] = str(count)
+
+    return all_labels
diff --git a/nemo/collections/nlp/data/datasets/glue_benchmark_dataset.py b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset.py
deleted file mode 100644
index 26423c3aa549..000000000000
--- a/nemo/collections/nlp/data/datasets/glue_benchmark_dataset.py
+++ /dev/null
@@ -1,593 +0,0 @@
-"""
-Copyright 2018 The Google AI Language Team Authors and
-The HuggingFace Inc. team.
-Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-Utility functions for GLUE tasks
-Some transformer of this code were adapted from the HuggingFace library at
-https://github.com/huggingface/transformers
-"""
-import csv
-import os
-
-import numpy as np
-from torch.utils.data import Dataset
-
-from nemo import logging
-
-__all__ = ['GLUEDataset']
-
-
-class GLUEDataset(Dataset):
-    def __init__(self, data_dir, tokenizer, max_seq_length, processor, output_mode, evaluate, token_params):
-        self.tokenizer = tokenizer
-        self.label_list = processor.get_labels()
-        self.examples = processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir)
-        self.features = convert_examples_to_features(
-            self.examples, self.label_list, max_seq_length, tokenizer, output_mode, **token_params
-        )
-
-    def __len__(self):
-        return len(self.features)
-
-    def __getitem__(self, idx):
-        feature = self.features[idx]
-        return (
-            np.array(feature.input_ids),
-            np.array(feature.segment_ids),
-            np.array(feature.input_mask, dtype=np.long),
-            np.array(feature.label_id),
-        )
-
-
-def convert_examples_to_features(
-    examples,
-    label_list,
-    max_seq_length,
-    tokenizer,
-    output_mode,
-    bos_token=None,
-    eos_token='[SEP]',
-    pad_token='[PAD]',
-    cls_token='[CLS]',
-    sep_token_extra=None,
-    cls_token_at_end=False,
-    cls_token_segment_id=0,
-    pad_token_segment_id=0,
-    pad_on_left=False,
-    mask_padding_with_zero=True,
-    sequence_a_segment_id=0,
-    sequence_b_segment_id=1,
-):
-    """ Loads a data file into a list of `InputBatch`s
-        `cls_token_at_end` define the location of the CLS token:
-            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
-            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
-        `cls_token_segment_id` define the segment id associated to the CLS
-        token (0 for BERT, 2 for XLNet)
-         The convention in BERT is:
-         (a) For sequence pairs:
-          tokens:   [CLS] is this jack ##ville ? [SEP] no it is not . [SEP]
-          type_ids:   0   0  0    0    0       0   0   1  1  1  1   1   1
-         (b) For single sequences:
-          tokens:   [CLS] the dog is hairy . [SEP]
-          type_ids:   0   0   0   0  0     0   0
-         Where "type_ids" are used to indicate whether this is the first
-         sequence or the second sequence. The embedding vectors for `type=0`
-         and `type=1` were learned during pre-training and are added to the
-         wordpiece embedding vector (and position vector). This is
-         not *strictly* necessarysince the [SEP] token unambiguously separates
-         the sequences, but it makes it easier for the model to learn
-         the concept of sequences.
-         For classification tasks, the first vector (corresponding to [CLS])
-         is used as as the "sentence vector". Note that this only makes sense
-         because the entire model is fine-tuned.
-         For NMT:
-         (a) For sequence pairs:
-          tokens:<BOS> is this jack ##ville ? <EOS> <BOS> no it is not . <EOS>
-          type_ids:0   0  0    0    0       0   0     1   1  1  1  1   1   1
-         (b) For single sequences:
-          tokens:   <BOS> the dog is hairy . <EOS>
-          type_ids:   0   0   0   0  0     0   0
-    """
-    label_map = {label: i for i, label in enumerate(label_list)}
-
-    features = []
-    for ex_index, example in enumerate(examples):
-        if ex_index % 10000 == 0:
-            logging.info("Writing example %d of %d" % (ex_index, len(examples)))
-
-        tokens_a = tokenizer.text_to_tokens(example.text_a)
-
-        tokens_b = None
-        if example.text_b:
-            tokens_b = tokenizer.text_to_tokens(example.text_b)
-
-            special_tokens_count = 2 if eos_token else 0
-            special_tokens_count += 1 if sep_token_extra else 0
-            special_tokens_count += 2 if bos_token else 0
-            special_tokens_count += 1 if cls_token else 0
-            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
-        else:
-            special_tokens_count = 1 if eos_token else 0
-            special_tokens_count += 1 if sep_token_extra else 0
-            special_tokens_count += 1 if bos_token else 0
-            if len(tokens_a) > max_seq_length - special_tokens_count:
-                tokens_a = tokens_a[: max_seq_length - special_tokens_count]
-        # Add special tokens to sequence_a
-        tokens = tokens_a
-        if bos_token:
-            tokens = [bos_token] + tokens
-        if eos_token:
-            tokens += [eos_token]
-        segment_ids = [sequence_a_segment_id] * len(tokens)
-
-        # Add sequence separator between sequences
-        if tokens_b and sep_token_extra:
-            tokens += [sep_token_extra]
-            segment_ids += [sequence_a_segment_id]
-
-        # Add special tokens to sequence_b
-        if tokens_b:
-            if bos_token:
-                tokens += [bos_token]
-                segment_ids += [sequence_b_segment_id]
-            tokens += tokens_b
-            segment_ids += [sequence_b_segment_id] * (len(tokens_b))
-            if eos_token:
-                tokens += [eos_token]
-                segment_ids += [sequence_b_segment_id]
-
-        # Add classification token - for BERT models
-        if cls_token:
-            if cls_token_at_end:
-                tokens += [cls_token]
-                segment_ids += [cls_token_segment_id]
-            else:
-                tokens = [cls_token] + tokens
-                segment_ids = [cls_token_segment_id] + segment_ids
-        input_ids = tokenizer.tokens_to_ids(tokens)
-
-        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        # tokens are attended to.
-        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-
-        # Zero-pad up to the sequence length.
-        padding_length = max_seq_length - len(input_ids)
-        pad_token_id = tokenizer.tokens_to_ids([pad_token])[0]
-        if pad_on_left:
-            input_ids = ([pad_token_id] * padding_length) + input_ids
-            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
-            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
-        else:
-            input_ids = input_ids + ([pad_token_id] * padding_length)
-            input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
-            segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
-        if len(input_ids) != max_seq_length:
-            raise ValueError("input_ids must be of length max_seq_length")
-        if len(input_mask) != max_seq_length:
-            raise ValueError("input_mask must be of length max_seq_length")
-        if len(segment_ids) != max_seq_length:
-            raise ValueError("segment_ids must be of length max_seq_length")
-        if output_mode == "classification":
-            label_id = label_map[example.label]
-        elif output_mode == "regression":
-            label_id = np.float32(example.label)
-        else:
-            raise KeyError(output_mode)
-
-        if ex_index < 5:
-            logging.info("*** Example ***")
-            logging.info("guid: %s" % (example.guid))
-            logging.info("tokens: %s" % " ".join(list(map(str, tokens))))
-            logging.info("input_ids: %s" % " ".join(list(map(str, input_ids))))
-            logging.info("input_mask: %s" % " ".join(list(map(str, input_mask))))
-            logging.info("segment_ids: %s" % " ".join(list(map(str, segment_ids))))
-            logging.info("label: %s (id = %d)" % (example.label, label_id))
-
-        features.append(
-            InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id)
-        )
-    return features
-
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length.
-
-     This will always truncate the longer sequence one token at a time.
-     This makes more sense than truncating an equal percent
-     of tokens from each, since if one sequence is very short then each token
-     that's truncated likely contains more information than a longer sequence.
-    """
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-
-"""
-Utility functions for GLUE tasks
-This code was adapted from the HuggingFace library at
-https://github.com/huggingface/transformers
-"""
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self, input_ids, input_mask, segment_ids, label_id):
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.label_id = label_id
-
-
-class InputExample(object):
-    """A single training/test example for simple sequence classification."""
-
-    def __init__(self, guid, text_a, text_b=None, label=None):
-        """Constructs a InputExample.
-
-        Args:
-            guid: Unique id for the example.
-            text_a: string. The untokenized text of the first sequence.
-            For single sequence tasks, only this sequence must be specified.
-            text_b: (Optional) string. The untokenized text of the second
-            sequence. Only must be specified for sequence pair tasks.
-            label: (Optional) string. The label of the example. This should be
-            specified for train and dev examples, but not for test examples.
-        """
-        self.guid = guid
-        self.text_a = text_a
-        self.text_b = text_b
-        self.label = label
-
-
-class DataProcessor(object):
-    """Base class for data converters for sequence classification data sets."""
-
-    def get_train_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the train set."""
-        raise NotImplementedError()
-
-    def get_dev_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the dev set."""
-        raise NotImplementedError()
-
-    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        raise NotImplementedError()
-
-    @classmethod
-    def _read_tsv(cls, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with open(input_file, "r", encoding="utf-8-sig") as f:
-            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
-            lines = []
-            for line in reader:
-                # if sys.version_info[0] == 2:
-                #     line = list(unicode(cell, 'utf-8') for cell in line)
-                lines.append(line)
-            return lines
-
-
-class MrpcProcessor(DataProcessor):
-    """Processor for the MRPC data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        logging.info(f'LOOKING AT {os.path.join(data_dir, "train.tsv")}')
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, i)
-            text_a = line[3]
-            text_b = line[4]
-            label = line[0]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class MnliProcessor(DataProcessor):
-    """Processor for the MultiNLI data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched")
-
-    def get_labels(self):
-        """See base class."""
-        return ["contradiction", "entailment", "neutral"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[8]
-            text_b = line[9]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class MnliMismatchedProcessor(MnliProcessor):
-    """Processor for the MultiNLI Mismatched data set (GLUE version)."""
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_matched")
-
-
-class ColaProcessor(DataProcessor):
-    """Processor for the CoLA data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            guid = "%s-%s" % (set_type, i)
-            text_a = line[3]
-            label = line[1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
-        return examples
-
-
-class Sst2Processor(DataProcessor):
-    """Processor for the SST-2 data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, i)
-            text_a = line[0]
-            label = line[1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
-        return examples
-
-
-class StsbProcessor(DataProcessor):
-    """Processor for the STS-B data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return [None]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[7]
-            text_b = line[8]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class QqpProcessor(DataProcessor):
-    """Processor for the QQP data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            try:
-                text_a = line[3]
-                text_b = line[4]
-                label = line[5]
-            except IndexError:
-                continue
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class QnliProcessor(DataProcessor):
-    """Processor for the QNLI data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched")
-
-    def get_labels(self):
-        """See base class."""
-        return ["entailment", "not_entailment"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[1]
-            text_b = line[2]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class RteProcessor(DataProcessor):
-    """Processor for the RTE data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["entailment", "not_entailment"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[1]
-            text_b = line[2]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class WnliProcessor(DataProcessor):
-    """Processor for the WNLI data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[1]
-            text_b = line[2]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-processors = {
-    "cola": ColaProcessor,
-    "mnli": MnliProcessor,
-    "mnli-mm": MnliMismatchedProcessor,
-    "mrpc": MrpcProcessor,
-    "sst-2": Sst2Processor,
-    "sts-b": StsbProcessor,
-    "qqp": QqpProcessor,
-    "qnli": QnliProcessor,
-    "rte": RteProcessor,
-    "wnli": WnliProcessor,
-}
-output_modes = {
-    "cola": "classification",
-    "mnli": "classification",
-    "mnli-mm": "classification",
-    "mrpc": "classification",
-    "sst-2": "classification",
-    "sts-b": "regression",
-    "qqp": "classification",
-    "qnli": "classification",
-    "rte": "classification",
-    "wnli": "classification",
-}
-GLUE_TASKS_NUM_LABELS = {
-    "cola": 2,
-    "mnli": 3,
-    "mrpc": 2,
-    "sst-2": 2,
-    "sts-b": 1,
-    "qqp": 2,
-    "qnli": 2,
-    "rte": 2,
-    "wnli": 2,
-}
diff --git a/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/__init__.py b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/__init__.py
new file mode 100644
index 000000000000..d396af9c88fb
--- /dev/null
+++ b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/__init__.py
@@ -0,0 +1,18 @@
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.collections.nlp.data.datasets.glue_benchmark_dataset.data_processors import *
+from nemo.collections.nlp.data.datasets.glue_benchmark_dataset.glue_benchmark_dataset import *
diff --git a/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/data_processors.py b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/data_processors.py
new file mode 100644
index 000000000000..48e9297dbe98
--- /dev/null
+++ b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/data_processors.py
@@ -0,0 +1,341 @@
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# =============================================================================
+# Copyright 2018 The Google AI Language Team Authors and
+# The HuggingFace Inc. team.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import os
+
+from nemo import logging
+from nemo.collections.nlp.data.datasets.datasets_utils.datasets_processing import DataProcessor
+
+__all__ = [
+    'ColaProcessor',
+    'MnliProcessor',
+    'MnliMismatchedProcessor',
+    'MrpcProcessor',
+    'Sst2Processor',
+    'StsbProcessor',
+    'QqpProcessor',
+    'QnliProcessor',
+    'RteProcessor',
+    'WnliProcessor',
+]
+
+
+class MrpcProcessor(DataProcessor):
+    """Processor for the MRPC data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logging.info(f'LOOKING AT {os.path.join(data_dir, "train.tsv")}')
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[3]
+            text_b = line[4]
+            label = line[0]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class MnliProcessor(DataProcessor):
+    """Processor for the MultiNLI data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched")
+
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[8]
+            text_b = line[9]
+            label = line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class MnliMismatchedProcessor(MnliProcessor):
+    """Processor for the MultiNLI Mismatched data set (GLUE version)."""
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_matched")
+
+
+class ColaProcessor(DataProcessor):
+    """Processor for the CoLA data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[3]
+            label = line[1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+class Sst2Processor(DataProcessor):
+    """Processor for the SST-2 data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[0]
+            label = line[1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+class StsbProcessor(DataProcessor):
+    """Processor for the STS-B data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return [None]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[7]
+            text_b = line[8]
+            label = line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class QqpProcessor(DataProcessor):
+    """Processor for the QQP data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            try:
+                text_a = line[3]
+                text_b = line[4]
+                label = line[5]
+            except IndexError:
+                continue
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class QnliProcessor(DataProcessor):
+    """Processor for the QNLI data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched")
+
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "not_entailment"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class RteProcessor(DataProcessor):
+    """Processor for the RTE data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "not_entailment"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class WnliProcessor(DataProcessor):
+    """Processor for the WNLI data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        """Constructs a InputExample.
+
+        Args:
+            guid: Unique id for the example.
+            text_a: string. The untokenized text of the first sequence.
+            For single sequence tasks, only this sequence must be specified.
+            text_b: (Optional) string. The untokenized text of the second
+            sequence. Only must be specified for sequence pair tasks.
+            label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
diff --git a/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/glue_benchmark_dataset.py b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/glue_benchmark_dataset.py
new file mode 100644
index 000000000000..60e67639dc62
--- /dev/null
+++ b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset/glue_benchmark_dataset.py
@@ -0,0 +1,289 @@
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# =============================================================================
+# Copyright 2018 The Google AI Language Team Authors and
+# The HuggingFace Inc. team.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""
+Utility functions for GLUE tasks
+Some transformer of this code were adapted from the HuggingFace library at
+https://github.com/huggingface/transformers
+"""
+
+import numpy as np
+from torch.utils.data import Dataset
+
+from nemo import logging
+from nemo.collections.nlp.data.datasets.glue_benchmark_dataset.data_processors import *
+
+__all__ = ['GLUEDataset', 'output_modes', 'processors']
+
+processors = {
+    "cola": ColaProcessor,
+    "mnli": MnliProcessor,
+    "mnli-mm": MnliMismatchedProcessor,
+    "mrpc": MrpcProcessor,
+    "sst-2": Sst2Processor,
+    "sts-b": StsbProcessor,
+    "qqp": QqpProcessor,
+    "qnli": QnliProcessor,
+    "rte": RteProcessor,
+    "wnli": WnliProcessor,
+}
+output_modes = {
+    "cola": "classification",
+    "mnli": "classification",
+    "mnli-mm": "classification",
+    "mrpc": "classification",
+    "sst-2": "classification",
+    "sts-b": "regression",
+    "qqp": "classification",
+    "qnli": "classification",
+    "rte": "classification",
+    "wnli": "classification",
+}
+GLUE_TASKS_NUM_LABELS = {
+    "cola": 2,
+    "mnli": 3,
+    "mrpc": 2,
+    "sst-2": 2,
+    "sts-b": 1,
+    "qqp": 2,
+    "qnli": 2,
+    "rte": 2,
+    "wnli": 2,
+}
+
+
+class GLUEDataset(Dataset):
+    def __init__(self, data_dir, tokenizer, max_seq_length, processor, output_mode, evaluate, token_params):
+        self.tokenizer = tokenizer
+        self.label_list = processor.get_labels()
+        self.examples = processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir)
+        self.features = self.convert_examples_to_features(
+            self.examples, self.label_list, max_seq_length, tokenizer, output_mode, **token_params
+        )
+
+    def __len__(self):
+        return len(self.features)
+
+    def __getitem__(self, idx):
+        feature = self.features[idx]
+        return (
+            np.array(feature.input_ids),
+            np.array(feature.segment_ids),
+            np.array(feature.input_mask, dtype=np.long),
+            np.array(feature.label_id),
+        )
+
+    def convert_examples_to_features(
+        self,
+        examples,
+        label_list,
+        max_seq_length,
+        tokenizer,
+        output_mode,
+        bos_token=None,
+        eos_token='[SEP]',
+        pad_token='[PAD]',
+        cls_token='[CLS]',
+        sep_token_extra=None,
+        cls_token_at_end=False,
+        cls_token_segment_id=0,
+        pad_token_segment_id=0,
+        pad_on_left=False,
+        mask_padding_with_zero=True,
+        sequence_a_segment_id=0,
+        sequence_b_segment_id=1,
+    ):
+        """ Loads a data file into a list of `InputBatch`s
+            `cls_token_at_end` define the location of the CLS token:
+                - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
+                - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
+            `cls_token_segment_id` define the segment id associated to the CLS
+            token (0 for BERT, 2 for XLNet)
+             The convention in BERT is:
+             (a) For sequence pairs:
+              tokens:   [CLS] is this jack ##ville ? [SEP] no it is not . [SEP]
+              type_ids:   0   0  0    0    0       0   0   1  1  1  1   1   1
+             (b) For single sequences:
+              tokens:   [CLS] the dog is hairy . [SEP]
+              type_ids:   0   0   0   0  0     0   0
+             Where "type_ids" are used to indicate whether this is the first
+             sequence or the second sequence. The embedding vectors for `type=0`
+             and `type=1` were learned during pre-training and are added to the
+             wordpiece embedding vector (and position vector). This is
+             not *strictly* necessarysince the [SEP] token unambiguously separates
+             the sequences, but it makes it easier for the model to learn
+             the concept of sequences.
+             For classification tasks, the first vector (corresponding to [CLS])
+             is used as as the "sentence vector". Note that this only makes sense
+             because the entire model is fine-tuned.
+             For NMT:
+             (a) For sequence pairs:
+              tokens:<BOS> is this jack ##ville ? <EOS> <BOS> no it is not . <EOS>
+              type_ids:0   0  0    0    0       0   0     1   1  1  1  1   1   1
+             (b) For single sequences:
+              tokens:   <BOS> the dog is hairy . <EOS>
+              type_ids:   0   0   0   0  0     0   0
+        """
+        label_map = {label: i for i, label in enumerate(label_list)}
+
+        features = []
+        for ex_index, example in enumerate(examples):
+            if ex_index % 10000 == 0:
+                logging.info("Writing example %d of %d" % (ex_index, len(examples)))
+
+            tokens_a = tokenizer.text_to_tokens(example.text_a)
+
+            tokens_b = None
+            if example.text_b:
+                tokens_b = tokenizer.text_to_tokens(example.text_b)
+
+                special_tokens_count = 2 if eos_token else 0
+                special_tokens_count += 1 if sep_token_extra else 0
+                special_tokens_count += 2 if bos_token else 0
+                special_tokens_count += 1 if cls_token else 0
+                self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
+            else:
+                special_tokens_count = 1 if eos_token else 0
+                special_tokens_count += 1 if sep_token_extra else 0
+                special_tokens_count += 1 if bos_token else 0
+                if len(tokens_a) > max_seq_length - special_tokens_count:
+                    tokens_a = tokens_a[: max_seq_length - special_tokens_count]
+            # Add special tokens to sequence_a
+            tokens = tokens_a
+            if bos_token:
+                tokens = [bos_token] + tokens
+            if eos_token:
+                tokens += [eos_token]
+            segment_ids = [sequence_a_segment_id] * len(tokens)
+
+            # Add sequence separator between sequences
+            if tokens_b and sep_token_extra:
+                tokens += [sep_token_extra]
+                segment_ids += [sequence_a_segment_id]
+
+            # Add special tokens to sequence_b
+            if tokens_b:
+                if bos_token:
+                    tokens += [bos_token]
+                    segment_ids += [sequence_b_segment_id]
+                tokens += tokens_b
+                segment_ids += [sequence_b_segment_id] * (len(tokens_b))
+                if eos_token:
+                    tokens += [eos_token]
+                    segment_ids += [sequence_b_segment_id]
+
+            # Add classification token - for BERT models
+            if cls_token:
+                if cls_token_at_end:
+                    tokens += [cls_token]
+                    segment_ids += [cls_token_segment_id]
+                else:
+                    tokens = [cls_token] + tokens
+                    segment_ids = [cls_token_segment_id] + segment_ids
+            input_ids = tokenizer.tokens_to_ids(tokens)
+
+            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+            # tokens are attended to.
+            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            padding_length = max_seq_length - len(input_ids)
+            pad_token_id = tokenizer.tokens_to_ids([pad_token])[0]
+            if pad_on_left:
+                input_ids = ([pad_token_id] * padding_length) + input_ids
+                input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
+                segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
+            else:
+                input_ids = input_ids + ([pad_token_id] * padding_length)
+                input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+                segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
+            if len(input_ids) != max_seq_length:
+                raise ValueError("input_ids must be of length max_seq_length")
+            if len(input_mask) != max_seq_length:
+                raise ValueError("input_mask must be of length max_seq_length")
+            if len(segment_ids) != max_seq_length:
+                raise ValueError("segment_ids must be of length max_seq_length")
+            if output_mode == "classification":
+                label_id = label_map[example.label]
+            elif output_mode == "regression":
+                label_id = np.float32(example.label)
+            else:
+                raise KeyError(output_mode)
+
+            if ex_index < 5:
+                logging.info("*** Example ***")
+                logging.info("guid: %s" % (example.guid))
+                logging.info("tokens: %s" % " ".join(list(map(str, tokens))))
+                logging.info("input_ids: %s" % " ".join(list(map(str, input_ids))))
+                logging.info("input_mask: %s" % " ".join(list(map(str, input_mask))))
+                logging.info("segment_ids: %s" % " ".join(list(map(str, segment_ids))))
+                logging.info("label: %s (id = %d)" % (example.label, label_id))
+
+            features.append(
+                InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id)
+            )
+        return features
+
+    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
+        """Truncates a sequence pair in place to the maximum length.
+
+         This will always truncate the longer sequence one token at a time.
+         This makes more sense than truncating an equal percent
+         of tokens from each, since if one sequence is very short then each token
+         that's truncated likely contains more information than a longer sequence.
+        """
+        while True:
+            total_length = len(tokens_a) + len(tokens_b)
+            if total_length <= max_length:
+                break
+            if len(tokens_a) > len(tokens_b):
+                tokens_a.pop()
+            else:
+                tokens_b.pop()
+
+    """
+    Utility functions for GLUE tasks
+    This code was adapted from the HuggingFace library at
+    https://github.com/huggingface/transformers
+    """
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, input_ids, input_mask, segment_ids, label_id):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
diff --git a/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/__init__.py b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/__init__.py
new file mode 100644
index 000000000000..3717507be2e2
--- /dev/null
+++ b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/__init__.py
@@ -0,0 +1,18 @@
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.inference_utils import *
+from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset.joint_intent_slot_dataset import *
diff --git a/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/data_descriptor.py b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/data_descriptor.py
new file mode 100644
index 000000000000..e298f4196bdc
--- /dev/null
+++ b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/data_descriptor.py
@@ -0,0 +1,254 @@
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import itertools
+import os
+
+from nemo import logging
+from nemo.collections.nlp.data.datasets.datasets_utils import (
+    DATABASE_EXISTS_TMP,
+    calc_class_weights,
+    get_label_stats,
+    if_exist,
+    process_atis,
+    process_dialogflow,
+    process_jarvis_datasets,
+    process_mturk,
+    process_snips,
+)
+from nemo.collections.nlp.utils import get_vocab, list2str
+
+__all__ = ['JointIntentSlotDataDesc']
+
+
+class JointIntentSlotDataDesc:
+    """ Convert the raw data to the standard format supported by
+    JointIntentSlotDataset.
+
+    By default, the None label for slots is 'O'.
+
+    JointIntentSlotDataset requires two files:
+
+        input_file: file to sequence + label.
+            the first line is header (sentence [tab] label)
+            each line should be [sentence][tab][label]
+
+        slot_file: file to slot labels, each line corresponding to
+            slot labels for a sentence in input_file. No header.
+
+    To keep the mapping from label index to label consistent during
+    training and inferencing, we require the following files:
+        dicts.intents.csv: each line is an intent. The first line
+            corresponding to the 0 intent label, the second line
+            corresponding to the 1 intent label, and so on.
+
+        dicts.slots.csv: each line is a slot. The first line
+            corresponding to the 0 slot label, the second line
+            corresponding to the 1 slot label, and so on.
+
+    Args:
+        data_dir (str): the directory of the dataset
+        do_lower_case (bool): whether to set your dataset to lowercase
+        dataset_name (str): the name of the dataset. If it's a dataset
+            that follows the standard JointIntentSlotDataset format,
+            you can set the name as 'default'.
+        none_slot_label (str): the label for slots that aren't indentified
+            defaulted to 'O'
+        pad_label (int): the int used for padding. If set to -1,
+             it'll be set to the whatever the None label is.
+
+    """
+
+    def __init__(self, data_dir, do_lower_case=False, dataset_name='default', none_slot_label='O', pad_label=-1):
+        if dataset_name == 'atis':
+            self.data_dir = process_atis(data_dir, do_lower_case)
+        elif dataset_name == 'snips-atis':
+            self.data_dir, self.pad_label = self.merge(
+                data_dir, ['ATIS/nemo-processed-uncased', 'snips/nemo-processed-uncased/all'], dataset_name
+            )
+        elif dataset_name == 'dialogflow':
+            self.data_dir = process_dialogflow(data_dir, do_lower_case)
+        elif dataset_name == 'mturk-processed':
+            self.data_dir = process_mturk(data_dir, do_lower_case)
+        elif dataset_name in set(['snips-light', 'snips-speak', 'snips-all']):
+            self.data_dir = process_snips(data_dir, do_lower_case)
+            if dataset_name.endswith('light'):
+                self.data_dir = f'{self.data_dir}/light'
+            elif dataset_name.endswith('speak'):
+                self.data_dir = f'{self.data_dir}/speak'
+            elif dataset_name.endswith('all'):
+                self.data_dir = f'{self.data_dir}/all'
+        elif dataset_name.startswith('jarvis'):
+            self.data_dir = process_jarvis_datasets(
+                data_dir, do_lower_case, dataset_name, modes=["train", "test", "eval"], ignore_prev_intent=False
+            )
+        else:
+            if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']):
+                raise FileNotFoundError(
+                    "Make sure that your data follows the standard format "
+                    "supported by JointIntentSlotDataset. Your data must "
+                    "contain dict.intents.csv and dict.slots.csv."
+                )
+            self.data_dir = data_dir
+
+        self.intent_dict_file = self.data_dir + '/dict.intents.csv'
+        self.slot_dict_file = self.data_dir + '/dict.slots.csv'
+        self.num_intents = len(get_vocab(self.intent_dict_file))
+        slots = label2idx(self.slot_dict_file)
+        self.num_slots = len(slots)
+
+        for mode in ['train', 'test', 'eval']:
+
+            if not if_exist(self.data_dir, [f'{mode}.tsv']):
+                logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.')
+                continue
+
+            slot_file = f'{self.data_dir}/{mode}_slots.tsv'
+            with open(slot_file, 'r') as f:
+                slot_lines = f.readlines()
+
+            input_file = f'{self.data_dir}/{mode}.tsv'
+            with open(input_file, 'r') as f:
+                input_lines = f.readlines()[1:]  # Skipping headers at index 0
+
+            if len(slot_lines) != len(input_lines):
+                raise ValueError(
+                    "Make sure that the number of slot lines match the "
+                    "number of intent lines. There should be a 1-1 "
+                    "correspondence between every slot and intent lines."
+                )
+
+            dataset = list(zip(slot_lines, input_lines))
+
+            raw_slots, queries, raw_intents = [], [], []
+            for slot_line, input_line in dataset:
+                slot_list = [int(slot) for slot in slot_line.strip().split()]
+                raw_slots.append(slot_list)
+                parts = input_line.strip().split()
+                raw_intents.append(int(parts[-1]))
+                queries.append(' '.join(parts[:-1]))
+
+            infold = input_file[: input_file.rfind('/')]
+
+            logging.info(f'Three most popular intents during {mode}ing')
+            total_intents, intent_label_freq = get_label_stats(raw_intents, infold + f'/{mode}_intent_stats.tsv')
+            merged_slots = itertools.chain.from_iterable(raw_slots)
+
+            logging.info(f'Three most popular slots during {mode}ing')
+            slots_total, slots_label_freq = get_label_stats(merged_slots, infold + f'/{mode}_slot_stats.tsv')
+
+            if mode == 'train':
+                self.slot_weights = calc_class_weights(slots_label_freq)
+                logging.info(f'Slot weights are - {self.slot_weights}')
+
+                self.intent_weights = calc_class_weights(intent_label_freq)
+                logging.info(f'Intent weights are - {self.intent_weights}')
+
+            logging.info(f'Total intents - {total_intents}')
+            logging.info(f'Intent label frequency - {intent_label_freq}')
+            logging.info(f'Total Slots - {slots_total}')
+            logging.info(f'Slots label frequency - {slots_label_freq}')
+
+        if pad_label != -1:
+            self.pad_label = pad_label
+        else:
+            if none_slot_label not in slots:
+                raise ValueError(f'none_slot_label {none_slot_label} not ' f'found in {self.slot_dict_file}.')
+            self.pad_label = slots[none_slot_label]
+
+    def merge(self, data_dir, subdirs, dataset_name, modes=['train', 'test']):
+        outfold = f'{data_dir}/{dataset_name}'
+        if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
+            logging.info(DATABASE_EXISTS_TMP.format('SNIPS-ATIS', outfold))
+            slots = get_vocab(f'{outfold}/dict.slots.csv')
+            none_slot = 0
+            for key in slots:
+                if slots[key] == 'O':
+                    none_slot = key
+                    break
+            return outfold, int(none_slot)
+
+        os.makedirs(outfold, exist_ok=True)
+
+        data_files, slot_files = {}, {}
+        for mode in modes:
+            data_files[mode] = open(f'{outfold}/{mode}.tsv', 'w')
+            data_files[mode].write('sentence\tlabel\n')
+            slot_files[mode] = open(f'{outfold}/{mode}_slots.tsv', 'w')
+
+        intents, slots = {}, {}
+        intent_shift, slot_shift = 0, 0
+        none_intent, none_slot = -1, -1
+
+        for subdir in subdirs:
+            curr_intents = get_vocab(f'{data_dir}/{subdir}/dict.intents.csv')
+            curr_slots = get_vocab(f'{data_dir}/{subdir}/dict.slots.csv')
+
+            for key in curr_intents:
+                if intent_shift > 0 and curr_intents[key] == 'O':
+                    continue
+                if curr_intents[key] == 'O' and intent_shift == 0:
+                    none_intent = int(key)
+                intents[int(key) + intent_shift] = curr_intents[key]
+
+            for key in curr_slots:
+                if slot_shift > 0 and curr_slots[key] == 'O':
+                    continue
+                if slot_shift == 0 and curr_slots[key] == 'O':
+                    none_slot = int(key)
+                slots[int(key) + slot_shift] = curr_slots[key]
+
+            for mode in modes:
+                with open(f'{data_dir}/{subdir}/{mode}.tsv', 'r') as f:
+                    for line in f.readlines()[1:]:
+                        text, label = line.strip().split('\t')
+                        label = int(label)
+                        if curr_intents[label] == 'O':
+                            label = none_intent
+                        else:
+                            label = label + intent_shift
+                        data_files[mode].write(f'{text}\t{label}\n')
+
+                with open(f'{data_dir}/{subdir}/{mode}_slots.tsv', 'r') as f:
+                    for line in f.readlines():
+                        labels = [int(label) for label in line.strip().split()]
+                        shifted_labels = []
+                        for label in labels:
+                            if curr_slots[label] == 'O':
+                                shifted_labels.append(none_slot)
+                            else:
+                                shifted_labels.append(label + slot_shift)
+                        slot_files[mode].write(list2str(shifted_labels) + '\n')
+
+            intent_shift += len(curr_intents)
+            slot_shift += len(curr_slots)
+
+        write_vocab_in_order(intents, f'{outfold}/dict.intents.csv')
+        write_vocab_in_order(slots, f'{outfold}/dict.slots.csv')
+        return outfold, none_slot
+
+
+def label2idx(file):
+    lines = open(file, 'r').readlines()
+    lines = [line.strip() for line in lines if line.strip()]
+    labels = {lines[i]: i for i in range(len(lines))}
+    return labels
+
+
+def write_vocab_in_order(vocab, outfile):
+    with open(outfile, 'w') as f:
+        for key in sorted(vocab.keys()):
+            f.write(f'{vocab[key]}\n')
diff --git a/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/inference_utils.py b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/inference_utils.py
new file mode 100644
index 000000000000..a886c20739bf
--- /dev/null
+++ b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/inference_utils.py
@@ -0,0 +1,50 @@
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import numpy as np
+
+from nemo import logging
+from nemo.collections.nlp.utils import get_vocab
+
+__all__ = ['read_intent_slot_outputs']
+
+
+def read_intent_slot_outputs(
+    queries, intent_file, slot_file, intent_logits, slot_logits, slot_masks, intents=None, slots=None
+):
+    intent_dict = get_vocab(intent_file)
+    slot_dict = get_vocab(slot_file)
+    pred_intents = np.argmax(intent_logits, 1)
+    pred_slots = np.argmax(slot_logits, axis=2)
+    slot_masks = slot_masks > 0.5
+    for i, query in enumerate(queries):
+        logging.info(f'Query: {query}')
+        pred = pred_intents[i]
+        logging.info(f'Predicted intent:\t{pred}\t{intent_dict[pred]}')
+        if intents is not None:
+            logging.info(f'True intent:\t{intents[i]}\t{intent_dict[intents[i]]}')
+
+        pred_slot = pred_slots[i][slot_masks[i]]
+        tokens = query.strip().split()
+
+        if len(pred_slot) != len(tokens):
+            raise ValueError('Pred_slot and tokens must be of the same length')
+
+        for j, token in enumerate(tokens):
+            output = f'{token}\t{slot_dict[pred_slot[j]]}'
+            if slots is not None:
+                output = f'{output}\t{slot_dict[slots[i][j]]}'
+            logging.info(output)
diff --git a/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset.py b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/joint_intent_slot_dataset.py
similarity index 55%
rename from nemo/collections/nlp/data/datasets/joint_intent_slot_dataset.py
rename to nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/joint_intent_slot_dataset.py
index 4abc70923226..b0cbebd41f0f 100644
--- a/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset.py
+++ b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset/joint_intent_slot_dataset.py
@@ -1,3 +1,20 @@
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# =============================================================================
 # Copyright 2018 The Google AI Language Team Authors and
 # The HuggingFace Inc. team.
 # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
@@ -13,31 +30,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# =============================================================================
+
 """
 Utility functions for Token Classification NLP tasks
 Some parts of this code were adapted from the HuggingFace library at
 https://github.com/huggingface/pytorch-pretrained-BERT
 """
-import itertools
-import random
 
 import numpy as np
 from torch.utils.data import Dataset
 
 from nemo import logging
-from nemo.collections.nlp.data.datasets.datasets_utils import (
-    get_label_stats,
-    get_stats,
-    merge,
-    process_atis,
-    process_dialogflow,
-    process_jarvis_datasets,
-    process_mturk,
-    process_snips,
-)
-from nemo.collections.nlp.utils.common_nlp_utils import calc_class_weights, get_vocab, if_exist, label2idx
+from nemo.collections.nlp.data.datasets.datasets_utils import get_stats
 
-__all__ = ['BertJointIntentSlotDataset', 'BertJointIntentSlotInferDataset', 'JointIntentSlotDataDesc']
+__all__ = ['BertJointIntentSlotDataset', 'BertJointIntentSlotInferDataset']
 
 
 def get_features(
@@ -151,7 +158,6 @@ class BertJointIntentSlotDataset(Dataset):
         tokenizer (Tokenizer): such as BertTokenizer
         num_samples (int): number of samples you want to use for the dataset.
             If -1, use all dataset. Useful for testing.
-        shuffle (bool): whether to shuffle your data.
         pad_label (int): pad value use for slot labels.
             by default, it's the neutral label.
 
@@ -164,7 +170,6 @@ def __init__(
         max_seq_length,
         tokenizer,
         num_samples=-1,
-        shuffle=True,
         pad_label=128,
         ignore_extra_tokens=False,
         ignore_start_end=False,
@@ -182,8 +187,6 @@ def __init__(
 
         dataset = list(zip(slot_lines, input_lines))
 
-        if shuffle or num_samples > 0:
-            random.shuffle(dataset)
         if num_samples > 0:
             dataset = dataset[:num_samples]
 
@@ -267,139 +270,3 @@ def __getitem__(self, idx):
             np.array(self.all_loss_mask[idx]),
             np.array(self.all_subtokens_mask[idx]),
         )
-
-
-class JointIntentSlotDataDesc:
-    """ Convert the raw data to the standard format supported by
-    JointIntentSlotDataset.
-
-    By default, the None label for slots is 'O'.
-
-    JointIntentSlotDataset requires two files:
-
-        input_file: file to sequence + label.
-            the first line is header (sentence [tab] label)
-            each line should be [sentence][tab][label]
-
-        slot_file: file to slot labels, each line corresponding to
-            slot labels for a sentence in input_file. No header.
-
-    To keep the mapping from label index to label consistent during
-    training and inferencing, we require the following files:
-        dicts.intents.csv: each line is an intent. The first line
-            corresponding to the 0 intent label, the second line
-            corresponding to the 1 intent label, and so on.
-
-        dicts.slots.csv: each line is a slot. The first line
-            corresponding to the 0 slot label, the second line
-            corresponding to the 1 slot label, and so on.
-
-    Args:
-        data_dir (str): the directory of the dataset
-        do_lower_case (bool): whether to set your dataset to lowercase
-        dataset_name (str): the name of the dataset. If it's a dataset
-            that follows the standard JointIntentSlotDataset format,
-            you can set the name as 'default'.
-        none_slot_label (str): the label for slots that aren't indentified
-            defaulted to 'O'
-        pad_label (int): the int used for padding. If set to -1,
-             it'll be set to the whatever the None label is.
-
-    """
-
-    def __init__(self, data_dir, do_lower_case=False, dataset_name='default', none_slot_label='O', pad_label=-1):
-        if dataset_name == 'atis':
-            self.data_dir = process_atis(data_dir, do_lower_case)
-        elif dataset_name == 'snips-atis':
-            self.data_dir, self.pad_label = merge(
-                data_dir, ['ATIS/nemo-processed-uncased', 'snips/nemo-processed-uncased/all'], dataset_name
-            )
-        elif dataset_name == 'dialogflow':
-            self.data_dir = process_dialogflow(data_dir, do_lower_case)
-        elif dataset_name == 'mturk-processed':
-            self.data_dir = process_mturk(data_dir, do_lower_case)
-        elif dataset_name in set(['snips-light', 'snips-speak', 'snips-all']):
-            self.data_dir = process_snips(data_dir, do_lower_case)
-            if dataset_name.endswith('light'):
-                self.data_dir = f'{self.data_dir}/light'
-            elif dataset_name.endswith('speak'):
-                self.data_dir = f'{self.data_dir}/speak'
-            elif dataset_name.endswith('all'):
-                self.data_dir = f'{self.data_dir}/all'
-        elif dataset_name.startswith('jarvis'):
-            self.data_dir = process_jarvis_datasets(
-                data_dir, do_lower_case, dataset_name, modes=["train", "test", "eval"], ignore_prev_intent=False
-            )
-        else:
-            if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']):
-                raise FileNotFoundError(
-                    "Make sure that your data follows the standard format "
-                    "supported by JointIntentSlotDataset. Your data must "
-                    "contain dict.intents.csv and dict.slots.csv."
-                )
-            self.data_dir = data_dir
-
-        self.intent_dict_file = self.data_dir + '/dict.intents.csv'
-        self.slot_dict_file = self.data_dir + '/dict.slots.csv'
-        self.num_intents = len(get_vocab(self.intent_dict_file))
-        slots = label2idx(self.slot_dict_file)
-        self.num_slots = len(slots)
-
-        for mode in ['train', 'test', 'eval']:
-
-            if not if_exist(self.data_dir, [f'{mode}.tsv']):
-                logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.')
-                continue
-
-            slot_file = f'{self.data_dir}/{mode}_slots.tsv'
-            with open(slot_file, 'r') as f:
-                slot_lines = f.readlines()
-
-            input_file = f'{self.data_dir}/{mode}.tsv'
-            with open(input_file, 'r') as f:
-                input_lines = f.readlines()[1:]  # Skipping headers at index 0
-
-            if len(slot_lines) != len(input_lines):
-                raise ValueError(
-                    "Make sure that the number of slot lines match the "
-                    "number of intent lines. There should be a 1-1 "
-                    "correspondence between every slot and intent lines."
-                )
-
-            dataset = list(zip(slot_lines, input_lines))
-
-            raw_slots, queries, raw_intents = [], [], []
-            for slot_line, input_line in dataset:
-                slot_list = [int(slot) for slot in slot_line.strip().split()]
-                raw_slots.append(slot_list)
-                parts = input_line.strip().split()
-                raw_intents.append(int(parts[-1]))
-                queries.append(' '.join(parts[:-1]))
-
-            infold = input_file[: input_file.rfind('/')]
-
-            logging.info(f'Three most popular intents during {mode}ing')
-            total_intents, intent_label_freq = get_label_stats(raw_intents, infold + f'/{mode}_intent_stats.tsv')
-            merged_slots = itertools.chain.from_iterable(raw_slots)
-
-            logging.info(f'Three most popular slots during {mode}ing')
-            slots_total, slots_label_freq = get_label_stats(merged_slots, infold + f'/{mode}_slot_stats.tsv')
-
-            if mode == 'train':
-                self.slot_weights = calc_class_weights(slots_label_freq)
-                logging.info(f'Slot weights are - {self.slot_weights}')
-
-                self.intent_weights = calc_class_weights(intent_label_freq)
-                logging.info(f'Intent weights are - {self.intent_weights}')
-
-            logging.info(f'Total intents - {total_intents}')
-            logging.info(f'Intent label frequency - {intent_label_freq}')
-            logging.info(f'Total Slots - {slots_total}')
-            logging.info(f'Slots label frequency - {slots_label_freq}')
-
-        if pad_label != -1:
-            self.pad_label = pad_label
-        else:
-            if none_slot_label not in slots:
-                raise ValueError(f'none_slot_label {none_slot_label} not ' f'found in {self.slot_dict_file}.')
-            self.pad_label = slots[none_slot_label]
diff --git a/nemo/collections/nlp/data/datasets/lm_bert_dataset.py b/nemo/collections/nlp/data/datasets/lm_bert_dataset.py
index 61b74f933c60..32ddb0f82384 100644
--- a/nemo/collections/nlp/data/datasets/lm_bert_dataset.py
+++ b/nemo/collections/nlp/data/datasets/lm_bert_dataset.py
@@ -24,12 +24,12 @@
 
 import h5py
 import numpy as np
+from sentencepiece import SentencePieceTrainer as SPT
 from torch.utils.data import Dataset
 from tqdm import tqdm
 
 from nemo import logging
-from nemo.collections.nlp.data.datasets.datasets_utils import download_wkt2
-from nemo.collections.nlp.data.datasets.lm_transformer_dataset import create_vocab_mlm
+from nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing import DATABASE_EXISTS_TMP, if_exist
 
 __all__ = ['BertPretrainingDataset', 'BertPretrainingPreprocessedDataset']
 
@@ -380,17 +380,75 @@ class BERTPretrainingDataDesc:
     def __init__(self, dataset_name, data_dir, vocab_size, sample_size, special_tokens, train_file=''):
         if dataset_name == 'wikitext-2':
             if not os.path.exists(data_dir):
-                data_dir = download_wkt2(data_dir)
-            self.data_dir, self.tokenizer_model = create_vocab_mlm(
+                raise FileNotFoundError("Dataset not found. Run './get_wkt2.sh DATA_DIR' from examples/nlp/scripts")
+            self.data_dir, self.tokenizer_model = self.create_vocab_mlm(
                 data_dir, vocab_size, sample_size, special_tokens, train_file
             )
         else:
-            logging.warning(
-                "Looks like you passed a dataset name that isn't "
-                "already supported by NeMo. Please make sure that "
+            raise ValueError(
+                "Looks like you passed a dataset name that isn't already supported by NeMo. Please make sure that "
                 "you build the preprocessing method for it."
             )
 
         self.train_file = f'{data_dir}/train.txt'
         self.eval_file = f'{data_dir}/valid.txt'
         self.test_file = f'{data_dir}/test.txt'
+
+    def create_vocab_mlm(
+        self,
+        data_dir,
+        vocab_size,
+        sample_size,
+        special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'],
+        train_file='',
+    ):
+        vocab = special_tokens[:]
+        bert_dir = f'{data_dir}/bert'
+        if if_exist(bert_dir, ['tokenizer.model']):
+            logging.info(DATABASE_EXISTS_TMP.format('WikiText_BERT', bert_dir))
+            return data_dir, f'{bert_dir}/tokenizer.model'
+        logging.info(f'Processing WikiText dataset and store at {bert_dir}')
+        os.makedirs(bert_dir, exist_ok=True)
+
+        if not train_file:
+            files = glob.glob(f'{data_dir}/*.txt')
+            train_file = f'{bert_dir}/merged.txt'
+            logging.info(f"Merging {len(files)} txt files into {train_file}")
+
+            with open(train_file, "w") as merged:
+                for file in tqdm(files):
+                    with open(file, 'r') as inf:
+                        content = inf.read().strip()
+                    merged.write(content + '\n\n\n')
+        else:
+            train_file = f'{data_dir}/{train_file}'
+
+        cmd = (
+            f"--input={train_file} --model_prefix={bert_dir}/tokenizer "
+            f"--vocab_size={vocab_size - len(vocab)} "
+            f"--input_sentence_size={sample_size} "
+            f"--shuffle_input_sentence=true --hard_vocab_limit=false "
+            f"--bos_id=-1 --eos_id=-1"
+        )
+
+        SPT.Train(cmd)
+
+        # Add BERT control symbols
+        tokens = []
+
+        with open(f"{bert_dir}/tokenizer.vocab", "r") as f:
+            f.readline()  # skip first <unk> token
+
+            # Read tokens from each line and parse for vocab
+            for line in f:
+                piece = line.split("\t")[0]
+                token = piece[1:] if piece.startswith("▁") else f"##{piece}"
+                tokens.append(token)
+
+        vocab.extend(tokens)
+
+        # Save vocabulary to output file
+        with open(f'{bert_dir}/vocab.txt', "w") as f:
+            for token in vocab:
+                f.write(f"{token}\n".format())
+        return data_dir, f'{bert_dir}/tokenizer.model'
diff --git a/nemo/collections/nlp/data/datasets/lm_transformer_dataset.py b/nemo/collections/nlp/data/datasets/lm_transformer_dataset.py
index 5d8f20723c6e..303a07904692 100644
--- a/nemo/collections/nlp/data/datasets/lm_transformer_dataset.py
+++ b/nemo/collections/nlp/data/datasets/lm_transformer_dataset.py
@@ -15,21 +15,16 @@
 # =============================================================================
 
 """Pytorch Dataset for training Neural Machine Translation."""
-import glob
 import os
-import pickle
 import re
 
 import numpy as np
-from sentencepiece import SentencePieceTrainer as SPT
 from torch.utils.data import Dataset
-from tqdm import tqdm
 
 from nemo import logging
-from nemo.collections.nlp.data.datasets.datasets_utils import DATABASE_EXISTS_TMP, download_wkt2
-from nemo.collections.nlp.utils.common_nlp_utils import if_exist
+from nemo.collections.nlp.data.datasets.datasets_utils import dataset_to_ids, if_exist
 
-__all__ = ['LanguageModelingDataset']
+__all__ = ['LanguageModelingDataset', 'LanguageModelDataDesc']
 
 
 class LanguageModelingDataset(Dataset):
@@ -56,8 +51,8 @@ class LanguageModelDataDesc:
     def __init__(self, dataset_name, data_dir, do_lower_case):
         if dataset_name == 'wikitext-2':
             if not os.path.exists(data_dir):
-                data_dir = download_wkt2(data_dir)
-            self.vocab_size = create_vocab_lm(data_dir, do_lower_case)
+                raise FileNotFoundError("Dataset not found. Run './get_wkt2.sh DATA_DIR' from examples/nlp/scripts")
+            self.vocab_size = self.create_vocab_lm(data_dir, do_lower_case)
             self.data_dir = data_dir
         else:
             logging.warning(
@@ -66,122 +61,33 @@ def __init__(self, dataset_name, data_dir, do_lower_case):
                 "you build the preprocessing method for it."
             )
 
-
-def create_vocab_mlm(
-    data_dir, vocab_size, sample_size, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], train_file=''
-):
-    vocab = special_tokens[:]
-    bert_dir = f'{data_dir}/bert'
-    if if_exist(bert_dir, ['tokenizer.model']):
-        logging.info(DATABASE_EXISTS_TMP.format('WikiText_BERT', bert_dir))
-        return data_dir, f'{bert_dir}/tokenizer.model'
-    logging.info(f'Processing WikiText dataset and store at {bert_dir}')
-    os.makedirs(bert_dir, exist_ok=True)
-
-    if not train_file:
-        files = glob.glob(f'{data_dir}/*.txt')
-        train_file = f'{bert_dir}/merged.txt'
-        logging.info(f"Merging {len(files)} txt files into {train_file}")
-
-        with open(train_file, "w") as merged:
-            for file in tqdm(files):
-                with open(file, 'r') as inf:
-                    content = inf.read().strip()
-                merged.write(content + '\n\n\n')
-    else:
-        train_file = f'{data_dir}/{train_file}'
-
-    cmd = (
-        f"--input={train_file} --model_prefix={bert_dir}/tokenizer "
-        f"--vocab_size={vocab_size - len(vocab)} "
-        f"--input_sentence_size={sample_size} "
-        f"--shuffle_input_sentence=true --hard_vocab_limit=false "
-        f"--bos_id=-1 --eos_id=-1"
-    )
-    SPT.Train(cmd)
-
-    # Add BERT control symbols
-    tokens = []
-
-    with open(f"{bert_dir}/tokenizer.vocab", "r") as f:
-        f.readline()  # skip first <unk> token
-
-        # Read tokens from each line and parse for vocab
-        for line in f:
-            piece = line.split("\t")[0]
-            token = piece[1:] if piece.startswith("▁") else f"##{piece}"
-            tokens.append(token)
-
-    vocab.extend(tokens)
-
-    # Save vocabulary to output file
-    with open(f'{bert_dir}/vocab.txt', "w") as f:
-        for token in vocab:
-            f.write(f"{token}\n".format())
-    return data_dir, f'{bert_dir}/tokenizer.model'
-
-
-def dataset_to_ids(dataset, tokenizer, cache_ids=False, add_bos_eos=True):
-    """
-    Reads dataset from file line by line, tokenizes each line with tokenizer,
-    and returns list of lists which corresponds to ids of tokenized strings.
-
-    Args:
-        dataset: path to dataset
-        tokenizer: tokenizer to convert text into ids
-        cache_ids: if True, ids are saved to disk as pickle file
-            with similar name (e.g., data.txt --> data.txt.pkl)
-        add_bos_eos: bool, whether to add <s> and </s> symbols (e.g., for NMT)
-    Returns:
-        ids: list of ids which correspond to tokenized strings of the dataset
-    """
-
-    cached_ids_dataset = dataset + str(".pkl")
-    if os.path.isfile(cached_ids_dataset):
-        logging.info("Loading cached tokenized dataset ...")
-        ids = pickle.load(open(cached_ids_dataset, "rb"))
-    else:
-        logging.info("Tokenizing dataset ...")
-        data = open(dataset, "rb").readlines()
-        ids = []
-        for sentence in data:
-            sent_ids = tokenizer.text_to_ids(sentence.decode("utf-8"))
-            if add_bos_eos:
-                sent_ids = [tokenizer.bos_id] + sent_ids + [tokenizer.eos_id]
-            ids.append(sent_ids)
-        if cache_ids:
-            logging.info("Caching tokenized dataset ...")
-            pickle.dump(ids, open(cached_ids_dataset, "wb"))
-    return ids
-
-
-def create_vocab_lm(data_dir, do_lower_case):
-    if if_exist(data_dir, ['train.txt', 'vocab.txt']):
-        logging.info("Vocabulary has been created.")
-        with open(os.path.join(data_dir, 'vocab.txt'), 'r') as f:
-            vocab_size = len(f.readlines())
-        return vocab_size
-
-    logging.info(f'Creating vocabulary from training data at {data_dir}')
-
-    with open(f'{data_dir}/train.txt', 'r') as f:
-        txt = f.read()
-    if do_lower_case:
-        txt = txt.lower()
-    lines = re.split(r'[\n]', txt)
-    sentences = [line.strip().split() for line in lines if line.strip()]
-
-    vocab = {"[PAD]": 0, "[SEP]": 1, "[CLS]": 2, "[MASK]": 3}
-    idx = 4
-    for sentence in sentences:
-        for word in sentence:
-            if word not in vocab:
-                vocab[word] = idx
-                idx += 1
-
-    with open(f'{data_dir}/vocab.txt', 'w') as f:
-        for word in sorted(vocab.keys()):
-            f.write(word + '\n')
-    logging.info(f"Created vocabulary of size {len(vocab)}")
-
-    return len(vocab)
+    def create_vocab_lm(self, data_dir, do_lower_case):
+        if if_exist(data_dir, ['train.txt', 'vocab.txt']):
+            logging.info("Vocabulary has been created.")
+            with open(os.path.join(data_dir, 'vocab.txt'), 'r') as f:
+                vocab_size = len(f.readlines())
+            return vocab_size
+
+        logging.info(f'Creating vocabulary from training data at {data_dir}')
+
+        with open(f'{data_dir}/train.txt', 'r') as f:
+            txt = f.read()
+        if do_lower_case:
+            txt = txt.lower()
+        lines = re.split(r'[\n]', txt)
+        sentences = [line.strip().split() for line in lines if line.strip()]
+
+        vocab = {"[PAD]": 0, "[SEP]": 1, "[CLS]": 2, "[MASK]": 3}
+        idx = 4
+        for sentence in sentences:
+            for word in sentence:
+                if word not in vocab:
+                    vocab[word] = idx
+                    idx += 1
+
+        with open(f'{data_dir}/vocab.txt', 'w') as f:
+            for word in sorted(vocab.keys()):
+                f.write(word + '\n')
+        logging.info(f"Created vocabulary of size {len(vocab)}")
+
+        return len(vocab)
diff --git a/nemo/collections/nlp/data/datasets/machine_translation_dataset.py b/nemo/collections/nlp/data/datasets/machine_translation_dataset.py
index db8e6b7ace2d..3fe43c1f6820 100644
--- a/nemo/collections/nlp/data/datasets/machine_translation_dataset.py
+++ b/nemo/collections/nlp/data/datasets/machine_translation_dataset.py
@@ -21,7 +21,7 @@
 import numpy as np
 from torch.utils.data import Dataset
 
-from nemo.collections.nlp.data.datasets.lm_transformer_dataset import dataset_to_ids
+from nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing import dataset_to_ids
 
 __all__ = ['TranslationDataset']
 
@@ -36,7 +36,7 @@ def __init__(self, tokenizer_src, tokenizer_tgt, dataset_src, dataset_tgt, token
         src_ids = dataset_to_ids(dataset_src, tokenizer_src)
         tgt_ids = dataset_to_ids(dataset_tgt, tokenizer_tgt)
         if clean:
-            src_ids, tgt_ids = clean_src_and_target(src_ids, tgt_ids)
+            src_ids, tgt_ids = self.clean_src_and_target(src_ids, tgt_ids)
         self.batch_indices = self.pack_data_into_batches(src_ids, tgt_ids)
         self.batches = self.pad_batches(src_ids, tgt_ids, self.batch_indices)
 
@@ -156,35 +156,36 @@ def pack_data_into_batches(self, src_ids, tgt_ids):
 
         return batches
 
+    def clean_src_and_target(
+        self, src_ids, tgt_ids, max_tokens=128, min_tokens=3, max_tokens_diff=25, max_tokens_ratio=2.5
+    ):
+        """
+        Cleans source and target sentences to get rid of noisy data.
+        Specifically, a pair of sentences is removed if
+          -- either source or target is longer than *max_tokens*
+          -- either source or target is shorter than *min_tokens*
+          -- absolute difference between source and target is larger than
+             *max_tokens_diff*
+          -- one sentence is *max_tokens_ratio* times longer than the other
+        """
 
-def clean_src_and_target(src_ids, tgt_ids, max_tokens=128, min_tokens=3, max_tokens_diff=25, max_tokens_ratio=2.5):
-    """
-    Cleans source and target sentences to get rid of noisy data.
-    Specifically, a pair of sentences is removed if
-      -- either source or target is longer than *max_tokens*
-      -- either source or target is shorter than *min_tokens*
-      -- absolute difference between source and target is larger than
-         *max_tokens_diff*
-      -- one sentence is *max_tokens_ratio* times longer than the other
-    """
-
-    if len(src_ids) != len(tgt_ids):
-        raise ValueError("Source and target corpora have different lengths!")
-    src_ids_, tgt_ids_ = [], []
-    for i in range(len(src_ids)):
-        src_len, tgt_len = len(src_ids[i]), len(tgt_ids[i])
-        if (
-            src_len > max_tokens
-            or tgt_len > max_tokens
-            or src_len < min_tokens
-            or tgt_len < min_tokens
-            or (src_ids[i] == tgt_ids[i])
-            or np.abs(src_len - tgt_len) > max_tokens_diff
-        ):
-            continue
-        ratio = max(src_len - 2, 1) / max(tgt_len - 2, 1)
-        if ratio > max_tokens_ratio or ratio < (1 / max_tokens_ratio):
-            continue
-        src_ids_.append(src_ids[i])
-        tgt_ids_.append(tgt_ids[i])
-    return src_ids_, tgt_ids_
+        if len(src_ids) != len(tgt_ids):
+            raise ValueError("Source and target corpora have different lengths!")
+        src_ids_, tgt_ids_ = [], []
+        for i in range(len(src_ids)):
+            src_len, tgt_len = len(src_ids[i]), len(tgt_ids[i])
+            if (
+                src_len > max_tokens
+                or tgt_len > max_tokens
+                or src_len < min_tokens
+                or tgt_len < min_tokens
+                or (src_ids[i] == tgt_ids[i])
+                or np.abs(src_len - tgt_len) > max_tokens_diff
+            ):
+                continue
+            ratio = max(src_len - 2, 1) / max(tgt_len - 2, 1)
+            if ratio > max_tokens_ratio or ratio < (1 / max_tokens_ratio):
+                continue
+            src_ids_.append(src_ids[i])
+            tgt_ids_.append(tgt_ids[i])
+        return src_ids_, tgt_ids_
diff --git a/nemo/collections/nlp/data/datasets/state_tracking_trade_dataset.py b/nemo/collections/nlp/data/datasets/multiwoz_dataset.py
similarity index 99%
rename from nemo/collections/nlp/data/datasets/state_tracking_trade_dataset.py
rename to nemo/collections/nlp/data/datasets/multiwoz_dataset.py
index 9358c79d16d6..17690034fc93 100644
--- a/nemo/collections/nlp/data/datasets/state_tracking_trade_dataset.py
+++ b/nemo/collections/nlp/data/datasets/multiwoz_dataset.py
@@ -170,7 +170,7 @@ def __getitem__(self, idx):
 
 class Vocab:
     """
-    Vocab class for TRADE model
+    Vocab class for MultiWOZ dataset
     UNK_token = 0
     PAD_token = 1
     SOS_token = 3
diff --git a/nemo/collections/nlp/data/datasets/punctuation_capitalization_dataset.py b/nemo/collections/nlp/data/datasets/punctuation_capitalization_dataset.py
index b2df10907304..3d99470a9ac7 100644
--- a/nemo/collections/nlp/data/datasets/punctuation_capitalization_dataset.py
+++ b/nemo/collections/nlp/data/datasets/punctuation_capitalization_dataset.py
@@ -1,6 +1,5 @@
-# Copyright 2018 The Google AI Language Team Authors and
-# The HuggingFace Inc. team.
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,24 +12,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-Utility functions for Token Classification NLP tasks
-Some parts of this code were adapted from the HuggingFace library at
-https://github.com/huggingface/pytorch-pretrained-BERT
-"""
+# =============================================================================
 
 __all__ = ['BertPunctuationCapitalizationDataset', 'BertPunctuationCapitalizationInferDataset']
 
 import itertools
 import os
 import pickle
-import random
 
 import numpy as np
 from torch.utils.data import Dataset
 
 from nemo import logging
-from nemo.collections.nlp.data.datasets import datasets_utils as utils
+from nemo.collections.nlp.data.datasets.datasets_utils import get_label_stats, get_stats
 
 
 def get_features(
@@ -49,7 +43,7 @@ def get_features(
     Args:
     queries (list of str): text sequences
     max_seq_length (int): max sequence length minus 2 for [CLS] and [SEP]
-    tokenizer (Tokenizer): such as NemoBertTokenizer
+    tokenizer (TokenizerSpec): such as NemoBertTokenizer
     pad_label (str): pad value use for labels.
         by default, it's the neutral label.
     punct_label_ids (dict): dict to map punctuation labels to label ids.
@@ -126,7 +120,7 @@ def get_features(
 
     max_seq_length = min(max_seq_length, max(sent_lengths))
     logging.info(f'Max length: {max_seq_length}')
-    utils.get_stats(sent_lengths)
+    get_stats(sent_lengths)
     too_long_count = 0
 
     for i, subtokens in enumerate(all_subtokens):
@@ -202,7 +196,6 @@ class BertPunctuationCapitalizationDataset(Dataset):
         tokenizer (Tokenizer): such as NemoBertTokenizer
         num_samples (int): number of samples you want to use for the dataset.
             If -1, use all dataset. Useful for testing.
-        shuffle (bool): whether to shuffle your data.
         pad_label (str): pad value use for labels.
             by default, it's the neutral label.
         punct_label_ids and capit_label_ids (dict):
@@ -224,7 +217,6 @@ def __init__(
         max_seq_length,
         tokenizer,
         num_samples=-1,
-        shuffle=False,
         pad_label='O',
         punct_label_ids=None,
         capit_label_ids=None,
@@ -275,17 +267,15 @@ def __init__(
             if len(punct_labels_lines) != len(text_lines):
                 raise ValueError("Labels file should contain labels for every word")
 
-            if shuffle or num_samples > 0:
-                dataset = list(zip(text_lines, punct_labels_lines, capit_labels_lines))
-                random.shuffle(dataset)
+            dataset = list(zip(text_lines, punct_labels_lines, capit_labels_lines))
 
-                if num_samples > 0:
-                    dataset = dataset[:num_samples]
+            if num_samples > 0:
+                dataset = dataset[:num_samples]
 
-                dataset = list(zip(*dataset))
-                text_lines = dataset[0]
-                punct_labels_lines = dataset[1]
-                capit_labels_lines = dataset[2]
+            dataset = list(zip(*dataset))
+            text_lines = dataset[0]
+            punct_labels_lines = dataset[1]
+            capit_labels_lines = dataset[2]
 
             # for dev/test sets use label mapping from training set
             if punct_label_ids:
@@ -351,7 +341,7 @@ def get_stats_and_save(all_labels, label_ids, name):
             infold = text_file[: text_file.rfind('/')]
             merged_labels = itertools.chain.from_iterable(all_labels)
             logging.info('Three most popular labels')
-            _, label_frequencies = utils.get_label_stats(merged_labels, infold + '/label_count_' + name + '.tsv')
+            _, label_frequencies = get_label_stats(merged_labels, infold + '/label_count_' + name + '.tsv')
 
             out = open(os.path.join(infold, name + '_label_ids.csv'), 'w')
             labels, _ = zip(*sorted(label_ids.items(), key=lambda x: x[1]))
diff --git a/nemo/collections/nlp/data/datasets/qa_squad_dataset.py b/nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_dataset.py
similarity index 62%
rename from nemo/collections/nlp/data/datasets/qa_squad_dataset.py
rename to nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_dataset.py
index b02d0036dc18..318cf6bcdb90 100644
--- a/nemo/collections/nlp/data/datasets/qa_squad_dataset.py
+++ b/nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_dataset.py
@@ -1,20 +1,37 @@
-"""
-Copyright 2018 The Google AI Language Team Authors and
-The HuggingFace Inc. team.
-Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# =============================================================================
+# Copyright 2018 The Google AI Language Team Authors and
+# The HuggingFace Inc. team.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
 
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
 import collections
 import json
 import os
@@ -26,7 +43,9 @@
 from tqdm import tqdm
 
 from nemo import logging
-from nemo.collections.nlp.data.datasets.glue_benchmark_dataset import DataProcessor
+from nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing import is_whitespace
+from nemo.collections.nlp.data.datasets.datasets_utils.datasets_processing import DataProcessor
+from nemo.collections.nlp.data.datasets.qa_squad_dataset.qa_squad_processing import convert_examples_to_features
 from nemo.collections.nlp.metrics.squad_metrics import (
     _get_best_indexes,
     apply_no_ans_threshold,
@@ -36,10 +55,9 @@
     get_final_text,
     make_eval_dict,
     merge_eval,
-    normalize_answer,
 )
-from nemo.collections.nlp.utils.common_nlp_utils import _is_whitespace
-from nemo.collections.nlp.utils.loss_utils import _compute_softmax
+from nemo.collections.nlp.utils.data_utils import normalize_answer
+from nemo.collections.nlp.utils.functional_utils import _compute_softmax
 
 __all__ = ['SquadDataset']
 
@@ -410,209 +428,6 @@ def evaluate(
         return exact_match, f1, all_predictions
 
 
-def convert_examples_to_features(
-    examples, tokenizer, max_seq_length, doc_stride, max_query_length, has_groundtruth,
-):
-    """Loads a data file into a list of `InputBatch`s."""
-
-    unique_id = 1000000000
-
-    features = []
-    for (example_index, example) in enumerate(examples):
-        query_tokens = tokenizer.text_to_tokens(example.question_text)
-
-        if len(query_tokens) > max_query_length:
-            query_tokens = query_tokens[0:max_query_length]
-
-        # context: index of token -> index of word
-        tok_to_orig_index = []
-        # context: index of word -> index of first token in token list
-        orig_to_tok_index = []
-        # context without white spaces after tokenization
-        all_doc_tokens = []
-        # doc tokens is word separated context
-        for (i, token) in enumerate(example.doc_tokens):
-            orig_to_tok_index.append(len(all_doc_tokens))
-            sub_tokens = tokenizer.text_to_tokens(token)
-            for sub_token in sub_tokens:
-                tok_to_orig_index.append(i)
-                all_doc_tokens.append(sub_token)
-
-        # idx of query token start and end in context
-        tok_start_position = None
-        tok_end_position = None
-        if has_groundtruth and example.is_impossible:
-            tok_start_position = -1
-            tok_end_position = -1
-        if has_groundtruth and not example.is_impossible:
-            tok_start_position = orig_to_tok_index[example.start_position]
-            if example.end_position < len(example.doc_tokens) - 1:
-                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
-            else:
-                tok_end_position = len(all_doc_tokens) - 1
-
-            (tok_start_position, tok_end_position) = _improve_answer_span(
-                all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
-            )
-
-        # The -3 accounts for tokenizer.cls_token, tokenizer.sep_token and tokenizer.eos_token
-        # doc_spans contains all possible contexts options of given length
-        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
-        _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
-        doc_spans = []
-        start_offset = 0
-        while start_offset < len(all_doc_tokens):
-            length = len(all_doc_tokens) - start_offset
-            if length > max_tokens_for_doc:
-                length = max_tokens_for_doc
-            doc_spans.append(_DocSpan(start=start_offset, length=length))
-            if start_offset + length == len(all_doc_tokens):
-                break
-            start_offset += min(length, doc_stride)
-
-        for (doc_span_index, doc_span) in enumerate(doc_spans):
-            tokens = []
-            # maps context tokens idx in final input -> word idx in context
-            token_to_orig_map = {}
-            token_is_max_context = {}
-            segment_ids = []
-            tokens.append(tokenizer.bos_token)
-            segment_ids.append(0)
-            for token in query_tokens:
-                tokens.append(token)
-                segment_ids.append(0)
-            tokens.append(tokenizer.sep_token)
-            segment_ids.append(0)
-
-            for i in range(doc_span.length):
-                split_token_index = doc_span.start + i
-                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
-
-                is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index)
-                token_is_max_context[len(tokens)] = is_max_context
-                tokens.append(all_doc_tokens[split_token_index])
-                segment_ids.append(1)
-            tokens.append(tokenizer.eos_token)
-            segment_ids.append(1)
-
-            input_ids = tokenizer.tokens_to_ids(tokens)
-
-            # The mask has 1 for real tokens and 0 for padding tokens.
-            # Only real tokens are attended to.
-            input_mask = [1] * len(input_ids)
-
-            # Zero-pad up to the sequence length.
-            while len(input_ids) < max_seq_length:
-                input_ids.append(tokenizer.pad_id)
-                input_mask.append(0)
-                segment_ids.append(0)
-
-            assert len(input_ids) == max_seq_length
-            assert len(input_mask) == max_seq_length
-            assert len(segment_ids) == max_seq_length
-
-            # calculate start and end position in final array
-            # of tokens in answer if no answer,
-            # 0 for both pointing to tokenizer.cls_token
-            start_position = None
-            end_position = None
-            if has_groundtruth and not example.is_impossible:
-                doc_start = doc_span.start
-                doc_end = doc_span.start + doc_span.length - 1
-                out_of_span = False
-                if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
-                    out_of_span = True
-                if out_of_span:
-                    start_position = 0
-                    end_position = 0
-                else:
-                    doc_offset = len(query_tokens) + 2
-                    start_position = tok_start_position - doc_start + doc_offset
-                    end_position = tok_end_position - doc_start + doc_offset
-            if has_groundtruth and example.is_impossible:
-                # if our document chunk does not contain
-                # an annotation we throw it out, since there is nothing
-                # to predict.
-                start_position = 0
-                end_position = 0
-
-            if example_index < 1:
-                logging.info("*** Example ***")
-                logging.info("unique_id: %s" % (unique_id))
-                logging.info("example_index: %s" % (example_index))
-                logging.info("doc_span_index: %s" % (doc_span_index))
-                logging.info("tokens: %s" % " ".join(tokens))
-                logging.info(
-                    "token_to_orig_map: %s" % " ".join(["%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])
-                )
-                logging.info(
-                    "token_is_max_context: %s"
-                    % " ".join(["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()])
-                )
-                logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-                logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
-                logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
-                if has_groundtruth and example.is_impossible:
-                    logging.info("impossible example")
-                if has_groundtruth and not example.is_impossible:
-                    answer_text = " ".join(tokens[start_position : (end_position + 1)])
-                    logging.info("start_position: %d" % (start_position))
-                    logging.info("end_position: %d" % (end_position))
-                    logging.info("answer: %s" % (answer_text))
-
-            features.append(
-                InputFeatures(
-                    unique_id=unique_id,
-                    example_index=example_index,
-                    doc_span_index=doc_span_index,
-                    tokens=tokens,
-                    token_to_orig_map=token_to_orig_map,
-                    token_is_max_context=token_is_max_context,
-                    input_ids=input_ids,
-                    input_mask=input_mask,
-                    segment_ids=segment_ids,
-                    start_position=start_position,
-                    end_position=end_position,
-                    is_impossible=example.is_impossible,
-                )
-            )
-            unique_id += 1
-
-    return features
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(
-        self,
-        unique_id,
-        example_index,
-        doc_span_index,
-        tokens,
-        token_to_orig_map,
-        token_is_max_context,
-        input_ids,
-        input_mask,
-        segment_ids,
-        start_position=None,
-        end_position=None,
-        is_impossible=None,
-    ):
-        self.unique_id = unique_id
-        self.example_index = example_index
-        self.doc_span_index = doc_span_index
-        self.tokens = tokens
-        self.token_to_orig_map = token_to_orig_map
-        self.token_is_max_context = token_is_max_context
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-
-
 class SquadProcessor(DataProcessor):
     """
     Processor for the SQuAD data set.
@@ -723,7 +538,7 @@ def __init__(
         #     char_to_word_offset = [0, 0, 0, 1, 1]
         #     doc_tokens = ["hi", "yo"]
         for c in self.context_text:
-            if _is_whitespace(c):
+            if is_whitespace(c):
                 prev_is_whitespace = True
             else:
                 if prev_is_whitespace:
@@ -743,79 +558,3 @@ def __init__(
             self.end_position = char_to_word_offset[
                 min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1)
             ]
-
-
-def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
-    """Returns tokenized answer spans that
-    better match the annotated answer."""
-    tok_answer_text = " ".join(tokenizer.text_to_tokens(orig_answer_text))
-
-    for new_start in range(input_start, input_end + 1):
-        for new_end in range(input_end, new_start - 1, -1):
-            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
-            if text_span == tok_answer_text:
-                return (new_start, new_end)
-
-    return (input_start, input_end)
-
-
-def _check_is_max_context(doc_spans, cur_span_index, position):
-    """Check if this is the 'max context' doc span for the token."""
-    best_score = None
-    best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
-        end = doc_span.start + doc_span.length - 1
-        if position < doc_span.start:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - doc_span.start
-        num_right_context = end - position
-        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
-
-    return cur_span_index == best_span_index
-
-
-def check_is_max_context(doc_spans, cur_span_index, position):
-    """Check if this is the 'max context' doc span for the token.
-
-    Because of the sliding window approach taken to scoring documents,
-    a single token can appear in multiple documents.
-
-    Example:
-        Doc: the man went to the store and bought a gallon of milk
-        Span A: the man went to the
-        Span B: to the store and bought
-        Span C: and bought a gallon of
-        ...
-
-    Now the word 'bought' will have two scores from spans B and C. We only
-    want to consider the score with "maximum context", which we define as
-    the *minimum* of its left and right context (the *sum* of left and
-    right context will always be the same, of course).
-
-    In the example the maximum context for 'bought' would be span C since
-    it has 1 left context and 3 right context, while span B has 4 left context
-    and 0 right context.
-
-    Code adapted from the code by the Google AI and HuggingFace.
-    """
-    best_score = None
-    best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
-        end = doc_span.start + doc_span.length - 1
-        if position < doc_span.start:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - doc_span.start
-        num_right_context = end - position
-        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
-
-    return cur_span_index == best_span_index
diff --git a/nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_processing.py b/nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_processing.py
new file mode 100644
index 000000000000..57b3db90c6c9
--- /dev/null
+++ b/nemo/collections/nlp/data/datasets/qa_squad_dataset/qa_squad_processing.py
@@ -0,0 +1,296 @@
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# =============================================================================
+# Copyright 2018 The Google AI Language Team Authors and
+# The HuggingFace Inc. team.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import collections
+
+from nemo import logging
+
+
+def convert_examples_to_features(
+    examples, tokenizer, max_seq_length, doc_stride, max_query_length, has_groundtruth,
+):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    unique_id = 1000000000
+
+    features = []
+    for (example_index, example) in enumerate(examples):
+        query_tokens = tokenizer.text_to_tokens(example.question_text)
+
+        if len(query_tokens) > max_query_length:
+            query_tokens = query_tokens[0:max_query_length]
+
+        # context: index of token -> index of word
+        tok_to_orig_index = []
+        # context: index of word -> index of first token in token list
+        orig_to_tok_index = []
+        # context without white spaces after tokenization
+        all_doc_tokens = []
+        # doc tokens is word separated context
+        for (i, token) in enumerate(example.doc_tokens):
+            orig_to_tok_index.append(len(all_doc_tokens))
+            sub_tokens = tokenizer.text_to_tokens(token)
+            for sub_token in sub_tokens:
+                tok_to_orig_index.append(i)
+                all_doc_tokens.append(sub_token)
+
+        # idx of query token start and end in context
+        tok_start_position = None
+        tok_end_position = None
+        if has_groundtruth and example.is_impossible:
+            tok_start_position = -1
+            tok_end_position = -1
+        if has_groundtruth and not example.is_impossible:
+            tok_start_position = orig_to_tok_index[example.start_position]
+            if example.end_position < len(example.doc_tokens) - 1:
+                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
+            else:
+                tok_end_position = len(all_doc_tokens) - 1
+
+            (tok_start_position, tok_end_position) = _improve_answer_span(
+                all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
+            )
+
+        # The -3 accounts for tokenizer.cls_token, tokenizer.sep_token and tokenizer.eos_token
+        # doc_spans contains all possible contexts options of given length
+        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+        _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
+        doc_spans = []
+        start_offset = 0
+        while start_offset < len(all_doc_tokens):
+            length = len(all_doc_tokens) - start_offset
+            if length > max_tokens_for_doc:
+                length = max_tokens_for_doc
+            doc_spans.append(_DocSpan(start=start_offset, length=length))
+            if start_offset + length == len(all_doc_tokens):
+                break
+            start_offset += min(length, doc_stride)
+
+        for (doc_span_index, doc_span) in enumerate(doc_spans):
+            tokens = []
+            # maps context tokens idx in final input -> word idx in context
+            token_to_orig_map = {}
+            token_is_max_context = {}
+            segment_ids = []
+            tokens.append(tokenizer.bos_token)
+            segment_ids.append(0)
+            for token in query_tokens:
+                tokens.append(token)
+                segment_ids.append(0)
+            tokens.append(tokenizer.sep_token)
+            segment_ids.append(0)
+
+            for i in range(doc_span.length):
+                split_token_index = doc_span.start + i
+                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
+
+                is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index)
+                token_is_max_context[len(tokens)] = is_max_context
+                tokens.append(all_doc_tokens[split_token_index])
+                segment_ids.append(1)
+            tokens.append(tokenizer.eos_token)
+            segment_ids.append(1)
+
+            input_ids = tokenizer.tokens_to_ids(tokens)
+
+            # The mask has 1 for real tokens and 0 for padding tokens.
+            # Only real tokens are attended to.
+            input_mask = [1] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            while len(input_ids) < max_seq_length:
+                input_ids.append(tokenizer.pad_id)
+                input_mask.append(0)
+                segment_ids.append(0)
+
+            assert len(input_ids) == max_seq_length
+            assert len(input_mask) == max_seq_length
+            assert len(segment_ids) == max_seq_length
+
+            # calculate start and end position in final array
+            # of tokens in answer if no answer,
+            # 0 for both pointing to tokenizer.cls_token
+            start_position = None
+            end_position = None
+            if has_groundtruth and not example.is_impossible:
+                doc_start = doc_span.start
+                doc_end = doc_span.start + doc_span.length - 1
+                out_of_span = False
+                if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
+                    out_of_span = True
+                if out_of_span:
+                    start_position = 0
+                    end_position = 0
+                else:
+                    doc_offset = len(query_tokens) + 2
+                    start_position = tok_start_position - doc_start + doc_offset
+                    end_position = tok_end_position - doc_start + doc_offset
+            if has_groundtruth and example.is_impossible:
+                # if our document chunk does not contain
+                # an annotation we throw it out, since there is nothing
+                # to predict.
+                start_position = 0
+                end_position = 0
+
+            if example_index < 1:
+                logging.info("*** Example ***")
+                logging.info("unique_id: %s" % (unique_id))
+                logging.info("example_index: %s" % (example_index))
+                logging.info("doc_span_index: %s" % (doc_span_index))
+                logging.info("tokens: %s" % " ".join(tokens))
+                logging.info(
+                    "token_to_orig_map: %s" % " ".join(["%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])
+                )
+                logging.info(
+                    "token_is_max_context: %s"
+                    % " ".join(["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()])
+                )
+                logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+                logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+                logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+                if has_groundtruth and example.is_impossible:
+                    logging.info("impossible example")
+                if has_groundtruth and not example.is_impossible:
+                    answer_text = " ".join(tokens[start_position : (end_position + 1)])
+                    logging.info("start_position: %d" % (start_position))
+                    logging.info("end_position: %d" % (end_position))
+                    logging.info("answer: %s" % (answer_text))
+
+            features.append(
+                InputFeatures(
+                    unique_id=unique_id,
+                    example_index=example_index,
+                    doc_span_index=doc_span_index,
+                    tokens=tokens,
+                    token_to_orig_map=token_to_orig_map,
+                    token_is_max_context=token_is_max_context,
+                    input_ids=input_ids,
+                    input_mask=input_mask,
+                    segment_ids=segment_ids,
+                    start_position=start_position,
+                    end_position=end_position,
+                    is_impossible=example.is_impossible,
+                )
+            )
+            unique_id += 1
+
+    return features
+
+
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
+    """Returns tokenized answer spans that
+    better match the annotated answer."""
+    tok_answer_text = " ".join(tokenizer.text_to_tokens(orig_answer_text))
+
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
+            if text_span == tok_answer_text:
+                return (new_start, new_end)
+
+    return (input_start, input_end)
+
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token.
+
+    Because of the sliding window approach taken to scoring documents,
+    a single token can appear in multiple documents.
+
+    Example:
+        Doc: the man went to the store and bought a gallon of milk
+        Span A: the man went to the
+        Span B: to the store and bought
+        Span C: and bought a gallon of
+        ...
+
+    Now the word 'bought' will have two scores from spans B and C. We only
+    want to consider the score with "maximum context", which we define as
+    the *minimum* of its left and right context (the *sum* of left and
+    right context will always be the same, of course).
+
+    In the example the maximum context for 'bought' would be span C since
+    it has 1 left context and 3 right context, while span B has 4 left context
+    and 0 right context.
+
+    Code adapted from the code by the Google AI and HuggingFace.
+    """
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(
+        self,
+        unique_id,
+        example_index,
+        doc_span_index,
+        tokens,
+        token_to_orig_map,
+        token_is_max_context,
+        input_ids,
+        input_mask,
+        segment_ids,
+        start_position=None,
+        end_position=None,
+        is_impossible=None,
+    ):
+        self.unique_id = unique_id
+        self.example_index = example_index
+        self.doc_span_index = doc_span_index
+        self.tokens = tokens
+        self.token_to_orig_map = token_to_orig_map
+        self.token_is_max_context = token_is_max_context
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
diff --git a/nemo/collections/nlp/data/datasets/text_classification_dataset.py b/nemo/collections/nlp/data/datasets/text_classification_dataset.py
index 11340ffa4da5..ae589641ffc3 100644
--- a/nemo/collections/nlp/data/datasets/text_classification_dataset.py
+++ b/nemo/collections/nlp/data/datasets/text_classification_dataset.py
@@ -1,6 +1,5 @@
-# Copyright 2018 The Google AI Language Team Authors and
-# The HuggingFace Inc. team.
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,31 +12,28 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# =============================================================================
 
-"""
-Utility functions for Token Classification NLP tasks
-Some parts of this code were adapted from the HuggingFace library at
-https://github.com/huggingface/pytorch-pretrained-BERT
-"""
-
-import random
 
 import numpy as np
 from torch.utils.data import Dataset
 
 from nemo import logging
 from nemo.collections.nlp.data.datasets.datasets_utils import (
-    get_intent_labels,
-    get_label_stats,
-    get_stats,
     process_imdb,
     process_jarvis_datasets,
     process_nlu,
     process_sst_2,
     process_thucnews,
 )
+from nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing import (
+    calc_class_weights,
+    get_intent_labels,
+    get_label_stats,
+    get_stats,
+    if_exist,
+)
 from nemo.collections.nlp.utils.callback_utils import list2str
-from nemo.collections.nlp.utils.common_nlp_utils import calc_class_weights, if_exist
 
 __all__ = ['BertTextClassificationDataset']
 
@@ -54,10 +50,9 @@ class BertTextClassificationDataset(Dataset):
         tokenizer (Tokenizer): such as BertTokenizer
         num_samples (int): number of samples you want to use for the dataset.
             If -1, use all dataset. Useful for testing.
-        shuffle (bool): whether to shuffle your data.
     """
 
-    def __init__(self, input_file, max_seq_length, tokenizer, num_samples=-1, shuffle=True):
+    def __init__(self, input_file, max_seq_length, tokenizer, num_samples=-1):
         with open(input_file, "r") as f:
             sent_labels, all_sent_subtokens = [], []
             sent_lengths = []
@@ -66,11 +61,8 @@ def __init__(self, input_file, max_seq_length, tokenizer, num_samples=-1, shuffl
             lines = f.readlines()[1:]
             logging.info(f'{input_file}: {len(lines)}')
 
-            if shuffle or num_samples > -1:
-                random.seed(0)
-                random.shuffle(lines)
-                if num_samples > 0:
-                    lines = lines[:num_samples]
+            if num_samples > 0:
+                lines = lines[:num_samples]
 
             for index, line in enumerate(lines):
                 if index % 20000 == 0:
@@ -177,7 +169,7 @@ def __init__(self, sent_id, sent_label, input_ids, input_mask, segment_ids):
         self.segment_ids = segment_ids
 
 
-class SentenceClassificationDataDesc:
+class TextClassificationDataDesc:
     def __init__(self, dataset_name, data_dir, do_lower_case):
         if dataset_name == 'sst-2':
             self.data_dir = process_sst_2(data_dir)
diff --git a/nemo/collections/nlp/data/datasets/token_classification_dataset.py b/nemo/collections/nlp/data/datasets/token_classification_dataset.py
index cac15d50d2c5..f57073236377 100644
--- a/nemo/collections/nlp/data/datasets/token_classification_dataset.py
+++ b/nemo/collections/nlp/data/datasets/token_classification_dataset.py
@@ -1,3 +1,20 @@
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# =============================================================================
 # Copyright 2018 The Google AI Language Team Authors and
 # The HuggingFace Inc. team.
 # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
@@ -13,6 +30,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# =============================================================================
 
 """
 Utility functions for Token Classification NLP tasks
@@ -23,13 +41,12 @@
 import itertools
 import os
 import pickle
-import random
 
 import numpy as np
 from torch.utils.data import Dataset
 
 from nemo import logging
-from nemo.collections.nlp.data.datasets import datasets_utils
+from nemo.collections.nlp.data.datasets.datasets_utils.data_preprocessing import get_label_stats, get_stats
 
 __all__ = ['BertTokenClassificationDataset', 'BertTokenClassificationInferDataset']
 
@@ -113,7 +130,7 @@ def get_features(
 
     max_seq_length = min(max_seq_length, max(sent_lengths))
     logging.info(f'Max length: {max_seq_length}')
-    datasets_utils.get_stats(sent_lengths)
+    get_stats(sent_lengths)
     too_long_count = 0
 
     for i, subtokens in enumerate(all_subtokens):
@@ -175,7 +192,6 @@ class BertTokenClassificationDataset(Dataset):
         tokenizer (Tokenizer): such as NemoBertTokenizer
         num_samples (int): number of samples you want to use for the dataset.
             If -1, use all dataset. Useful for testing.
-        shuffle (bool): whether to shuffle your data.
         pad_label (str): pad value use for labels.
             by default, it's the neutral label.
         label_ids (dict): label_ids (dict): dict to map labels to label ids.
@@ -196,7 +212,6 @@ def __init__(
         max_seq_length,
         tokenizer,
         num_samples=-1,
-        shuffle=False,
         pad_label='O',
         label_ids=None,
         ignore_extra_tokens=False,
@@ -241,12 +256,9 @@ def __init__(
             if len(labels_lines) != len(text_lines):
                 raise ValueError("Labels file should contain labels for every word")
 
-            if shuffle or num_samples > 0:
+            if num_samples > 0:
                 dataset = list(zip(text_lines, labels_lines))
-                random.shuffle(dataset)
-
-                if num_samples > 0:
-                    dataset = dataset[:num_samples]
+                dataset = dataset[:num_samples]
 
                 dataset = list(zip(*dataset))
                 text_lines = dataset[0]
@@ -308,7 +320,7 @@ def __init__(
         infold = text_file[: text_file.rfind('/')]
         merged_labels = itertools.chain.from_iterable(self.all_labels)
         logging.info('Three most popular labels')
-        _, self.label_frequencies = datasets_utils.get_label_stats(merged_labels, infold + '/label_stats.tsv')
+        _, self.label_frequencies = get_label_stats(merged_labels, infold + '/label_stats.tsv')
 
         # save label_ids
         out = open(infold + '/label_ids.csv', 'w')
diff --git a/nemo/collections/nlp/metrics/bleu.py b/nemo/collections/nlp/metrics/bleu.py
index bab9c5f4c0f6..a49eb0a8c10b 100644
--- a/nemo/collections/nlp/metrics/bleu.py
+++ b/nemo/collections/nlp/metrics/bleu.py
@@ -1,3 +1,20 @@
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# =============================================================================
 # Copyright 2017 Google Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,6 +29,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+
 """Python implementation of BLEU and smooth-BLEU.
 This module provides a Python implementation of BLEU and smooth-BLEU.
 Smooth BLEU is computed following the method outlined in the paper:
diff --git a/nemo/collections/nlp/metrics/sacrebleu.py b/nemo/collections/nlp/metrics/sacrebleu.py
index 586b19bf2d30..5130dd9633ca 100755
--- a/nemo/collections/nlp/metrics/sacrebleu.py
+++ b/nemo/collections/nlp/metrics/sacrebleu.py
@@ -1,6 +1,23 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# =============================================================================
 # Copyright 2017--2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
@@ -13,6 +30,8 @@
 # an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
+# =============================================================================
+
 """
 SacreBLEU provides hassle-free computation of shareable, comparable, and reproducible BLEU scores.
 Inspired by Rico Sennrich's `multi-bleu-detok.perl`, it produces the official WMT scores but works with plain text.
diff --git a/nemo/collections/nlp/metrics/squad_metrics.py b/nemo/collections/nlp/metrics/squad_metrics.py
index e5f0af1e2517..f8a89c9ead35 100644
--- a/nemo/collections/nlp/metrics/squad_metrics.py
+++ b/nemo/collections/nlp/metrics/squad_metrics.py
@@ -1,27 +1,43 @@
-"""
-Copyright 2018 The Google AI Language Team Authors and
-The HuggingFace Inc. team.
-Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# =============================================================================
+# Copyright 2018 The Google AI Language Team Authors and
+# The HuggingFace Inc. team.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
 
 import collections
 
 from transformers.tokenization_bert import BasicTokenizer
 
 from nemo import logging
-from nemo.collections.nlp.data.datasets.datasets_utils import get_tokens, normalize_answer
+from nemo.collections.nlp.utils.data_utils import get_tokens, normalize_answer
 
 __all__ = [
     'f1_score',
@@ -31,7 +47,6 @@
     'merge_eval',
     'find_all_best_thresh',
     'find_best_thresh',
-    'normalize_answer',
     '_get_best_indexes',
     'get_final_text',
 ]
diff --git a/nemo/collections/nlp/nm/data_layers/glue_benchmark_datalayer.py b/nemo/collections/nlp/nm/data_layers/glue_benchmark_datalayer.py
index dca9324b7817..b9f1fc2c1638 100644
--- a/nemo/collections/nlp/nm/data_layers/glue_benchmark_datalayer.py
+++ b/nemo/collections/nlp/nm/data_layers/glue_benchmark_datalayer.py
@@ -30,6 +30,14 @@ class GlueClassificationDataLayer(TextDataLayer):
     All the data processing is done in GLUEDataset.
 
     Args:
+        data_dir (str): data directory path
+        tokenizer (TokenizerSpec): text tokenizer.
+        max_seq_length (int): maximum allowed length of the text segments .
+        processor (DataProcessor): data processor.
+        evaluate (bool): true if data layer is used for evaluation. Default: False.
+        token_params (dict): dictionary that specifies special tokens.
+        batch_size (int): batch size in segments
+        shuffle (bool): whether to shuffle data or not. Default: False.
         dataset_type (GLUEDataset):
                 the dataset that needs to be converted to DataLayerNM
     """
@@ -38,12 +46,17 @@ class GlueClassificationDataLayer(TextDataLayer):
     @add_port_docs()
     def output_ports(self):
         """Returns definitions of module output ports.
+
+        input_ids:
+            indices of tokens which constitute batches of text segments
+        input_type_ids:
+            tensor with 0's and 1's to denote the text segment type
+        input_mask:
+            bool tensor with 0s in place of tokens to be masked
+        labels:
+            integer indices for sentence classication prediction
         """
         return {
-            # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "labels": NeuralType({0: AxisType(CategoricalTag)}),
             "input_ids": NeuralType(('B', 'T'), ChannelType()),
             "input_type_ids": NeuralType(('B', 'T'), ChannelType()),
             "input_mask": NeuralType(('B', 'T'), ChannelType()),
@@ -82,6 +95,14 @@ class GlueRegressionDataLayer(TextDataLayer):
     All the data processing is done in GLUEDataset.
 
     Args:
+        data_dir (str): data directory path
+        tokenizer (TokenizerSpec): text tokenizer.
+        max_seq_length (int): maximum allowed length of the text segments .
+        processor (DataProcessor): data processor.
+        evaluate (bool): true if data layer is used for evaluation. Default: False.
+        token_params (dict): dictionary that specifies special tokens.
+        batch_size (int): batch size in segments
+        shuffle (bool): whether to shuffle data or not. Default: False.
         dataset_type (GLUEDataset):
                 the dataset that needs to be converted to DataLayerNM
     """
@@ -90,12 +111,17 @@ class GlueRegressionDataLayer(TextDataLayer):
     @add_port_docs()
     def output_ports(self):
         """Returns definitions of module output ports.
+
+        input_ids:
+            indices of tokens which constitute batches of text segments
+        input_type_ids:
+            tensor with 0's and 1's to denote the text segment type
+        input_mask:
+            bool tensor with 0s in place of tokens to be masked
+        labels:
+            float for sentence regression prediction
         """
         return {
-            # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "labels": NeuralType({0: AxisType(RegressionTag)}),
             "input_ids": NeuralType(('B', 'T'), ChannelType()),
             "input_type_ids": NeuralType(('B', 'T'), ChannelType()),
             "input_mask": NeuralType(('B', 'T'), ChannelType()),
diff --git a/nemo/collections/nlp/nm/data_layers/joint_intent_slot_datalayer.py b/nemo/collections/nlp/nm/data_layers/joint_intent_slot_datalayer.py
index df3731cfa454..e9bec213d3e2 100644
--- a/nemo/collections/nlp/nm/data_layers/joint_intent_slot_datalayer.py
+++ b/nemo/collections/nlp/nm/data_layers/joint_intent_slot_datalayer.py
@@ -16,7 +16,7 @@
 
 from nemo.collections.nlp.data import BertJointIntentSlotDataset, BertJointIntentSlotInferDataset
 from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer
-from nemo.core import ChannelType, NeuralType
+from nemo.core import ChannelType, LabelsType, MaskType, NeuralType
 from nemo.utils.decorators import add_port_docs
 
 __all__ = ['BertJointIntentSlotDataLayer', 'BertJointIntentSlotInferDataLayer']
@@ -29,38 +29,53 @@ class BertJointIntentSlotDataLayer(TextDataLayer):
 
     All the data processing is done in BertJointIntentSlotDataset.
 
-    input_mask: used to ignore some of the input tokens like paddings
-
-    loss_mask: used to mask and ignore tokens in the loss function
-
-    subtokens_mask: used to ignore the outputs of unwanted tokens in
-    the inference and evaluation like the start and end tokens
-
     Args:
-        dataset (BertJointIntentSlotDataset):
+        input_file (str):
+            data file
+        slot_file (str):
+            file to slot labels, each line corresponding to
+            slot labels for a sentence in input_file. No header.
+        pad_label (int): pad value use for slot labels
+        tokenizer (TokenizerSpec): text tokenizer.
+        max_seq_length (int):
+            max sequence length minus 2 for [CLS] and [SEP]
+        dataset_type (BertJointIntentSlotDataset):
             the dataset that needs to be converted to DataLayerNM
+        shuffle (bool): whether to shuffle data or not. Default: False.
+        batch_size: text segments batch size
+        ignore_extra_tokens (bool): whether or not to ignore extra tokens
+        ignore_start_end (bool)": whether or not to ignore start and end
     """
 
     @property
     @add_port_docs()
     def output_ports(self):
         """Returns definitions of module output ports.
+
+        input_ids:
+            indices of tokens which constitute batches of text segments
+        input_type_ids:
+            tensor with 0's and 1's to denote the text segment type
+        input_mask:
+            bool tensor with 0s in place of tokens to be masked
+        loss_mask:
+            used to mask and ignore tokens in the loss function
+        subtokens_mask:
+            used to ignore the outputs of unwanted tokens in
+            the inference and evaluation like the start and end tokens
+        intents:
+            TODO
+        slots:
+            TODO
         """
         return {
-            # "input_ids":      NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "input_mask":     NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "loss_mask":      NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "intents": NeuralType({0: AxisType(BatchTag)}),
-            # "slots":          NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
             "input_ids": NeuralType(('B', 'T'), ChannelType()),
             "input_type_ids": NeuralType(('B', 'T'), ChannelType()),
             "input_mask": NeuralType(('B', 'T'), ChannelType()),
-            "loss_mask": NeuralType(('B', 'T'), ChannelType()),
+            "loss_mask": NeuralType(('B', 'T'), MaskType()),
             "subtokens_mask": NeuralType(('B', 'T'), ChannelType()),
-            "intents": NeuralType(tuple('B'), ChannelType()),
-            "slots": NeuralType(('B', 'T'), ChannelType()),
+            "intents": NeuralType(tuple('B'), LabelsType()),
+            "slots": NeuralType(('B', 'T'), LabelsType()),
         }
 
     def __init__(
@@ -84,11 +99,10 @@ def __init__(
             'tokenizer': tokenizer,
             'max_seq_length': max_seq_length,
             'num_samples': num_samples,
-            'shuffle': shuffle,
             'ignore_extra_tokens': ignore_extra_tokens,
             'ignore_start_end': ignore_start_end,
         }
-        super().__init__(dataset_type, dataset_params, batch_size, shuffle)
+        super().__init__(dataset_type, dataset_params, batch_size, shuffle=shuffle)
 
 
 class BertJointIntentSlotInferDataLayer(TextDataLayer):
@@ -98,29 +112,36 @@ class BertJointIntentSlotInferDataLayer(TextDataLayer):
 
     All the data processing is done in BertJointIntentSlotInferDataset.
 
-    input_mask: used to ignore some of the input tokens like paddings
-
-    loss_mask: used to mask and ignore tokens in the loss function
-
-    subtokens_mask: used to ignore the outputs of unwanted tokens in
-    the inference and evaluation like the start and end tokens
-
     Args:
-        dataset (BertJointIntentSlotInferDataset):
+        queries:
+            TODO
+        tokenizer (TokenizerSpec): text tokenizer.
+        max_seq_length (int):
+            max sequence length minus 2 for [CLS] and [SEP]
+        dataset_type (BertJointIntentSlotDataset):
             the dataset that needs to be converted to DataLayerNM
+        shuffle (bool): whether to shuffle data or not. Default: False.
+        batch_size: text segments batch size
     """
 
     @property
     @add_port_docs()
     def output_ports(self):
         """Returns definitions of module output ports.
+        
+        input_ids:
+            indices of tokens which constitute batches of text segments
+        input_type_ids:
+            tensor with 0's and 1's to denote the text segment type
+        input_mask:
+            bool tensor with 0s in place of tokens to be masked
+        loss_mask:
+            used to mask and ignore tokens in the loss function
+        subtokens_mask:
+            used to ignore the outputs of unwanted tokens in
+            the inference and evaluation like the start and end tokens
         """
         return {
-            # "input_ids":      NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "input_mask":     NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "loss_mask":      NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
             "input_ids": NeuralType(('B', 'T'), ChannelType()),
             "input_type_ids": NeuralType(('B', 'T'), ChannelType()),
             "input_mask": NeuralType(('B', 'T'), ChannelType()),
@@ -128,6 +149,14 @@ def output_ports(self):
             "subtokens_mask": NeuralType(('B', 'T'), ChannelType()),
         }
 
-    def __init__(self, queries, tokenizer, max_seq_length, batch_size=1, dataset_type=BertJointIntentSlotInferDataset):
+    def __init__(
+        self,
+        queries,
+        tokenizer,
+        max_seq_length,
+        batch_size=1,
+        shuffle=False,
+        dataset_type=BertJointIntentSlotInferDataset,
+    ):
         dataset_params = {'queries': queries, 'tokenizer': tokenizer, 'max_seq_length': max_seq_length}
-        super().__init__(dataset_type, dataset_params, batch_size, shuffle=False)
+        super().__init__(dataset_type, dataset_params, batch_size, shuffle=shuffle)
diff --git a/nemo/collections/nlp/nm/data_layers/lm_bert_datalayer.py b/nemo/collections/nlp/nm/data_layers/lm_bert_datalayer.py
index 176a7cc67a59..c59134b9f271 100644
--- a/nemo/collections/nlp/nm/data_layers/lm_bert_datalayer.py
+++ b/nemo/collections/nlp/nm/data_layers/lm_bert_datalayer.py
@@ -25,7 +25,7 @@
 from nemo.backends.pytorch import DataLayerNM
 from nemo.collections.nlp.data import BertPretrainingDataset, BertPretrainingPreprocessedDataset
 from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer
-from nemo.core import ChannelType, LabelsType, NeuralType
+from nemo.core import ChannelType, LabelsType, MaskType, NeuralType
 from nemo.utils.decorators import add_port_docs
 
 __all__ = ['BertPretrainingDataLayer', 'BertPretrainingPreprocessedDataLayer']
@@ -33,7 +33,7 @@
 
 class BertPretrainingDataLayer(TextDataLayer):
     """
-    Data layer for masked language modeling task.
+    Data layer for masked language modeling task for text data.
 
     Args:
         tokenizer (TokenizerSpec): tokenizer
@@ -43,30 +43,36 @@ class BertPretrainingDataLayer(TextDataLayer):
         batch_size (int): batch size in segments
         short_seeq_prob (float): Probability of creating sequences which are
             shorter than the maximum length.
-            Defualts to 0.1.
+            Defaults to 0.1.
+        shuffle (bool): whether to shuffle data or not. Default: False.
     """
 
     @property
     @add_port_docs()
     def output_ports(self):
         """Returns definitions of module output ports.
+        input_ids:
+            indices of tokens which constitute batches of masked text segments
+        input_type_ids:
+            tensor with 0's and 1's to denote the text segment type
+        input_mask:
+            bool tensor with 0s in place of tokens to be masked
+        output_ids: indices of tokens which constitute batches of unmasked text segments
+        output_mask: bool tensor with 0s in place of tokens to be masked
+        labels: 0 or 1 for next sentence prediction classification
         """
         return {
-            # "input_ids":      NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "input_mask":     NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "output_ids":     NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "output_mask":    NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "labels":         NeuralType({0: AxisType(BatchTag)}),
             "input_ids": NeuralType(('B', 'T'), ChannelType()),
             "input_type_ids": NeuralType(('B', 'T'), ChannelType()),
             "input_mask": NeuralType(('B', 'T'), ChannelType()),
-            "output_ids": NeuralType(('B', 'T'), ChannelType()),
-            "output_mask": NeuralType(('B', 'T'), ChannelType()),
+            "output_ids": NeuralType(('B', 'T'), LabelsType()),
+            "output_mask": NeuralType(('B', 'T'), MaskType()),
             "labels": NeuralType(tuple('B'), LabelsType()),
         }
 
-    def __init__(self, tokenizer, dataset, max_seq_length, mask_probability, short_seq_prob=0.1, batch_size=64):
+    def __init__(
+        self, tokenizer, dataset, max_seq_length, mask_probability, short_seq_prob=0.1, batch_size=64, shuffle=False
+    ):
         dataset_params = {
             'tokenizer': tokenizer,
             'dataset': dataset,
@@ -74,41 +80,40 @@ def __init__(self, tokenizer, dataset, max_seq_length, mask_probability, short_s
             'mask_probability': mask_probability,
             'short_seq_prob': short_seq_prob,
         }
-        super().__init__(BertPretrainingDataset, dataset_params, batch_size, shuffle=False)
+        super().__init__(BertPretrainingDataset, dataset_params, batch_size, shuffle=shuffle)
 
 
 class BertPretrainingPreprocessedDataLayer(DataLayerNM):
     """
-    Data layer for masked language modeling task.
+    Data layer for masked language modeling task for preprocessed data.
 
     Args:
-        tokenizer (TokenizerSpec): tokenizer
         dataset (str): directory or a single file with dataset documents
         max_seq_length (int): maximum allowed length of the text segments
-        mask_probability (float): probability of masking input sequence tokens
         batch_size (int): batch size in segments
-        short_seeq_prob (float): Probability of creating sequences which are
-            shorter than the maximum length.
-            Defualts to 0.1.
+        training (bool): true if in training mode
     """
 
     @property
     @add_port_docs()
     def output_ports(self):
         """Returns definitions of module output ports.
+        input_ids:
+            indices of tokens which constitute batches of masked text segments
+        input_type_ids:
+            tensor with 0's and 1's to denote the text segment type
+        input_mask:
+            bool tensor with 0s in place of tokens to be masked
+        output_ids: indices of tokens which constitute batches of unmasked text segments
+        output_mask: bool tensor with 0s in place of tokens to be masked
+        labels: 0 or 1 for next sentence prediction classification
         """
         return {
-            # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "output_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "labels": NeuralType({0: AxisType(BatchTag)}),
             "input_ids": NeuralType(('B', 'T'), ChannelType()),
             "input_type_ids": NeuralType(('B', 'T'), ChannelType()),
             "input_mask": NeuralType(('B', 'T'), ChannelType()),
-            "output_ids": NeuralType(('B', 'T'), ChannelType()),
-            "output_mask": NeuralType(('B', 'T'), ChannelType()),
+            "output_ids": NeuralType(('B', 'T'), LabelsType()),
+            "output_mask": NeuralType(('B', 'T'), MaskType()),
             "labels": NeuralType(tuple('B'), LabelsType()),
         }
 
diff --git a/nemo/collections/nlp/nm/data_layers/lm_transformer_datalayer.py b/nemo/collections/nlp/nm/data_layers/lm_transformer_datalayer.py
index a81cb1568c69..7c9df0695991 100644
--- a/nemo/collections/nlp/nm/data_layers/lm_transformer_datalayer.py
+++ b/nemo/collections/nlp/nm/data_layers/lm_transformer_datalayer.py
@@ -30,8 +30,12 @@ class LanguageModelingDataLayer(TextDataLayer):
         dataset (str): path to text document with data
         tokenizer (TokenizerSpec): tokenizer
         max_seq_length (int): maximum allowed length of the text segments
+        batch_size (int): batch size
         batch_step (int): how many tokens to skip between two successive
             segments of text when constructing batches
+        dataset_type (Dataset):
+                the underlying dataset. Default: LanguageModelingDataset
+        shuffle (bool): whether to shuffle data or not. Default: False.
     """
 
     @property
@@ -40,33 +44,26 @@ def output_ports(self):
         """Returns definitions of module output ports.
 
         input_ids: indices of tokens which constitute batches of text segments
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
         input_mask: bool tensor with 0s in place of tokens to be masked
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
         labels: indices of tokens which should be predicted from each of the
             corresponding tokens in input_ids; for left-to-right language
             modeling equals to input_ids shifted by 1 to the right
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
         """
         return {
-            # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
             "input_ids": NeuralType(('B', 'T'), ChannelType()),
             "input_mask": NeuralType(('B', 'T'), ChannelType()),
             "labels": NeuralType(('B', 'T'), LabelsType()),
         }
 
     def __init__(
-        self, dataset, tokenizer, max_seq_length, batch_size, batch_step=128, dataset_type=LanguageModelingDataset
+        self,
+        dataset,
+        tokenizer,
+        max_seq_length,
+        batch_size,
+        batch_step=128,
+        dataset_type=LanguageModelingDataset,
+        shuffle=False,
     ):
         dataset_params = {
             'dataset': dataset,
@@ -74,4 +71,4 @@ def __init__(
             'max_seq_length': max_seq_length,
             'batch_step': batch_step,
         }
-        super().__init__(dataset_type, dataset_params, batch_size, shuffle=False)
+        super().__init__(dataset_type, dataset_params, batch_size, shuffle=shuffle)
diff --git a/nemo/collections/nlp/nm/data_layers/machine_translation_datalayer.py b/nemo/collections/nlp/nm/data_layers/machine_translation_datalayer.py
index 33fa833fa7a6..0ff83ae67b90 100644
--- a/nemo/collections/nlp/nm/data_layers/machine_translation_datalayer.py
+++ b/nemo/collections/nlp/nm/data_layers/machine_translation_datalayer.py
@@ -42,6 +42,8 @@ class TranslationDataLayer(TextDataLayer):
             pairs with big difference in sentences length, removing pairs with
             the same tokens in src and tgt, etc; useful for training data layer
             and should not be used in evaluation data layer
+        dataset_type (Dataset):
+                the underlying dataset. Default: TranslationDataset
     """
 
     @property
@@ -50,28 +52,17 @@ def output_ports(self):
         """Returns definitions of module output ports.
 
         src_ids: indices of tokens which correspond to source sentences
-
         src_mask: bool tensor with 0s in place of source tokens to be masked
-
         tgt_ids: indices of tokens which correspond to target sentences
-
         tgt_mask: bool tensor with 0s in place of target tokens to be masked
-
         labels: indices of tokens which should be predicted from each of the
             corresponding target tokens in tgt_ids; for standard neural
             machine translation equals to tgt_ids shifted by 1 to the right
-
         sent_ids: indices of the sentences in a batch; important for
             evaluation with external metrics, such as SacreBLEU
 
         """
         return {
-            # "src_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "src_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "tgt_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "tgt_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "sent_ids": NeuralType({0: AxisType(BatchTag)}),
             "src_ids": NeuralType(('B', 'T'), ChannelType()),
             "src_mask": NeuralType(('B', 'T'), ChannelType()),
             "tgt_ids": NeuralType(('B', 'T'), ChannelType()),
diff --git a/nemo/collections/nlp/nm/data_layers/punctuation_capitalization_datalayer.py b/nemo/collections/nlp/nm/data_layers/punctuation_capitalization_datalayer.py
index 16de9a8956e7..10e943682e5a 100644
--- a/nemo/collections/nlp/nm/data_layers/punctuation_capitalization_datalayer.py
+++ b/nemo/collections/nlp/nm/data_layers/punctuation_capitalization_datalayer.py
@@ -16,30 +16,72 @@
 
 from nemo.collections.nlp.data import BertPunctuationCapitalizationDataset
 from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer
-from nemo.core import ChannelType, LabelsType, NeuralType
+from nemo.core import ChannelType, LabelsType, MaskType, NeuralType
 from nemo.utils.decorators import add_port_docs
 
 __all__ = ['PunctuationCapitalizationDataLayer']
 
 
 class PunctuationCapitalizationDataLayer(TextDataLayer):
+    """
+    Data layer for punctuation and capitalization.
+
+    Args:
+        text_file (str): file to sequences, each line should a sentence,
+            No header.
+        label_file (str): file to labels, each line corresponds to
+            word labels for a sentence in the text_file. No header.
+        tokenizer (TokenizerSpec): text tokenizer.
+        max_seq_length (int): max sequence length minus 2 for [CLS] and [SEP]
+        pad_label (str): ad value use for labels.
+            by default, it's the neutral label.
+        punct_label_ids (dict): 
+            dict to map labels to label ids.
+            Starts with pad_label->0 and then increases in alphabetical order
+            For dev set use label_ids generated during training to support
+            cases when not all labels are present in the dev set.
+            For training set label_ids should be None.
+        capit_label_ids (dict):
+            dict to map labels to label ids.
+            Starts with pad_label->0 and then increases in alphabetical order
+            For dev set use label_ids generated during training to support
+            cases when not all labels are present in the dev set.
+            For training set label_ids should be None.
+        num_samples (int):
+            number of samples you want to use for the dataset.
+                If -1, use all dataset. Useful for testing.
+        shuffle (bool): whether to shuffle your data.
+        batch_size (int): batch size
+        ignore_extra_tokens (bool): whether to ignore extra tokens in
+            the loss_mask
+        ignore_start_end (bool):
+            whether to ignore bos and eos tokens in the loss_mask
+        use_cache (bool): whether to use data cache
+        dataset_type (Dataset): Default BertPunctuationCapitalizationDataset.
+    """
+
     @property
     @add_port_docs()
     def output_ports(self):
         """Returns definitions of module output ports.
+        input_ids:
+            indices of tokens which constitute batches of masked text segments
+        input_type_ids:
+            tensor with 0's and 1's to denote the text segment type
+        input_mask:
+            bool tensor with 0s in place of tokens to be masked
+        loss_mask:
+            used to mask and ignore tokens in the loss function: indices of tokens which constitute batches of unmasked text segments
+        subtokens_mask:
+            used to mask all but the first subtoken of the work, could be useful during inference
+        punct_labels: punctuation label ids
+        capit_labels: capit_labels label ids
         """
         return {
-            # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "punct_labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "capit_labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
             "input_ids": NeuralType(('B', 'T'), ChannelType()),
             "input_type_ids": NeuralType(('B', 'T'), ChannelType()),
             "input_mask": NeuralType(('B', 'T'), ChannelType()),
-            "loss_mask": NeuralType(('B', 'T'), ChannelType()),
+            "loss_mask": NeuralType(('B', 'T'), MaskType()),
             "subtokens_mask": NeuralType(('B', 'T'), ChannelType()),
             "punct_labels": NeuralType(('B', 'T'), LabelsType()),
             "capit_labels": NeuralType(('B', 'T'), LabelsType()),
@@ -68,7 +110,6 @@ def __init__(
             'max_seq_length': max_seq_length,
             'tokenizer': tokenizer,
             'num_samples': num_samples,
-            'shuffle': shuffle,
             'pad_label': pad_label,
             'punct_label_ids': punct_label_ids,
             'capit_label_ids': capit_label_ids,
@@ -76,4 +117,4 @@ def __init__(
             'ignore_start_end': ignore_start_end,
             'use_cache': use_cache,
         }
-        super().__init__(dataset_type, dataset_params, batch_size, shuffle)
+        super().__init__(dataset_type, dataset_params, batch_size, shuffle=shuffle)
diff --git a/nemo/collections/nlp/nm/data_layers/qa_squad_datalayer.py b/nemo/collections/nlp/nm/data_layers/qa_squad_datalayer.py
index f66af5d20962..c192972ba60b 100644
--- a/nemo/collections/nlp/nm/data_layers/qa_squad_datalayer.py
+++ b/nemo/collections/nlp/nm/data_layers/qa_squad_datalayer.py
@@ -33,7 +33,7 @@ class BertQuestionAnsweringDataLayer(TextDataLayer):
             unanswerable questions.
         doc_stride (int): When splitting up a long document into chunks,
             how much stride to take between chunks.
-        max_query_length (iny): All training files which have a duration less
+        max_query_length (int): All training files which have a duration less
             than min_duration are dropped. Can't be used if the `utt2dur` file
             does not exist. Defaults to None.
         max_seq_length (int): All training files which have a duration more
@@ -42,7 +42,7 @@ class BertQuestionAnsweringDataLayer(TextDataLayer):
         mode (str): Use "train" or "dev" to define between
             training and evaluation.
         batch_size (int): Batch size. Defaults to 64.
-        dataset_type (class): Question Answering class.
+        dataset_type (Dataset): Question Answering class.
             Defaults to SquadDataset.
     """
 
@@ -50,14 +50,17 @@ class BertQuestionAnsweringDataLayer(TextDataLayer):
     @add_port_docs()
     def output_ports(self):
         """Returns definitions of module output ports.
+        input_ids:
+            indices of tokens which constitute batches of masked text segments
+        input_type_ids:
+            tensor with 0's and 1's to denote the text segment type
+        input_mask:
+            bool tensor with 0s in place of tokens to be masked
+        start_positions: indices of tokens which constitute start position of answer
+        end_positions: indices of tokens which constitute end position of answer
+        unique_ids: id of the Question answer example this instance belongs to
         """
         return {
-            # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "start_positions": NeuralType({0: AxisType(BatchTag)}),
-            # "end_positions": NeuralType({0: AxisType(BatchTag)}),
-            # "unique_ids": NeuralType({0: AxisType(BatchTag)}),
             "input_ids": NeuralType(('B', 'T'), ChannelType()),
             "input_type_ids": NeuralType(('B', 'T'), ChannelType()),
             "input_mask": NeuralType(('B', 'T'), ChannelType()),
diff --git a/nemo/collections/nlp/nm/data_layers/state_tracking_trade_datalayer.py b/nemo/collections/nlp/nm/data_layers/state_tracking_trade_datalayer.py
index 02088916ac83..42ba379dc461 100644
--- a/nemo/collections/nlp/nm/data_layers/state_tracking_trade_datalayer.py
+++ b/nemo/collections/nlp/nm/data_layers/state_tracking_trade_datalayer.py
@@ -50,32 +50,52 @@
 
 
 class MultiWOZDataLayer(TextDataLayer):
+    """
+    Creates the data layer to use for State Tracking dataset MultiWOZ.
+
+    Args:
+        data_dir (str): TODO
+        domains: TODO
+        all_domains: 
+            TODO
+        vocab:
+            TODO
+        slots:
+            TODO
+        gating_dict:
+            TODO 
+        num_samples:
+            TODO
+        batch_size:
+            TODO
+        mode:
+            TODO
+        shuffle:
+            TODO
+        num_workers:
+            TODO
+        input_dropout:
+            TODO
+        is_training:
+            TODO
+        dataset_type (Dataset):
+            TODO
+    """
+
     @property
     @add_port_docs()
     def output_ports(self):
         """Returns definitions of module output ports.
-
         src_ids: ids of input sequences
-
         src_lens: lengths of input sequences
-
         tgt_ids: labels for the generator output
-
         tgt_lens: lengths of the generator targets
-
         gating_labels: labels for the gating head
-
         turn_domain: list of the domains
             NeuralType(None)
 
         """
         return {
-            # "src_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "src_lens": NeuralType({0: AxisType(BatchTag)}),
-            # "tgt_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(TimeTag)}),
-            # "tgt_lens": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
-            # "gating_labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
-            # "turn_domain": NeuralType(None),
             "src_ids": NeuralType(('B', 'T'), ChannelType()),
             "src_lens": NeuralType(tuple('B'), LengthsType()),
             "tgt_ids": NeuralType(('B', 'D', 'T'), LabelsType()),
diff --git a/nemo/collections/nlp/nm/data_layers/text_classification_datalayer.py b/nemo/collections/nlp/nm/data_layers/text_classification_datalayer.py
index 2d6e60e0af58..d7f85408f894 100644
--- a/nemo/collections/nlp/nm/data_layers/text_classification_datalayer.py
+++ b/nemo/collections/nlp/nm/data_layers/text_classification_datalayer.py
@@ -27,9 +27,16 @@ class BertSentenceClassificationDataLayer(TextDataLayer):
     Creates the data layer to use for the task of sentence classification
     with pretrained model.
 
-    All the data processing is done BertSentenceClassificationDataset.
+    All the data processing is done BertTextClassificationDataset.
 
     Args:
+        input_file (str): data file
+        tokenizer (TokenizerSpec): text tokenizer.
+        max_seq_length (int): max sequence length minus 2 for [CLS] and [SEP]
+        num_samples:
+            TODO
+        shuffle (bool): whether to shuffle data or not. Default: False.
+        batch_size: text segments batch size
         dataset (BertTextClassificationDataset):
                 the dataset that needs to be converted to DataLayerNM
     """
@@ -38,12 +45,15 @@ class BertSentenceClassificationDataLayer(TextDataLayer):
     @add_port_docs()
     def output_ports(self):
         """Returns definitions of module output ports.
+        input_ids:
+            indices of tokens which constitute batches of masked text segments
+        input_type_ids:
+            tensor with 0's and 1's to denote the text segment type
+        input_mask:
+            bool tensor with 0s in place of tokens to be masked
+        labels: sequence classification id
         """
         return {
-            # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "labels": NeuralType({0: AxisType(BatchTag)}),
             "input_ids": NeuralType(('B', 'T'), ChannelType()),
             "input_type_ids": NeuralType(('B', 'T'), ChannelType()),
             "input_mask": NeuralType(('B', 'T'), ChannelType()),
@@ -65,6 +75,5 @@ def __init__(
             'tokenizer': tokenizer,
             'max_seq_length': max_seq_length,
             'num_samples': num_samples,
-            'shuffle': shuffle,
         }
-        super().__init__(dataset_type, dataset_params, batch_size, shuffle)
+        super().__init__(dataset_type, dataset_params, batch_size, shuffle=shuffle)
diff --git a/nemo/collections/nlp/nm/data_layers/text_datalayer.py b/nemo/collections/nlp/nm/data_layers/text_datalayer.py
index 1b02cb4c1f16..e18da9f0d721 100644
--- a/nemo/collections/nlp/nm/data_layers/text_datalayer.py
+++ b/nemo/collections/nlp/nm/data_layers/text_datalayer.py
@@ -25,8 +25,10 @@ class TextDataLayer(DataLayerNM):
     Generic Text Data Layer NM which wraps PyTorch's dataset
 
     Args:
-        dataset_type: type of dataset used for this datalayer
+        dataset_type (Dataset): type of dataset used for this datalayer
         dataset_params (dict): all the params for the dataset
+        batch_size (int): sequence batch size
+        shuffle (bool): whether to shuffle data
     """
 
     def __init__(self, dataset_type, dataset_params, batch_size, shuffle=False):
diff --git a/nemo/collections/nlp/nm/data_layers/token_classification_datalayer.py b/nemo/collections/nlp/nm/data_layers/token_classification_datalayer.py
index 8110fcf16e1b..3cd1256ef54a 100644
--- a/nemo/collections/nlp/nm/data_layers/token_classification_datalayer.py
+++ b/nemo/collections/nlp/nm/data_layers/token_classification_datalayer.py
@@ -16,29 +16,70 @@
 
 from nemo.collections.nlp.data import BertTokenClassificationDataset, BertTokenClassificationInferDataset
 from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer
-from nemo.core import ChannelType, LabelsType, NeuralType
+from nemo.core import ChannelType, LabelsType, MaskType, NeuralType
 from nemo.utils.decorators import add_port_docs
 
 __all__ = ['BertTokenClassificationDataLayer', 'BertTokenClassificationInferDataLayer']
 
 
 class BertTokenClassificationDataLayer(TextDataLayer):
+    """
+    Creates the data layer to use for the task of token classification
+    with pretrained model.
+
+    All the data processing is done BertTokenClassificationDataset.
+        text_file (str):
+            file to sequences, each line should a sentence,
+            No header.
+        label_file (str):
+            file to labels, each line corresponds to word labels for a sentence in the text_file. No header.
+        pad_label (int):
+            d value use for labels.
+            by default, it's the neutral label.
+        tokenizer (TokenizerSpec): text tokenizer.
+        max_seq_length (int):
+            max sequence length minus 2 for [CLS] and [SEP]
+        label_ids:
+            dict to map labels to label ids.
+            Starts with pad_label->0 and then increases in alphabetical order
+            For dev set use label_ids generated during training to support
+            cases when not all labels are present in the dev set.
+            For training set label_ids should be None.
+        num_samples (int): 
+            number of samples you want to use for the dataset.
+                If -1, use all dataset. Useful for testing.
+        shuffle (bool): whether to shuffle data or not. Default: False.
+        batch_size (int): text segments batch size
+        ignore_extra_tokens (bool): whether or not to ignore extra tokens
+        ignore_start_end (bool): whether or not to ignore start and end
+        use_cache:
+            whether to use data cache
+        dataset_type (BertTokenClassificationDataset):
+            the dataset that needs to be converted to DataLayerNM
+    """
+
     @property
     @add_port_docs()
     def output_ports(self):
         """Returns definitions of module output ports.
+        input_ids:
+            indices of tokens which constitute batches of text segments
+        input_type_ids:
+            tensor with 0's and 1's to denote the text segment type
+        input_mask:
+            bool tensor with 0s in place of tokens to be masked
+        loss_mask:
+            used to mask and ignore tokens in the loss function
+        subtokens_mask:
+            used to mask all but the first subtoken of the work, could be useful during inference
+        labels:
+            token target ids
         """
         return {
-            # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
             "input_ids": NeuralType(('B', 'T'), ChannelType()),
             "input_type_ids": NeuralType(('B', 'T'), ChannelType()),
             "input_mask": NeuralType(('B', 'T'), ChannelType()),
-            "loss_mask": NeuralType(('B', 'T'), ChannelType()),
+            "loss_mask": NeuralType(('B', 'T'), MaskType()),
             "subtokens_mask": NeuralType(('B', 'T'), ChannelType()),
             "labels": NeuralType(('B', 'T'), LabelsType()),
         }
@@ -65,7 +106,6 @@ def __init__(
             'max_seq_length': max_seq_length,
             'tokenizer': tokenizer,
             'num_samples': num_samples,
-            'shuffle': shuffle,
             'pad_label': pad_label,
             'label_ids': label_ids,
             'ignore_extra_tokens': ignore_extra_tokens,
@@ -76,17 +116,25 @@ def __init__(
 
 
 class BertTokenClassificationInferDataLayer(TextDataLayer):
+    """
+    All the data processing is done BertTokenClassificationInferDataset.
+        queries:
+            (list of str): quiries to run inference on
+        tokenizer (TokenizerSpec): text tokenizer.
+        max_seq_length (int):
+            max sequence length minus 2 for [CLS] and [SEP]
+        shuffle (bool): whether to shuffle data or not. Default: False.
+        batch_size: text segments batch size
+        dataset_type (BertTokenClassificationInferDataset):
+            the dataset that needs to be converted to DataLayerNM
+    """
+
     @property
     @add_port_docs()
     def output_ports(self):
         """Returns definitions of module output ports.
         """
         return {
-            # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
             "input_ids": NeuralType(('B', 'T'), ChannelType()),
             "input_type_ids": NeuralType(('B', 'T'), ChannelType()),
             "input_mask": NeuralType(('B', 'T'), ChannelType()),
@@ -95,7 +143,13 @@ def output_ports(self):
         }
 
     def __init__(
-        self, queries, tokenizer, max_seq_length, batch_size=1, dataset_type=BertTokenClassificationInferDataset,
+        self,
+        queries,
+        tokenizer,
+        max_seq_length,
+        batch_size=1,
+        shuffle=False,
+        dataset_type=BertTokenClassificationInferDataset,
     ):
         dataset_params = {'queries': queries, 'tokenizer': tokenizer, 'max_seq_length': max_seq_length}
-        super().__init__(dataset_type, dataset_params, batch_size, shuffle=False)
+        super().__init__(dataset_type, dataset_params, batch_size, shuffle=shuffle)
diff --git a/nemo/collections/nlp/nm/losses/__init__.py b/nemo/collections/nlp/nm/losses/__init__.py
index 11c24cdefa6b..ee7b74199e13 100644
--- a/nemo/collections/nlp/nm/losses/__init__.py
+++ b/nemo/collections/nlp/nm/losses/__init__.py
@@ -14,11 +14,6 @@
 # limitations under the License.
 # =============================================================================
 
-from nemo.collections.nlp.nm.losses.aggregator_loss import *
-from nemo.collections.nlp.nm.losses.joint_intent_slot_loss import *
-from nemo.collections.nlp.nm.losses.masked_language_modeling_loss import *
-from nemo.collections.nlp.nm.losses.padded_smoothed_cross_entropy_loss import *
-from nemo.collections.nlp.nm.losses.qa_squad_loss import *
+from nemo.collections.nlp.nm.losses.masked_xentropy_loss import *
 from nemo.collections.nlp.nm.losses.smoothed_cross_entropy_loss import *
-from nemo.collections.nlp.nm.losses.state_tracking_trade_loss import *
-from nemo.collections.nlp.nm.losses.token_classification_loss import *
+from nemo.collections.nlp.nm.losses.spanning_loss import *
diff --git a/nemo/collections/nlp/nm/losses/aggregator_loss.py b/nemo/collections/nlp/nm/losses/aggregator_loss.py
deleted file mode 100644
index 3165e19af29b..000000000000
--- a/nemo/collections/nlp/nm/losses/aggregator_loss.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# =============================================================================
-# Copyright 2020 NVIDIA. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-from nemo.backends.pytorch import LossNM
-from nemo.core import LossType, NeuralType
-from nemo.utils.decorators import add_port_docs
-
-__all__ = ['LossAggregatorNM']
-
-
-class LossAggregatorNM(LossNM):
-    """
-    Neural module which combines sums several losses into one.
-
-    Args:
-        num_inputs (int): number of input losses
-    """
-
-    @property
-    @add_port_docs()
-    def input_ports(self):
-        """Returns definitions of module input ports.
-
-        """
-        input_ports = {}
-        for i in range(self.num_losses):
-            input_ports["loss_" + str(i + 1)] = NeuralType()
-
-        return input_ports
-
-    @property
-    @add_port_docs()
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-        loss:
-            NeuralType(None)
-        """
-        return {"loss": NeuralType(elements_type=LossType())}
-
-    def __init__(self, num_inputs=2):
-        # Store number of inputs/losses.
-        self.num_losses = num_inputs
-        LossNM.__init__(self)
-
-    def _loss_function(self, **kwargs):
-        values = [kwargs[x] for x in sorted(kwargs.keys())]
-        loss = values[0]
-        for loss_i in values[1:]:
-            loss = loss.add(loss_i)
-        return loss
diff --git a/nemo/collections/nlp/nm/losses/joint_intent_slot_loss.py b/nemo/collections/nlp/nm/losses/joint_intent_slot_loss.py
deleted file mode 100644
index be5b87936c75..000000000000
--- a/nemo/collections/nlp/nm/losses/joint_intent_slot_loss.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# =============================================================================
-# Copyright 2020 NVIDIA. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-import torch
-from torch import nn
-
-from nemo.backends.pytorch import LossNM
-from nemo.core import ChannelType, LogitsType, LossType, NeuralType
-from nemo.utils.decorators import add_port_docs
-
-__all__ = ['JointIntentSlotLoss']
-
-
-class JointIntentSlotLoss(LossNM):
-    """
-    Loss function for the joint intent classification and slot
-    filling task.
-
-    The loss is a joint loss of both tasks, aim to maximize:
-    p(y^i | x)P(y^s1, y^s2, ..., y^sn | x)
-
-    with y^i being the predicted intent and y^s1, y^s2, ..., y^sn
-    are the predicted slots corresponding to x1, x2, ..., xn.
-
-    Args:
-        hidden_states: output of the hidden layers
-        intents: ground truth intents,
-        slots: ground truth slots.
-        input_mask: to differentiate from original tokens and paddings
-        intent_loss_weight: the loss is the sum of:
-            intent_loss_weight * intent_loss +
-            (1 - intent_loss_weight) * slot_loss
-
-    """
-
-    @property
-    @add_port_docs()
-    def input_ports(self):
-        """Returns definitions of module input ports.
-
-        """
-        return {
-            # "intent_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
-            # "slot_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}),
-            # "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "intents": NeuralType({0: AxisType(BatchTag)}),
-            # "slots": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "intent_logits": NeuralType(('B', 'D'), LogitsType()),
-            "slot_logits": NeuralType(('B', 'T', 'D'), LogitsType()),
-            "loss_mask": NeuralType(('B', 'T'), ChannelType()),
-            "intents": NeuralType(tuple('B'), ChannelType()),
-            "slots": NeuralType(('B', 'T'), ChannelType()),
-        }
-
-    @property
-    @add_port_docs()
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-        loss:
-            NeuralType(None)
-        """
-        # return {"loss": NeuralType(None)}
-        return {"loss": NeuralType(elements_type=LossType())}
-
-    def __init__(
-        self, num_slots, slot_classes_loss_weights=None, intent_classes_loss_weights=None, intent_loss_weight=0.6,
-    ):
-        LossNM.__init__(self)
-        self.num_slots = num_slots
-        self.intent_loss_weight = intent_loss_weight
-        self.slot_classes_loss_weights = slot_classes_loss_weights
-        self.intent_classes_loss_weights = intent_classes_loss_weights
-
-        # For weighted loss to tackle class imbalance
-        if slot_classes_loss_weights:
-            self.slot_classes_loss_weights = torch.FloatTensor(slot_classes_loss_weights).to(self._device)
-
-        if intent_classes_loss_weights:
-            self.intent_classes_loss_weights = torch.FloatTensor(intent_classes_loss_weights).to(self._device)
-
-        self._criterion_intent = nn.CrossEntropyLoss(weight=self.intent_classes_loss_weights)
-        self._criterion_slot = nn.CrossEntropyLoss(weight=self.slot_classes_loss_weights)
-
-    def _loss_function(self, intent_logits, slot_logits, loss_mask, intents, slots):
-        intent_loss = self._criterion_intent(intent_logits, intents)
-
-        active_loss = loss_mask.view(-1) > 0.5
-        active_logits = slot_logits.view(-1, self.num_slots)[active_loss]
-        active_labels = slots.view(-1)[active_loss]
-
-        # To support empty active_labels
-        if len(active_labels) == 0:
-            slot_loss = 0.0
-        else:
-            slot_loss = self._criterion_slot(active_logits, active_labels)
-        loss = intent_loss * self.intent_loss_weight + slot_loss * (1 - self.intent_loss_weight)
-
-        return loss
diff --git a/nemo/collections/nlp/nm/losses/masked_language_modeling_loss.py b/nemo/collections/nlp/nm/losses/masked_language_modeling_loss.py
deleted file mode 100644
index b29667b1aee0..000000000000
--- a/nemo/collections/nlp/nm/losses/masked_language_modeling_loss.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# =============================================================================
-# Copyright 2020 NVIDIA. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-from nemo.backends.pytorch import LossNM
-from nemo.collections.nlp.nm.losses.smoothed_cross_entropy_loss import SmoothedCrossEntropyLoss
-from nemo.core import ChannelType, LogitsType, LossType, NeuralType
-from nemo.utils.decorators import add_port_docs
-
-__all__ = ['MaskedLanguageModelingLossNM']
-
-
-class MaskedLanguageModelingLossNM(LossNM):
-    """
-    Neural module which implements Masked Language Modeling (MLM) loss.
-
-    Args:
-        label_smoothing (float): label smoothing regularization coefficient
-    """
-
-    @property
-    @add_port_docs()
-    def input_ports(self):
-        """Returns definitions of module input ports.
-        """
-        return {
-            # "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}),
-            # "output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "output_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "logits": NeuralType(('B', 'T', 'D'), LogitsType()),
-            "output_ids": NeuralType(('B', 'T'), ChannelType()),
-            "output_mask": NeuralType(('B', 'T'), ChannelType()),
-        }
-
-    @property
-    @add_port_docs()
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-        loss:
-            NeuralType(None)
-        """
-        return {"loss": NeuralType(elements_type=LossType())}
-
-    def __init__(self, label_smoothing=0.0):
-        LossNM.__init__(self)
-        self._criterion = SmoothedCrossEntropyLoss(label_smoothing)
-
-    def _loss_function(self, logits, output_ids, output_mask):
-        loss = self._criterion(logits, output_ids, output_mask)
-        return loss
diff --git a/nemo/collections/nlp/nm/losses/state_tracking_trade_loss.py b/nemo/collections/nlp/nm/losses/masked_xentropy_loss.py
similarity index 56%
rename from nemo/collections/nlp/nm/losses/state_tracking_trade_loss.py
rename to nemo/collections/nlp/nm/losses/masked_xentropy_loss.py
index 7623c8cddc32..ef2240aad303 100644
--- a/nemo/collections/nlp/nm/losses/state_tracking_trade_loss.py
+++ b/nemo/collections/nlp/nm/losses/masked_xentropy_loss.py
@@ -42,16 +42,16 @@
 from nemo.core.neural_types import LabelsType, LengthsType, LogitsType, LossType, NeuralType
 from nemo.utils.decorators import add_port_docs
 
-__all__ = ['TRADEMaskedCrossEntropy', 'CrossEntropyLoss3D']
+__all__ = ['MaskedXEntropyLoss']
 
 
-class TRADEMaskedCrossEntropy(LossNM):
+class MaskedXEntropyLoss(LossNM):
     """
-    Neural module which implements a cross entropy for trade model with masking feature.
+    Neural module which implements a cross entropy model with masking feature. It keeps just the target logit for cross entropy calculation
 
     Args:
         logits (float): output of the classifier
-        targets (long): ground truth targets
+        labels (long): ground truth targets
         loss_mask (long): specifies the ones to get ignored in loss calculation
 
 
@@ -64,20 +64,15 @@ def input_ports(self):
 
         logits: 4d tensor of logits
 
-        targets: 3d tensor of labels
+        labels: 3d tensor of labels
 
         loss_mask: specifies the words to be considered in the loss calculation
 
         """
         return {
-            # "logits": NeuralType(
-            #     {0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag), 3: AxisType(ChannelTag)}
-            # ),
-            # "targets": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(TimeTag)}),
-            # "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
             "logits": NeuralType(('B', 'T', 'D', 'D'), LogitsType()),
-            "targets": NeuralType(('B', 'D', 'T'), LabelsType()),
-            "loss_mask": NeuralType(('B', 'D'), LengthsType()),
+            "labels": NeuralType(('B', 'D', 'T'), LabelsType()),
+            "length_mask": NeuralType(('B', 'D'), LengthsType()),
         }
 
     @property
@@ -91,65 +86,21 @@ def output_ports(self):
     def __init__(self):
         LossNM.__init__(self)
 
-    def _loss_function(self, logits, targets, loss_mask):
+    def _loss_function(self, logits, labels, length_mask, eps=1e-10):
         logits_flat = logits.view(-1, logits.size(-1))
-        eps = 1e-10
         log_probs_flat = torch.log(torch.clamp(logits_flat, min=eps))
-        target_flat = targets.view(-1, 1)
-        losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat)
-        losses = losses_flat.view(*targets.size())
-        loss = self.masking(losses, loss_mask)
+        labels_flat = labels.view(-1, 1)
+        losses_flat = -torch.gather(log_probs_flat, dim=1, index=labels_flat)
+        losses = losses_flat.view(*labels.size())
+        loss = self.masking(losses, length_mask)
         return loss
 
     @staticmethod
-    def masking(losses, mask):
+    def masking(losses, length_mask):
         max_len = losses.size(2)
 
-        mask_ = torch.arange(max_len, device=mask.device)[None, None, :] < mask[:, :, None]
+        mask_ = torch.arange(max_len, device=length_mask.device)[None, None, :] < length_mask[:, :, None]
         mask_ = mask_.float()
         losses = losses * mask_
         loss = losses.sum() / mask_.sum()
         return loss
-
-
-class CrossEntropyLoss3D(LossNM):
-    """
-    Neural module which implements a cross entropy loss for 3d logits.
-    Args:
-        num_classes (int): number of classes in a classifier, e.g. size
-            of the vocabulary in language modeling objective
-        logits (float): output of the classifier
-        labels (long): ground truth labels
-    """
-
-    @property
-    @add_port_docs()
-    def input_ports(self):
-        """Returns definitions of module input ports.
-        """
-        return {
-            # "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(ChannelTag)}),
-            # "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
-            "logits": NeuralType(('B', 'D', 'D'), LogitsType()),
-            "labels": NeuralType(('B', 'D'), LabelsType()),
-        }
-
-    @property
-    @add_port_docs()
-    def output_ports(self):
-        """Returns definitions of module output ports.
-        """
-        # return {"loss": NeuralType(None)}
-        return {"loss": NeuralType(elements_type=LossType())}
-
-    def __init__(self, num_classes, **kwargs):
-        LossNM.__init__(self, **kwargs)
-        self._criterion = torch.nn.CrossEntropyLoss()
-        self.num_classes = num_classes
-
-    def _loss_function(self, logits, labels):
-        logits_flatten = logits.view(-1, self.num_classes)
-        labels_flatten = labels.view(-1)
-
-        loss = self._criterion(logits_flatten, labels_flatten)
-        return loss
diff --git a/nemo/collections/nlp/nm/losses/padded_smoothed_cross_entropy_loss.py b/nemo/collections/nlp/nm/losses/padded_smoothed_cross_entropy_loss.py
deleted file mode 100644
index dfae9e852987..000000000000
--- a/nemo/collections/nlp/nm/losses/padded_smoothed_cross_entropy_loss.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# =============================================================================
-# Copyright 2020 NVIDIA. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-from nemo.backends.pytorch import LossNM
-from nemo.collections.nlp.nm.losses.smoothed_cross_entropy_loss import SmoothedCrossEntropyLoss
-from nemo.collections.nlp.utils.common_nlp_utils import mask_padded_tokens
-from nemo.core import LabelsType, LogitsType, LossType, NeuralType
-from nemo.utils.decorators import add_port_docs
-
-__all__ = ['PaddedSmoothedCrossEntropyLossNM']
-
-
-class PaddedSmoothedCrossEntropyLossNM(LossNM):
-    """
-    Neural module which calculates CrossEntropyLoss and
-    1) excludes padding tokens from loss calculation
-    2) allows to use label smoothing regularization
-    3) allows to calculate loss for the desired number of last tokens
-
-    Args:
-        label_smoothing (float): label smoothing regularization coefficient
-        predict_last_k (int): how many last tokens to use for the loss
-            calculation, important for fast evaluation of LM perplexity
-    """
-
-    @property
-    @add_port_docs()
-    def input_ports(self):
-        """Returns definitions of module input ports.
-        """
-        return {
-            # "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}),
-            # "target_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "logits": NeuralType(('B', 'T', 'D'), LogitsType()),
-            "target_ids": NeuralType(('B', 'T'), LabelsType()),
-        }
-
-    @property
-    @add_port_docs()
-    def output_ports(self):
-        """Returns definitions of module output ports.
-        """
-        # return {"loss": NeuralType(None)}
-        return {"loss": NeuralType(elements_type=LossType())}
-
-    def __init__(self, pad_id, label_smoothing=0, predict_last_k=0):
-        LossNM.__init__(self)
-
-        self._loss_fn = SmoothedCrossEntropyLoss(label_smoothing, predict_last_k)
-        self._pad_id = pad_id
-
-    def _loss_function(self, logits, target_ids):
-        target_mask = mask_padded_tokens(target_ids, self._pad_id).to(logits.dtype)
-        loss = self._loss_fn(logits, target_ids, target_mask)
-        return loss
diff --git a/nemo/collections/nlp/nm/losses/smoothed_cross_entropy_loss.py b/nemo/collections/nlp/nm/losses/smoothed_cross_entropy_loss.py
index cecedece75de..b33a4c4b7611 100644
--- a/nemo/collections/nlp/nm/losses/smoothed_cross_entropy_loss.py
+++ b/nemo/collections/nlp/nm/losses/smoothed_cross_entropy_loss.py
@@ -16,10 +16,65 @@
 
 import torch
 
+from nemo.backends.pytorch import LossNM
+from nemo.collections.nlp.utils.data_utils import mask_padded_tokens
+from nemo.core import LabelsType, LogitsType, LossType, MaskType, NeuralType
+
 __all__ = ['SmoothedCrossEntropyLoss']
 
 
-class SmoothedCrossEntropyLoss(torch.nn.Module):
+class SmoothedCrossEntropyLoss(LossNM):
+    """
+    Neural module which calculates CrossEntropyLoss and
+    1) excludes padding tokens from loss calculation
+    2) allows to use label smoothing regularization
+    3) allows to calculate loss for the desired number of last tokens
+
+    Args:
+        label_smoothing (float): label smoothing regularization coefficient
+        predict_last_k (int): how many last tokens to use for the loss
+            calculation, important for fast evaluation of LM perplexity
+    """
+
+    @property
+    def input_ports(self):
+        """Returns definitions of module input ports.
+        """
+        return {
+            "logits": NeuralType(('B', 'T', 'D'), LogitsType()),
+            "labels": NeuralType(('B', 'T'), LabelsType()),
+            "output_mask": NeuralType(('B', 'T'), MaskType(), optional=True),
+        }
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+        """
+        # return {"loss": NeuralType(None)}
+        return {"loss": NeuralType(elements_type=LossType())}
+
+    def __init__(self, pad_id=None, label_smoothing=0, predict_last_k=0):
+        LossNM.__init__(self)
+
+        self._loss_fn = SmoothedCrossEntropy(label_smoothing, predict_last_k)
+        self._pad_id = pad_id
+
+    def _loss_function(self, logits, labels, output_mask=None):
+        if output_mask is not None:
+            labels_mask = output_mask
+        elif self._pad_id is not None:
+            labels_mask = mask_padded_tokens(labels, self._pad_id).to(logits.dtype)
+        else:
+            raise ValueError("Both output_mask and pad_id are None")
+
+        if labels_mask.dtype is not logits.dtype:
+            labels_mask = labels_mask.to(logits.dtype)
+
+        loss = self._loss_fn(logits, labels, labels_mask)
+        return loss
+
+
+class SmoothedCrossEntropy(torch.nn.Module):
     """
     Cross-entropy loss with label smoothing for a batch of sequences.
 
@@ -41,16 +96,17 @@ def __init__(self, label_smoothing=0.0, predict_last_k=0):
         self._smoothing = label_smoothing
         self._predict_last_k = predict_last_k
 
-    def forward(self, logits, output_ids, output_mask, eps=1e-6):
+    def forward(self, logits, labels, output_mask, eps=1e-6):
         """
         Args:
-            logits: float tensor of shape batch_size x seq_len x vocab_size
-            output_ids: int tensor of shape batch_size x seq_len
+            logits: float tensor of shape batch_size x seq_len x vocab_size, values should be log probabilities
+            labels: int tensor of shape batch_size x seq_len
             output_mask: binary tensor of shape batch_size x seq_len
+            eps: epsilon param to avoid divide by zero in loss calculation
         """
         batch_size, seq_len, vocab_size = logits.size()
         smoothing = vocab_size * self._smoothing / (vocab_size - 1)
-        target_logits = logits.gather(2, output_ids.unsqueeze(2)).squeeze(2)
+        target_logits = logits.gather(2, labels.unsqueeze(2)).squeeze(2)
         smoothing_logits = logits.mean(dim=-1)
         neg_log_likelihood = (1.0 - smoothing) * target_logits + smoothing * smoothing_logits
         neg_log_likelihood = neg_log_likelihood[:, -self._predict_last_k :]
diff --git a/nemo/collections/nlp/nm/losses/qa_squad_loss.py b/nemo/collections/nlp/nm/losses/spanning_loss.py
similarity index 86%
rename from nemo/collections/nlp/nm/losses/qa_squad_loss.py
rename to nemo/collections/nlp/nm/losses/spanning_loss.py
index 289f98ce989e..d0193725887f 100644
--- a/nemo/collections/nlp/nm/losses/qa_squad_loss.py
+++ b/nemo/collections/nlp/nm/losses/spanning_loss.py
@@ -20,10 +20,10 @@
 from nemo.core import ChannelType, LogitsType, LossType, NeuralType
 from nemo.utils.decorators import add_port_docs
 
-__all__ = ['QuestionAnsweringLoss']
+__all__ = ['SpanningLoss']
 
 
-class QuestionAnsweringLoss(LossNM):
+class SpanningLoss(LossNM):
     """
     Neural module which implements QuestionAnswering loss.
     Args:
@@ -42,9 +42,6 @@ def input_ports(self):
         """Returns definitions of module input ports.
         """
         return {
-            # "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}),
-            # "start_positions": NeuralType({0: AxisType(BatchTag)}),
-            # "end_positions": NeuralType({0: AxisType(BatchTag)}),
             "logits": NeuralType(('B', 'T', 'D'), LogitsType()),
             "start_positions": NeuralType(tuple('B'), ChannelType()),
             "end_positions": NeuralType(tuple('B'), ChannelType()),
@@ -69,9 +66,6 @@ def output_ports(self):
             1: AxisType(TimeTag)
         """
         return {
-            # "loss": NeuralType(None),
-            # "start_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "end_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
             "loss": NeuralType(elements_type=LossType()),
             "start_logits": NeuralType(('B', 'T'), ChannelType()),
             "end_logits": NeuralType(('B', 'T'), ChannelType()),
diff --git a/nemo/collections/nlp/nm/losses/token_classification_loss.py b/nemo/collections/nlp/nm/losses/token_classification_loss.py
deleted file mode 100644
index ec7dad68c499..000000000000
--- a/nemo/collections/nlp/nm/losses/token_classification_loss.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# =============================================================================
-# Copyright 2020 NVIDIA. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-import torch
-from torch import nn
-
-from nemo.backends.pytorch import LossNM
-from nemo.core import ChannelType, LabelsType, LogitsType, LossType, NeuralType
-from nemo.utils.decorators import add_port_docs
-
-__all__ = ['TokenClassificationLoss']
-
-
-class TokenClassificationLoss(LossNM):
-    """
-    Neural module which implements Token Classification loss.
-
-    Args:
-        num_classes (int): number of classes in a classifier, e.g. size
-            of the vocabulary in language modeling objective
-        logits (float): output of the classifier
-        labels (long): ground truth labels
-        loss_mask (long): to differentiate from original tokens and paddings
-    """
-
-    @property
-    @add_port_docs()
-    def input_ports(self):
-        """Returns definitions of module input ports.
-        """
-        return {
-            # "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}),
-            # "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "logits": NeuralType(('B', 'T', 'D'), LogitsType()),
-            "labels": NeuralType(('B', 'T'), LabelsType()),
-            "loss_mask": NeuralType(('B', 'T'), ChannelType()),
-        }
-
-    @property
-    @add_port_docs()
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-        loss:
-            NeuralType(None)
-        """
-        return {"loss": NeuralType(elements_type=LossType())}
-
-    def __init__(self, num_classes, class_weights=None):
-        LossNM.__init__(self)
-        if class_weights:
-            class_weights = torch.FloatTensor(class_weights).to(self._device)
-
-        self._criterion = nn.CrossEntropyLoss(weight=class_weights)
-        self.num_classes = num_classes
-
-    def _loss_function(self, logits, labels, loss_mask):
-        active_loss = loss_mask.view(-1) > 0.5
-        active_logits = logits.view(-1, self.num_classes)[active_loss]
-        active_labels = labels.view(-1)[active_loss]
-
-        loss = self._criterion(active_logits, active_labels)
-        return loss
diff --git a/nemo/collections/nlp/nm/trainables/common/huggingface/albert_nm.py b/nemo/collections/nlp/nm/trainables/common/huggingface/albert_nm.py
index 5279d60efb47..0be2d9f73583 100644
--- a/nemo/collections/nlp/nm/trainables/common/huggingface/albert_nm.py
+++ b/nemo/collections/nlp/nm/trainables/common/huggingface/albert_nm.py
@@ -14,6 +14,24 @@
 # limitations under the License.
 # =============================================================================
 
+# =============================================================================
+# Copyright 2018 The Google AI Language Team Authors and
+# The HuggingFace Inc. team.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
 from typing import List, Optional
 
 from transformers import (
@@ -56,12 +74,10 @@ class Albert(TrainableNM):
     @add_port_docs()
     def input_ports(self):
         """Returns definitions of module input ports.
+        input_ids: input token ids
+        token_type_ids: segment type ids
+        attention_mask: attention mask
         """
-        # return {
-        #     "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-        #     "token_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-        #     "attention_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-        # }
         return {
             "input_ids": NeuralType(('B', 'T'), ChannelType()),
             "token_type_ids": NeuralType(('B', 'T'), ChannelType()),
@@ -71,9 +87,9 @@ def input_ports(self):
     @property
     @add_port_docs()
     def output_ports(self):
-        """Returns definitions of module output ports.
+        """Returns definitions of module input ports.
+        hidden_states: output embedding
         """
-        # return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})}
         return {"hidden_states": NeuralType(('B', 'T', 'D'), ChannelType())}
 
     def __init__(
@@ -134,19 +150,12 @@ def __init__(
 
         self.add_module("albert", model)
         self.config = model.config
-
-        # TK: storing config name in init_params instead.
-        # for key, value in self.config.to_dict().items():
-        #    self._local_parameters[key] = value
-
-        # Store the only value that will be used externally - hidden_size.
         self._hidden_size = model.config.hidden_size
 
     @property
     def hidden_size(self):
         """
             Property returning hidden size.
-
             Returns:
                 Hidden size.
         """
diff --git a/nemo/collections/nlp/nm/trainables/common/huggingface/bert_nm.py b/nemo/collections/nlp/nm/trainables/common/huggingface/bert_nm.py
index a4ac1f9d1c66..dd6a554845ee 100644
--- a/nemo/collections/nlp/nm/trainables/common/huggingface/bert_nm.py
+++ b/nemo/collections/nlp/nm/trainables/common/huggingface/bert_nm.py
@@ -14,6 +14,24 @@
 # limitations under the License.
 # =============================================================================
 
+# =============================================================================
+# Copyright 2018 The Google AI Language Team Authors and
+# The HuggingFace Inc. team.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
 from typing import List, Optional
 
 from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertConfig, BertModel
@@ -51,11 +69,11 @@ class BERT(TrainableNM):
     @add_port_docs()
     def input_ports(self):
         """Returns definitions of module input ports.
+        input_ids: input token ids
+        token_type_ids: segment type ids
+        attention_mask: attention mask
         """
         return {
-            # "input_ids":      NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "token_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "attention_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
             "input_ids": NeuralType(('B', 'T'), ChannelType()),
             "token_type_ids": NeuralType(('B', 'T'), ChannelType()),
             "attention_mask": NeuralType(('B', 'T'), ChannelType()),
@@ -65,8 +83,8 @@ def input_ports(self):
     @add_port_docs()
     def output_ports(self):
         """Returns definitions of module output ports.
+        hidden_states: output embedding 
         """
-        # return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})}
         return {"hidden_states": NeuralType(('B', 'T', 'D'), ChannelType())}
 
     def __init__(
@@ -127,12 +145,6 @@ def __init__(
 
         self.add_module("bert", model)
         self.config = model.config
-
-        # TK: storing config name in init_params instead.
-        # for key, value in self.config.to_dict().items():
-        #    self._local_parameters[key] = value
-
-        # Store the only value that will be used externally - hidden_size.
         self._hidden_size = model.config.hidden_size
 
     @property
diff --git a/nemo/collections/nlp/nm/trainables/common/huggingface/roberta_nm.py b/nemo/collections/nlp/nm/trainables/common/huggingface/roberta_nm.py
index 650d637bb74e..fa960856a5ce 100644
--- a/nemo/collections/nlp/nm/trainables/common/huggingface/roberta_nm.py
+++ b/nemo/collections/nlp/nm/trainables/common/huggingface/roberta_nm.py
@@ -14,6 +14,24 @@
 # limitations under the License.
 # =============================================================================
 
+# =============================================================================
+# Copyright 2018 The Google AI Language Team Authors and
+# The HuggingFace Inc. team.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
 from typing import List, Optional
 
 from transformers import (
@@ -56,12 +74,10 @@ class Roberta(TrainableNM):
     @add_port_docs()
     def input_ports(self):
         """Returns definitions of module input ports.
+        input_ids: input token ids
+        token_type_ids: segment type ids
+        attention_mask: attention mask
         """
-        # return {
-        #     "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-        #     "token_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-        #     "attention_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-        # }
         return {
             "input_ids": NeuralType(('B', 'T'), ChannelType()),
             "token_type_ids": NeuralType(('B', 'T'), ChannelType()),
@@ -72,8 +88,8 @@ def input_ports(self):
     @add_port_docs()
     def output_ports(self):
         """Returns definitions of module output ports.
+        hidden_states: output embedding 
         """
-        # return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})}
         return {"hidden_states": NeuralType(('B', 'T', 'D'), ChannelType())}
 
     def __init__(
@@ -134,12 +150,6 @@ def __init__(
 
         self.add_module("roberta", model)
         self.config = model.config
-
-        # TK: storing config name in init_params instead.
-        # for key, value in self.config.to_dict().items():
-        #    self._local_parameters[key] = value
-
-        # Store the only value that will be used externally - hidden_size.
         self._hidden_size = model.config.hidden_size
 
     @property
diff --git a/nemo/collections/nlp/nm/trainables/common/sequence_classification_nm.py b/nemo/collections/nlp/nm/trainables/common/sequence_classification_nm.py
index 5f938b64d4c2..b233242536dc 100644
--- a/nemo/collections/nlp/nm/trainables/common/sequence_classification_nm.py
+++ b/nemo/collections/nlp/nm/trainables/common/sequence_classification_nm.py
@@ -17,7 +17,7 @@
 from torch import nn as nn
 
 from nemo.backends.pytorch import MultiLayerPerceptron, TrainableNM
-from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import transformer_weights_init
+from nemo.collections.nlp.utils.transformer_utils import transformer_weights_init
 from nemo.core import ChannelType, LogitsType, NeuralType
 from nemo.utils.decorators import add_port_docs
 
@@ -37,12 +37,15 @@ class SequenceClassifier(TrainableNM):
         activation (str): activation function applied in classifier MLP layers
         log_softmax (bool): whether to apply log_softmax to MLP output
         dropout (float): dropout ratio applied to MLP
+        use_transformer_pretrained (bool):
+            TODO
     """
 
     @property
     @add_port_docs()
     def input_ports(self):
         """Returns definitions of module input ports.
+        hidden_states: embedding hidden states
         """
         return {"hidden_states": NeuralType(('B', 'T', 'D'), ChannelType())}
 
@@ -50,8 +53,8 @@ def input_ports(self):
     @add_port_docs()
     def output_ports(self):
         """Returns definitions of module output ports.
+        logits: logits before loss
         """
-        # return {"logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)})}
         return {"logits": NeuralType(('B', 'D'), LogitsType())}
 
     def __init__(
diff --git a/nemo/collections/nlp/nm/trainables/common/sequence_regression_nm.py b/nemo/collections/nlp/nm/trainables/common/sequence_regression_nm.py
index 8f0db64dd48a..ec681b86d3fa 100644
--- a/nemo/collections/nlp/nm/trainables/common/sequence_regression_nm.py
+++ b/nemo/collections/nlp/nm/trainables/common/sequence_regression_nm.py
@@ -17,7 +17,7 @@
 from torch import nn as nn
 
 from nemo.backends.pytorch import MultiLayerPerceptron, TrainableNM
-from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import transformer_weights_init
+from nemo.collections.nlp.utils.transformer_utils import transformer_weights_init
 from nemo.core import ChannelType, NeuralType, RegressionValuesType
 from nemo.utils.decorators import add_port_docs
 
@@ -35,22 +35,24 @@ class SequenceRegression(TrainableNM):
         num_layers (int): number of layers in classifier MLP
         activation (str): activation function applied in classifier MLP layers
         dropout (float): dropout ratio applied to MLP
+        use_transformer_pretrained (bool):
+            TODO
     """
 
     @property
     @add_port_docs()
     def input_ports(self):
         """Returns definitions of module input ports.
+        hidden_states: embedding hidden states
         """
-        # return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})}
         return {"hidden_states": NeuralType(('B', 'T', 'D'), ChannelType())}
 
     @property
     @add_port_docs()
     def output_ports(self):
         """Returns definitions of module output ports.
+        preds: predictions before loss
         """
-        # return {"preds": NeuralType({0: AxisType(RegressionTag)})}
         return {"preds": NeuralType(tuple('B'), RegressionValuesType())}
 
     def __init__(self, hidden_size, num_layers=2, activation='relu', dropout=0.0, use_transformer_pretrained=True):
diff --git a/nemo/collections/nlp/nm/trainables/common/token_classification_nm.py b/nemo/collections/nlp/nm/trainables/common/token_classification_nm.py
index 2eefe80ec3c6..93aee4e9bf8d 100644
--- a/nemo/collections/nlp/nm/trainables/common/token_classification_nm.py
+++ b/nemo/collections/nlp/nm/trainables/common/token_classification_nm.py
@@ -17,7 +17,8 @@
 from torch import nn as nn
 
 from nemo.backends.pytorch import MultiLayerPerceptron, TrainableNM
-from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import gelu, transformer_weights_init
+from nemo.collections.nlp.utils.functional_utils import gelu
+from nemo.collections.nlp.utils.transformer_utils import transformer_weights_init
 from nemo.core import ChannelType, LogitsType, NeuralType
 from nemo.utils.decorators import add_port_docs
 
@@ -38,22 +39,24 @@ class BertTokenClassifier(TrainableNM):
         activation (str): activation function applied in classifier MLP layers
         log_softmax (bool): whether to apply log_softmax to MLP output
         dropout (float): dropout ratio applied to MLP
+        use_transformer_pretrained (bool):
+            TODO
     """
 
     @property
     @add_port_docs()
     def input_ports(self):
         """Returns definitions of module input ports.
+        hidden_states: embedding hidden states
         """
-        # return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})}
         return {"hidden_states": NeuralType(('B', 'T', 'D'), ChannelType())}
 
     @property
     @add_port_docs()
     def output_ports(self):
         """Returns definitions of module output ports.
+        logits: logits before loss
         """
-        # return {"logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})}
         return {"logits": NeuralType(('B', 'T', 'C'), LogitsType())}
 
     def __init__(
@@ -108,7 +111,6 @@ class TokenClassifier(TrainableNM):
     def input_ports(self):
         """Returns definitions of module input ports.
         """
-        # return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})}
         return {"hidden_states": NeuralType(('B', 'T', 'C'), ChannelType())}
 
     @property
@@ -116,7 +118,6 @@ def input_ports(self):
     def output_ports(self):
         """Returns definitions of module output ports.
         """
-        # return {"logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})}
         return {"logits": NeuralType(('B', 'T', 'D'), LogitsType())}
 
     def __init__(
diff --git a/nemo/collections/nlp/nm/trainables/common/transformer/transformer_decoders.py b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_decoders.py
index 1f3cbf0e4f44..fef524343f1c 100644
--- a/nemo/collections/nlp/nm/trainables/common/transformer/transformer_decoders.py
+++ b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_decoders.py
@@ -1,3 +1,19 @@
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
 import copy
 
 import torch
@@ -7,7 +23,7 @@
     MultiHeadAttention,
     PositionWiseFF,
 )
-from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import form_attention_mask
+from nemo.collections.nlp.utils.transformer_utils import form_attention_mask
 
 __all__ = []
 
diff --git a/nemo/collections/nlp/nm/trainables/common/transformer/transformer_encoders.py b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_encoders.py
index 24c6afce55ad..049a94755b8f 100644
--- a/nemo/collections/nlp/nm/trainables/common/transformer/transformer_encoders.py
+++ b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_encoders.py
@@ -1,3 +1,19 @@
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
 import copy
 
 import torch
@@ -8,7 +24,7 @@
     PositionWiseFF,
     TwoStreamSelfAttention,
 )
-from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import form_attention_mask
+from nemo.collections.nlp.utils.transformer_utils import form_attention_mask
 
 __all__ = []
 
diff --git a/nemo/collections/nlp/nm/trainables/common/transformer/transformer_generators.py b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_generators.py
index d878ccd17655..d6e25d480832 100644
--- a/nemo/collections/nlp/nm/trainables/common/transformer/transformer_generators.py
+++ b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_generators.py
@@ -1,13 +1,48 @@
-__all__ = []
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
 
 import torch
 import torch.nn as nn
 
-from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import NEG_INF
-from nemo.collections.nlp.utils.common_nlp_utils import mask_padded_tokens
+from nemo.collections.nlp.utils.data_utils import mask_padded_tokens
+from nemo.collections.nlp.utils.transformer_utils import NEG_INF
+
+__all__ = []
 
 
 class GreedySequenceGenerator(nn.Module):
+    """
+    Greedy sequence generator based on the decoder followed by log_softmax.
+
+    Args:
+        embedding: nn.Module, transforms input_ids into vector embeddings
+        decoder: nn.Module, takes embeddings and produces hidden_states
+        log_softmax: nn.Module, takes hidden_states and produces log_probs
+            which correspond to probability distribution of tokens (ids)
+        pad: index of padding token in the vocabulary
+        bos: index of beginning of sequence token in the vocabulary
+        eos: index of end of sequence token in the vocabulary
+        max_sequence_length: maximum allowed length for generated sequences
+        max_delta_length: in case of encoder-decoder generation (e.g. NMT),
+            forbids generated sequences to be longer than the length of
+            source sequences plus max_delta_length
+        batch_size: size of the batch of generated sequences if neither
+            source nor target starting sequences are provided
+    """
+
     def __init__(
         self,
         embedding,
@@ -20,25 +55,6 @@ def __init__(
         max_delta_length=20,
         batch_size=1,
     ):
-        """
-        Greedy sequence generator based on the decoder followed by log_softmax.
-
-        Args:
-            embedding: nn.Module, transforms input_ids into vector embeddings
-            decoder: nn.Module, takes embeddings and produces hidden_states
-            log_softmax: nn.Module, takes hidden_states and produces log_probs
-                which correspond to probability distribution of tokens (ids)
-            pad: index of padding token in the vocabulary
-            bos: index of beginning of sequence token in the vocabulary
-            eos: index of end of sequence token in the vocabulary
-            max_sequence_length: maximum allowed length for generated sequences
-            max_delta_length: in case of encoder-decoder generation (e.g. NMT),
-                forbids generated sequences to be longer than the length of
-                source sequences plus max_delta_length
-            batch_size: size of the batch of generated sequences if neither
-                source nor target starting sequences are provided
-        """
-
         super().__init__()
         self.embedding = embedding
         self.decoder = decoder
@@ -148,20 +164,20 @@ def forward(self, decoder_input_ids=None, encoder_hidden_states=None, encoder_in
 
 
 class TopKSequenceGenerator(GreedySequenceGenerator):
-    def __init__(self, embedding, decoder, log_softmax, beam_size=1, temperature=1.0, **kwargs):
-        """
-        Top-k sequence generator based on the decoder followed by log_softmax.
-
-        Args:
-            *all args of GreedySequenceGenerator class
-            beam_size: size of the beam (parameter k in top-k)
-            temperature: temperature of top-k sampling, all logits are divided
-                by temperature before rescaling. High temperature leads to
-                uniform distribution, low leads to delta-like distribution.
-        Kwargs:
-            all remaining parameters of GreedySequenceGenerator class
-        """
+    """
+    Top-k sequence generator based on the decoder followed by log_softmax.
+
+    Args:
+        *all args of GreedySequenceGenerator class
+        beam_size: size of the beam (parameter k in top-k)
+        temperature: temperature of top-k sampling, all logits are divided
+            by temperature before rescaling. High temperature leads to
+            uniform distribution, low leads to delta-like distribution.
+    Kwargs:
+        all remaining parameters of GreedySequenceGenerator class
+    """
 
+    def __init__(self, embedding, decoder, log_softmax, beam_size=1, temperature=1.0, **kwargs):
         super().__init__(embedding, decoder, log_softmax, **kwargs)
         self.beam_size = beam_size
         self.temp = temperature
diff --git a/nemo/collections/nlp/nm/trainables/common/transformer/transformer_modules.py b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_modules.py
index a09a5fa99be8..dd19b221c5df 100644
--- a/nemo/collections/nlp/nm/trainables/common/transformer/transformer_modules.py
+++ b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_modules.py
@@ -1,27 +1,18 @@
-# coding=utf-8
-"""
-Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc.
-Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-Various parts of Transformer architecture implemented as Pytorch nn.Modules.
-Some parts of this code were adapted from the HuggingFace library at
-https://github.com/huggingface/pytorch-pretrained-BERT
-Some parts of this code were adapted from the Annotated Transformer at
-http://nlp.seas.harvard.edu/2018/04/03/attention.html
-Copyright by the HuggingFace and Annotated Transformer authors.
-"""
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
 
 import math
 
@@ -29,7 +20,7 @@
 from torch import nn
 
 from nemo import logging
-from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import gelu
+from nemo.collections.nlp.utils.functional_utils import gelu
 
 __all__ = []
 
diff --git a/nemo/collections/nlp/nm/trainables/common/transformer/transformer_nm.py b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_nm.py
index a57d20941f96..73e52e260892 100644
--- a/nemo/collections/nlp/nm/trainables/common/transformer/transformer_nm.py
+++ b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_nm.py
@@ -1,7 +1,19 @@
-# Copyright (c) 2019 NVIDIA Corporation
-"""
-This package contains Transformer for translation Neural Module
-"""
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
 
 import math
 
@@ -13,7 +25,7 @@
     GreedySequenceGenerator,
 )
 from nemo.collections.nlp.nm.trainables.common.transformer.transformer_modules import TransformerEmbedding
-from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import transformer_weights_init
+from nemo.collections.nlp.utils.transformer_utils import transformer_weights_init
 from nemo.core.neural_types import ChannelType, NeuralType
 from nemo.utils.decorators import add_port_docs
 
@@ -49,10 +61,10 @@ class TransformerEncoderNM(TrainableNM):
     @add_port_docs()
     def input_ports(self):
         """Returns definitions of module input ports.
+        input_ids: ids of input tokens
+        input_mask_src: input mask
         """
         return {
-            # "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "input_mask_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
             "input_ids": NeuralType(('B', 'T'), ChannelType()),
             "input_mask_src": NeuralType(('B', 'T'), ChannelType()),
         }
@@ -61,9 +73,8 @@ def input_ports(self):
     @add_port_docs()
     def output_ports(self):
         """Returns definitions of module output ports.
-
+        hidden_states: outputs hidden states
         """
-        # return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})}
         return {"hidden_states": NeuralType(('B', 'T', 'D'), ChannelType())}
 
     def __init__(
@@ -140,12 +151,12 @@ class TransformerDecoderNM(TrainableNM):
     @add_port_docs()
     def input_ports(self):
         """Returns definitions of module input ports.
+        input_ids_tgt: ids of target sequence
+        hidden_states_src: input hidden states 
+        input_mask_src: input token mask
+        input_mask_tgt: target token mask
         """
         return {
-            # "input_ids_tgt": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "hidden_states_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}),
-            # "input_mask_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # "input_mask_tgt": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
             "input_ids_tgt": NeuralType(('B', 'T'), ChannelType()),
             "hidden_states_src": NeuralType(('B', 'T', 'D'), ChannelType()),
             "input_mask_src": NeuralType(('B', 'T'), ChannelType()),
@@ -156,8 +167,8 @@ def input_ports(self):
     @add_port_docs()
     def output_ports(self):
         """Returns definitions of module output ports.
+        hidden_states: output hidden states
         """
-        # return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})}
         return {"hidden_states": NeuralType(('B', 'T', 'D'), ChannelType())}
 
     def __init__(
@@ -224,16 +235,16 @@ class GreedyLanguageGeneratorNM(TrainableNM):
     @add_port_docs()
     def input_ports(self):
         """Returns definitions of module input ports.
+        input_ids:  input ids
         """
-        # return {"input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)})}
         return {"input_ids": NeuralType(('B', 'T'), ChannelType())}
 
     @property
     @add_port_docs()
     def output_ports(self):
         """Returns definitions of module output ports.
+        output ids: output ids
         """
-        # return {"output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)})}
         return {"output_ids": NeuralType(('B', 'T'), ChannelType())}
 
     def __init__(self, decoder, log_softmax, max_seq_length, pad_token, bos_token, eos_token, batch_size=1):
@@ -282,10 +293,10 @@ class BeamSearchTranslatorNM(TrainableNM):
     @add_port_docs()
     def input_ports(self):
         """Returns definitions of module input ports.
+        hidden_states_src: input hidden states
+        input_mask_src: input mask
         """
         return {
-            # "hidden_states_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}),
-            # "input_mask_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
             "hidden_states_src": NeuralType(('B', 'T', 'C'), ChannelType()),
             "input_mask_src": NeuralType(('B', 'T'), ChannelType()),
         }
@@ -294,8 +305,8 @@ def input_ports(self):
     @add_port_docs()
     def output_ports(self):
         """Returns definitions of module output ports.
+        output_ids: output ids
         """
-        # return {"output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)})}
         return {"output_ids": NeuralType(('B', 'T'), ChannelType())}
 
     @property
diff --git a/nemo/collections/nlp/nm/trainables/dialogue_state_tracking/state_tracking_trade_nm.py b/nemo/collections/nlp/nm/trainables/dialogue_state_tracking/state_tracking_trade_nm.py
index a576e4be34be..e6d634ecc26a 100644
--- a/nemo/collections/nlp/nm/trainables/dialogue_state_tracking/state_tracking_trade_nm.py
+++ b/nemo/collections/nlp/nm/trainables/dialogue_state_tracking/state_tracking_trade_nm.py
@@ -52,33 +52,28 @@
 
 
 class TRADEGenerator(TrainableNM):
+    """
+    The generator module for state tracking model TRADE
+    Args:
+        vocab (Vocab): an instance of Vocab containing the vocabularey
+        embeddings (Tensor): word embedding matrix
+        hid_size (int): hidden size of the GRU decoder
+        dropout (float): dropout of the GRU
+        slots (list): list of slots
+        nb_gate (int): number of gates
+        teacher_forcing (float): 0.5
+    """
+
     @property
     @add_port_docs()
     def input_ports(self):
         """Returns definitions of module input ports.
-
-        encoder_hidden: hidden states of the encoder
-
-        encoder_outputs: outputs of the encoder
-
-        input_lens: lengths of the input sequences to encoder
-
-        src_ids: input sequences to encoder
-
-        targets: targets for the output of the generator
-
         """
         return {
-            # 'encoder_hidden': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}),
-            # 'encoder_outputs': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}),
-            # 'input_lens': NeuralType({0: AxisType(BatchTag)}),
-            # 'src_ids': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            # 'targets': NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(TimeTag)}),
             'encoder_hidden': NeuralType(('B', 'T', 'C'), ChannelType()),
             'encoder_outputs': NeuralType(('B', 'T', 'C'), ChannelType()),
             'input_lens': NeuralType(tuple('B'), LengthsType()),
             'src_ids': NeuralType(('B', 'T'), ChannelType()),
-            # 'targets': NeuralType(ChannelType(), ('B', 'D', 'T')),
             'targets': NeuralType(('B', 'D', 'T'), LabelsType()),
         }
 
@@ -88,16 +83,8 @@ def output_ports(self):
         """Returns definitions of module output ports.
 
         point_outputs: outputs of the generator
-
         gate_outputs: outputs of gating heads
-
         """
-        # return {
-        #     'point_outputs': NeuralType(
-        #         {0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag), 3: AxisType(ChannelTag)}
-        #     ),
-        #     'gate_outputs': NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(ChannelTag)}),
-        # }
         return {
             'point_outputs': NeuralType(('B', 'T', 'D', 'D'), LogitsType()),
             'gate_outputs': NeuralType(('B', 'D', 'D'), LogitsType()),
diff --git a/nemo/collections/nlp/nm/trainables/joint_intent_slot/joint_intent_slot_nm.py b/nemo/collections/nlp/nm/trainables/joint_intent_slot/joint_intent_slot_nm.py
index 4020e6e290b9..461a25c902e6 100644
--- a/nemo/collections/nlp/nm/trainables/joint_intent_slot/joint_intent_slot_nm.py
+++ b/nemo/collections/nlp/nm/trainables/joint_intent_slot/joint_intent_slot_nm.py
@@ -17,7 +17,7 @@
 from torch import nn as nn
 
 from nemo.backends.pytorch import MultiLayerPerceptron, TrainableNM
-from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import transformer_weights_init
+from nemo.collections.nlp.utils.transformer_utils import transformer_weights_init
 from nemo.core import ChannelType, LogitsType, NeuralType
 from nemo.utils.decorators import add_port_docs
 
@@ -35,14 +35,18 @@ class JointIntentSlotClassifier(TrainableNM):
         num_intents (int): number of intents
         num_slots (int): number of slots
         dropout (float): dropout to be applied to the layer
+        use_transformer_pretrained (bool):
+            TODO
     """
 
     @property
     @add_port_docs()
     def input_ports(self):
         """Returns definitions of module input ports.
+
+        hidden_states:
+            TODO
         """
-        # return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})}
         return {"hidden_states": NeuralType(('B', 'T', 'C'), ChannelType())}
 
     @property
@@ -51,20 +55,11 @@ def output_ports(self):
         """Returns definitions of module output ports.
 
         intent_logits:
-            0: AxisType(BatchTag)
-
-            1: AxisType(ChannelTag)
-
+            TODO
         slot_logits:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-            2: AxisType(ChannelTag)
+            TODO
         """
         return {
-            # "intent_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
-            # "slot_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}),
             "intent_logits": NeuralType(('B', 'D'), LogitsType()),
             "slot_logits": NeuralType(('B', 'T', 'D'), LogitsType()),
         }
@@ -85,7 +80,6 @@ def __init__(self, hidden_size, num_intents, num_slots, dropout=0.0, use_transfo
         )
         if use_transformer_pretrained:
             self.apply(lambda module: transformer_weights_init(module, xavier=False))
-        # self.to(self._device)
 
     def forward(self, hidden_states):
         hidden_states = self.dropout(hidden_states)
diff --git a/nemo/collections/nlp/utils/__init__.py b/nemo/collections/nlp/utils/__init__.py
index 9a0f97ecdc63..feaa9815482a 100644
--- a/nemo/collections/nlp/utils/__init__.py
+++ b/nemo/collections/nlp/utils/__init__.py
@@ -1,4 +1,21 @@
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
 from nemo.collections.nlp.utils.callback_utils import *
-from nemo.collections.nlp.utils.common_nlp_utils import *
+from nemo.collections.nlp.utils.data_utils import *
+from nemo.collections.nlp.utils.functional_utils import *
 from nemo.collections.nlp.utils.huggingface_utils import *
-from nemo.collections.nlp.utils.loss_utils import *
+from nemo.collections.nlp.utils.transformer_utils import *
diff --git a/nemo/collections/nlp/utils/common_nlp_utils.py b/nemo/collections/nlp/utils/common_nlp_utils.py
deleted file mode 100644
index cb6737bac97e..000000000000
--- a/nemo/collections/nlp/utils/common_nlp_utils.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# =============================================================================
-# Copyright 2020 NVIDIA. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-import os
-import re
-import string
-
-import numpy as np
-
-from nemo import logging
-
-__all__ = [
-    '_is_whitespace',
-    'mask_padded_tokens',
-    'read_intent_slot_outputs',
-    'get_vocab',
-    'write_vocab',
-    'label2idx',
-    'write_vocab_in_order',
-    'if_exist',
-    'remove_punctuation_from_sentence',
-    'ids2text',
-    'calc_class_weights',
-]
-
-
-def _is_whitespace(c):
-    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-        return True
-    return False
-
-
-def mask_padded_tokens(tokens, pad_id):
-    mask = tokens != pad_id
-    return mask
-
-
-def read_intent_slot_outputs(
-    queries, intent_file, slot_file, intent_logits, slot_logits, slot_masks, intents=None, slots=None
-):
-    intent_dict = get_vocab(intent_file)
-    slot_dict = get_vocab(slot_file)
-    pred_intents = np.argmax(intent_logits, 1)
-    pred_slots = np.argmax(slot_logits, axis=2)
-    slot_masks = slot_masks > 0.5
-    for i, query in enumerate(queries):
-        logging.info(f'Query: {query}')
-        pred = pred_intents[i]
-        logging.info(f'Predicted intent:\t{pred}\t{intent_dict[pred]}')
-        if intents is not None:
-            logging.info(f'True intent:\t{intents[i]}\t{intent_dict[intents[i]]}')
-
-        pred_slot = pred_slots[i][slot_masks[i]]
-        tokens = query.strip().split()
-
-        if len(pred_slot) != len(tokens):
-            raise ValueError('Pred_slot and tokens must be of the same length')
-
-        for j, token in enumerate(tokens):
-            output = f'{token}\t{slot_dict[pred_slot[j]]}'
-            if slots is not None:
-                output = f'{output}\t{slot_dict[slots[i][j]]}'
-            logging.info(output)
-
-
-def get_vocab(file):
-    lines = open(file, 'r').readlines()
-    lines = [line.strip() for line in lines if line.strip()]
-    labels = {i: lines[i] for i in range(len(lines))}
-    return labels
-
-
-def write_vocab(items, outfile):
-    vocab = {}
-    idx = 0
-    with open(outfile, 'w') as f:
-        for item in items:
-            f.write(item + '\n')
-            vocab[item] = idx
-            idx += 1
-    return vocab
-
-
-def label2idx(file):
-    lines = open(file, 'r').readlines()
-    lines = [line.strip() for line in lines if line.strip()]
-    labels = {lines[i]: i for i in range(len(lines))}
-    return labels
-
-
-def write_vocab_in_order(vocab, outfile):
-    with open(outfile, 'w') as f:
-        for key in sorted(vocab.keys()):
-            f.write(f'{vocab[key]}\n')
-
-
-def if_exist(outfold, files):
-    if not os.path.exists(outfold):
-        return False
-    for file in files:
-        if not os.path.exists(f'{outfold}/{file}'):
-            return False
-    return True
-
-
-def remove_punctuation_from_sentence(sentence):
-    sentence = re.sub('[' + string.punctuation + ']', '', sentence)
-    sentence = sentence.lower()
-    return sentence
-
-
-def ids2text(ids, vocab):
-    return ' '.join([vocab[int(id_)] for id_ in ids])
-
-
-def calc_class_weights(label_freq):
-    """
-    Goal is to give more weight to the classes with less samples
-    so as to match the one with the higest frequency. We achieve this by
-    dividing the highest frequency by the freq of each label.
-    Example -
-    [12, 5, 3] -> [12/12, 12/5, 12/3] -> [1, 2.4, 4]
-
-    Here label_freq is assumed to be sorted by the frequency. I.e.
-    label_freq[0] is the most frequent element.
-
-    """
-
-    most_common_label_freq = label_freq[0]
-    weighted_slots = sorted([(index, most_common_label_freq[1] / freq) for (index, freq) in label_freq])
-    return [weight for (_, weight) in weighted_slots]
diff --git a/nemo/collections/nlp/utils/data_utils.py b/nemo/collections/nlp/utils/data_utils.py
new file mode 100644
index 000000000000..d57c782fedca
--- /dev/null
+++ b/nemo/collections/nlp/utils/data_utils.py
@@ -0,0 +1,57 @@
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import re
+import string
+
+__all__ = ['get_vocab', 'get_tokens', 'normalize_answer', 'mask_padded_tokens']
+
+
+def get_vocab(file):
+    lines = open(file, 'r').readlines()
+    lines = [line.strip() for line in lines if line.strip()]
+    labels = {i: lines[i] for i in range(len(lines))}
+    return labels
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+
+    def remove_articles(text):
+        return re.sub(r'\b(a|an|the)\b', ' ', text)
+
+    def white_space_fix(text):
+        return ' '.join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def get_tokens(s):
+    if not s:
+        return []
+    return normalize_answer(s).split()
+
+
+def mask_padded_tokens(tokens, pad_id):
+    mask = tokens != pad_id
+    return mask
diff --git a/nemo/collections/nlp/utils/functional_utils.py b/nemo/collections/nlp/utils/functional_utils.py
new file mode 100644
index 000000000000..b1f4353dc049
--- /dev/null
+++ b/nemo/collections/nlp/utils/functional_utils.py
@@ -0,0 +1,66 @@
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# =============================================================================
+# Copyright 2018 The Google AI Language Team Authors and
+# The HuggingFace Inc. team.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import math
+
+import torch
+
+__all__ = ['_compute_softmax', 'gelu']
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
+
+
+def gelu(x):
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
diff --git a/nemo/collections/nlp/utils/huggingface_utils.py b/nemo/collections/nlp/utils/huggingface_utils.py
index 98f3df9c36b7..8cb3965ad326 100644
--- a/nemo/collections/nlp/utils/huggingface_utils.py
+++ b/nemo/collections/nlp/utils/huggingface_utils.py
@@ -14,6 +14,24 @@
 # limitations under the License.
 # =============================================================================
 
+# =============================================================================
+# Copyright 2018 The Google AI Language Team Authors and
+# The HuggingFace Inc. team.
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
 __all__ = ['MODEL_SPECIAL_TOKENS', 'MODEL_NAMES']
 
 MODEL_SPECIAL_TOKENS = {
diff --git a/nemo/collections/nlp/nm/trainables/common/transformer/transformer_utils.py b/nemo/collections/nlp/utils/transformer_utils.py
similarity index 73%
rename from nemo/collections/nlp/nm/trainables/common/transformer/transformer_utils.py
rename to nemo/collections/nlp/utils/transformer_utils.py
index 4f3f80ec670a..4c8742098182 100644
--- a/nemo/collections/nlp/nm/trainables/common/transformer/transformer_utils.py
+++ b/nemo/collections/nlp/utils/transformer_utils.py
@@ -1,13 +1,25 @@
-import math
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
 
 import torch
 import torch.nn as nn
 
-NEG_INF = -10000.0
-
+__all__ = ['form_attention_mask', 'transformer_weights_init']
 
-def gelu(x):
-    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+NEG_INF = -10000.0
 
 
 def form_attention_mask(input_mask, diagonal=None):
diff --git a/nemo/core/neural_types/elements.py b/nemo/core/neural_types/elements.py
index d963831e2cbc..945506065a34 100644
--- a/nemo/core/neural_types/elements.py
+++ b/nemo/core/neural_types/elements.py
@@ -34,6 +34,7 @@
     'LengthsType',
     'EmbeddedTextType',
     'EncodedRepresentation',
+    'MaskType',
 ]
 import abc
 from abc import ABC, abstractmethod
@@ -188,3 +189,7 @@ class RegressionValuesType(PredictionsType):
 
 class CategoricalValuesType(PredictionsType):
     """Element type to represent labels for categorical classification task"""
+
+
+class MaskType(PredictionsType):
+    """Element type to represent boolean mask"""
diff --git a/tests/core/test_weight_share.py b/tests/core/test_weight_share.py
index 6317052ae77d..80934aa8b08a 100644
--- a/tests/core/test_weight_share.py
+++ b/tests/core/test_weight_share.py
@@ -28,7 +28,7 @@
 import nemo
 import nemo.collections.asr as nemo_asr
 from nemo.backends.pytorch.nm import DataLayerNM
-from nemo.collections.nlp.nm.losses import PaddedSmoothedCrossEntropyLossNM
+from nemo.collections.nlp.nm.losses import SmoothedCrossEntropyLoss
 from nemo.collections.nlp.nm.trainables.common import TokenClassifier
 from nemo.core import WeightShareTransform
 from nemo.core.neural_types import *
@@ -181,7 +181,7 @@ def data_iterator(self):
         embd = nemo.backends.pytorch.common.other.SequenceEmbedding(voc_size=voc_size, hidden_size=dim)
         proj = TokenClassifier(hidden_size=dim, num_classes=voc_size)
         data = DummyDataLayer(voc_size)
-        loss = PaddedSmoothedCrossEntropyLossNM(0)
+        loss = SmoothedCrossEntropyLoss(pad_id=0)
         embd.tie_weights_with(
             proj,
             weight_names=["embedding.weight"],
@@ -193,7 +193,7 @@ def data_iterator(self):
         _in, _out = data()
         pred = embd(input_seq=_in)
         pred = proj(hidden_states=pred)
-        loss_t = loss(target_ids=_out, logits=pred)
+        loss_t = loss(labels=_out, logits=pred)
 
         self.nf.train(
             [loss_t], optimizer="sgd", optimization_params={"max_steps": 5, "lr": 0.0003},
@@ -245,7 +245,7 @@ def data_iterator(self):
         embd = nemo.backends.pytorch.common.other.SequenceEmbedding(voc_size=voc_size, hidden_size=dim)
         proj = TokenClassifier(hidden_size=dim, num_classes=voc_size)
         data = DummyDataLayer(voc_size)
-        loss = PaddedSmoothedCrossEntropyLossNM(0)
+        loss = SmoothedCrossEntropyLoss(pad_id=0)
         # embd.tie_weights_with(
         #     proj,
         #     weight_names=["embedding.weight"],
@@ -257,7 +257,7 @@ def data_iterator(self):
         _in, _out = data()
         pred = embd(input_seq=_in)
         pred = proj(hidden_states=pred)
-        loss_t = loss(target_ids=_out, logits=pred)
+        loss_t = loss(labels=_out, logits=pred)
 
         self.nf.train(
             [loss_t], optimizer="sgd", optimization_params={"max_steps": 5, "lr": 0.0003},