From af21b7765882d9a694f2c0c5e1cb7d87d98d733e Mon Sep 17 00:00:00 2001
From: Daniel Galvez <galv@users.noreply.github.com>
Date: Thu, 7 Mar 2024 12:45:48 -0800
Subject: [PATCH 001/140] Accelerate CTC greedy decoding by around 10% (#8521)

* Accelerate CTC greedy decoding by over 10x.

There were a few problems with CTC greedy decoding before:

- It would copy GPU memory to pageable memory originally, which is very
slow compared to copying to pinned memory. Unfortunately,
cudaMallocHost() is synchronous and "slow", but fortunately pytorch
has a free list of recent pinned memory allocations in its pinned
memory allocator, allowing us to reduce the number of these calls.

- A single scalar of logits_len/decoder_lengths was copied from GPU to
CPU at a time, rather than all at once in a single call. This caused
neddless overhead. We could also use a pinned memory allocation for
copying logits_len as well, but using pinned memory is less important
for small allocations.

- detatch() was called in function marked with @nograd

- For some reason, someone was using torch_tensor.numpy().tolist()
instead of torch_tensor.tolist(). I don't believe numpy() makes a copy
of the data, but it is unnecessary.

The main important improvements are the first two bullet
points. Everything else is unimportant.

There are more opportunities for improvement. In particular, logits
gets copied to cpu twice if trcfg.return_hypotheses is True

Performance improvements:

I ran this code on an A100 GPU, on a machine with 16 CPU cores:

```
python examples/asr/speech_to_text_eval.py \
pretrained_name=nvidia/parakeet-ctc-1.1b \
dataset_manifest=/home/dgalvez/scratch/data/test_other.json \
batch_size=16  output_filename=test_other_decoded.jsonl  amp=true \
amp_dtype=bfloat16  use_cer=false num_workers=1
```

Time to do each of 5 evaluations of librispeech test other before my changes:
33 seconds
29
29
30
28

Average: 29.8 seconds
Average excluding first (warmup): 29 seconds

Time to do each of 5 evaluations of librispeech test other after my changes:

35 seconds
28
29
26
26
Average: 28.8 seconds
Average excluding first (warmup): 27.25 seconds

This corresponds to an almost 10% speedup. This meets expectations,
since 10% of the time was originally spent on CTC greedy decoding
before.

You may wonder why the first iteration is slower. It is because
calling cudaMallocHost(), and then doing a pinned memory copy from GPU
to CPU is slower than doing a paged memory copy from GPU to
CPU. However, the cudaMallocHost() calls will be cached over time,
allowing us to avoid the overhead of them in later evaltions of the
dataset.

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Make this work in CPU-only mode.

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 nemo/collections/asr/models/ctc_models.py     | 16 +++++++---
 .../parts/submodules/ctc_greedy_decoding.py   | 32 +++++++++++++++----
 2 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py
index 42406415651c..5f380619db68 100644
--- a/nemo/collections/asr/models/ctc_models.py
+++ b/nemo/collections/asr/models/ctc_models.py
@@ -662,14 +662,22 @@ def _transcribe_output_processing(self, outputs, trcfg: TranscribeConfig) -> Gen
         current_hypotheses, all_hyp = self.decoding.ctc_decoder_predictions_tensor(
             logits, decoder_lengths=logits_len, return_hypotheses=trcfg.return_hypotheses,
         )
-        logits = logits.cpu()
-
         if trcfg.return_hypotheses:
+            if logits.is_cuda:
+                # See comment in
+                # ctc_greedy_decoding.py::GreedyCTCInfer::forward() to
+                # understand this idiom.
+                logits_cpu = torch.empty(logits.shape, dtype=logits.dtype, device=torch.device("cpu"), pin_memory=True)
+                logits_cpu.copy_(logits, non_blocking=True)
+            else:
+                logits_cpu = logits
+            logits_len = logits_len.cpu()
             # dump log probs per file
-            for idx in range(logits.shape[0]):
-                current_hypotheses[idx].y_sequence = logits[idx][: logits_len[idx]]
+            for idx in range(logits_cpu.shape[0]):
+                current_hypotheses[idx].y_sequence = logits_cpu[idx][: logits_len[idx]]
                 if current_hypotheses[idx].alignments is None:
                     current_hypotheses[idx].alignments = current_hypotheses[idx].y_sequence
+            del logits_cpu
 
         # cleanup memory
         del logits, logits_len
diff --git a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
index 686ef79cabad..ab4b4c40e860 100644
--- a/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
@@ -161,7 +161,25 @@ def forward(
         with torch.inference_mode():
             hypotheses = []
             # Process each sequence independently
-            prediction_cpu_tensor = decoder_output.cpu()
+
+            if decoder_output.is_cuda:
+                # This two-liner is around twenty times faster than:
+                # `prediction_cpu_tensor = decoder_output.cpu()`
+                # cpu() does not use pinned memory, meaning that a slow pageable
+                # copy must be done instead.
+                prediction_cpu_tensor = torch.empty(
+                    decoder_output.shape, dtype=decoder_output.dtype, device=torch.device("cpu"), pin_memory=True
+                )
+                prediction_cpu_tensor.copy_(decoder_output, non_blocking=True)
+            else:
+                prediction_cpu_tensor = decoder_output
+
+            if decoder_lengths is not None and isinstance(decoder_lengths, torch.Tensor):
+                # Before this change, self._greedy_decode_labels would copy
+                # each scalar from GPU to CPU one at a time, in the line:
+                # prediction = prediction[:out_len]
+                # Doing one GPU to CPU copy ahead of time amortizes that overhead.
+                decoder_lengths = decoder_lengths.cpu()
 
             if prediction_cpu_tensor.ndim < 2 or prediction_cpu_tensor.ndim > 3:
                 raise ValueError(
@@ -192,7 +210,7 @@ def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: torch.Tensor):
 
         # Initialize blank state and empty label set in Hypothesis
         hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None)
-        prediction = x.detach().cpu()
+        prediction = x.cpu()
 
         if out_len is not None:
             prediction = prediction[:out_len]
@@ -200,7 +218,7 @@ def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: torch.Tensor):
         prediction_logprobs, prediction_labels = prediction.max(dim=-1)
 
         non_blank_ids = prediction_labels != self.blank_id
-        hypothesis.y_sequence = prediction_labels.numpy().tolist()
+        hypothesis.y_sequence = prediction_labels.tolist()
         hypothesis.score = (prediction_logprobs[non_blank_ids]).sum()
 
         if self.preserve_alignments:
@@ -208,7 +226,7 @@ def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: torch.Tensor):
             hypothesis.alignments = (prediction.clone(), prediction_labels.clone())
 
         if self.compute_timestamps:
-            hypothesis.timestep = torch.nonzero(non_blank_ids, as_tuple=False)[:, 0].numpy().tolist()
+            hypothesis.timestep = torch.nonzero(non_blank_ids, as_tuple=False)[:, 0].tolist()
 
         if self.preserve_frame_confidence:
             hypothesis.frame_confidence = self._get_confidence(prediction)
@@ -222,20 +240,20 @@ def _greedy_decode_labels(self, x: torch.Tensor, out_len: torch.Tensor):
 
         # Initialize blank state and empty label set in Hypothesis
         hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None)
-        prediction_labels = x.detach().cpu()
+        prediction_labels = x.cpu()
 
         if out_len is not None:
             prediction_labels = prediction_labels[:out_len]
 
         non_blank_ids = prediction_labels != self.blank_id
-        hypothesis.y_sequence = prediction_labels.numpy().tolist()
+        hypothesis.y_sequence = prediction_labels.tolist()
         hypothesis.score = -1.0
 
         if self.preserve_alignments:
             raise ValueError("Requested for alignments, but predictions provided were labels, not log probabilities.")
 
         if self.compute_timestamps:
-            hypothesis.timestep = torch.nonzero(non_blank_ids, as_tuple=False)[:, 0].numpy().tolist()
+            hypothesis.timestep = torch.nonzero(non_blank_ids, as_tuple=False)[:, 0].tolist()
 
         if self.preserve_frame_confidence:
             raise ValueError(

From aad6cf7789c8b54c4e6cc22bd1ff9141e566a61e Mon Sep 17 00:00:00 2001
From: Krishna Puvvada <93558329+krishnacpuvvada@users.noreply.github.com>
Date: Thu, 7 Mar 2024 15:42:57 -0800
Subject: [PATCH 002/140] bug fix in transcribe_speech.py (#8611)

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>
---
 examples/asr/transcribe_speech.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
index b8a10f01603f..6d6006e939e5 100644
--- a/examples/asr/transcribe_speech.py
+++ b/examples/asr/transcribe_speech.py
@@ -396,8 +396,6 @@ def autocast(dtype=None):
         logging.info(f"Finished transcribing from manifest file: {cfg.dataset_manifest}")
         if cfg.presort_manifest:
             transcriptions = restore_transcription_order(cfg.dataset_manifest, transcriptions)
-            if remove_path_after_done is not None:
-                os.unlink(remove_path_after_done)
     else:
         logging.info(f"Finished transcribing {len(filepaths)} files !")
     logging.info(f"Writing transcriptions into file: {cfg.output_filename}")
@@ -420,6 +418,11 @@ def autocast(dtype=None):
     )
     logging.info(f"Finished writing predictions to {output_filename}!")
 
+    # clean-up
+    if cfg.presort_manifest is not None:
+        if remove_path_after_done is not None:
+            os.unlink(remove_path_after_done)
+
     if cfg.calculate_wer:
         output_manifest_w_wer, total_res, _ = cal_write_wer(
             pred_manifest=output_filename,

From 8f3855f241099a83b405d2057998d628789ec73b Mon Sep 17 00:00:00 2001
From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Date: Thu, 7 Mar 2024 16:34:42 -0800
Subject: [PATCH 003/140] Remove irrelevant multimodal models in docs (#8574)

* Remove irrelevant models in docs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix citation

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* clean up mm intro page

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Update docs/source/multimodal/nerf/intro.rst

Signed-off-by: Eric Harper <complex451@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 docs/source/multimodal/mllm/intro.rst     | 100 +---------------------
 docs/source/multimodal/nerf/intro.rst     |  47 +---------
 docs/source/multimodal/text2img/intro.rst |  86 +------------------
 docs/source/multimodal/vlm/intro.rst      |  72 +---------------
 4 files changed, 4 insertions(+), 301 deletions(-)

diff --git a/docs/source/multimodal/mllm/intro.rst b/docs/source/multimodal/mllm/intro.rst
index 4a87ac44e7c5..687ecd930a9e 100644
--- a/docs/source/multimodal/mllm/intro.rst
+++ b/docs/source/multimodal/mllm/intro.rst
@@ -1,97 +1,7 @@
 Multimodal Language Models
 ==========================
 
-The endeavor to extend Language Models (LLMs) into multimodal domains by integrating additional structures like visual encoders has become a focal point of recent research, especially given its potential to significantly lower the cost compared to training multimodal universal models from scratch.
-
-The advent of GPT-4 has spurred a plethora of developments including notable models like LLaVA, Mini-GPT4, and Flamingo. These models, despite minor differences, share similar structural and training strategies.
-
-Supported Models
------------------
-NeMo Multimodal currently supports the following models:
-
-+-----------------------------------+----------+-------------+------+-------------------------+------------------+
-| Model                             | Training | Fine-Tuning | PEFT | Evaluation              | Inference        |
-+===================================+==========+=============+======+=========================+==================+
-| `NeVA (LLaVA) <./neva.html>`_     | Yes      | Yes         | -    | -                       | Yes              |
-+-----------------------------------+----------+-------------+------+-------------------------+------------------+
-| Kosmos-2                          | WIP      | WIP         | -    | -                       | WIP              |
-+-----------------------------------+----------+-------------+------+-------------------------+------------------+
-
-Spotlight Models
------------------
-
-LLaVA: Visual Instruction Tuning
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-LLaVA :cite:`mm-models-llava` focuses on creating a dataset for visual instruction tuning to enhance LLMs' ability to comprehend diverse instructions and provide detailed responses. NeMo's implementation of LLaVA is called NeVA.
-
-- Model Structure:
-    - Visual Encoder: Utilizes CLIP’s ViT-L/14.
-    - Text Decoder: Employs LLaMA.
-    - Connection: A simple linear mapping layer connects the visual encoder's output to the text decoder's word embedding space (v1.0 version).
-
-- Training:
-    1. Cross-modal Pre-training: Utilizes 595k image-text data from CC3M, training only the linear mapping layer while keeping the visual encoder and text decoder frozen.
-    2. Instruction Fine-tuning: Custom-built 158k multimodal instruction dataset employed for fine-tuning targeting multimodal chatbot scenarios, with a variant targeting the Science QA dataset.
-
-Flamingo: A Visual Language Model for Few-Shot Learning
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Flamingo :cite:`mm-models-flamingo` addresses inconsistent visual feature map sizes by generating fixed-length feature sequences, enhancing visual relevance generation.
-
-- Model Structure:
-    - Resampler: Utilizes a Perceiver Resampler for generating fixed-length feature sequences.
-    - Attention: Adds cross-attention layers before each LLM layer to enhance visual relevance generation.
-
-- Training:
-    - Dataset: Utilizes data from various datasets like M3W, ALIGN, LTIP, and VTP emphasizing multimodal in-context learning.
-
-Kosmos-1: Language Is Not All You Need: Aligning Perception with Language Models
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Kosmos-1 :cite:`mm-models-kosmos1` by Microsoft is a Multimodal Large Language Model (MLLM) aimed at melding language, perception, action, and world modeling.
-
-- Model Structure:
-    - Core Backbone: Transformer-Based Causal Language Model.
-    - Architecture: Utilizes MAGNETO, a nuanced Transformer variant.
-    - Position Encoding: Employs XPOS relative position encoding for long-context modeling.
-    - Resampler: Employs Flamingo's Perceiver Resampler
-
-- Training:
-    - Dataset: Encompasses web-scale multimodal corpora including monomodal, cross-modal paired, and interleaved multimodal data.
-    - Objective: Focused on next-token prediction to maximize log-likelihood of tokens within examples.
-
-BLIP-2: Bootstrapping Language-Image Pre-training
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-BLIP-2 :cite:`mm-models-blip2` adopts a two-phase training strategy focusing on learning key visual information and adapting visual encoding structure to LLMs.
-
-- Model Structure:
-    - Visual Encoder: Combines a pre-trained image encoder with a Querying Transformer (Q-Former).
-    - Bridging: The Q-Former acts as the bridge between the image encoder and the Large Language Model (LLM).
-
-- Training:
-    1. Phase 1: Focuses on tasks like Image-Text Contrastive Learning, Image-grounded Text Generation, and Image-Text Matching.
-    2. Phase 2: Aims at adapting the visual encoding structure's output to LLMs with language modeling as the training task.
-
-Mini-GPT4: Enhancing Vision-Language Understanding
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Mini-GPT4 :cite:`mm-models-minigpt4` emphasizes the importance of multimodal instruction data for model performance in multimodal open-ended scenarios.
-
-- Model Structure:
-    - Visual Encoder: Employs BLIP2’s ViT and Q-Former.
-    - Text Decoder: Uses Vicuna (a fine-tuned version of LLaMA).
-    - Connection: A linear mapping layer projects visual features into text representation space.
-
-- Training:
-    1. Cross-modal Learning: Focuses on learning the relationship between vision and language using data from CC+SBU+LAION datasets.
-    2. Fine-tuning: Utilizes a multimodal fine-tuning dataset built using ChatGPT to enhance text descriptions generated in phase 1.
-
-.. note::
-    NeMo Megatron has an Enterprise edition which proffers tools for data preprocessing, hyperparameter tuning, containers, scripts for various clouds, and more. With the Enterprise edition, you also garner deployment tools. Apply for `early access here <https://developer.nvidia.com/nemo-megatron-early-access>`_ .
-
-For more information, see additional sections in the NeMo multimodal language model docs on the left-hand-side menu or in the list below:
+The endeavor to extend Language Models (LLMs) into multimodal domains by integrating additional structures like visual encoders has become a focal point of recent research, especially given its potential to significantly lower the cost compared to training multimodal universal models from scratch. Please refer to `NeMo Framework User Guide for Multimodal Models <https://docs.nvidia.com/nemo-framework/user-guide/latest/multimodalmodels/index.html>`_ for detailed support information.
 
 .. toctree::
    :maxdepth: 1
@@ -101,11 +11,3 @@ For more information, see additional sections in the NeMo multimodal language mo
    checkpoint
    neva
 
-References
-----------
-
-.. bibliography:: ../mm_all.bib
-    :style: plain
-    :filter: docname in docnames
-    :labelprefix: MM-MODELS
-    :keyprefix: mm-models-
diff --git a/docs/source/multimodal/nerf/intro.rst b/docs/source/multimodal/nerf/intro.rst
index eca057215a75..1380fe65a54d 100644
--- a/docs/source/multimodal/nerf/intro.rst
+++ b/docs/source/multimodal/nerf/intro.rst
@@ -1,42 +1,6 @@
 NeRF
 ====
-NeMO NeRF is a collection of models and tools for training 3D and 4D models.
-
-The library is designed with a modular approach, enabling developers to explore and find the most suitable solutions for their requirements,
-and allowing researchers to accelerate their experimentation process.
-
-
-Supported Models
------------------
-NeMo NeRF currently supports the following models:
-
-+----------------------------------------+------------+
-| Model                                  | Categories |
-+========================================+============+
-| `DreamFusion <./dreamfusion.html>`_    | text to 3D |
-+----------------------------------------+------------+
-
-
-Spotlight Models
------------------
-
-DreamFusion
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-The `DreamFusion <https://dreamfusion3d.github.io/>`_ model utilizing pre-trained 2D text-to-image diffusion models to create detailed 3D objects from textual descriptions.
-This approach overcomes the limitations of traditional 3D synthesis, which typically requires extensive labeled 3D data and sophisticated denoising architectures.
-At the core of DreamFusion is the optimization of a Neural Radiance Field (NeRF), a parametric model for rendering 3D scenes.
-The optimization process is driven by a loss function based on probability density distillation, which enables the 2D diffusion model to act as an effective prior.
-DreamFusion is capable of producing 3D models that are not only accurate representations of the input text but also offer versatility in terms of rendering from any viewpoint,
-relighting under diverse lighting conditions, and integration into various 3D environments. Importantly, this method achieves these results without the need for
-specific 3D training data or modifications to the existing image diffusion model.
-
-- Model Structure:
-    - Text-to-image model: a pretrained text-to-image diffusion model is used to generate a 2D image from a given text.
-    - NeRF: a neural radiance field (NeRF) that can generate novel views of complex 3D scenes, based on a partial set of 2D images.
-    - Renderer: A volume rendering layer is used to render the NeRF model from a given viewpoint.
-
-
-For more information, see additional sections in the NeRF docs on the left-hand-side menu or in the list below:
+NeMo NeRF is a collection of models and tools for training 3D and 4D models. Please refer to `NeMo Framework User Guide for Multimodal Models <https://docs.nvidia.com/nemo-framework/user-guide/latest/multimodalmodels/index.html>`_ for detailed support information.
 
 .. toctree::
    :maxdepth: 1
@@ -44,12 +8,3 @@ For more information, see additional sections in the NeRF docs on the left-hand-
    datasets
    configs
    dreamfusion
-
-References
-----------
-
-.. bibliography:: ../mm_all.bib
-    :style: plain
-    :filter: docname in docnames
-    :labelprefix: MM-MODELS
-    :keyprefix: mm-models-
diff --git a/docs/source/multimodal/text2img/intro.rst b/docs/source/multimodal/text2img/intro.rst
index 39ce33562d50..9ec793d246fa 100644
--- a/docs/source/multimodal/text2img/intro.rst
+++ b/docs/source/multimodal/text2img/intro.rst
@@ -1,82 +1,7 @@
 Text to Image Models
 ====================
 
-
-Supported Models
------------------
-NeMo Multimodal currently supports the following models:
-
-+----------------------------------------+------------+
-| Model                                  | Categories |
-+========================================+============+
-| `Stable Diffusion <./sd.html>`_        | Foundation |
-+----------------------------------------+------------+
-| `Imagen <./imagen.html>`_              | Foundation |
-+----------------------------------------+------------+
-| `DreamBooth <./dreambooth.html>`_      | Finetune   |
-+----------------------------------------+------------+
-| `ControlNet <./controlnet.html>`_      | Finetune   |
-+----------------------------------------+------------+
-| `instructPix2Pix <./insp2p.html>`_     | Finetune   |
-+----------------------------------------+------------+
-
-
-Text2Img Foundation Models
---------------------------
-Text-to-image models are a fascinating category of artificial intelligence models that aim to generate realistic images from textual descriptions. The mainstream text-2-image models can be broadly grouped into:
-
-#. **Diffusion Based Models**: these models leverage diffusion processes to
-   generate images from text and may operate in the latent space (Stable Diffusion :cite:`mm-models-rombach2022highresolution`) or directly in the pixel space (Imagen :cite:`mm-models-saharia2022photorealistic`). These models typically use probabilistic models to model the generation process.
-   They consider the sequential diffusion of information, which helps them generate images in a more coherent and controlled manner.
-   This approach is known for producing high-quality and diverse images while incorporating textual descriptions.
-
-#. **Autoregressive Based Models**: like Parti :cite:`mm-models-yu2022scaling`
-   and Make-A-Scene :cite:`mm-models-gafni2022makeascene`, generate images one pixel or region at a time.
-   These models take in the text description and gradually build the image pixel by pixel or element by element in
-   an autoregressive manner. While this approach can produce detailed images, it can be computationally expensive
-   and may not scale well for high-resolution images.
-
-
-#. **Masked Token Prediction Models**: including MUSE :cite:`mm-models-chang2023muse`, employ masked token prediction-based architectures.
-   These models learn to map text and image inputs into a shared embedding space.
-   They use a masked token prediction task during pretraining, allowing them to understand the
-   relationships between text and images. Given a text prompt, they can retrieve or generate images
-   that align with the content and context of the text description.
-
-
-Each of these approaches has its strengths and weaknesses, making them suitable for different use cases and scenarios.
-Diffusion-based models excel in generating diverse and high-quality images, autoregressive models offer fine-grained control,
-and masked token prediction-based models are strong at understanding and aligning text and images.
-The choice of model depends on the specific requirements of the text-to-image generation task at hand.
-
-
-Approaches to Customize/Extend Text2Img Models
-----------------------------------------------
-
-Customizing and extending Text2Img models can be essential to tailor these foundation models to
-specific applications or creative tasks. Some popular approaches to customize and extend text2img models include:
-
-
-#. **Text-Based Image Editing**: such as instructPix2Pix :cite:`mm-models-insp2p`, involves manipulating or modifying generated images based on
-   textual descriptions. To customize text2img models for this purpose, one can employ post-processing techniques to
-   alter the generated images.
-
-#. **Injecting New Concepts**: including DreamBooth :cite:`mm-models-ruiz2023dreambooth`, can introduce new concepts into text2img models. This is typically done by
-   adapting foundation models with additional data for finetuning.
-
-#. **Adding Conditionings to Guide Image Generation**: like ControlNet :cite:`mm-models-zhang2023adding`, allows for greater control and specificity in the generated images.
-   These conditionings can be based on various factors including specific attributes mentioned in the text (such as colors, sizes, or object properties),
-   spatial information, style and mood.
-
-Customizing and extending Text2Img models based on these approaches empowers users to have more control over the generated content,
-make images more contextually relevant, and adapt the models to a wide array of creative and practical tasks,
-from art creation to content personalization.
-
-.. note::
-    NeMo Megatron has an Enterprise edition which proffers tools for data preprocessing, hyperparameter tuning, containers, scripts for various clouds, and more. With the Enterprise edition, you also garner deployment tools. Apply for `early access here <https://developer.nvidia.com/nemo-megatron-early-access>`_ .
-
-
-For more information, see additional sections in the MM Text2Img docs on the left-hand-side menu or in the list below:
+NeMo multimodal provides implementations of multiple image-to-text models, including Stable Diffusion, Imagen, DreamBooth, ControlNet, and InstructPix2Pix. Please refer to `NeMo Framework User Guide for Multimodal Models <https://docs.nvidia.com/nemo-framework/user-guide/latest/multimodalmodels/index.html>`_ for detailed support information.
 
 .. toctree::
    :maxdepth: 1
@@ -88,12 +13,3 @@ For more information, see additional sections in the MM Text2Img docs on the lef
    imagen
    dreambooth
    controlnet
-
-References
-----------
-
-.. bibliography:: ../mm_all.bib
-    :style: plain
-    :filter: docname in docnames
-    :labelprefix: MM-MODELS
-    :keyprefix: mm-models-
\ No newline at end of file
diff --git a/docs/source/multimodal/vlm/intro.rst b/docs/source/multimodal/vlm/intro.rst
index 949fb8a11196..2885b27e24a4 100644
--- a/docs/source/multimodal/vlm/intro.rst
+++ b/docs/source/multimodal/vlm/intro.rst
@@ -1,68 +1,7 @@
 Vision-Language Foundation
 ==========================
 
-Humans naturally process information using multiple senses like sight and sound. Similarly, multi-modal learning aims to create models that handle different types of data, such as images, text, and audio. There's a growing trend in models that combine vision and language, like OpenAI's CLIP. These models excel in tasks like aligning image and text features, image captioning and visual question-answering. Their ability to generalize without specific training offers many practical uses.
-
-Supported Models
------------------
-NeMo Multimodal currently supports the following models:
-
-+-----------------------------------+----------+-------------+------+-------------------------+------------------+
-| Model                             | Training | Fine-Tuning | PEFT | Evaluation              | Inference        |
-+===================================+==========+=============+======+=========================+==================+
-| `CLIP <./clip.html>`_             | ✓        | -           | -    | zero-shot imagenet      | similarity score |
-+-----------------------------------+----------+-------------+------+-------------------------+------------------+
-
-Spotlight Models
------------------
-
-Vision-Language models are at the forefront of multimodal learning, showcasing impressive abilities in tasks that require a combination of visual and textual comprehension. Let's take a quick look at some key models driving progress in this field:
-
-#. **Contrastive Learning Based Models**: At the forefront is CLIP :cite:`mm-models-radford2021clip`, which harnesses contrastive learning to jointly fine-tune a text and image encoder, facilitating a gamut of downstream tasks. CLIP's success has spurred further research, leading to models like ALIGN :cite:`mm-models-saharia2022photorealistic` and DeCLIP :cite:`mm-models-li2021declip`.
-
-#. **Holistic Foundation Models**: FLAVA :cite:`mm-models-singh2022flava` aspires to craft a universal model adept at vision, language, and multimodal tasks. Through a unified architecture, it vies to excel across a spectrum of tasks, embodying the essence of a true foundation model.
-
-#. **Bootstrapping Techniques**: BLIP :cite:`mm-models-blip2` employs a pioneering framework that shines in both understanding-based and generation-based vision-language tasks. By bootstrapping captions from noisy web data, it exhibits remarkable generalization across a plethora of vision-language challenges.
-
-Anatomy of Vision-Language Models
-----------------------------------
-
-At their core, vision-language models fundamentally consist of three main parts:
-
-1. **Image Encoder:** Extracts features from images.
-2. **Text Encoder:** Extracts features from textual data.
-3. **Fusion Strategy:** Merges the information gleaned from both encoders.
-
-These models have undergone a significant transformation. Earlier models used manually designed image descriptors and pre-trained word vectors. Nowadays, models primarily utilize transformer architectures for both image and text encoding, learning features together or separately. The pre-training objectives of these models are carefully designed to suit a wide range of tasks.
-
-Contrastive Learning: Bridging Vision and Language
----------------------------------------------------
-
-Contrastive learning has burgeoned as a pivotal pre-training objective, especially for vision-language models. Models like CLIP, CLOOB, ALIGN, and DeCLIP have harnessed contrastive learning to bridge the chasm between vision and language. They accomplish this by jointly learning a text encoder and an image encoder using a contrastive loss, typically on extensive datasets encompassing {image, caption} pairs.
-
-The quintessence of contrastive learning is to map images and texts to a shared feature realm. Here, the distance between the embeddings of congruent image-text pairs is minimized, while it's maximized for incongruent pairs. For instance, CLIP employs the cosine distance between text and image embeddings, while models like ALIGN and DeCLIP have crafted their own distance metrics to cater to the intricacies of their datasets.
-
-CLIP and Beyond
----------------
-
-The CLIP (Contrastive Language-Image Pre-training) model  has notably served as a linchpin for various models and applications within the realms of deep learning and computer vision, and also within the NeMo toolkit. Below is an elucidation on how the CLIP model extends its influence into other models and domains:
-
-1. **Use Cases in Vision Tasks:**
-   * **Classification:** CLIP can be harnessed for classification tasks, accepting arbitrary text labels for zero-shot classification on video frames or images.
-   * **Semantic Image Search:** Constructing a semantic image search engine with CLIP showcases its capability to generate embeddings for semantic content analysis and similarity search.
-
-2. **Image Similarity and Clustering:**
-   * In a practical scenario, CLIP's embeddings were leveraged for an image similarity search engine, showcasing its effectiveness in generating useful representations for visual similarity scenarios, even without being specifically trained for such tasks.
-
-3. **Foundation for Multimodal Language Models:**
-   * Large language models with visual capabilities, such as LLaVA, Flamingo, Kosmos-1, and Kosmos-2, have leaned on CLIP's architecture. In these models, images are encoded using a visual encoder derived from CLIP.
-
-4. **Foundation Diffusion Models:**
-   * Models like Stable Diffusion and Imagen have tapped into the prowess of the text encoder from CLIP to condition their processes based on text prompts. This integration exemplifies the adaptability and influence of the CLIP encoder in the broader AI landscape, especially in the domain of diffusion models.
-
-.. note::
-    NeMo Megatron has an Enterprise edition which proffers tools for data preprocessing, hyperparameter tuning, containers, scripts for various clouds, and more. With the Enterprise edition, you also garner deployment tools. Apply for `early access here <https://developer.nvidia.com/nemo-megatron-early-access>`_ .
-
+Humans naturally process information using multiple senses like sight and sound. Similarly, multimodal learning aims to create models that handle different data types, such as images, text, and audio. There's a growing trend in models that combine vision and language, like OpenAI's CLIP. These models excel at tasks like aligning image and text features, image captioning, and visual question-answering. Their ability to generalize without specific training offers many practical uses. Please refer to `NeMo Framework User Guide for Multimodal Models <https://docs.nvidia.com/nemo-framework/user-guide/latest/multimodalmodels/index.html>`_ for detailed support information.
 
 .. toctree::
    :maxdepth: 1
@@ -71,12 +10,3 @@ The CLIP (Contrastive Language-Image Pre-training) model  has notably served as
    configs
    checkpoint
    clip
-
-References
-----------
-
-.. bibliography:: ../mm_all.bib
-    :style: plain
-    :filter: docname in docnames
-    :labelprefix: MM-MODELS
-    :keyprefix: mm-models-
\ No newline at end of file

From 60af0825aa5a067353e97a1a7877d78b96fa1400 Mon Sep 17 00:00:00 2001
From: Gerald Shen <119401249+gshennvm@users.noreply.github.com>
Date: Thu, 7 Mar 2024 17:20:43 -0800
Subject: [PATCH 004/140] run val only if val dataloader exists (#8605)

Signed-off-by: Gerald Shen <geshen@nvidia.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
---
 .../nlp/models/language_modeling/megatron_base_model.py       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 15df152b79c2..cd5587351ecd 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -333,8 +333,8 @@ def _reconfigure_val_batches(self):
             self.trainer.limit_val_batches *= get_num_microbatches()
         else:
             assert isinstance(self.trainer.limit_val_batches, float)
-            # Don't reconfigure if limit_val_batches is 0.0
-            if self.trainer.limit_val_batches == 0.0:
+            # Don't reconfigure if limit_val_batches is 0.0 or if there's no val dataloader
+            if self.trainer.limit_val_batches == 0.0 or self._validation_dl is None:
                 return
             # len(self._validation_dl) returns len as num of microbatches
             val_len_in_micro_batches = len(self._validation_dl)

From 593e6621cb89a681890dfdeaa88d25a724ddc5c0 Mon Sep 17 00:00:00 2001
From: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Date: Fri, 8 Mar 2024 11:26:28 -0800
Subject: [PATCH 005/140] Upgrade to PTL 2.1 and 2.2 (#8030)

* Add the following changes for PTL 2.1

1) Remove LightningModuleWrapperBase around model as its not required with PTL 2.1
2) Make precision as None when using precision plugin in MegatronTrainerBuilder
3) Change dataloader_iter API for some megatron model

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Change dataloader_iter API and remove val_iterator_done

1) Change dataloader_iter API according to PTl 2.1 for bert and gpt model
2) Comment self._val_iterator_done for all megatron models

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Override format_checkpoint_nae and fix dataloader_iter API

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Update PTL version in requirements

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add the following changes for PTL 2.1

1) Remove LightningModuleWrapperBase around model as its not required with PTL 2.1
2) Make precision as None when using precision plugin in MegatronTrainerBuilder
3) Change dataloader_iter API for some megatron model

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Change dataloader_iter API and remove val_iterator_done

1) Change dataloader_iter API according to PTl 2.1 for bert and gpt model
2) Comment self._val_iterator_done for all megatron models

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Override format_checkpoint_nae and fix dataloader_iter API

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove unused import and comment val_iterator_done

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Override _link_checkpoint

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Temporarily comment out CPU Unit tests

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Remove precision arg from Trainer in convert_hf_llama_to_nemo.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix dataloader_iter API for megatron_lm_encoder_decoder_model.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Temporarily disable NMT Training TP=2 test

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix val_step, test_step func API MegatronLMEncoderDecoderModel

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Enable NMT training TP=2 test

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Disable some unit tests

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Comment CI tests

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Comment resume part of BART

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Uncomment few lines from JenkinsFile

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Return len of dataloader in microbatches

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix _link_checkpoint

1) Add inject_model_parallel_rank to _link_checkpoint
2) Override super._link_checkpoint to remove condition check for rank 0

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Check if using dist ckpt in _link_checkpoint

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Remove batch_idx arg from validation_step megatron_gpt_sft_model.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Use PTL bug fix branch

Test unit tests with PTL bug fix https://github.com/Lightning-AI/pytorch-lightning/pull/19344/files

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Temporarily disable test_ema_saved_state in test_ema.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Skip test_beam_decoding_preserve_alignments in test_rnnt_decoding.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Use PTL with fs.lexists

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Comment _link_checkpoint related overrides

In order to test with PTL without symbolic links

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Return only batch for dataloader_iter in DFT model

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Modify get_batch in GPTModel

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add condition checks for batch extraction from dataloader_iter

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add missing condition check for batch extraction in GPTModel

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add condition check for dataloader_iter extraction in MegatronLMEncoderDecoder

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Comment test_invalid_checkpoints_removed_from_topk in test_exp_manager.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix test invalid ckpts in test_exp_manager.py

Also uncomment some of the commented out tests in JenkinsFile and test_ema.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix bug in test_invalid_checkpoints_removed_from_topk

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix validation step of GPTModel for finetuning case with multi dataloaders

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix test_step_outputs for SFT in GPTMOdel

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Pass dataloader_idx for val_step of GPTModel and remove unwanted code

1) Pass dataloader_idx to val_step of GPTModel as its required for GPTSFTModel in case multi dataloaders to append the outputs correctly val/test_step_output
2) Remove val_iterator_done check from all megatron GPT models

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add condition check for extraction of batch in T5SFTModel & LMEncoderDecoder

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add condition check for extracting batch in MegatronNMTModel

Also uncomment GPT PP=2 and NMT tests from JenkinsFIle

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix typo and uncomment multimodel tests

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Change to new dataloader_iter API for MultiModal

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix new dataloader_api for MegatronLatenDiffusion Model

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Store and restore precision value in MegatronGPTSFTModel

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Temporarily comment Multimodal Stable Diffusion Train

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Update JenkinsFile for multimodal with latest main

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Upgrade PTL to version 2.2 in reqs

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Install PTL 2.2 from fork

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add strict arg to load_model_state_dict func in NLPDDPStrategy

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Delete megatron_t5_adapter_tuning.py, megatron_t5_ia3_tuning.py

These files were added in the branch by mistake

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Delete megatron_t5_prompt_learning.py that got added by mistake

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add appropriate comments, code clean up

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Remove PTL installation from JenkinsFile

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update PTL version to be >= 2.2.1

Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 Jenkinsfile                                   | 35 ++++++-----
 .../megatron_bart_pretraining.py              |  3 +
 .../megatron_change_num_partitions.py         |  3 +
 .../megatron_retro_fine_tune.py               |  3 +
 .../megatron_retro_mutransfer_pretrain.py     |  3 +
 .../megatron_retro_pretraining.py             |  3 +
 .../megatron_t5_lm_adaptation_finetune.py     |  3 +
 .../megatron_t5_seq2seq_eval.py               |  3 +
 .../megatron_t5_seq2seq_finetune.py           |  3 +
 .../tuning/megatron_gpt_finetuning.py         |  5 ++
 .../tuning/megatron_gpt_sft.py                |  3 +
 .../megatron_nmt_training.py                  |  3 +
 .../models/multimodal_llm/neva/neva_model.py  | 18 +++---
 .../text_to_image/controlnet/controlnet.py    |  8 +--
 .../text_to_image/dreambooth/dreambooth.py    | 12 ++--
 .../models/text_to_image/imagen/imagen.py     | 12 ++--
 .../stable_diffusion/ldm/ddpm.py              | 12 ++--
 .../clip/megatron_clip_models.py              | 14 ++---
 .../language_modeling/megatron_bert_model.py  | 21 ++++---
 .../language_modeling/megatron_gpt_model.py   | 49 ++++++++++-----
 .../megatron_gpt_prompt_learning_model.py     | 22 +++----
 .../megatron_gpt_sft_model.py                 | 40 ++++++-------
 .../megatron_lm_encoder_decoder_model.py      | 60 +++++++++----------
 .../megatron_retrieval_model.py               |  2 +-
 .../megatron_t5_adapter_model.py              |  8 +--
 .../megatron_t5_sft_model.py                  | 30 +++++-----
 .../machine_translation/megatron_nmt_model.py | 25 ++++----
 .../nlp/parts/megatron_trainer_builder.py     |  1 +
 nemo/collections/nlp/parts/nlp_overrides.py   | 10 ++--
 .../megatron_vit_classification_models.py     | 14 ++---
 nemo/utils/exp_manager.py                     |  4 +-
 requirements/requirements_lightning.txt       |  2 +-
 .../convert_hf_llama_to_nemo.py               |  3 +-
 tests/core/test_exp_manager.py                |  4 +-
 34 files changed, 243 insertions(+), 198 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index ecd78365c787..cfd5853a6882 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -126,17 +126,17 @@ pipeline {
       }
     }
 
-    stage('L0: Unit Tests CPU') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      steps {
-        sh 'CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat'
-      }
-    }
+   stage('L0: Unit Tests CPU') {
+     when {
+       anyOf {
+         branch 'main'
+         changeRequest target: 'main'
+       }
+     }
+     steps {
+       sh 'CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat'
+     }
+   }
 
     stage('L2: Multimodal Imagen Train') {
       when {
@@ -4082,7 +4082,6 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
       }
     }
-    // @athitten Remove /home/TestData/nlp/megatron_sft/trec.jsonl for validation and test file until we have support for multiple dataloaders in lightning 2.0
     stage('L2: Megatron GPT Finetuning PP=2') {
       when {
         anyOf {
@@ -4114,13 +4113,13 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         model.data.train_ds.num_workers=0 \
         model.data.test_ds.micro_batch_size=1 \
         model.data.test_ds.global_batch_size=1 \
-        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
         model.data.test_ds.names=[quarel] \
         model.data.validation_ds.micro_batch_size=1 \
         model.data.validation_ds.global_batch_size=1 \
         model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.validation_ds.names=[quarel]"
+        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
+        model.data.validation_ds.names=[quarel,trec]"
         sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
         trainer.devices=2 \
         trainer.log_every_n_steps=1 \
@@ -4143,13 +4142,13 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         model.data.train_ds.num_workers=0 \
         model.data.test_ds.micro_batch_size=1 \
         model.data.test_ds.global_batch_size=1 \
-        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
         model.data.test_ds.names=[quarel] \
         model.data.validation_ds.micro_batch_size=1 \
         model.data.validation_ds.global_batch_size=1 \
         model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.validation_ds.names=[quarel]"
+        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
+        model.data.validation_ds.names=[quarel,trec]"
         sh "rm -rf examples/nlp/language_modeling/gpt_sft_results"
       }
     }
diff --git a/examples/nlp/language_modeling/megatron_bart_pretraining.py b/examples/nlp/language_modeling/megatron_bart_pretraining.py
index c2ba020a4a21..447c34426602 100644
--- a/examples/nlp/language_modeling/megatron_bart_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_bart_pretraining.py
@@ -60,6 +60,9 @@ def main(cfg) -> None:
             plugins.append(MegatronHalfPrecisionPlugin(plugin_precision, device='cuda', scaler=scaler))
         else:
             plugins.append(PipelineMixedPrecisionPlugin(plugin_precision, device='cuda', scaler=scaler))
+        # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both
+        # precision plugins and precision to exist
+        cfg.trainer.precision = None
 
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
diff --git a/examples/nlp/language_modeling/megatron_change_num_partitions.py b/examples/nlp/language_modeling/megatron_change_num_partitions.py
index e135835292a3..436661e01b5d 100644
--- a/examples/nlp/language_modeling/megatron_change_num_partitions.py
+++ b/examples/nlp/language_modeling/megatron_change_num_partitions.py
@@ -935,6 +935,9 @@ def main():
             plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
         else:
             plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+        # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both
+        # precision plugins and precision to exist
+        precision = None
     trainer = Trainer(plugins=plugins, devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=precision)
 
     if tp_size < 0 or pp_size < 0:
diff --git a/examples/nlp/language_modeling/megatron_retro_fine_tune.py b/examples/nlp/language_modeling/megatron_retro_fine_tune.py
index aa7de6fda582..1577faa69a2b 100644
--- a/examples/nlp/language_modeling/megatron_retro_fine_tune.py
+++ b/examples/nlp/language_modeling/megatron_retro_fine_tune.py
@@ -99,6 +99,9 @@ def main(cfg) -> None:
             plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
         else:
             plugins.append(MixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+        # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both
+        # precision plugins and precision to exist
+        cfg.trainer.precision = None
 
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
diff --git a/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py b/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py
index 81a71650dc42..abe7006448e2 100644
--- a/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py
+++ b/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py
@@ -63,6 +63,9 @@ def main(cfg) -> None:
             plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
         else:
             plugins.append(MixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+        # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both
+        # precision plugins and precision to exist
+        cfg.trainer.precision = None
 
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
diff --git a/examples/nlp/language_modeling/megatron_retro_pretraining.py b/examples/nlp/language_modeling/megatron_retro_pretraining.py
index c1393863da57..909260856eef 100644
--- a/examples/nlp/language_modeling/megatron_retro_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_retro_pretraining.py
@@ -62,6 +62,9 @@ def main(cfg) -> None:
             plugins.append(MegatronHalfPrecisionPlugin(plugin_precision, device='cuda', scaler=scaler))
         else:
             plugins.append(MixedPrecisionPlugin(plugin_precision, device='cuda', scaler=scaler))
+        # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both
+        # precision plugins and precision to exist
+        cfg.trainer.precision = None
 
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
diff --git a/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py b/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py
index e2af0b89ac48..0777d1f40819 100644
--- a/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py
+++ b/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py
@@ -61,6 +61,9 @@ def main(cfg) -> None:
             plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
         else:
             plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+        # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both
+        # precision plugins and precision to exist
+        cfg.trainer.precision = None
 
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
index 4c11e10d99c5..ba8ea6492da3 100644
--- a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
+++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
@@ -94,6 +94,9 @@ def main(cfg) -> None:
             plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
         else:
             plugins.append(MixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+        # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both
+        # precision plugins and precision to exist
+        cfg.trainer.precision = None
 
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
index 3204ba2f6d76..13be61f5b1c5 100644
--- a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
+++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
@@ -174,6 +174,9 @@ def main(cfg) -> None:
             plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
         else:
             plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+        # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both
+        # precision plugins and precision to exist
+        cfg.trainer.precision = None
 
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py b/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py
index aaa087a46623..1e6f680fad7e 100644
--- a/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py
+++ b/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py
@@ -56,7 +56,12 @@ def main(cfg) -> None:
     logging.info("\n\n************** Experiment configuration ***********")
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
+    # cfg.trainer.precision becomes None in TrainerBuilder if precision_plugins exist since both precision plugins and precision
+    # can't exist in PTL >= 2.1, hence storing precision value from cfg.trainer.precision as its used for future steps like in merge_cfg_with func.
+    precision = cfg.trainer.precision
     trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
+    # Restore the precision value after Trainer is built.
+    cfg.trainer.precision = precision
     exp_manager(trainer, cfg.exp_manager)
 
     model_cfg = MegatronGPTSFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg)
diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py b/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py
index 44d0737ad44e..fbaacbb7bff4 100644
--- a/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py
+++ b/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py
@@ -199,6 +199,9 @@ def main(cfg) -> None:
             plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
         else:
             plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+        # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both
+        # precision plugins and precision to exist
+        cfg.trainer.precision = None
 
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
diff --git a/examples/nlp/machine_translation/megatron_nmt_training.py b/examples/nlp/machine_translation/megatron_nmt_training.py
index 38b993479b3c..7946500f92e9 100644
--- a/examples/nlp/machine_translation/megatron_nmt_training.py
+++ b/examples/nlp/machine_translation/megatron_nmt_training.py
@@ -66,6 +66,9 @@ def main(cfg) -> None:
             plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
         else:
             plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+        # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both
+        # precision plugins and precision to exist
+        cfg.trainer.precision = None
 
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
index f0137fd28722..44ab4785e8de 100644
--- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
+++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
@@ -613,16 +613,16 @@ def forward(self, tokens, text_position_ids, attention_mask, labels, media=None)
         output_tensor = self.model(**forward_args)
         return output_tensor
 
-    def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only, first_val_step=None):
-        return MegatronGPTModel.fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only, first_val_step)
+    def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
+        return MegatronGPTModel.fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step)
 
-    def training_step(self, dataloader_iter, batch_idx):
+    def training_step(self, dataloader_iter):
         """
             We pass the dataloader iterator function to the micro-batch scheduler.
             The input batch to each micro-batch is fetched using the dataloader function
             in the micro-batch fwd function.
         """
-        return MegatronGPTModel.training_step(self, dataloader_iter, batch_idx)
+        return MegatronGPTModel.training_step(self, dataloader_iter)
 
     def get_forward_output_and_loss_func(self, validation_step=False, tuning=False):
         def loss_func(output_tensor, loss_mask):
@@ -634,7 +634,7 @@ def loss_func(output_tensor, loss_mask):
                 return loss_for_ub, dict(avg=reduced_loss[0].unsqueeze(0))
 
         def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_layers=None):
-            batch = next(dataloader_iter)
+            batch, _, _ = next(dataloader_iter)
             if parallel_state.get_pipeline_model_parallel_world_size() == 1:
                 for k in batch.keys():
                     if self.get_attention_mask_from_fusion:
@@ -690,7 +690,7 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_
 
     def get_forward_output_only_func(self):
         def fwd_output_only_func(dataloader_iter, model):
-            batch = next(dataloader_iter)
+            batch, _, _ = next(dataloader_iter)
             extra_arg = {}
             (
                 tokens,
@@ -744,8 +744,8 @@ def id_func(output_tensor):
 
         return fwd_output_only_func
 
-    def validation_step(self, dataloader_iter, batch_idx):
-        return MegatronGPTModel.validation_step(self, dataloader_iter, batch_idx)
+    def validation_step(self, dataloader_iter):
+        return MegatronGPTModel.validation_step(self, dataloader_iter)
 
     def on_validation_epoch_end(self):
         if not self.validation_step_outputs:
@@ -775,7 +775,7 @@ def on_validation_epoch_start(self):
         pass
 
     def test_step(self, batch, batch_idx):
-        return self.validation_step(batch, batch_idx)
+        return self.validation_step(batch)
 
     def test_epoch_end(self, outputs):
         averaged_loss = average_losses_across_data_parallel_group(outputs)
diff --git a/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py b/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py
index 36329c3b7d0f..3f59eb66c81a 100644
--- a/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py
+++ b/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py
@@ -678,7 +678,7 @@ def on_train_batch_start(self, batch, batch_idx, dataloader_idx=0):
             batch[self.cfg.first_stage_key] = batch[self.cfg.first_stage_key].cuda(non_blocking=True)
             self.model.on_train_batch_start(batch, batch_idx)
 
-    def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
+    def fwd_bwd_step(self, dataloader_iter, forward_only):
         tensor_shape = None  # Placeholder
 
         # handle asynchronous grad reduction
@@ -726,7 +726,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
 
         return loss_mean, loss_dict
 
-    def training_step(self, dataloader_iter, batch_idx):
+    def training_step(self, dataloader_iter):
         """
             Our dataloaders produce a micro-batch and then we fetch
             a number of microbatches depending on the global batch size and model parallel size
@@ -738,7 +738,7 @@ def training_step(self, dataloader_iter, batch_idx):
         # we zero grads here because we also call backward in the apex fwd/bwd functions
         self._optimizer.zero_grad()
 
-        loss_mean, loss_dict = self.fwd_bwd_step(dataloader_iter, batch_idx, False)
+        loss_mean, loss_dict = self.fwd_bwd_step(dataloader_iter, False)
 
         if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False):
             self.allreduce_sequence_parallel_gradients()
@@ -827,7 +827,7 @@ def process_batch(batch):
             return [x, *c_list]
 
         def fwd_output_and_loss_func(dataloader_iter, model):
-            batch = next(dataloader_iter)
+            batch, _, _ = next(dataloader_iter)
             batch = process_batch(batch)
             batch = [x.cuda(non_blocking=True) for x in batch]
             if len(self.conditioning_keys) == 0:
diff --git a/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py b/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py
index 704f8b39371a..317cdf5d6364 100644
--- a/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py
+++ b/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py
@@ -241,7 +241,7 @@ def forward(self, batch):
         output_tensor = self.model(batch)
         return output_tensor
 
-    def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
+    def fwd_bwd_step(self, dataloader_iter, forward_only):
         tensor_shape = None  # Placeholder
 
         # handle asynchronous grad reduction
@@ -290,7 +290,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
 
         return loss_mean, loss_dict
 
-    def training_step(self, dataloader_iter, batch_idx):
+    def training_step(self, dataloader_iter):
         """
             Our dataloaders produce a micro-batch and then we fetch
             a number of microbatches depending on the global batch size and model parallel size
@@ -303,7 +303,7 @@ def training_step(self, dataloader_iter, batch_idx):
         # we zero grads here because we also call backward in the apex fwd/bwd functions
         self._optimizer.zero_grad()
 
-        loss_mean, loss_dict = self.fwd_bwd_step(dataloader_iter, batch_idx, False)
+        loss_mean, loss_dict = self.fwd_bwd_step(dataloader_iter, False)
 
         torch.distributed.broadcast(loss_mean, get_last_rank())
 
@@ -344,8 +344,8 @@ def training_step(self, dataloader_iter, batch_idx):
         )
         return loss_mean
 
-    def validation_step(self, dataloader_iter, batch_idx):
-        loss, val_loss_dict = self.fwd_bwd_step(dataloader_iter, batch_idx, True)
+    def validation_step(self, dataloader_iter):
+        loss, val_loss_dict = self.fwd_bwd_step(dataloader_iter, True)
 
         self.log_dict(val_loss_dict, prog_bar=False, logger=True, on_step=False, on_epoch=True, batch_size=1)
 
@@ -394,7 +394,7 @@ def process_batch(batch):
             return images, cond
 
         def fwd_output_and_loss_func(dataloader_iter, model):
-            batch = next(dataloader_iter)
+            batch, _, _ = next(dataloader_iter)
             batch = process_batch(batch)
             batch = [x.cuda(non_blocking=True) for x in batch]
             loss = model(batch)
diff --git a/nemo/collections/multimodal/models/text_to_image/imagen/imagen.py b/nemo/collections/multimodal/models/text_to_image/imagen/imagen.py
index 90487eac61dc..4fa6cd230e03 100644
--- a/nemo/collections/multimodal/models/text_to_image/imagen/imagen.py
+++ b/nemo/collections/multimodal/models/text_to_image/imagen/imagen.py
@@ -248,7 +248,7 @@ def process_batch(batch):
             return [x_start, text_embed, text_mask, x_lowres]
 
         def fwd_output_and_loss_func(dataloader_iter, model):
-            batch = next(dataloader_iter)
+            batch, _, _ = next(dataloader_iter)
             batch = process_batch(batch)
             batch = [x.cuda(non_blocking=True) for x in batch]
             loss, loss_dict = model(*batch)
@@ -326,7 +326,7 @@ def setup_test_data(self, cfg):
                 self._test_ds, batch_size=self._micro_batch_size, num_workers=cfg.num_workers, pin_memory=True,
             )
 
-    def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
+    def fwd_bwd_step(self, dataloader_iter, forward_only):
         tensor_shape = None
 
         # handle asynchronous grad reduction
@@ -377,7 +377,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
 
         return loss_mean, loss_dict
 
-    def training_step(self, dataloader_iter, batch_idx):
+    def training_step(self, dataloader_iter):
         """
             Our dataloaders produce a micro-batch and then we fetch
             a number of microbatches depending on the global batch size and model parallel size
@@ -390,7 +390,7 @@ def training_step(self, dataloader_iter, batch_idx):
         # we zero grads here because we also call backward in the megatron-core fwd/bwd functions
         self._optimizer.zero_grad()
 
-        loss_mean, loss_dict = self.fwd_bwd_step(dataloader_iter, batch_idx, False)
+        loss_mean, loss_dict = self.fwd_bwd_step(dataloader_iter, False)
 
         torch.distributed.broadcast(loss_mean, get_last_rank())
 
@@ -458,14 +458,14 @@ def _append_sequence_parallel_module_grads(self, module, grads):
                     grad = param.grad
                 grads.append(grad.data)
 
-    def validation_step(self, dataloader_iter, batch_idx):
+    def validation_step(self, dataloader_iter):
         """
             Our dataloaders produce a micro-batch and then we fetch
             a number of microbatches depending on the global batch size and model parallel size
             from the dataloader to produce a list of microbatches.
             The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.        """
 
-        loss, val_loss_dict = self.fwd_bwd_step(dataloader_iter, batch_idx, True)
+        loss, val_loss_dict = self.fwd_bwd_step(dataloader_iter, True)
 
         self.log_dict(val_loss_dict, prog_bar=False, logger=True, on_step=False, on_epoch=True, batch_size=1)
         return loss
diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
index 36dfb74fbfaf..61bb664e43ed 100644
--- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
+++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
@@ -1716,7 +1716,7 @@ def on_train_batch_start(self, batch, batch_idx, dataloader_idx=0):
             batch[self.cfg.first_stage_key] = batch[self.cfg.first_stage_key].cuda(non_blocking=True)
             self.model.on_train_batch_start(batch, batch_idx)
 
-    def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
+    def fwd_bwd_step(self, dataloader_iter, forward_only):
         tensor_shape = None  # Placeholder
 
         # handle asynchronous grad reduction
@@ -1780,7 +1780,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
 
         return loss_mean, loss_dict
 
-    def training_step(self, dataloader_iter, batch_idx):
+    def training_step(self, dataloader_iter):
         """
             Our dataloaders produce a micro-batch and then we fetch
             a number of microbatches depending on the global batch size and model parallel size
@@ -1793,7 +1793,7 @@ def training_step(self, dataloader_iter, batch_idx):
         # we zero grads here because we also call backward in the megatron-core fwd/bwd functions
         self._optimizer.zero_grad()
 
-        loss_mean, loss_dict = self.fwd_bwd_step(dataloader_iter, batch_idx, False)
+        loss_mean, loss_dict = self.fwd_bwd_step(dataloader_iter, False)
 
         # when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced
         if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False):
@@ -1902,7 +1902,7 @@ def process_batch(batch):
             return [x, *c_list]
 
         def fwd_output_and_loss_func(dataloader_iter, model):
-            batch = next(dataloader_iter)
+            batch, _, _ = next(dataloader_iter)
             batch = process_batch(batch)
             batch = [x.cuda(non_blocking=True) for x in batch]
             if len(self.conditioning_keys) == 0:
@@ -1928,8 +1928,8 @@ def fwd_output_only_func(batch, model):
 
         return fwd_output_only_func
 
-    def validation_step(self, dataloader_iter, batch_idx):
-        loss, val_loss_dict = self.fwd_bwd_step(dataloader_iter, batch_idx, True)
+    def validation_step(self, dataloader_iter):
+        loss, val_loss_dict = self.fwd_bwd_step(dataloader_iter, True)
 
         self.log_dict(val_loss_dict, prog_bar=False, logger=True, on_step=False, on_epoch=True, batch_size=1)
 
diff --git a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
index ea325a4a2839..fe35ae148026 100644
--- a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
+++ b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
@@ -452,7 +452,7 @@ def forward(self, image, text):
         output_tensor = self.model(image, text)
         return output_tensor
 
-    def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
+    def fwd_bwd_step(self, dataloader_iter, forward_only):
 
         # handle asynchronous grad reduction
         no_sync_func = None
@@ -523,7 +523,7 @@ def initialize_ub_func(self):
         )
         self.initialize_ub = False
 
-    def training_step(self, dataloader_iter, batch_idx):
+    def training_step(self, dataloader_iter):
         """
             Our dataloaders produce a micro-batch and then we fetch
             a number of microbatches depending on the global batch size and model parallel size
@@ -557,7 +557,7 @@ def training_step(self, dataloader_iter, batch_idx):
                     for param in module.embedding.parameters():
                         param.data_ptr()
 
-        loss_mean = self.fwd_bwd_step(dataloader_iter, batch_idx, False)
+        loss_mean = self.fwd_bwd_step(dataloader_iter, False)
 
         # when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced
         if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False):
@@ -649,7 +649,7 @@ def get_forward_output_and_loss_func(self):
         loss_func = ClipLoss(local_loss=self.cfg.local_loss, gather_with_grad=self.cfg.gather_with_grad,)
 
         def fwd_output_and_loss_func(dataloader_iter, model):
-            batch = next(dataloader_iter)
+            batch, _, _ = next(dataloader_iter)
             if parallel_state.get_pipeline_model_parallel_world_size() == 1:
                 images = batch["images"].cuda(non_blocking=True)
                 captions = batch["captions"].cuda(non_blocking=True)
@@ -739,7 +739,7 @@ def accuracy(output, target, topk=(1,)):
         top5 = top5 / n
         return top1, top5
 
-    def validation_step(self, dataloader_iter, batch_idx):
+    def validation_step(self, dataloader_iter):
         """
             Our dataloaders produce a micro-batch and then we fetch
             a number of microbatches depending on the global batch size and model parallel size
@@ -749,7 +749,7 @@ def validation_step(self, dataloader_iter, batch_idx):
         if self.initialize_ub:
             self.initialize_ub_func()
 
-        loss = self.fwd_bwd_step(dataloader_iter, batch_idx, True)
+        loss = self.fwd_bwd_step(dataloader_iter, True)
         self.validation_step_outputs.append(loss)
 
         return loss
@@ -785,7 +785,7 @@ def on_validation_epoch_end(self):
         return averaged_loss
 
     def test_step(self, batch, batch_idx):
-        return self.validation_step(batch, batch_idx)
+        return self.validation_step(batch)
 
     def test_epoch_end(self, outputs):
         averaged_loss = average_losses_across_data_parallel_group(outputs)
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
index 49b64268e6b9..29e1d2656cdf 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
@@ -210,7 +210,7 @@ def _validate_trainer(self):
     def get_forward_output_and_loss_func(self):
         def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_layers=None):
             if parallel_state.get_pipeline_model_parallel_world_size() == 1:
-                batch = next(dataloader_iter)
+                batch, batch_idx, dataloader_idx = next(dataloader_iter)
                 tokens, types, sentence_order, loss_mask, lm_labels, padding_mask = (
                     batch['text'].cuda(non_blocking=True),
                     batch['types'].cuda(non_blocking=True),
@@ -220,7 +220,7 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_
                     batch['padding_mask'].cuda(non_blocking=True),
                 )
             else:
-                batch = next(dataloader_iter)
+                batch, batch_idx, dataloader_idx = next(dataloader_iter)
                 if parallel_state.is_pipeline_first_stage():
                     tokens = batch['text'].cuda(non_blocking=True)
                     types = batch['types'].cuda(non_blocking=True)
@@ -238,6 +238,9 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_
                     sentence_order = batch['is_random'].cuda(non_blocking=True)
                     tokens, types, loss_mask, lm_labels = None, None, None, None
 
+            dataloader_iter._dataloader_idx = dataloader_idx
+            dataloader_iter._batch_idx = batch_idx
+
             if not self.cfg.bert_binary_head:
                 types = None
 
@@ -309,7 +312,7 @@ def forward(
 
         return output_tensor
 
-    def training_step(self, dataloader_iter, batch_idx):
+    def training_step(self, dataloader_iter):
 
         self._optimizer.zero_grad()
 
@@ -391,7 +394,7 @@ def training_step(self, dataloader_iter, batch_idx):
             if loss_scale is not None:
                 self.log('loss_scale', loss_scale, batch_size=1)
 
-        if (batch_idx + 1) % self.trainer.accumulate_grad_batches == 0:
+        if (dataloader_iter._batch_idx + 1) % self.trainer.accumulate_grad_batches == 0:
             # Reduced loss for logging.
             self.log('reduced_train_loss', loss_mean[0], prog_bar=True, batch_size=1)
             if len(loss_mean) > 2:
@@ -497,11 +500,7 @@ def allreduce_first_last_embeddings(self):
                     grad = word_embeddings_weight.grad
                 torch.distributed.all_reduce(grad, group=parallel_state.get_embedding_group())
 
-    def validation_step(self, dataloader_iter, batch_idx):
-        # Check if iterator is exhausted
-        dataloader_iter, done = self._val_iterator_done(dataloader_iter)
-        if done:
-            return
+    def validation_step(self, dataloader_iter):
         prefix = "test" if self.trainer.testing else "val"
         if self.cfg.data.dataloader_type == "LDDL":
             seq_length = dataloader_iter.iterator.get_seqlen()
@@ -542,8 +541,8 @@ def on_validation_epoch_end(self):
         self.log('val_loss', averaged_loss, prog_bar=True, batch_size=1)
         self.validation_step_outputs.clear()  # free memory
 
-    def test_step(self, batch, batch_idx):
-        return self.validation_step(batch, batch_idx)
+    def test_step(self, dataloader_iter):
+        return self.validation_step(dataloader_iter)
 
     def on_test_epoch_end(self):
         averaged_loss = average_losses_across_data_parallel_group(self.test_step_outputs)
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index a63dfc7c5ce4..ac35af38de64 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -27,6 +27,7 @@
 from omegaconf.dictconfig import DictConfig
 from pkg_resources import packaging
 from pytorch_lightning.accelerators import CPUAccelerator
+from pytorch_lightning.loops.fetchers import _DataFetcherWrapper
 from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.common.parts.utils import extend_instance
@@ -550,7 +551,7 @@ def forward(self, tokens, text_position_ids, attention_mask, labels):
         output_tensor = self.model(tokens, text_position_ids, attention_mask, labels=labels)
         return output_tensor
 
-    def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only, first_val_step=None):
+    def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
 
         # handle asynchronous grad reduction
         no_sync_func = None
@@ -634,7 +635,7 @@ def initialize_ub_func(self):
         )
         self.initialize_ub = False
 
-    def training_step(self, dataloader_iter, batch_idx):
+    def training_step(self, dataloader_iter):
         """
             We pass the dataloader iterator function to the micro-batch scheduler.
             The input batch to each micro-batch is fetched using the dataloader function
@@ -673,7 +674,7 @@ def training_step(self, dataloader_iter, batch_idx):
                     for param in module.embedding.parameters():
                         param.data_ptr()
 
-        loss_mean = self.fwd_bwd_step(dataloader_iter, batch_idx, False)
+        loss_mean = self.fwd_bwd_step(dataloader_iter, False)
 
         if self.cfg.get('fp8', False):
             self.prev_step_training = self.training
@@ -925,7 +926,13 @@ def get_batch(self, data_iterator, tuning):
 
         # Broadcast data.
         if data_iterator is not None:
-            data = next(data_iterator)
+            # Check if instance of PTL's _DataFetcherWrapper or not, since sometimes (batch, batch_idx, dataloader_idx) as a tuple
+            # from the dataloader_iter are already extracted in the child class validation steps. In that case extact only the batch
+            # from the data_iterator
+            if isinstance(data_iterator, _DataFetcherWrapper):
+                data, _, _ = next(data_iterator)
+            else:
+                data = next(data_iterator)
         else:
             data = None
 
@@ -1077,7 +1084,13 @@ def loss_func(output_tensor):
 
     def get_forward_output_only_func(self):
         def fwd_output_only_func(dataloader_iter, model):
-            batch = next(dataloader_iter)
+            # Check if instance of PTL's _DataFetcherWrapper or not, since sometimes (batch, batch_idx, dataloader_idx) as a tuple
+            # from the dataloader_iter are already extracted in the child class validation steps. In that case extact only the batch
+            # from the data_iterator
+            if isinstance(dataloader_iter, _DataFetcherWrapper):
+                batch, _, _ = next(dataloader_iter)
+            else:
+                batch = next(dataloader_iter)
             extra_arg = {}
             if len(batch) == 3:
                 batch = [x.cuda() for x in batch]
@@ -1127,17 +1140,13 @@ def id_func(output_tensor):
 
         return fwd_output_only_func
 
-    def validation_step(self, dataloader_iter, batch_idx):
+    def validation_step(self, dataloader_iter, dataloader_idx=0):
         """
             Our dataloaders produce a micro-batch and then we fetch
             a number of microbatches depending on the global batch size and model parallel size
             from the dataloader to produce a list of microbatches.
             The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
         """
-        # Check if iterator is exhausted
-        dataloader_iter, done = self._val_iterator_done(dataloader_iter)
-        if done:
-            return
         mode = 'test' if self.trainer.testing else 'val'
         # Initialize userbuffer communicators.
         if self.initialize_ub:
@@ -1153,12 +1162,24 @@ def validation_step(self, dataloader_iter, batch_idx):
         else:
             first_val_step = None
 
-        loss = self.fwd_bwd_step(dataloader_iter, batch_idx, True, first_val_step)
+        loss = self.fwd_bwd_step(dataloader_iter, True, first_val_step)
 
         if isinstance(self.model, list):
             for model_module in self.model:
                 model_module.train()
-        self.validation_step_outputs.append(loss) if mode == 'val' else self.test_step_outputs.append(loss)
+
+        if mode == 'val':
+            # Append with the correct dataloader_idx in case of multiple dataloaders
+            if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1:
+                self.validation_step_outputs[dataloader_idx].append(loss)
+            else:
+                self.validation_step_outputs.append(loss)
+        else:
+            if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1:
+                self.test_step_outputs[dataloader_idx].append(loss)
+            else:
+                self.test_step_outputs.append(loss)
+
         return loss
 
     def on_validation_epoch_end(self):
@@ -1194,8 +1215,8 @@ def on_validation_epoch_end(self):
 
         return averaged_loss
 
-    def test_step(self, batch, batch_idx):
-        return self.validation_step(batch, batch_idx)
+    def test_step(self, dataloader_iter):
+        return self.validation_step(dataloader_iter)
 
     def on_test_epoch_end(self):
         averaged_loss = average_losses_across_data_parallel_group(self.test_step_outputs)
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
index daa0c6dd02fa..617a585ef3a9 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
@@ -309,7 +309,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
             The iterator of microbatches is then piped through the pipeline using Core's fwd/bwd functions.
         """
         # Get seq length of batch
-        batch = next(dataloader_iter)
+        batch, _, _ = next(dataloader_iter)
         _, seq_length = batch[0].shape
         data_iter = get_iterator_k_split(batch, get_num_microbatches())
 
@@ -337,10 +337,10 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
 
         return loss_mean
 
-    def training_step(self, dataloader_iter, batch_idx):
+    def training_step(self, dataloader_iter):
         # we zero grads here because we also call backward in the megatron-core fwd/bwd functions
         self._optimizer.zero_grad()
-        batch = next(dataloader_iter)
+        batch, batch_idx, _ = next(dataloader_iter)
         loss_mean = self.fwd_bwd_step(itertools.chain([batch]), batch_idx, forward_only=False)
         self.allreduce_gradients()
 
@@ -373,13 +373,9 @@ def optimizer_zero_grad(self, *args, **kwargs):
         """
         return
 
-    def validation_step(self, dataloader_iter, batch_idx):
-        # Check if iterator is exhausted
-        dataloader_iter, done = self._val_iterator_done(dataloader_iter)
-        if done:
-            return
+    def validation_step(self, dataloader_iter):
         mode = 'test' if self.trainer.testing else 'val'
-        batch = next(dataloader_iter)
+        batch, batch_idx, _ = next(dataloader_iter)
         gbs = self.cfg.get('validation_global_batch_size', self.cfg.global_batch_size)
         self._reconfigure_and_process_inference_batch(batch[0].size(0), gbs)
         loss_mean = self.fwd_bwd_step(itertools.chain([batch]), batch_idx, forward_only=True)
@@ -503,8 +499,8 @@ def on_validation_epoch_end(self):
         self._reconfigure_batch_sizes(gbs, mbs)
         self.validation_step_outputs.clear()  # free memory
 
-    def test_step(self, dataloader_iter, batch_idx):
-        return self.validation_step(dataloader_iter, batch_idx)
+    def test_step(self, dataloader_iter):
+        return self.validation_step(dataloader_iter)
 
     def on_test_epoch_end(self):
         averaged_loss = average_losses_across_data_parallel_group(self.test_step_outputs)
@@ -661,7 +657,7 @@ def set_input_tensor(self, input_tensor):
 
     def get_forward_output_and_loss_func(self):
         def fwd_output_and_loss_func(dataloader_iter, model):
-            batch = next(dataloader_iter)
+            batch, _, _ = next(dataloader_iter)
             batch = [x.cuda(non_blocking=True) for x in batch]
             input_ids, labels, loss_mask, position_ids, attention_mask, taskname_ids = batch
             output_tensor = model(input_ids, position_ids, attention_mask, taskname_ids, labels, inference=False)
@@ -684,7 +680,7 @@ def get_forward_output_only_func(self):
         """
 
         def fwd_output_only_func(dataloader_iter, model):
-            batch = next(dataloader_iter)
+            batch, _, _ = next(dataloader_iter)
             extra_arg = {}
             (
                 tokens,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index 3777047780f2..331f977a3265 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -18,6 +18,7 @@
 
 import torch
 from omegaconf import DictConfig, ListConfig
+from pytorch_lightning.loops.fetchers import _DataFetcherWrapper
 from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.common.metrics import MetricStringToTorchMetric
@@ -184,13 +185,6 @@ def setup(self, stage=None):
         if hasattr(self.cfg.data, 'test_ds') and self.cfg.data.test_ds.get('file_names', None) is not None:
             self._test_dl = self.setup_eval_dataloader(self._test_ds, self.cfg.data.test_ds)
 
-        # Raise error if using multiple dataloaders
-        if type(self._validation_dl) == list and len(self._validation_dl) > 1:
-            raise NotImplementedError('Lightning 2.0 does not support multiple dataloaders with dataloader_iter')
-
-        if type(self._test_dl) == list and len(self._test_dl) > 1:
-            raise NotImplementedError('Lightning 2.0 does not support multiple dataloaders with dataloader_iter')
-
         # when using pipeline model parallel the final stage need to initialize word embeddings
         self.initialize_last_rank_embeddings()
 
@@ -327,8 +321,13 @@ def _determine_log_key(self, data_config, dataloader_idx, metric_name, mode):
         else:
             return base_key + f"dataloader{dataloader_idx}"
 
-    def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only, first_val_step=None):
-        batch = next(dataloader_iter)
+    def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
+        # Return only batch if batch, batch_idx, dataloder_idx are extracted as a tuple in the previous func
+        # call like validation_step otherwise return tuple (in which case dataloader_iter is still a PTL _DataFetcherWrapper object)
+        if isinstance(dataloader_iter, _DataFetcherWrapper):
+            batch, _, _ = next(dataloader_iter)
+        else:
+            batch = next(dataloader_iter)
 
         log_token_counts = self.cfg.get('log_token_counts', False)
         if log_token_counts:
@@ -399,24 +398,21 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only, first_val_step=
 
         return loss_mean
 
-    def validation_step(self, dataloader_iter, batch_idx, dataloader_idx=0):
-        return self.inference_step(dataloader_iter, batch_idx, 'validation', dataloader_idx)
+    def validation_step(self, dataloader_iter):
+        return self.inference_step(dataloader_iter, 'validation')
 
-    def test_step(self, dataloader_iter, batch_idx, dataloader_idx=0):
-        # Add try except since dataloader_iter in PTL 2.0 doesnt catch the end of iterables
-        return self.inference_step(dataloader_iter, batch_idx, 'test', dataloader_idx)
+    def test_step(self, dataloader_iter):
+        return self.inference_step(dataloader_iter, 'test')
 
-    def inference_step(self, dataloader_iter, batch_idx, mode, dataloader_idx=0):
-        # Check if iterator is exhausted
-        dataloader_iter, done = self._val_iterator_done(dataloader_iter)
-        if done:
-            return
-        batch = next(dataloader_iter)
+    def inference_step(self, dataloader_iter, mode):
+        batch, batch_idx, dataloader_idx = next(dataloader_iter)
         data_cfg = self.cfg.data.validation_ds if mode == 'validation' else self.cfg.data.test_ds
         self._reconfigure_and_process_inference_batch(batch, data_cfg)
         # Meta data from dataset
         metadata = batch.get('metadata', [{}] * len(batch['tokens']))
-        loss = super().validation_step(itertools.chain([batch]), batch_idx)
+        # Pass dataloader_idx, as it's needed in val_step of GPTModel to append the loss correctly to self.val/test_step_outputs
+        # in case of multi dataloaders
+        loss = super().validation_step(itertools.chain([batch]), dataloader_idx)
 
         if data_cfg.get("write_predictions_to_file", False) or data_cfg.metric.name != 'loss':
             # We need _inference_config to get generation params
@@ -460,7 +456,7 @@ def inference_step(self, dataloader_iter, batch_idx, mode, dataloader_idx=0):
 
     def inference_epoch_end(self, outputs, mode, data_cfg):
         # Parent class will handle logging of the loss.
-        if not outputs:
+        if not outputs or not outputs[0]:
             return
 
         if isinstance(outputs[0], dict):
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
index 384acd599e40..38c887304f7a 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
@@ -21,6 +21,7 @@
 from omegaconf import OmegaConf, open_dict
 from omegaconf.dictconfig import DictConfig
 from pytorch_lightning.accelerators import CPUAccelerator
+from pytorch_lightning.loops.fetchers import _DataFetcherWrapper
 from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.data.language_modeling.megatron.data_samplers import (
@@ -338,7 +339,7 @@ def _execute_fwd_bwd_function(self, data_iterator, forward_only, tensor_shape, d
 
         return mean_loss_dict
 
-    def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
+    def fwd_bwd_step(self, dataloader_iter, forward_only):
         """
             Dataloader produces a global batch which is turned into a list of microbatches.
             The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
@@ -353,7 +354,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
             decoder_seq_length=self.max_decoder_seq_length,
         )
 
-    def training_step(self, dataloader_iter, batch_idx):
+    def training_step(self, dataloader_iter):
         """
             Our dataloaders produce a micro-batch and then we fetch
             a number of microbatches depending on the global batch size and model parallel size
@@ -365,7 +366,7 @@ def training_step(self, dataloader_iter, batch_idx):
         # we zero grads here because we also call backward in the megatron fwd/bwd functions
         self._optimizer.zero_grad()
 
-        loss_dict = self.fwd_bwd_step(dataloader_iter, batch_idx, False)
+        loss_dict = self.fwd_bwd_step(dataloader_iter, False)
 
         if self.with_distributed_adam:
             # synchronize asynchronous grad reductions
@@ -566,7 +567,13 @@ def _process_batch(self, global_batch: Dict[str, torch.Tensor]) -> List[torch.Te
 
     def get_forward_output_and_loss_func(self):
         def fwd_output_and_loss_func(dataloader_iter, model):
-            batch = next(dataloader_iter)
+            # Check if instance of PTL's _DataFetcherWrapper or not, since sometimes (batch, batch_idx, dataloader_idx) as a tuple
+            # from the dataloader_iter are already extracted in the child class or previous functions. In that case extact only the batch
+            # from the data_iterator
+            if isinstance(dataloader_iter, _DataFetcherWrapper):
+                batch, _, _ = next(dataloader_iter)
+            else:
+                batch = next(dataloader_iter)
             # convert to list if not already converted.
             if isinstance(batch, dict):
                 # convert to list if not already converted.
@@ -679,7 +686,11 @@ def _get_forward_output_only_func(self, arg_names, output_name, **kwargs):
         """
 
         def fwd_output_only_func(dataloader_iter, model):
-            batch = next(dataloader_iter)
+            # Extract batch, batch_idx, dataloader_idx only if dataloader_iter is an object of PTL's _DataFetcherWrapper
+            if isinstance(dataloader_iter, _DataFetcherWrapper):
+                batch, _, _ = next(dataloader_iter)
+            else:
+                batch = next(dataloader_iter)
             batch = [x.cuda(non_blocking=True) if torch.is_tensor(x) else x for x in batch]
 
             # map batch and shared args into forward args
@@ -699,48 +710,31 @@ def id_func(output_tensor):
 
     ##########
 
-    def _test_validation_step(self, step_outputs, dataloader_iter, batch_idx, dataloader_idx=0):
+    def _test_validation_step(self, dataloader_iter):
         """
         Shared code for validation and test step
         """
-        # Check if iterator is exhausted
-        dataloader_iter, done = self._val_iterator_done(dataloader_iter)
-        if done:
-            return
 
-        loss_dict = self.fwd_bwd_step(dataloader_iter, batch_idx, True)
-        step_outputs.append(loss_dict)
+        loss_dict = self.fwd_bwd_step(dataloader_iter, True)
 
         return loss_dict
 
-    def validation_step(self, dataloader_iter, batch_idx, dataloader_idx=0):
+    def validation_step(self, dataloader_iter):
         """
         return_values - if given, returns a dictionary with given keys and corresponding values
         """
+        outputs = self._test_validation_step(dataloader_iter=dataloader_iter)
         if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1:
-            step_outputs = self.validation_step_outputs[dataloader_idx]
+            self.validation_step_outputs[dataloader_iter.dataloader_idx].append(outputs)
         else:
-            step_outputs = self.validation_step_outputs
-
-        return self._test_validation_step(
-            step_outputs=step_outputs,
-            dataloader_iter=dataloader_iter,
-            batch_idx=batch_idx,
-            dataloader_idx=dataloader_idx,
-        )
+            self.validation_step_outputs.append(outputs)
 
-    def test_step(self, dataloader_iter, batch_idx, dataloader_idx=0):
-        if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1:
-            step_outputs = self.test_step_outputs[dataloader_idx]
+    def test_step(self, dataloader_iter):
+        outputs = self._test_validation_step(dataloader_iter=dataloader_iter)
+        if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1:
+            self.test_step_outputs[dataloader_iter.dataloader_idx].append(outputs)
         else:
-            step_outputs = self.test_step_outputs
-
-        return self._test_validation_step(
-            step_outputs=step_outputs,
-            dataloader_iter=dataloader_iter,
-            batch_idx=batch_idx,
-            dataloader_idx=dataloader_idx,
-        )
+            self.test_step_outputs.append(outputs)
 
     def _test_validation_epoch_end(self, step_outputs, prefix):
         """
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py
index d10c9f27f6cb..ebe936a8178a 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py
@@ -325,7 +325,7 @@ def validation_step(self, batch, batch_idx):
         if prefix == 'val':
             self.validation_step_outputs.append(reduced_loss)
         else:
-            self.test_step_outputs.apped(reduced_loss)
+            self.test_step_outputs.append(reduced_loss)
         return reduced_loss
 
     def on_validation_epoch_end(self):
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_t5_adapter_model.py b/nemo/collections/nlp/models/language_modeling/megatron_t5_adapter_model.py
index d1332831ef1d..31eb4519ded2 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_t5_adapter_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_t5_adapter_model.py
@@ -146,12 +146,8 @@ def compute_accuracy(self, enc_input, enc_mask, encoder_input, labels):
             'enc_inputs': processed_inputs,
         }
 
-    def validation_step(self, dataloader_iter, batch_idx, inference=False):
-        # Check if iterator is exhausted
-        dataloader_iter, done = self._val_iterator_done(dataloader_iter)
-        if done:
-            return
-        batch = next(dataloader_iter)
+    def validation_step(self, dataloader_iter):
+        batch, batch_idx, _ = next(dataloader_iter)
         enc_input, dec_input, labels, loss_mask, enc_mask, dec_mask, position_ids, taskname_ids = batch
 
         mode = self.training
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_t5_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_t5_sft_model.py
index 22483731a534..0b32530668be 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_t5_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_t5_sft_model.py
@@ -17,6 +17,7 @@
 
 import torch
 from omegaconf import DictConfig, ListConfig
+from pytorch_lightning.loops.fetchers import _DataFetcherWrapper
 from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.common.data import ConcatMapDataset
@@ -26,7 +27,6 @@
 from nemo.collections.nlp.data.language_modeling.megatron.t5_sft_dataset import T5SFTDataset
 from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model, T5Sentinel
 from nemo.collections.nlp.modules.common.megatron.utils import get_iterator_k_split
-
 from nemo.collections.nlp.parts.mixins.nlp_adapter_mixins import NLPAdapterModelMixin
 from nemo.collections.nlp.parts.utils_funcs import get_last_rank
 from nemo.utils import AppState, logging
@@ -288,12 +288,18 @@ def _reconfigure_and_process_inference_batch(self, batch, ds_config):
                     data_parallel_size=parallel_state.get_data_parallel_world_size(),
                 )
 
-    def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
+    def fwd_bwd_step(self, dataloader_iter, forward_only):
         """
             Dataloader produces a global batch which is turned into a list of microbatches.
             The list of microbatches is then piped through the pipeline using Apex fwd/bwd functions.
         """
-        batch = next(dataloader_iter)
+        # Check if instance of PTL's _DataFetcherWrapper or not, since sometimes (batch, batch_idx, dataloader_idx) as a tuple
+        # from the dataloader_iter are already extracted in the child class. In that case extact only the batch
+        # from the data_iterator
+        if isinstance(dataloader_iter, _DataFetcherWrapper):
+            batch, _, _ = next(dataloader_iter)
+        else:
+            batch = next(dataloader_iter)
         if isinstance(batch, dict):
             # convert to list if not already converted.
             batch = self._process_batch(batch)
@@ -312,14 +318,10 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
             decoder_seq_length=decoder_seq_length,
         )
 
-    def inference_step(self, dataloader_iter, batch_idx: int, mode: str, dataloader_idx=0):
-        # Check if iterator is exhausted
-        dataloader_iter, done = self._val_iterator_done(dataloader_iter)
-        if done:
-            return
+    def inference_step(self, dataloader_iter, mode: str):
         # Regular finetuning datasets will return a list of dicts for each microbatch.
         # But T0 datasets will return a single dict for the global batch.
-        batch = next(dataloader_iter)
+        batch, batch_idx, dataloader_idx = next(dataloader_iter)
         batch_has_lang_information = isinstance(batch, list) and len(batch[0]) == 7
         data_cfg = self.cfg.data.validation_ds if mode == 'validation' else self.cfg.data.test_ds
 
@@ -327,7 +329,7 @@ def inference_step(self, dataloader_iter, batch_idx: int, mode: str, dataloader_
 
         # NOTE: There could be extra keys in the processed_batch dictionary such as "langs" for XNLI,
         # this will be ignored.
-        loss = self.fwd_bwd_step(itertools.chain([batch]), batch_idx, forward_only=True)
+        loss = self.fwd_bwd_step(itertools.chain([batch]), forward_only=True)
 
         predicted_token_ids, _ = self.decode(
             tokens_enc=batch['text_enc'],
@@ -589,16 +591,16 @@ def write_predictions_to_file(self, outputs, output_file_path_prefix):
             for i, p, l in zip(outputs['inputs'], outputs['preds'], outputs['labels']):
                 f_json.write(json.dumps({'input': i, 'pred': p, 'label': l}) + '\n')
 
-    def validation_step(self, dataloader_iter, batch_idx, dataloader_idx=0):
-        return self.inference_step(dataloader_iter, batch_idx, 'validation', dataloader_idx)
+    def validation_step(self, dataloader_iter):
+        return self.inference_step(dataloader_iter, 'validation')
 
     def on_validation_epoch_end(self):
         _ = self.inference_epoch_end(self.validation_step_outputs, 'validation', self.cfg.data.validation_ds)
         # Commenting as on_validation_epoch_end was a no-op in PTL 1.9
         # return super().on_validation_epoch_end()
 
-    def test_step(self, dataloader_iter, batch_idx, dataloader_idx=0):
-        return self.inference_step(dataloader_iter, batch_idx, 'test', dataloader_idx)
+    def test_step(self, dataloader_iter):
+        return self.inference_step(dataloader_iter, 'test')
 
     def on_test_epoch_end(self):
         _ = self.inference_epoch_end(self.test_step_outputs, 'test', self.cfg.data.test_ds)
diff --git a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py
index 5deac0c43e67..952c76ce929e 100644
--- a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py
+++ b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py
@@ -21,6 +21,7 @@
 import torch
 from omegaconf.dictconfig import DictConfig
 from omegaconf.listconfig import ListConfig
+from pytorch_lightning.loops.fetchers import _DataFetcherWrapper
 from pytorch_lightning.trainer.trainer import Trainer
 from sacrebleu import corpus_bleu
 
@@ -286,12 +287,18 @@ def _build_vocab(self):
             tensor_model_parallel_size=self._cfg.get('tensor_model_parallel_size', 1),
         )
 
-    def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
+    def fwd_bwd_step(self, dataloader_iter, forward_only):
         """
             Dataloader produces a global batch which is turned into a list of microbatches.
             The list of microbatches is then piped through the pipeline using Apex fwd/bwd functions.
         """
-        batch = next(dataloader_iter)
+        # Check if instance of PTL's _DataFetcherWrapper or not, since sometimes (batch, batch_idx, dataloader_idx) as a tuple
+        # from the dataloader_iter are already extracted in the child class or previous functions. In that case extact only the batch
+        # from the data_iterator
+        if isinstance(dataloader_iter, _DataFetcherWrapper):
+            batch, _, _ = next(dataloader_iter)
+        else:
+            batch = next(dataloader_iter)
         if isinstance(batch, dict):
             # convert to list if not already converted.
             batch = self._process_batch(batch)
@@ -310,13 +317,9 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
             decoder_seq_length=decoder_seq_length,
         )
 
-    def eval_step(self, dataloader_iter, batch_idx, dataloader_idx=0):
-        # Check if iterator is exhausted
-        dataloader_iter, done = self._val_iterator_done(dataloader_iter)
-        if done:
-            return
+    def eval_step(self, dataloader_iter):
         # Need to squeze dim 0 for old NMT datasets since things are pre-batched and we ask the dataloader for batch size 1.
-        batch = next(dataloader_iter)
+        batch, _, dataloader_idx = next(dataloader_iter)
         batch = [x.squeeze(dim=0) if x.ndim == 3 else x for x in batch]
         batch = self.process_global_batch_for_text_translation_datasets(batch)
 
@@ -330,7 +333,7 @@ def eval_step(self, dataloader_iter, batch_idx, dataloader_idx=0):
             data_parallel_size=parallel_state.get_data_parallel_world_size(),
         )
         # This returns the averaged loss across data-parallel groups.
-        reduced_loss = self.fwd_bwd_step(itertools.chain([batch]), batch_idx, True)
+        reduced_loss = self.fwd_bwd_step(itertools.chain([batch]), True)
 
         tokens_enc, labels, enc_mask = batch['text_enc'], batch['labels'], batch['enc_mask']
 
@@ -400,12 +403,12 @@ def postprocess_outputs(self, outputs, tokenizer, processor):
 
         return results
 
-    def validation_step(self, dataloader_iter, batch_idx, dataloader_idx=0):
+    def validation_step(self, dataloader_iter):
         """
         Lightning calls this inside the validation loop with the data from the validation dataloader
         passed in as `batch`.
         """
-        return self.eval_step(dataloader_iter, batch_idx, dataloader_idx)
+        return self.eval_step(dataloader_iter)
 
     def _setup_eval_dataloader_from_config(self, cfg: DictConfig, dataset):
         rank = parallel_state.get_data_parallel_rank()
diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
index 0cd1563b2849..055671219fb8 100644
--- a/nemo/collections/nlp/parts/megatron_trainer_builder.py
+++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py
@@ -114,6 +114,7 @@ def _plugins(self) -> list:
                 plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
             else:
                 plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+            self.cfg.trainer.precision = None
 
         if self.cfg.get('cluster_type', None) == 'BCP':
             plugins.append(TorchElasticEnvironment())
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 48fe9034ad25..66fa99ffefd1 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -33,7 +33,6 @@
 from pytorch_lightning.callbacks.progress.tqdm_progress import _update_n
 from pytorch_lightning.core.optimizer import LightningOptimizer
 from pytorch_lightning.loops.fetchers import _DataFetcher
-from pytorch_lightning.overrides.base import _LightningModuleWrapperBase
 from pytorch_lightning.plugins import ClusterEnvironment
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
 from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
@@ -219,7 +218,7 @@ def configure_ddp(self):
             hasattr(self.model, 'with_distributed_adam') and self.model.with_distributed_adam
         ):
             # do not use DDP if using megatron amp O2 or distributed optimizer
-            self._model = _LightningModuleWrapperBase(self.model)
+            self._model = self.model
         else:
             app_state = AppState()
 
@@ -236,7 +235,7 @@ def configure_ddp(self):
                 # self.pre_configure_ddp()
                 # device_ids = self.determine_ddp_device_ids()
                 self._model = DistributedDataParallel(
-                    _LightningModuleWrapperBase(self.model),
+                    self.model,
                     process_group=parallel_state.get_data_parallel_group(with_context_parallel=True),
                     **self._ddp_kwargs,
                 )
@@ -360,7 +359,8 @@ def save_checkpoint(
             if self.is_global_zero or app_state.data_parallel_rank == 0:
                 self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options)
 
-    def load_model_state_dict(self, checkpoint: Mapping[str, Any]) -> None:
+    # PTL 2.2 supports non strict loading of the ckpt with the strict arg (https://github.com/Lightning-AI/pytorch-lightning/pull/19404)
+    def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict: bool = True) -> None:
         # if using distributed checkpointing, the state dict logic is at the model level
         if (
             hasattr(self.lightning_module, 'sharded_state_dict')
@@ -390,7 +390,7 @@ def load_model_state_dict(self, checkpoint: Mapping[str, Any]) -> None:
                         new_state_dict[new_key] = checkpoint['state_dict'][key]
                     checkpoint['state_dict'] = new_state_dict
 
-            self.lightning_module.load_state_dict(checkpoint["state_dict"])
+            self.lightning_module.load_state_dict(checkpoint["state_dict"], strict=strict)
 
     def _fix_tensors_device(self, ckpt: Dict) -> Dict:
         """ Ensure checkpoint tensors are on the correct device."""
diff --git a/nemo/collections/vision/models/megatron_vit_classification_models.py b/nemo/collections/vision/models/megatron_vit_classification_models.py
index 2ced8c8ecc08..c27c37c2b917 100644
--- a/nemo/collections/vision/models/megatron_vit_classification_models.py
+++ b/nemo/collections/vision/models/megatron_vit_classification_models.py
@@ -275,7 +275,7 @@ def forward(self, tokens):
         output_tensor = self.model(tokens)
         return output_tensor
 
-    def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
+    def fwd_bwd_step(self, dataloader_iter, forward_only):
 
         # handle asynchronous grad reduction
         no_sync_func = None
@@ -351,7 +351,7 @@ def initialize_ub_func(self):
         )
         self.initialize_ub = False
 
-    def training_step(self, dataloader_iter, batch_idx):
+    def training_step(self, dataloader_iter):
         """
             Our dataloaders produce a micro-batch and then we fetch
             a number of microbatches depending on the global batch size and model parallel size
@@ -367,7 +367,7 @@ def training_step(self, dataloader_iter, batch_idx):
         # we zero grads here because we also call backward in the megatron-core fwd/bwd functions
         self._optimizer.zero_grad()
 
-        loss_mean, _ = self.fwd_bwd_step(dataloader_iter, batch_idx, False)
+        loss_mean, _ = self.fwd_bwd_step(dataloader_iter, False)
 
         # when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced
         if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False):
@@ -477,7 +477,7 @@ def loss_func(labels, output_tensor):
             return loss, {"loss": averaged_loss[0], "accuracy": averaged_loss[1]}
 
         def fwd_output_and_loss_func(dataloader_iter, model):
-            batch = next(dataloader_iter)
+            batch, _, _ = next(dataloader_iter)
             if parallel_state.get_pipeline_model_parallel_world_size() == 1:
                 batch = [x.cuda(non_blocking=True) for x in batch]
                 tokens, labels = batch
@@ -506,7 +506,7 @@ def fwd_output_only_func(batch, model):
 
         return fwd_output_only_func
 
-    def validation_step(self, dataloader_iter, batch_idx):
+    def validation_step(self, dataloader_iter):
         """
             Our dataloaders produce a micro-batch and then we fetch
             a number of microbatches depending on the global batch size and model parallel size
@@ -519,7 +519,7 @@ def validation_step(self, dataloader_iter, batch_idx):
         if self.initialize_ub:
             self.initialize_ub_func()
 
-        loss, accuracy = self.fwd_bwd_step(dataloader_iter, batch_idx, True)
+        loss, accuracy = self.fwd_bwd_step(dataloader_iter, True)
 
         self.validation_step_outputs.append((loss, accuracy)) if mode == 'val' else self.test_step_outputs.append(
             (loss, accuracy)
@@ -554,7 +554,7 @@ def on_validation_epoch_end(self):
         return averaged_loss
 
     def test_step(self, batch, batch_idx):
-        return self.validation_step(batch, batch_idx)
+        return self.validation_step(batch)
 
     def on_test_epoch_end(self):
         pass
diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index db45701385e8..7f915b82c820 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -1034,10 +1034,10 @@ class SkipResumeTrainingValidationLoop(_TrainingEpochLoop):
     the training state before validation has run.
     """
 
-    def _should_check_val_fx(self) -> bool:
+    def _should_check_val_fx(self, data_fetcher) -> bool:
         if self.restarting and self.global_step % self.trainer.val_check_batch == 0:
             return False
-        return super()._should_check_val_fx()
+        return super()._should_check_val_fx(data_fetcher)
 
 
 def clean_exp_ckpt(exp_log_dir: Union[str, Path], remove_ckpt: bool = True, remove_nemo: bool = False):
diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt
index 7adea60957fe..ee9423b9115c 100644
--- a/requirements/requirements_lightning.txt
+++ b/requirements/requirements_lightning.txt
@@ -1,6 +1,6 @@
 hydra-core>1.3,<=1.3.2
 omegaconf<=2.3
-pytorch-lightning>=2.0,<=2.0.7
+pytorch-lightning>=2.2.1
 torchmetrics>=0.11.0
 transformers>=4.36.0
 wandb
diff --git a/scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py
index 597d6f2ccc74..e50f7fa71f2d 100644
--- a/scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py
+++ b/scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py
@@ -133,7 +133,8 @@ def convert(args):
     nemo_config.precision = precision
     print(f"nemo_config: {nemo_config}")
 
-    trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision, strategy=NLPDDPStrategy())
+    # Remove precision arg, since with PTL >= 2.1 both precision and precision plugin cannot exist together.
+    trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy())
 
     hidden_size = hf_config["hidden_size"]
     head_num = hf_config["num_attention_heads"]
diff --git a/tests/core/test_exp_manager.py b/tests/core/test_exp_manager.py
index 8073a75e14ca..8883d6514119 100644
--- a/tests/core/test_exp_manager.py
+++ b/tests/core/test_exp_manager.py
@@ -946,7 +946,9 @@ def test_invalid_checkpoints_removed_from_topk(self, tmp_path):
         test_trainer2.fit(model)
 
         ckpt_filenames = {f.name for f in checkpoints_dir.rglob("*.ckpt") if f.is_file()}
-        assert len(ckpt_filenames) == 4  # 3 top + 1 last
+        # 3 top + 1 last + 1 resume ckpt since PTL >= 2.1 ensures to never delete the resume ckpt
+        # (https://github.com/Lightning-AI/pytorch-lightning/pull/18750)
+        assert len(ckpt_filenames) == 5
         assert 'epoch=9-last.ckpt' in ckpt_filenames
         assert 'epoch=8.ckpt' in ckpt_filenames
         assert 'epoch=7.ckpt' in ckpt_filenames

From 6be016c450bd680dc32d2d5179aea2dc56471c47 Mon Sep 17 00:00:00 2001
From: Somshubra Majumdar <titu1994@gmail.com>
Date: Fri, 8 Mar 2024 11:30:23 -0800
Subject: [PATCH 006/140] Update docs for NeMo Framework (#8596)

* Update docs version

Signed-off-by: smajumdar <titu1994@gmail.com>

* Update docs for NeMo Framework

Signed-off-by: smajumdar <titu1994@gmail.com>

* Update docs for NeMo Framework

Signed-off-by: smajumdar <titu1994@gmail.com>

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
---
 docs/source/_static/css/custom.css        |  8 ++-
 docs/source/conf.py                       |  6 +-
 docs/source/index.rst                     | 73 +++++++++++++----------
 docs/source/starthere/intro.rst           |  8 ---
 docs/source/starthere/migration-guide.rst |  4 +-
 docs/source/vision/intro.rst              |  4 +-
 requirements/requirements_docs.txt        | 10 ++--
 7 files changed, 60 insertions(+), 53 deletions(-)

diff --git a/docs/source/_static/css/custom.css b/docs/source/_static/css/custom.css
index cf0ad0ff2d7f..2dae2661b353 100644
--- a/docs/source/_static/css/custom.css
+++ b/docs/source/_static/css/custom.css
@@ -1,3 +1,5 @@
+@import url("theme.css");
+
 body {
 	font-size: 100%;
 	font-family: 'NVIDIA Sans', sans-serif;
@@ -40,13 +42,17 @@ p {
 }
 
 /* Link Colors */
+/*
 a {
-	color: #76b900;
+    color: #76b900;
 }
+/*
 
+/*
 a:visited {
 	color: #218219;
 }
+*/
 
 .container-xl {
 	margin-right: unset;
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 0596b15e3de5..6d086cb42e9f 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -203,6 +203,10 @@
 
 # html_logo = html_theme_options["logo_path"]
 
+# html_sidebars = {
+#     "**": ["navbar-logo.html", "search-field.html", "sbt-sidebar-nav.html"]
+# }
+
 # -- Options for HTMLHelp output ------------------------------------------
 
 # Output file base name for HTML help builder.
@@ -222,7 +226,7 @@
 html_title = 'NVIDIA NeMo'
 
 html_theme_options = {
-    'logo_only': True,
+    'logo_only': False,
     'display_version': True,
     # 'prev_next_buttons_location': 'bottom',
     # 'style_external_links': False,
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 9d66d693000e..822431a9108a 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,6 +1,12 @@
 NVIDIA NeMo Framework Developer Docs
 ====================================
 
+.. include:: starthere/intro.rst
+
+
+Index of NeMo Framework Developer Docs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 .. toctree::
    :maxdepth: 2
    :caption: Getting Started
@@ -9,18 +15,19 @@ NVIDIA NeMo Framework Developer Docs
    starthere/intro
    starthere/tutorials
    starthere/best-practices
-   starthere/migration-guide
+
 
 .. toctree::
-   :maxdepth: 3
-   :caption: Multimodal (MM)
-   :name: Multimodal
+   :maxdepth: 2
+   :caption: NeMo Core
+   :name: core
 
-   multimodal/mllm/intro
-   multimodal/vlm/intro
-   multimodal/text2img/intro
-   multimodal/nerf/intro
-   multimodal/api
+   core/core
+   core/exp_manager
+   core/neural_types
+   core/export
+   core/adapters/intro
+   core/api
 
 
 .. toctree::
@@ -34,10 +41,11 @@ NVIDIA NeMo Framework Developer Docs
    nlp/megatron_onnx_export
    nlp/api
 
+
 .. toctree::
    :maxdepth: 2
-   :caption: Speech Processing
-   :name: Speech Processing
+   :caption: Speech AI
+   :name: Speech AI
 
    asr/intro
    asr/speech_classification/intro
@@ -46,6 +54,19 @@ NVIDIA NeMo Framework Developer Docs
    asr/ssl/intro
    asr/speech_intent_slot/intro
 
+
+.. toctree::
+   :maxdepth: 3
+   :caption: Multimodal (MM)
+   :name: Multimodal
+
+   multimodal/mllm/intro
+   multimodal/vlm/intro
+   multimodal/text2img/intro
+   multimodal/nerf/intro
+   multimodal/api
+
+
 .. toctree::
    :maxdepth: 1
    :caption: Text To Speech (TTS)
@@ -55,37 +76,16 @@ NVIDIA NeMo Framework Developer Docs
 
 .. toctree::
    :maxdepth: 2
-   :caption: Vision
+   :caption: Vision (CV)
    :name: vision
 
    vision/intro
 
-
-.. toctree::
-   :maxdepth: 2
-   :caption: NeMo Core
-   :name: core
-
-   core/core
-   core/exp_manager
-   core/neural_types
-   core/export
-   core/adapters/intro
-   core/api
-
 .. toctree::
    :maxdepth: 2
    :caption: Common
    :name: Common
 
-   text_processing/intro
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Text Processing
-   :name: Text Processing
-
-   text_processing/g2p/g2p
    common/intro
 
 
@@ -95,3 +95,10 @@ NVIDIA NeMo Framework Developer Docs
    :name: Speech Tools
 
    tools/intro
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Upgrade Guide
+   :name: Upgrade Guide
+
+   starthere/migration-guide
\ No newline at end of file
diff --git a/docs/source/starthere/intro.rst b/docs/source/starthere/intro.rst
index 185350bad3ab..77a1ca0255a1 100644
--- a/docs/source/starthere/intro.rst
+++ b/docs/source/starthere/intro.rst
@@ -98,14 +98,6 @@ See the two introductory videos below for a high level overview of NeMo.
         <iframe width="560" height="315" src="https://www.youtube.com/embed/wBgpMf_KQVw" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
     </div>
 
-**NVIDIA NeMo: Toolkit for Conversational AI at PyData Yerevan 2022**
-
-.. raw:: html
-
-    <div style="position: relative; padding-bottom: 3%; height: 0; overflow: hidden; max-width: 100%; height: auto;">
-        <iframe width="560" height="315" src="https://www.youtube.com/embed/J-P6Sczmas8?mute=0&start=14&autoplay=0" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
-    </div>
-
 .. _installation:
 
 Installation
diff --git a/docs/source/starthere/migration-guide.rst b/docs/source/starthere/migration-guide.rst
index 15b4940172c3..1d9816493a5b 100644
--- a/docs/source/starthere/migration-guide.rst
+++ b/docs/source/starthere/migration-guide.rst
@@ -1,5 +1,5 @@
-Migration guide to use lightning 2.0
-=====================================
+Upgrade guide to use lightning 2.0
+==================================
 
 .. # define a hard line break for html
 .. |br| raw:: html
diff --git a/docs/source/vision/intro.rst b/docs/source/vision/intro.rst
index 6df5881e1121..4f4462404b90 100644
--- a/docs/source/vision/intro.rst
+++ b/docs/source/vision/intro.rst
@@ -1,5 +1,5 @@
-Foundation Vision Models in NeMo
-================================
+Vision Models
+=============
 
 NeMo has implemented foundational vision models, establishing a solid base for further exploration into multimodal applications. These foundational vision models can be leveraged in a variety of multimodal applications including multimodal language models and text to image generation tasks, among others. These foundation models not only lay the functional groundwork but also play a crucial role in achieving state-of-the-art performance on NVIDIA GPUs through our custom optimizations.
 
diff --git a/requirements/requirements_docs.txt b/requirements/requirements_docs.txt
index 8412c67d4ab2..ff3ec5202b0e 100644
--- a/requirements/requirements_docs.txt
+++ b/requirements/requirements_docs.txt
@@ -1,14 +1,12 @@
 boto3
-Jinja2<3.1
+Jinja2
 latexcodec
 numpy
-# sphinx-book-theme is incompatible with pydata-sphinx-theme>0.13.2
-# https://github.com/executablebooks/sphinx-book-theme/issues/711
-pydata-sphinx-theme==0.13.1
-Sphinx>=4.0,<6,!=5.0.0
+pydata-sphinx-theme
+Sphinx
 sphinx-book-theme
 sphinx-copybutton
 sphinxcontrib-bibtex
 sphinxext-opengraph
-urllib3<2.0.0
+urllib3
 wrapt

From 5a3450dbdf65654fee1310dbcc205347537b3143 Mon Sep 17 00:00:00 2001
From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com>
Date: Fri, 8 Mar 2024 15:28:50 -0500
Subject: [PATCH 007/140] Update results.rst for Canary Inference (#8562)

* Update results.rst for Canary Inference

Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* Update results.rst for Canary Inference

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>

---------

Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>
Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>
---
 docs/source/asr/results.rst | 51 +++++++++++++++++++++++++++++++++++--
 1 file changed, 49 insertions(+), 2 deletions(-)

diff --git a/docs/source/asr/results.rst b/docs/source/asr/results.rst
index b38a661a0ea9..05f91dde88ae 100644
--- a/docs/source/asr/results.rst
+++ b/docs/source/asr/results.rst
@@ -133,10 +133,56 @@ Often times, we want to transcribe a large number of files at once (maybe from a
         # process a batch of 32 results (or less if last batch does not contain 32 elements)
         ....
 
+For more information, see `nemo.collections.asr.modules <./api.html#modules>`__. For more information on the general ``Transcription API``, please take a look at :class:`~nemo.collections.asr.parts.mixins.transcription.TranscriptionMixin`. The audio files should be 16KHz mono-channel wav files.
 
 -----
 
-For more information, see `nemo.collections.asr.modules <./api.html#modules>`__. For more information on the general ``Transcription API``, please take a look at :class:`~nemo.collections.asr.parts.mixins.transcription.TranscriptionMixin`. The audio files should be 16KHz mono-channel wav files.
+Inference with Multi-task Models
+^^^^^^^^^^^^^^^^^^^^^^
+
+Multi-task models that use structured prompts require additionl task tokens as input, in which case it is recommended to use manifest as input. Below is an example of using the `nvidia/canary-1b` model:
+
+.. code-block:: python
+    from nemo.collections.asr.models import EncDecMultiTaskModel
+   
+    # load model
+    canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
+   
+    # update dcode params
+    decode_cfg = canary_model.cfg.decoding
+    decode_cfg.beam.beam_size = 1
+    canary_model.change_decoding_strategy(decode_cfg)
+
+    # run transcribe
+    predicted_text = canary_model.transcribe(
+          "<path to input manifest file>",
+          batch_size=16,  # batch size to run the inference with
+    )
+
+Here the manifest file should be a json file where each line has the following format:
+
+.. code-block:: bash
+    {
+       "audio_filepath": "/path/to/audio.wav",  # path to the audio file
+       "duration": None,  # duration of the audio in seconds, set to `None` to use full audio
+       "taskname": "asr",  # use "ast" for speech-to-text translation
+       "source_lang": "en",  # language of the audio input, set `source_lang`==`target_lang` for ASR
+       "target_lang": "en",  # language of the text output
+       "pnc": "yes",  # whether to have PnC output, choices=['yes', 'no']
+       "answer": "na", # set to non-dummy strings to calculate WER/BLEU scores 
+    }
+
+Note that using manifest allows to specify the task configuration for each audio individually. If we want to use the same task configuration for all the audio files, it can be specified in `transcribe` method directly. 
+
+.. code-block:: python
+    canary_model.transcribe(
+            audio=[list of audio files],
+            batch_size=4,  # batch size to run the inference with
+            task="asr",  # use "ast" for speech-to-text translation
+            source_lang="en",  # language of the audio input, set `source_lang`==`target_lang` for ASR
+            target_lang="en",  # language of the text output
+            pnc=True,  # whether to have PnC output, choices=[True, False]
+    )
 
 Inference on long audio
 ^^^^^^^^^^^^^^^^^^^^^^^
@@ -180,6 +226,7 @@ Sometimes, the downsampling module at the earliest stage of the model can take m
     # Speedup conv subsampling factor to speed up the subsampling module.
     asr_model.change_subsampling_conv_chunking_factor(1)  # 1 = auto select
 
+
 .. note::
 
     Only certain models which use depthwise separable convolutions in the downsampling layer support this operation. Please try it out on your model and see if it is supported.
@@ -388,4 +435,4 @@ Code-Switching
    :file: data/benchmark_code_switching.csv
    :align: left
    :widths: 40, 10, 50
-   :header-rows: 1
\ No newline at end of file
+   :header-rows: 1

From eeb0dd7ae1b3edb87770fcd8b984e6202ba1b2e5 Mon Sep 17 00:00:00 2001
From: Krishna Puvvada <93558329+krishnacpuvvada@users.noreply.github.com>
Date: Fri, 8 Mar 2024 16:15:43 -0800
Subject: [PATCH 008/140] bug fix in long-form transcription for canary (#8614)

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>
---
 nemo/collections/asr/parts/utils/streaming_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/nemo/collections/asr/parts/utils/streaming_utils.py b/nemo/collections/asr/parts/utils/streaming_utils.py
index d90fe2be981e..71c945b66255 100644
--- a/nemo/collections/asr/parts/utils/streaming_utils.py
+++ b/nemo/collections/asr/parts/utils/streaming_utils.py
@@ -1590,8 +1590,8 @@ def get_input_tokens(self, sample: dict):
                 )
             tokens = canary_prompt(
                 tokenizer=self.asr_model.tokenizer,
-                text="none",
-                language=sample['target_lang'],
+                text=None,
+                language=None,
                 source_language=sample['source_lang'],
                 target_language=sample['target_lang'],
                 taskname=sample['taskname'],
@@ -1619,7 +1619,7 @@ def _get_batch_preds(self, keep_logits=False):
             tokens = self.input_tokens.to(device).repeat(feat_signal.size(0), 1)
             tokens_len = torch.tensor([tokens.size(1)] * tokens.size(0), device=device).long()
 
-            batch_input = (feat_signal, feat_signal_len, tokens, tokens_len)
+            batch_input = (feat_signal, feat_signal_len, None, None, tokens, tokens_len)
             predictions = self.asr_model.predict_step(batch_input, has_processed_signal=True)
             self.all_preds.extend(predictions)
             del predictions

From 438db620bffdf4e2d4cef6368d0e86be2a02b7c3 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 8 Mar 2024 16:19:15 -0800
Subject: [PATCH 009/140] Fixes gpt mcore conversion to account for
 _extra_state that may be present (#8618)

---
 scripts/nlp_language_modeling/convert_nemo_gpt_to_mcore.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/nlp_language_modeling/convert_nemo_gpt_to_mcore.py b/scripts/nlp_language_modeling/convert_nemo_gpt_to_mcore.py
index e152736734f6..8e2c2d350855 100644
--- a/scripts/nlp_language_modeling/convert_nemo_gpt_to_mcore.py
+++ b/scripts/nlp_language_modeling/convert_nemo_gpt_to_mcore.py
@@ -292,7 +292,9 @@ def run_sanity_checks(nemo_file, mcore_file, cpu_only=False, ignore_if_missing=t
     logging.info("✅ Weights match")
 
     # check for unexpected weights in state dict
-    assert len(nemo_state_dict) == 0, f"❌ unexpected items in nemo_state_dict: {nemo_state_dict}"
+    assert (
+        len([k for k in nemo_state_dict if not k.endswith('_extra_state')]) == 0
+    ), f"❌ unexpected items in nemo_state_dict: {nemo_state_dict}"
     assert (
         len([k for k in mcore_state_dict if not k.endswith('_extra_state')]) == 0
     ), f"❌ unexpected items in mcore_state_dict: {mcore_state_dict}"

From 49c10c881c835725ae24ed115b6aefd2cb595e8e Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Mon, 11 Mar 2024 11:50:44 -0400
Subject: [PATCH 010/140] Fix LoRA SP no redundant gather + linear_fc1 lora
 logic (#8621)

* remove LoRA SP no redundant comm for all linear layers

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert to scatter in adapter module instead of scatter after add

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../modules/common/megatron/adapters/mcore_mixins.py | 12 ++++++++++--
 .../common/megatron/adapters/parallel_adapters.py    |  2 +-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
index 368d2cc52ae0..3eb63e96c3a3 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
@@ -248,16 +248,23 @@ def mcore_register_adapters(self):
         self.set_accepted_adapter_types(
             [LoraHto4HAdapterConfig._target_, Lora4HtoHAdapterConfig._target_, MLPInfusedAdapterConfig._target_]
         )  # only self attn (packed qkv) for now
+        self.linear_fc1.return_layernorm_output = True  # need layernorm output for lora mlp
+        if self.config.sequence_parallel and hasattr(self.linear_fc1, "return_layernorm_output_gathered"):
+            # for LoRA SP, TE v1.5 can return layernorm output gathered so there is no need
+            # to perform the redundant gather in the adapter module.
+            self.linear_fc1.return_layernorm_output_gathered = True
 
     def forward(self, hidden_states):
         # [s, b, 4 * h/p]
-        intermediate_parallel, bias_parallel = self.linear_fc1(hidden_states)
+        linear_fc1_output, bias_parallel = self.linear_fc1(hidden_states)
+
+        intermediate_parallel, layernorm_output = linear_fc1_output
 
         # LoRA logic
         if self.is_adapter_available():
             lora_linear_fc1_adapter = self.get_adapter_module(AdapterName.LORA_Hto4H_ADAPTER)
             if lora_linear_fc1_adapter and self.adapter_cfg[AdapterName.LORA_Hto4H_ADAPTER]['enabled']:
-                lora_output = lora_linear_fc1_adapter(hidden_states)
+                lora_output = lora_linear_fc1_adapter(layernorm_output)
                 intermediate_parallel = intermediate_parallel + lora_output
 
         if self.config.bias_activation_fusion:
@@ -294,6 +301,7 @@ def glu(x):
             if lora_linear_fc2_adapter and self.adapter_cfg[AdapterName.LORA_4HtoH_ADAPTER]['enabled']:
                 lora_output = lora_linear_fc2_adapter(intermediate_parallel)
                 output = output + lora_output
+
         return output, output_bias
 
 
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
index d31125945f73..8c34f528f2d9 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
@@ -237,7 +237,7 @@ def __init__(
 
         # revert config change in case it is read elsewhere
         model_parallel_config.sequence_parallel = self._sequence_parallel
-        if self._sequence_parallel:
+        if self._sequence_parallel and not input_is_parallel:
             from importlib.metadata import version
 
             from pkg_resources import packaging

From 1f6191ef0cccc1bf7e256448b85d59a9c43fb553 Mon Sep 17 00:00:00 2001
From: Zeeshan Patel <micronest@gmail.com>
Date: Mon, 11 Mar 2024 11:12:42 -0700
Subject: [PATCH 011/140] fixed pp eval for sft/lora (#8616)

Co-authored-by: Chen Cui <chcui@nvidia.com>
---
 .../language_modeling/megatron_gpt_sft_model.py   | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index 331f977a3265..325f039d461b 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -423,12 +423,15 @@ def inference_step(self, dataloader_iter, mode):
             self._inference_config['tokens_to_generate'] = data_cfg.get('tokens_to_generate')
 
             output = self.predict_step(batch, batch_idx, dataloader_idx)
-            inputs_text = [self.tokenizer.ids_to_text(c.tolist()) for c in batch['contexts']]
-            labels_text = [self.tokenizer.ids_to_text(a.tolist()) for a in batch['answers']]
-            preds_text = [
-                self.tokenizer.ids_to_text(t[l.item() :][: data_cfg.get('tokens_to_generate')])
-                for t, l in zip(output['token_ids'], batch['context_lengths'])
-            ]
+            if output:
+                inputs_text = [self.tokenizer.ids_to_text(c.tolist()) for c in batch['contexts']]
+                labels_text = [self.tokenizer.ids_to_text(a.tolist()) for a in batch['answers']]
+                preds_text = [
+                    self.tokenizer.ids_to_text(t[l.item() :][: data_cfg.get('tokens_to_generate')])
+                    for t, l in zip(output['token_ids'], batch['context_lengths'])
+                ]
+            else:
+                inputs_text, labels_text, preds_text = [], [], []
         else:
             inputs_text, labels_text, preds_text = [], [], []
 

From f005f1323eaf9a23ad6dc4bc326dc95bf0002e8d Mon Sep 17 00:00:00 2001
From: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Date: Mon, 11 Mar 2024 20:40:36 -0700
Subject: [PATCH 012/140] Set precision None in megatron_ckpt_to_nemo.py
 (#8630)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 examples/nlp/language_modeling/megatron_ckpt_to_nemo.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
index c4c0394e3892..c58ae7f156eb 100644
--- a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
+++ b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
@@ -142,6 +142,9 @@ def convert(local_rank, rank, world_size, args):
             hysteresis=cfg.model.get('hysteresis', 2),
         )
     plugins.append(PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler))
+    # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both
+    # precision plugins and precision to exist
+    cfg.trainer.precision = None
     trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer)
 
     app_state.pipeline_model_parallel_size = args.pipeline_model_parallel_size

From ac0396f37350bf98f2887c224428e387d82dc155 Mon Sep 17 00:00:00 2001
From: Chris Alexiuk <161380339+chrisalexiuk-nvidia@users.noreply.github.com>
Date: Tue, 12 Mar 2024 01:36:25 -0400
Subject: [PATCH 013/140] Minor Updates to GPT Training Documentation Example
 (#8629)

Minor copy and instruction changes to improve tutorial viability.

Signed-off-by: Chris Alexiuk <161380339+chrisalexiuk-nvidia@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 .../nlp/nemo_megatron/gpt/gpt_training.rst    | 32 +++++++++++--------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst b/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst
index 4c0a09b7f6ea..986e7be30a00 100644
--- a/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst
+++ b/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst
@@ -6,7 +6,10 @@ GPT is a decoder-only Transformer model.
 
 Quick start
 ^^^^^^^^^^^
-Steps below demonstrate training of a GPT style model with NeMo
+The steps below demonstrate training of a GPT-style model with NeMo
+
+.. note::
+    This example is best completed using the latest NeMo Framework NGC Container
 
 Data download & pre-processing
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -16,7 +19,7 @@ Data download & pre-processing
 
 **Step 1: Download data**
 
-The step below will download Wikipedia data (around 20GB) and can take some several hours.
+The step below will download Wikipedia data (around 20GB) and can take several hours.
 
 .. code-block:: bash
 
@@ -35,12 +38,13 @@ Now, ``train_data.jsonl`` will contain our training data in the json line format
 
 **Step 3: Train tokenizer**
 
-Below we will condider 2 options for training data tokenizers: Using pre-built HuggingFace BPE and training and using your own Google Sentencepiece tokenizer.
-Note that only second option allows you to experiment with vocabulary size.
+Below we will consider 2 options for training data tokenizers: Using pre-built HuggingFace BPE and training and using your own Google Sentencepiece tokenizer.
+
+Note that only the second option allows you to experiment with vocabulary size.
 
 *Option 1:* Using HuggingFace GPT2 tokenizer files.
 
-With this option we will just download pre-built vocabulary and merge files for BPE tokenizer.
+With this option, we will download a pre-built vocabulary and merge the files for the BPE tokenizer.
 
 .. code-block:: bash
 
@@ -50,7 +54,7 @@ With this option we will just download pre-built vocabulary and merge files for
 
 *Option 2:* Using `Google Sentencepiece <https://github.com/google/sentencepiece>`_ tokenizer library. 
 
-It comes as dependency with NeMo, so if you have installed NeMo it should already be installed.
+It comes as a dependency with NeMo, so if you have installed NeMo it should already be installed.
 Note that training tokenizer model will also take some time.
 
 .. code-block:: bash
@@ -66,11 +70,11 @@ Note that training tokenizer model will also take some time.
         --pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 \
         --split_digits true 
 
-After this is done (will take a while), you'll have two files: ```spm_32k_wiki.model and spm_32k_wiki.vocab`` which correspond to model and vocabulary.
+After this is done (will take a while), you'll have two files: ```spm_32k_wiki.model``` and ```spm_32k_wiki.vocab``corresponding to the model and vocabulary.
 
 **Step 4: Convert training data into memory map format**
 
-This format makes training more efficient, especially with many nodes and GPUs. This step will also tokenize data using tokenizer model from Step 3.
+This format makes training more efficient, especially with many nodes and GPUs. This step will also tokenize data using the tokenizer model from Step 3.
 
 *Option 1:* Using HuggingFace GPT2 tokenizer files.
 
@@ -106,15 +110,15 @@ Train GPT-style Model
 ~~~~~~~~~~~~~~~~~~~~~
 
 Once you have prepared training data and tokenizer, you are ready to train the model.
-The configuration we present below has about 124M parameters and it should fit on a single 16GB GPU if using float16.
+The configuration we present below has about 124M parameters and should fit on a single 16GB GPU using float16.
 Let's go!!!
 
 *Option 1:* Using HuggingFace GPT2 tokenizer files.
 
 .. code-block:: bash
 
-    python /home/okuchaiev/repos/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py  \
-	--config-path=/home/okuchaiev/repos/NeMo/examples/nlp/language_modeling/conf \
+    python <NeMo_ROOT_FOLDER>/examples/nlp/language_modeling/megatron_gpt_pretraining.py  \
+	--config-path=<NeMo_ROOT_FOLDER>/examples/nlp/language_modeling/conf \
 	--config-name=megatron_gpt_config \
 	trainer.devices=1 \
 	trainer.num_nodes=1 \
@@ -166,8 +170,8 @@ Let's go!!!
 
 .. code-block:: bash
 
-    python /home/okuchaiev/repos/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py  \
-	--config-path=/home/okuchaiev/repos/NeMo/examples/nlp/language_modeling/conf \
+    python <NeMo_ROOT_FOLDER>/examples/nlp/language_modeling/megatron_gpt_pretraining.py  \
+	--config-path=<NeMo_ROOT_FOLDER>/examples/nlp/language_modeling/conf \
 	--config-name=megatron_gpt_config \
 	trainer.devices=1 \
 	trainer.num_nodes=1 \
@@ -215,7 +219,7 @@ Let's go!!!
 	exp_manager.checkpoint_callback_params.always_save_nemo=False
 
 
-Next, simply launch Tensorboard to monitor training like so:
+Next, you can launch Tensorboard to monitor training like so:
 
 .. code-block:: bash
 

From e46f71117011739603f38eacdbc3acd7e7904074 Mon Sep 17 00:00:00 2001
From: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Date: Mon, 11 Mar 2024 23:04:57 -0700
Subject: [PATCH 014/140] remove include intro from docs index (#8636)

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>
---
 docs/source/index.rst | 42 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 822431a9108a..9b62174ecbe2 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,26 +1,44 @@
 NVIDIA NeMo Framework Developer Docs
 ====================================
 
-.. include:: starthere/intro.rst
+NVIDIA NeMo Framework is an end-to-end, cloud-native framework to build, customize, and deploy generative AI models anywhere.
 
+`NVIDIA NeMo Framework <https://github.com/NVIDIA/NeMo>`_ has separate collections for:
+
+* :doc:`Large Language Models (LLMs) <nlp/nemo_megatron/intro>`
+
+* :doc:`Automatic Speech Recognition (ASR) <asr/intro>`
+
+* :doc:`Multimodal (MM) Models <multimodal/mllm/intro>`
+
+* :doc:`Text-to-Speech (TTS) <tts/intro>`
+
+* :doc:`Computer Vision (CV)  <vision/intro>`
+
+Each collection consists of prebuilt modules that include everything needed to train on your data.
+Every module can easily be customized, extended, and composed to create new generative AI
+model architectures.
+
+For quick guides and tutorials, see the "Getting started" section below.
 
-Index of NeMo Framework Developer Docs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. toctree::
    :maxdepth: 2
    :caption: Getting Started
    :name: starthere
+   :titlesonly:
 
    starthere/intro
    starthere/tutorials
    starthere/best-practices
 
+For more information, browse the developer docs for your area of interest in the contents section below or on the left sidebar.
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
    :caption: NeMo Core
    :name: core
+   :titlesonly:
 
    core/core
    core/exp_manager
@@ -31,9 +49,10 @@ Index of NeMo Framework Developer Docs
 
 
 .. toctree::
-   :maxdepth: 3
+   :maxdepth: 1
    :caption: Large Language Models (LLMs)
    :name: Large Language Models
+   :titlesonly:
 
    nlp/nemo_megatron/intro
    nlp/models
@@ -43,9 +62,10 @@ Index of NeMo Framework Developer Docs
 
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
    :caption: Speech AI
    :name: Speech AI
+   :titlesonly:
 
    asr/intro
    asr/speech_classification/intro
@@ -56,9 +76,10 @@ Index of NeMo Framework Developer Docs
 
 
 .. toctree::
-   :maxdepth: 3
+   :maxdepth: 1
    :caption: Multimodal (MM)
    :name: Multimodal
+   :titlesonly:
 
    multimodal/mllm/intro
    multimodal/vlm/intro
@@ -71,6 +92,7 @@ Index of NeMo Framework Developer Docs
    :maxdepth: 1
    :caption: Text To Speech (TTS)
    :name: Text To Speech
+   :titlesonly:
 
    tts/intro
 
@@ -78,6 +100,7 @@ Index of NeMo Framework Developer Docs
    :maxdepth: 2
    :caption: Vision (CV)
    :name: vision
+   :titlesonly:
 
    vision/intro
 
@@ -85,14 +108,16 @@ Index of NeMo Framework Developer Docs
    :maxdepth: 2
    :caption: Common
    :name: Common
+   :titlesonly:
 
    common/intro
 
 
 .. toctree::
-   :maxdepth: 3
+   :maxdepth: 2
    :caption: Speech Tools
    :name: Speech Tools
+   :titlesonly:
 
    tools/intro
 
@@ -100,5 +125,6 @@ Index of NeMo Framework Developer Docs
    :maxdepth: 2
    :caption: Upgrade Guide
    :name: Upgrade Guide
+   :titlesonly:
 
    starthere/migration-guide
\ No newline at end of file

From 6daf5e88e7016d537cede484cefbeecec2c391fe Mon Sep 17 00:00:00 2001
From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com>
Date: Tue, 12 Mar 2024 15:31:05 -0400
Subject: [PATCH 015/140] Fix for relative file paths when
 presort_manifest==True (#8639)

Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>
---
 examples/asr/transcribe_speech.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
index 6d6006e939e5..e85a15be81d4 100644
--- a/examples/asr/transcribe_speech.py
+++ b/examples/asr/transcribe_speech.py
@@ -40,6 +40,7 @@
     transcribe_partial_audio,
     write_transcription,
 )
+from nemo.collections.common.parts.preprocessing.manifest import get_full_path
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 
@@ -331,6 +332,7 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
             if cfg.presort_manifest:
                 with NamedTemporaryFile("w", suffix=".json", delete=False) as f:
                     for item in read_and_maybe_sort_manifest(cfg.dataset_manifest, try_sort=True):
+                        item["audio_filepath"] = get_full_path(item["audio_filepath"], cfg.dataset_manifest)
                         print(json.dumps(item), file=f)
                     cfg.dataset_manifest = f.name
                     remove_path_after_done = f.name

From bab2a39467db2a489c5104af264f0d9575a6db6d Mon Sep 17 00:00:00 2001
From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Date: Tue, 12 Mar 2024 14:50:06 -0700
Subject: [PATCH 016/140] Gemma uses openai_gelu approx (#8638)

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
---
 examples/nlp/language_modeling/conf/megatron_gemma_config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gemma_config.yaml b/examples/nlp/language_modeling/conf/megatron_gemma_config.yaml
index cda2162002d3..bdc5e2057886 100644
--- a/examples/nlp/language_modeling/conf/megatron_gemma_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gemma_config.yaml
@@ -76,7 +76,7 @@ model:
   activation: 'geglu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu']
   headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
   transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
-  openai_gelu: False # Use OpenAI's GELU instead of the default GeLU
+  openai_gelu: True # Use OpenAI's GELU instead of the default GeLU
   normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True.
   position_embedding_type: 'rope' # Position embedding type. Options ['learned_absolute', 'rope']
   rotary_percentage: 1.0 # If using position_embedding_type=rope, then the per head dim is multiplied by this.

From cb3f2bc748c5f4545a94f7aa32ec0e6576af9b7c Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Wed, 13 Mar 2024 16:28:56 +0100
Subject: [PATCH 017/140] AMMO Integration with Llama2 Post-Training
 Quantization Example and Tests (#8444)

* AMMO integration with Llama2 PTQ example and tests

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Jenkins megatron_llama_quantization.py test setup

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* License headers

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Add AMMO to requirements_nlp.txt with --extra-index-url for pip install

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Bump AMMO version to latest

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Guards workaround on spec definition

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Save artifacts and tokenizer config at once

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Extend nemo.utils package with new tools

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Reorganize & reformat

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Tests for FP8 and INT4 AWQ

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add load_config helper function

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Unused import removal

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Fix FP8 Jenkins test

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Fix TP=2 test cont'd: no need to use mpirun

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Allow for patches in AMMO versioning

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Drop AWQ test for now (need to debug)

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Allow for patches in AMMO versioning cont'd

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Use AMMO spec from MCore as it has been published

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Make AMMO optional dependency and properly import guard it

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add Llama2 AWQ test and update some paths

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Enable specifying quantization.algorithm=null for baseline accuracy checks

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Enable exporting qnemo tarball or just to a directory

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Drop AWQ testing for now

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Test case for export.inference_tensor_parallel=2

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Flag to export TRT-LLM config.json

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 Dockerfile                                    |   4 +-
 Jenkinsfile                                   |  65 +++++-
 .../conf/megatron_llama_quantization.yaml     |  38 +++
 .../megatron_llama_quantization.py            |  93 ++++++++
 .../language_modeling/megatron_gpt_model.py   |   2 +
 nemo/export/__init__.py                       |  13 ++
 nemo/export/quantize/__init__.py              |  15 ++
 nemo/export/quantize/quantizer.py             | 218 ++++++++++++++++++
 nemo/utils/distributed.py                     |  23 ++
 nemo/utils/model_utils.py                     |  49 ++++
 tests/setup/__main__.py                       |  42 ++++
 tests/setup/data/create_sample_jsonl.py       |  58 +++++
 tests/setup/models/create_hf_model.py         |  94 ++++++++
 13 files changed, 710 insertions(+), 4 deletions(-)
 create mode 100644 examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
 create mode 100644 examples/nlp/language_modeling/megatron_llama_quantization.py
 create mode 100644 nemo/export/__init__.py
 create mode 100644 nemo/export/quantize/__init__.py
 create mode 100644 nemo/export/quantize/quantizer.py
 create mode 100644 tests/setup/__main__.py
 create mode 100644 tests/setup/data/create_sample_jsonl.py
 create mode 100644 tests/setup/models/create_hf_model.py

diff --git a/Dockerfile b/Dockerfile
index 90c84ea07627..970c34a690d4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -66,7 +66,7 @@ WORKDIR /workspace/
 # We leave it here in case we need to work off of a specific commit in main
 RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \
   cd Megatron-LM && \
-  git checkout ad53b1e38689a0ceed75ade7821f4e6c7554abb4 && \
+  git checkout 36e9b6bf3d8034b10c9bbd9fc357c2df2bd1515c && \
   pip install .
 
 # Performance optimizations for distributed optimizer: https://github.com/NVIDIA/apex/pull/1771
@@ -132,6 +132,8 @@ RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-chec
 RUN pip install flash-attn
 # install numba for latest containers
 RUN pip install numba>=0.57.1
+# install ammo
+RUN pip install nvidia-ammo~=0.7.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir
 
 # copy nemo source into a scratch image
 FROM scratch as nemo-src
diff --git a/Jenkinsfile b/Jenkinsfile
index cfd5853a6882..100a0bd4a6ad 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -91,11 +91,17 @@ pipeline {
       steps {
          sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \
              cd Megatron-LM && \
-             git checkout 5f9c870f9f24b482509699d206a9dbb00958f6fc && \
+             git checkout 36e9b6bf3d8034b10c9bbd9fc357c2df2bd1515c && \
              pip install .'
       }
     }
 
+    stage('AMMO installation') {
+      steps {
+         sh 'pip install nvidia-ammo~=0.7.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir'
+      }
+    }
+
     stage('PyTorch Lightning version') {
       steps {
         sh 'python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"'
@@ -390,6 +396,12 @@ pipeline {
       }
     }
 
+    stage('Setup test data and models') {
+      steps {
+        sh 'python -m tests.setup --save_dir /home/TestData/nlp'
+      }
+    }
+
     // TODO: this requires TE >= v0.11 which is not available in 23.06.
     //        please uncomment this test once mcore CI is ready.
     stage('L2: Community LLM Checkpoints tests') {
@@ -405,9 +417,8 @@ pipeline {
           steps {
             sh 'CUDA_VISIBLE_DEVICES=0 python scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py \
             --in-file=/home/TestData/nlp/megatron_llama/llama-ci-hf \
-            --out-file=/home/TestData/nlp/megatron_llama/ci.nemo \
+            --out-file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \
             --precision=16'
-            sh 'rm -f /home/TestData/nlp/megatron_llama/ci.nemo'
           }
         }
         stage('StarCoder') {
@@ -439,6 +450,54 @@ pipeline {
       }
     }
 
+    stage('L2: Nemo PTQ') {
+      when {
+        anyOf {
+          branch 'main'
+          changeRequest target: 'main'
+        }
+      }
+      failFast true
+      parallel {
+        stage('Llama2 - Export Only') {
+          steps {
+            sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \
+            model_file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \
+            quantization.algorithm=null \
+            model_save=/home/TestData/nlp/megatron_llama/ci_baseline'
+            sh 'rm -rf /home/TestData/nlp/megatron_llama/ci_baseline'
+          }
+        }
+        stage('Llama2 - INT8 SQ') {
+          steps {
+            sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \
+            model_file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \
+            quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
+            quantization.algorithm=int8_sq \
+            quantization.num_calib_size=8 \
+            inference.batch_size=2 \
+            model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo'
+            sh 'rm -f /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo'
+          }
+        }
+        stage('Llama2 - FP8') {
+          steps {
+            sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \
+            model_file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \
+            tensor_model_parallel_size=2 \
+            trainer.devices=2 \
+            quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
+            quantization.algorithm=fp8 \
+            quantization.num_calib_size=8 \
+            inference.batch_size=2 \
+            export.inference_tensor_parallel=2 \
+            model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo'
+            sh 'rm -f /home/TestData/nlp/megatron_llama/ci_fp8.qnemo'
+          }
+        }
+      }
+    }
+
     stage('L2: ASR dev run') {
       when {
         anyOf {
diff --git a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
new file mode 100644
index 000000000000..f3803dc4e69c
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
@@ -0,0 +1,38 @@
+inference:
+  greedy: false # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  add_BOS: true # add the bos token at the begining of the prompt
+  tokens_to_generate: 30 # The minimum length of the sequence to be generated.
+  all_probs: false  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: false  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+  batch_size: 64 # batch size for inference
+  max_context_length: 512 # max length of the context, input sequence will be truncated if it is longer than this
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: false # logger provided by exp_manager
+  precision: bf16 # 16, 32, or bf16
+  enable_checkpointing: false
+
+quantization:
+  quantize_bmm1: false
+  algorithm: fp8 # int8_sq, fp8, int8, int4_awq, null
+  calib_dataset: cnn_dailymail # pileval, wikitext, cnn_dailymail
+  num_calib_size: 512 # number of samples used for calibration
+
+export:
+  decoder_type: llama # gptnext, gpt2, llama
+  inference_tensor_parallel: 1 # Default using 1 TP for inference
+  dtype: 16 # Default precision data type
+  export_tensorrt_llm_config: true # export config to build TRT-LLM engine directly
+
+model_file: llama2-7b-fp16.nemo # Nemo file path
+model_save: llama2-7b-fp8.qnemo # Path where the quantized model will be saved
+tensor_model_parallel_size: 1
+pipeline_model_parallel_size: 1
diff --git a/examples/nlp/language_modeling/megatron_llama_quantization.py b/examples/nlp/language_modeling/megatron_llama_quantization.py
new file mode 100644
index 000000000000..16fb5ae9c13b
--- /dev/null
+++ b/examples/nlp/language_modeling/megatron_llama_quantization.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.multiprocessing as mp
+from datasets import load_dataset
+
+from nemo.core.config import hydra_runner
+from nemo.export.quantize import Quantizer
+
+mp.set_start_method("spawn", force=True)
+
+"""
+Nemo quantization example script.
+
+Please consult nemo.export.quantize.Quantizer class
+and examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml config on available quantization methods,
+models supported as well as how to set up data and inference for calibration (with defaults recommended).
+
+Example usage:
+```
+python examples/nlp/language_modeling/megatron_llama_quantization.py \
+    model_file=llama2-7b-fp16.nemo \
+    model_save=llama2-7b-fp8.qnemo \
+    quantization.algorithm=fp8 \
+    export.decoder_type=llama \
+    export.inference_tensor_parallel=1
+```
+"""
+
+
+def get_calib_dataloader(data="cnn_dailymail", batch_size=64, calib_size=512, max_sequence_length=512):
+    if data == "pileval":
+        dataset = load_dataset("json", data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", split="train")
+        text_column = "text"
+    elif data == "wikitext":
+        dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")
+        text_column = "text"
+    elif data == "cnn_dailymail":
+        dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train")
+        text_column = "article"
+    else:
+        # Assume a local JSON dataset with a column named "text"
+        dataset = load_dataset("json", data_files=data, split="train")
+        text_column = "text"
+    calib_size = max(min(len(dataset), calib_size), batch_size)
+    for i in range(calib_size // batch_size):
+        batch = dataset[i * batch_size : (i + 1) * batch_size][text_column]
+        for j in range(len(batch)):
+            batch[j] = batch[j][:max_sequence_length]
+        yield batch
+
+
+@hydra_runner(config_path="conf", config_name="megatron_llama_quantization")
+def main(cfg) -> None:
+    if not torch.cuda.is_available():
+        raise EnvironmentError("GPU is required for the inference.")
+
+    quantizer = Quantizer(cfg.quantization, cfg.inference, cfg.export, cfg.trainer)
+
+    # Quantization algorithm can be set to None. This is useful for baseline precision
+    # accuracy validation. In this case only weights export step will be performed:
+    if cfg.quantization.algorithm is not None:
+        dataloader = get_calib_dataloader(
+            cfg.quantization.calib_dataset,
+            cfg.inference.batch_size,
+            cfg.quantization.num_calib_size,
+            cfg.inference.max_context_length,
+        )
+        dataloader = [data for data in dataloader]
+    else:
+        dataloader = None
+
+    model = quantizer.quantize(
+        cfg.model_file, dataloader, cfg.tensor_model_parallel_size, cfg.pipeline_model_parallel_size
+    )
+
+    quantizer.export(model, cfg.model_save)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index ac35af38de64..f883f1c1fc7c 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -91,6 +91,7 @@
     from megatron.core import InferenceParams, parallel_state, tensor_parallel
     from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
     from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset
+    from megatron.core.deploy.gpt.model_specs import get_gpt_layer_ammo_spec
     from megatron.core.models.gpt import GPTModel as MCoreGPTModel
     from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
@@ -140,6 +141,7 @@ def get_specs(spec_name, num_experts=None):
         "": get_gpt_layer_with_transformer_engine_spec(num_experts),
         "megatron_falcon_gpt": get_falcon_layer_spec(),
         "megatron_gpt_full_te_layer_autocast": get_gpt_full_te_layer_autocast_spec(),
+        "ammo": get_gpt_layer_ammo_spec(),
     }
     if spec_name not in name_spec_dict:
         raise ValueError(f"Spec name '{spec_name}' is not recognized.")
diff --git a/nemo/export/__init__.py b/nemo/export/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/export/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/export/quantize/__init__.py b/nemo/export/quantize/__init__.py
new file mode 100644
index 000000000000..87812e621bb6
--- /dev/null
+++ b/nemo/export/quantize/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .quantizer import Quantizer
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
new file mode 100644
index 000000000000..1ae375e6cfe7
--- /dev/null
+++ b/nemo/export/quantize/quantizer.py
@@ -0,0 +1,218 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import tarfile
+from contextlib import nullcontext
+from typing import List, Optional
+
+import torch.distributed as dist
+from megatron.core import parallel_state
+from omegaconf import OmegaConf
+from omegaconf.omegaconf import DictConfig, open_dict
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
+from nemo.utils import logging
+from nemo.utils.distributed import temporary_directory
+from nemo.utils.get_rank import is_global_rank_zero
+from nemo.utils.model_utils import load_config, save_artifacts
+
+try:
+    import ammo.torch.quantization as atq
+    from ammo.torch.export import export_model_config
+
+    HAVE_AMMO = True
+
+except (ImportError, ModuleNotFoundError) as e:
+    HAVE_AMMO = False
+    HAVE_AMMO_ERROR = e
+
+
+class Quantizer:
+
+    """
+    Post-training quantization of Nemo checkpoints.
+
+    PTQ converts selected model layers to low-precision format (e.g., INT4, FP8) for efficient serving.
+    The process consist of several steps:
+
+        1. Loading a Nemo model from disk using appropriate parallelism strategy
+        2. Calibrating the model to obtain appropriate algorithm-specific scaling factors
+        3. Producing output directory or .qnemo tarball with model config (json),
+           quantized weights (safetensors) and tokenizer config (yaml).
+
+    The output directory (or .qnemo file) produced is intended to be consumed by TensorRT-LLM toolbox
+    for efficient inference. This can be achieved using Nemo inference containers.
+
+    Currently supported and tested model family is Llama2. Model type needs to be specified in
+    the quantization command with decoder_type parameter on exporting (see below). Quantizing other
+    model families is experimental and might not be fully supported.
+
+    Available quantization methods are listed in QUANT_CFG_CHOICES dictionary below.
+    Please consult AMMO documentation for details. You can also inspect different choices in
+    examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml for quantization algorithms and
+    calibration data as well as recommended settings.
+
+    Quantization algorithm can also be conveniently set to 'null' to perform only weights export step
+    for TensorRT-LLM deployment. This is useful to getting baseline results for a full-precision model.
+    """
+
+    def __init__(
+        self,
+        quantization_config: DictConfig,
+        inference_config: DictConfig,
+        export_config: DictConfig,
+        trainer_config: DictConfig,
+    ):
+        if not HAVE_AMMO:
+            raise RuntimeError("nvidia-ammo>=0.7 is needed to use Quantizer") from HAVE_AMMO_ERROR
+        QUANT_CFG_CHOICES = {
+            "int8": atq.INT8_DEFAULT_CFG,
+            "int8_sq": atq.INT8_SMOOTHQUANT_CFG,
+            "fp8": atq.FP8_DEFAULT_CFG,
+            "int4_awq": atq.INT4_AWQ_CFG,
+            "w4a8_awq": atq.W4A8_AWQ_BETA_CFG,
+        }
+        SUPPORTED_DTYPE = [16, "16", "bf16"]  # Default precision for non-quantized layers
+        assert export_config.dtype in SUPPORTED_DTYPE
+        assert quantization_config.algorithm is None or quantization_config.algorithm in QUANT_CFG_CHOICES
+        self.quantization_config = quantization_config
+        self.inference_config = inference_config
+        self.export_config = export_config
+        self.trainer_config = trainer_config
+        if quantization_config.algorithm is not None:
+            atq_config = QUANT_CFG_CHOICES[quantization_config.algorithm]
+            if quantization_config.algorithm != "fp8":
+                # disable quantization for the last output layer
+                atq_config = copy.deepcopy(atq_config)
+                atq_config["quant_cfg"]["*.output_layer.*"] = {"enable": False}
+            self.atq_config = atq_config
+        else:
+            self.atq_config = None
+
+    def _load_model(
+        self,
+        model_file: str,
+        tensor_model_parallel_size: Optional[int] = None,
+        pipeline_model_parallel_size: Optional[int] = None,
+    ):
+        """Load model using AMMO layer spec for quantization."""
+        model_cfg = self._load_and_modify_config(model_file, tensor_model_parallel_size, pipeline_model_parallel_size)
+
+        trainer = Trainer(strategy=NLPDDPStrategy(), **self.trainer_config)
+        connector = NLPSaveRestoreConnector()
+
+        model = MegatronGPTModel.restore_from(
+            restore_path=model_file, trainer=trainer, override_config_path=model_cfg, save_restore_connector=connector,
+        )
+        model.freeze()
+
+        try:
+            model.model.module.language_model.encoder.activations_checkpoint_method = None
+        except AttributeError:
+            pass
+
+        self._check_ddp_initialized(model)
+
+        if is_global_rank_zero():
+            print(model)
+
+        return model
+
+    def _check_ddp_initialized(self, model):
+        if parallel_state.is_unitialized():
+
+            def dummy():
+                return
+
+            if model.trainer.strategy.launcher is not None:
+                model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer)
+            model.trainer.strategy.setup_environment()
+
+    def _load_and_modify_config(
+        self,
+        model_file: str,
+        tensor_model_parallel_size: Optional[int] = None,
+        pipeline_model_parallel_size: Optional[int] = None,
+    ):
+        model_cfg = load_config(model_file)
+
+        with open_dict(model_cfg):
+            model_cfg.activations_checkpoint_method = None
+            model_cfg.activations_checkpoint_granularity = None
+            if tensor_model_parallel_size is not None:
+                model_cfg.tensor_model_parallel_size = tensor_model_parallel_size
+            if pipeline_model_parallel_size is not None:
+                model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size
+            # Only custom AMMO spec is supported for PTQ: this custom spec is largely based on local Megatron-LM
+            # layer definitions to avoid Transformer Engine implementations that are currently not supported.
+            model_cfg.name = "ammo"
+
+        return model_cfg
+
+    def quantize(
+        self,
+        model_file: str,
+        dataloader: Optional[List[List[str]]],
+        tensor_model_parallel_size: Optional[int] = None,
+        pipeline_model_parallel_size: Optional[int] = None,
+    ):
+        """Quantize model checkpoint using given dataloader and optional custom parallelism settings."""
+        model = self._load_model(model_file, tensor_model_parallel_size, pipeline_model_parallel_size)
+
+        if self.quantization_config.algorithm is None:
+            return model
+
+        model.set_inference_config(OmegaConf.to_container(self.inference_config))
+
+        def forward_loop():
+            for i, batch in enumerate(dataloader):
+                if is_global_rank_zero():
+                    print(f"Calibrating batch {i}")
+                model.predict_step(batch, i)
+
+        model = atq.quantize(model, self.atq_config, forward_loop)
+        return model
+
+    def export(self, model, model_save: str):
+        """Export model to '.qnemo' format for TensorRT-LLM engine build."""
+        torch_dtype = torch_dtype_from_precision(self.export_config.dtype)
+
+        # Setup model export handling: temporary directory for
+        # '.qnemo' tarball or directly write to model_save
+        save_qnemo = model_save.endswith(".qnemo")
+        if save_qnemo:
+            export_handler = temporary_directory()
+        else:
+            export_handler = nullcontext(enter_result=model_save)
+
+        with export_handler as export_dir:
+            export_model_config(
+                model=model,
+                decoder_type=self.export_config.decoder_type,
+                dtype=torch_dtype,
+                export_dir=export_dir,
+                inference_tensor_parallel=self.export_config.inference_tensor_parallel,
+                export_tensorrt_llm_config=self.export_config.export_tensorrt_llm_config,
+            )
+            dist.barrier()  # Wait until all ranks complete export_model_config step
+            if is_global_rank_zero():
+                logging.info(f"Exporting quantized weights, model artifacts, and tokenizer config to {model_save}...")
+                save_artifacts(model, export_dir)
+                if save_qnemo:
+                    with tarfile.open(model_save, "w:gz") as tar:
+                        tar.add(export_dir, arcname="./")
diff --git a/nemo/utils/distributed.py b/nemo/utils/distributed.py
index b0d24de3e5b4..ee6c107b1d85 100644
--- a/nemo/utils/distributed.py
+++ b/nemo/utils/distributed.py
@@ -12,11 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import contextlib
 import os
+import tempfile
 
 import torch
+import torch.distributed as dist
 
 from nemo.utils import logging
+from nemo.utils.get_rank import is_global_rank_zero
 
 try:
     from megatron.core import parallel_state
@@ -100,3 +104,22 @@ def gather_objects(partial_results_list, main_rank=None):
         results_list.extend(r)
 
     return results_list
+
+
+@contextlib.contextmanager
+def temporary_directory():
+    """Create a shared temporary directory across ranks in distributed setup.
+
+    This function assumes that the distributed setup has been already
+    correctly initialized. It is intended to be used only in single-node
+    setup so that all ranks can access the directory created."""
+
+    if is_global_rank_zero():
+        tmp_dir = [tempfile.TemporaryDirectory()]
+    else:
+        tmp_dir = [None]
+    dist.broadcast_object_list(tmp_dir)
+    yield tmp_dir[0].name
+    # We use barrier below to make sure that rank zero won't exit
+    # and delete tmp_dir while other ranks may still use it
+    dist.barrier()
diff --git a/nemo/utils/model_utils.py b/nemo/utils/model_utils.py
index b2a6abbf54aa..8889f13d5b98 100644
--- a/nemo/utils/model_utils.py
+++ b/nemo/utils/model_utils.py
@@ -12,9 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import contextlib
 import copy
 import importlib
 import os
+import shutil
+import tarfile
+import tempfile
 from dataclasses import dataclass, is_dataclass
 from enum import Enum
 from functools import lru_cache
@@ -61,6 +65,18 @@ class ArtifactItem:
     hashed_path: Optional[str] = None
 
 
+def load_config(model_file: str) -> DictConfig:
+    """Load model config from extracted directory or '.nemo' tarball."""
+    if os.path.isfile(model_file):
+        with tempfile.TemporaryDirectory() as tmp, tarfile.open(model_file, "r:") as tar:
+            tar.extract("./model_config.yaml", path=tmp)
+            model_config = OmegaConf.load(os.path.join(tmp, "model_config.yaml"))
+    else:
+        model_config = OmegaConf.load(os.path.join(model_file, "model_config.yaml"))
+
+    return model_config
+
+
 def resolve_dataset_name_from_cfg(cfg: 'DictConfig') -> Optional[str]:
     """
     Parses items of the provided sub-config to find the first potential key that
@@ -636,3 +652,36 @@ def ckpt_to_dir(filepath: Union[str, Path]) -> Path:
     checkpoint_dir = filepath.with_name(filepath.stem)
 
     return checkpoint_dir
+
+
+def save_artifacts(model, output_dir: str, use_abspath: bool = False) -> None:
+    """Save all model artifacts and tokenizer config to a given output directory."""
+    app_state = AppState()
+    model_file = app_state.model_restore_path
+    model_cfg = copy.deepcopy(model.cfg)
+
+    # Setup model file handling context: directory or tarball
+    if os.path.isfile(model_file):
+        model_file_handler = tarfile.open
+        kwargs = {"name": model_file, "mode": "r:"}
+    elif os.path.isdir(model_file):
+        model_file_handler = contextlib.nullcontext
+        kwargs = {}
+    else:
+        raise FileNotFoundError(model_file)
+
+    # Copy or extract artifacts depending on the context
+    with model_file_handler(**kwargs) as maybe_tar:
+        for arti_name, arti_item in model.artifacts.items():
+            _, arti_file = arti_item.path.split("nemo:")
+            arti_path = os.path.join(output_dir, arti_name)
+            if maybe_tar is not None:
+                maybe_tar.extract(f"./{arti_file}", path=output_dir)
+                os.rename(os.path.join(output_dir, arti_file), arti_path)
+            else:
+                shutil.copy(os.path.join(model_file, arti_file), arti_path)
+            # Store artifact path as basename by default. Otherwise save absolute path but bear in mind
+            # that in this case output directory should be permanent for correct artifact recovery later
+            arti_path = os.path.abspath(arti_path) if use_abspath else os.path.basename(arti_path)
+            OmegaConf.update(model_cfg, arti_name, arti_path)
+    OmegaConf.save(model_cfg.tokenizer, os.path.join(output_dir, "tokenizer_config.yaml"))
diff --git a/tests/setup/__main__.py b/tests/setup/__main__.py
new file mode 100644
index 000000000000..289a2537e2f2
--- /dev/null
+++ b/tests/setup/__main__.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+from .data.create_sample_jsonl import create_sample_jsonl
+from .models.create_hf_model import create_hf_model
+
+print("Setup test data and models...")
+
+parser = argparse.ArgumentParser("Setup test data and models.")
+parser.add_argument("--save_dir", required=True, help="Root save directory for artifacts")
+parser.add_argument("--overwrite", action="store_true", help="Overwrite existing files and directories")
+args = parser.parse_args()
+
+print(f"Arguments are: {vars(args)}")
+
+os.makedirs(args.save_dir, exist_ok=True)
+
+create_sample_jsonl(
+    output_file=os.path.join(args.save_dir, "test_quantization", "test.json"), overwrite=args.overwrite,
+)
+
+create_hf_model(
+    model_name_or_path="/home/TestData/nlp/meta-llama/Llama-2-7b-hf",
+    output_dir=os.path.join(args.save_dir, "megatron_llama/llama-ci-hf"),
+    config_updates={"hidden_size": 256, "num_attention_heads": 4, "num_hidden_layers": 2, "num_key_value_heads": 4},
+    overwrite=args.overwrite,
+)
+print("Setup done.")
diff --git a/tests/setup/data/create_sample_jsonl.py b/tests/setup/data/create_sample_jsonl.py
new file mode 100644
index 000000000000..00f789548f81
--- /dev/null
+++ b/tests/setup/data/create_sample_jsonl.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import os
+
+"""
+Create sample JSONL file for functional testing. Each line contains a dictionary
+with a single element "text" for storing data.
+"""
+
+
+def create_sample_jsonl(output_file: str, overwrite: bool = False):
+    """Create sample JSONL."""
+    if os.path.isfile(output_file) and not overwrite:
+        print(f"File {output_file} exists and overwrite flag is not set so exiting.")
+        return
+
+    texts = [
+        "Sample data for functional tests",
+        "Once upon a time, in the middle of a dense forest, there was a small house, where lived a pretty little girl "
+        "named Little Red Riding Hood.",
+        "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore "
+        "magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea "
+        "commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat "
+        "nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit "
+        "anim id est laborum...",
+        "Next please!",
+        "¡H E L L O   W O R L D!",
+        "Korzystając z okazji chciałbym pozdrowić całą moją rodzinę i przyjaciół",
+    ]
+    print(f"Writing {len(texts)} line(s) to {output_file}...")
+    os.makedirs(os.path.dirname(output_file), exist_ok=True)
+    with open(output_file, mode="w", encoding="utf-8") as f:
+        for text in texts:
+            json.dump({"text": text}, f)
+            f.write("\n")
+    print("OK.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Create sample JSONL file.")
+    parser.add_argument("--output_file", help="Output file name")
+    parser.add_argument("--overwrite", action="store_true", help="Overwrite file if it exists")
+    args = parser.parse_args()
+    create_sample_jsonl(args.output_file)
diff --git a/tests/setup/models/create_hf_model.py b/tests/setup/models/create_hf_model.py
new file mode 100644
index 000000000000..9f57d5996dfc
--- /dev/null
+++ b/tests/setup/models/create_hf_model.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import os
+
+from typing import Any, Dict, Optional
+
+import transformers
+
+"""
+Create a randomly initialized HuggingFace model for testing purposes.
+
+Model can be specified by name or path for creating its config and tokenizer using
+HuggingFace transformers AutoConfig and AutoTokenizer functions.
+
+Parameter config_updates can be used to override specific model config fields to make
+it smaller, for example, by changing number of layers or hidden layers dimensionality,
+making it adequate for testing purposes. This parameter should be specified as
+a dictionary that can be parsed using json.loads method.
+
+Example usage for Llama2 model (requires HF login):
+```
+python tests/setup/models/create_tiny_hf_model.py \
+  --model_name_or_path meta-llama/Llama-2-7b-hf \
+  --output_dir tiny_llama2_hf \
+  --config_updates '{"hidden_size": 128, "num_attention_heads": 4, "num_hidden_layers": 2, "num_key_value_heads": 4}'
+```
+"""
+
+
+def get_hf_model_class(hf_config):
+    """Get HuggingFace model class from config."""
+    if len(hf_config.architectures) > 1:
+        print(f"More than one model architecture available, choosing 1st: {hf_config.architectures}")
+    model_name = hf_config.architectures[0]
+    model_class = getattr(transformers, model_name)
+    return model_class
+
+
+def create_hf_model(
+    model_name_or_path: str, output_dir: str, config_updates: Optional[Dict[str, Any]] = None, overwrite: bool = False
+):
+    """Create HuggingFace model with optional config updates."""
+    if os.path.isdir(output_dir) and not overwrite:
+        print(f"Output directory {output_dir} exists and overwrite flag is not set so exiting.")
+        return
+
+    hf_config = transformers.AutoConfig.from_pretrained(model_name_or_path)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path)
+    model_class = get_hf_model_class(hf_config)
+
+    if config_updates is not None:
+        hf_config.update(config_updates)
+    print(hf_config)
+
+    model = model_class(hf_config)
+    print(model)
+
+    os.makedirs(output_dir, exist_ok=True)
+    print(f"Saving model to {output_dir}...")
+    tokenizer.save_pretrained(output_dir)
+    model.save_pretrained(output_dir)
+    print("OK.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Create a HuggingFace model (random initialization) for testing purposes.")
+    parser.add_argument(
+        "--model_name_or_path", required=True, help="Model name or local path with model config and tokenizer",
+    )
+    parser.add_argument(
+        "--output_dir", required=True, help="Output directory",
+    )
+    parser.add_argument(
+        "--config_updates", type=json.loads, help="Parameter updates in JSON format to overwrite for model config",
+    )
+    parser.add_argument(
+        "--overwrite", action="store_true", help="Overwrite file if it exists",
+    )
+    args = parser.parse_args()
+    create_hf_model(args.model_name_or_path, args.output_dir, args.config_updates)

From fba71c0977a747444de3e08f0e38d812128ddf00 Mon Sep 17 00:00:00 2001
From: Ao Tang <aot@nvidia.com>
Date: Wed, 13 Mar 2024 14:14:36 -0400
Subject: [PATCH 018/140] fix FIM RNG issue (#8513)

* fix FIM RNG issue

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix FIMDataset

* fix seed ref

* fim fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add fim test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove files

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove swp

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove import

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix syntax

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix Jenkins

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: dimapihtar <dpihtar@gmail.com>
---
 Jenkinsfile                                   | 55 ++++++++++++++++++-
 .../megatron/gpt_fim_dataset.py               |  9 +--
 .../language_modeling/megatron_gpt_model.py   |  2 +-
 3 files changed, 60 insertions(+), 6 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 100a0bd4a6ad..602c78890262 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -5273,7 +5273,60 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
       }
     }
 
-
+    stage('L2: Megatron FIM Dataset') {
+      when {
+        anyOf {
+          branch 'main'
+          changeRequest target: 'main'
+        }
+      }
+      failFast true
+      steps {
+        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        trainer.devices=1 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=3 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        ++model.name=megatron_gpt_full_te_layer_autocast \
+        model.mcore_gpt=True \
+        model.tensor_model_parallel_size=1 \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=1 \
+        model.optim.sched.constant_steps=1 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.normalization=layernorm1p \
+        model.bias_activation_fusion=True \
+        model.bias_dropout_add_fusion=True \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        model.num_layers=8 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method=null \
+        model.activations_checkpoint_granularity=null \
+        model.activations_checkpoint_num_layers=null \
+        model.data.data_prefix='[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document]' \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \
+        ++model.data.add_fim=True \
+        ++model.data.fim.extra_tokens.prefix='fim_prefix' \
+        ++model.data.fim.extra_tokens.middle='fim_middle' \
+        ++model.data.fim.extra_tokens.suffix='fim_suffix' \
+        ++model.data.fim.extra_tokens.pad='fim_pad' \
+        ++model.data.fim.extra_tokens.eod='endoftext'"
+        sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
+      }
+    }
+    
     stage('L2: Megatron Mock Data Generation') {
       when {
         anyOf {
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py
index 49a34a368fdc..17576bea4c75 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py
@@ -29,13 +29,11 @@ class GPTFIMDatasetConfig(GPTDatasetConfig):
     """Configuration object for Megatron Core GPT FIM datasets
 
         Attributes:
-            tokenizer: model tokenizer
             fim: fill in the middle parameters config
     """
 
-    def __init__(self, tokenizer, fim, **kwargs):
+    def __init__(self, fim, **kwargs):
         super().__init__(**kwargs)
-        self.tokenizer = tokenizer
         self.fim = fim
 
 
@@ -58,12 +56,15 @@ class GPTFIMDataset(GPTDataset):
     def __init__(
         self,
         indexed_dataset: MMapIndexedDataset,
+        dataset_path: str,
         indexed_indices: np.ndarray,
         num_samples: int,
         index_split: Split,
         config: GPTFIMDatasetConfig,
     ) -> None:
-        super().__init__(indexed_dataset, indexed_indices, num_samples, index_split, config)
+        super().__init__(indexed_dataset, dataset_path, indexed_indices, num_samples, index_split, config)
+
+        self.indexed_dataset = indexed_dataset
 
     def _query_document_sample_shuffle_indices(self, idx: int) -> Tuple[np.ndarray, np.ndarray]:
         """Get the text (token ids) and document ids for a given index
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index f883f1c1fc7c..79d48269d3a6 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1286,7 +1286,7 @@ def build_train_valid_test_datasets(self):
             kwargs["split"] = self.cfg.data.splits_string
 
         if self.cfg.data.get('add_fim', False):
-            dataset_config = GPTFIMDatasetConfig(self.tokenizer, self.cfg.data.fim, **kwargs)
+            dataset_config = GPTFIMDatasetConfig(self.cfg.data.fim, **kwargs)
 
             self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
                 GPTFIMDataset, train_valid_test_num_samples, dataset_config,

From 48b8204d57e59c8790aaa6eaa20384b046b1a574 Mon Sep 17 00:00:00 2001
From: Aditya Malte <aditya.malte@gmail.com>
Date: Wed, 13 Mar 2024 15:43:58 -0700
Subject: [PATCH 019/140] Add support to perform "inference-only" without
 loading training data (#8640)

* Add support to perform "inference-only" without loading training data

Hi,
Currently, the MegatronSBERT model cannot run inference. Essentially, a user may not be able to simply load a trained .nemo checkpoint and run inference (forward()) function on it.

This patch adds a try/except block to handle cases where training data is not specified

Signed-off-by: Aditya Malte <aditya.malte@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Aditya Malte <aditya.malte@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../megatron_sbert_model.py                   | 26 ++++++++++++-------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_sbert_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_sbert_model.py
index 0d312845db58..a9bb7fd40017 100644
--- a/nemo/collections/nlp/models/information_retrieval/megatron_sbert_model.py
+++ b/nemo/collections/nlp/models/information_retrieval/megatron_sbert_model.py
@@ -391,15 +391,23 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         self.cross_entropy_loss = torch.nn.CrossEntropyLoss(label_smoothing=cfg.get('label_smoothing', 0.0))
         softmax_temp = cfg.get('softmax_temp', 0.05)
         self.scale = 1.0 / softmax_temp
-        train_file_path = self.cfg.data.data_prefix
-        with open(train_file_path) as f:
-            train_data = json.load(f)
-
-        random_seed = 42
-        set_seed(random_seed)
-        random.shuffle(train_data)
-
-        self.train_data = train_data
+        try:
+            train_file_path = self.cfg.data.data_prefix
+            with open(train_file_path) as f:
+                train_data = json.load(f)
+
+            random_seed = 42
+            set_seed(random_seed)
+            random.shuffle(train_data)
+
+            self.train_data = train_data
+            logging.warning("Model is running in training mode")
+        except:
+            logging.warning(
+                "Model is running inference mode as training data is not specified, or could not be loaded"
+            )
+            random_seed = 42
+            set_seed(random_seed)
 
     def model_provider_func(self, pre_process, post_process):
         cfg = self.cfg

From 1baaff7567508127dd3778eed77e185b8cf10dc0 Mon Sep 17 00:00:00 2001
From: Andrei Andrusenko <52885736+andrusenkoau@users.noreply.github.com>
Date: Thu, 14 Mar 2024 19:32:48 +0400
Subject: [PATCH 020/140] Add ASR context-biasing tutorial (#8462)

* add ctcws tutorial

Signed-off-by: andrusenkoau <andrusenkoau@gmail.com>

* clear sell outputs

Signed-off-by: andrusenkoau <andrusenkoau@gmail.com>

* fixes

Signed-off-by: andrusenkoau <andrusenkoau@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fixes

Signed-off-by: andrusenkoau <andrusenkoau@gmail.com>

* fixes

Signed-off-by: andrusenkoau <andrusenkoau@gmail.com>

* fixes

Signed-off-by: andrusenkoau <andrusenkoau@gmail.com>

---------

Signed-off-by: andrusenkoau <andrusenkoau@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../context_biasing/context_graph_ctc.py      |  22 +-
 tutorials/asr/ASR_Context_Biasing.ipynb       | 947 ++++++++++++++++++
 tutorials/asr/README.md                       |   1 +
 3 files changed, 963 insertions(+), 7 deletions(-)
 create mode 100644 tutorials/asr/ASR_Context_Biasing.ipynb

diff --git a/nemo/collections/asr/parts/context_biasing/context_graph_ctc.py b/nemo/collections/asr/parts/context_biasing/context_graph_ctc.py
index 5c9c3924625d..bcfcdf2435f1 100644
--- a/nemo/collections/asr/parts/context_biasing/context_graph_ctc.py
+++ b/nemo/collections/asr/parts/context_biasing/context_graph_ctc.py
@@ -179,22 +179,26 @@ def draw(self, title: Optional[str] = None, symbol_table: Optional[Dict[int, str
             "size": "8.5,11",
             "center": "1",
             "orientation": "Portrait",
-            "ranksep": "0.4",
+            "ranksep": "0.30",
             "nodesep": "0.25",
         }
         if title is not None:
             graph_attr["label"] = title
 
+        default_edge_attr = {
+            "fontsize": "12",
+        }
+
         default_node_attr = {
             "shape": "circle",
             "style": "bold",
-            "fontsize": "14",
+            "fontsize": "12",
         }
 
         final_state_attr = {
             "shape": "doublecircle",
             "style": "bold",
-            "fontsize": "14",
+            "fontsize": "12",
         }
 
         dot = graphviz.Digraph(name="Context Graph", graph_attr=graph_attr)
@@ -221,14 +225,18 @@ def draw(self, title: Optional[str] = None, symbol_table: Optional[Dict[int, str
                 if node.index != current_node.index:
                     output, input, arc = str(current_node.index), str(node.index), f"{label}"
                     if (output, input, arc) not in printed_arcs:
-                        dot.edge(output, input, label=arc)
+                        if arc == self.blank_token:
+                            dot.edge(output, input, label=self.blank_token, color="blue", **default_edge_attr)
+                        else:
+                            dot.edge(output, input, label=arc)
                         queue.append(node)
                 else:
                     output, input, arc = str(current_node.index), str(current_node.index), f"{label}"
                     if (output, input, arc) not in printed_arcs:
-                        dot.edge(
-                            output, input, label=arc, color="green",
-                        )
+                        if arc == self.blank_token:
+                            dot.edge(output, input, label=self.blank_token, color="blue", **default_edge_attr)
+                        else:
+                            dot.edge(output, input, label=arc, color="green")
                 printed_arcs.add((output, input, arc))
 
         return dot
diff --git a/tutorials/asr/ASR_Context_Biasing.ipynb b/tutorials/asr/ASR_Context_Biasing.ipynb
new file mode 100644
index 000000000000..f001ce3d65a2
--- /dev/null
+++ b/tutorials/asr/ASR_Context_Biasing.ipynb
@@ -0,0 +1,947 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "17b3cbf8",
+   "metadata": {},
+   "source": [
+    "# Context-Biasing for ASR models with CTC-based Word Spotter"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1156d1d1",
+   "metadata": {},
+   "source": [
+    "This tutorial aims to show how to improve the recognition accuracy of specific words in NeMo framework\n",
+    "for CTC and Trasducer (RNN-T) ASR models by using the fast context-biasing method with CTC-based Word Spotter.\n",
+    "\n",
+    "## Tutorial content:\n",
+    "* Intro in the context-biasing problem\n",
+    "* Description of the proposed CTC-based Words Spotter (CTC-WS) method\n",
+    "* Practical part 1 (base):\n",
+    "    * Download data set and ASR models\n",
+    "    * Build context-biasing list\n",
+    "    * Evaluate recognition results with and without context-biasing\n",
+    "    * Improve context-biasing results with alternative transcriptions\n",
+    "* Practical part 2 (advanced):\n",
+    "    * Visualization of context-biasing graph\n",
+    "    * Running CTC-based Word Spotter only\n",
+    "    * Merge greedy decoding results with spotted context-biasing candidates\n",
+    "    * Results analysis\n",
+    "* Summary"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "431edfbf",
+   "metadata": {},
+   "source": [
+    "## Context-biasing: intro\n",
+    "\n",
+    "ASR models often struggle to recognize words that were absent or had few examples in the training data.\n",
+    "This problem is especially acute due to the emergence of new names and titles in a rapidly developing world.\n",
+    "The users need to be able to recognize these new words.\n",
+    "Context-biasing methods attempt to solve this problem by assuming that we have a list of words and phrases (context-biasing list) in advance\n",
+    "for which we want to improve recognition accuracy.\n",
+    "\n",
+    "One of the directions of context-biasing methods is based on the `deep fusion` approach.\n",
+    "These methods require intervention into the ASR model and its training process.\n",
+    "The main disadvantage of these methods is that they require a lot of computational resources and time to train the model.\n",
+    "\n",
+    "Another direction is methods based on the `shallow fusion` approach. In this case, only the decoding process is modified.\n",
+    "During the beam-search decoding, the hypothesis is rescored depending on whether the current word is present in the context-biasing list or external language model.\n",
+    "The beam-search decoding may be computationally expensive, especially for the models with a large vocabulary and context-biasing list.\n",
+    "This problem is considerably worsened in the case of the Transducer (RNN-T) model since beam-search decoding involves multiple Decoder (Prediction) and Joint networks calculations.\n",
+    "Moreover, the context-biasing recognition is limited by the model prediction pool biased toward training data. In the case of rare or new words, the model may not have a hypothesis for the desired word from the context-biasing list whose probability we want to amplify."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ae0bfd60",
+   "metadata": {},
+   "source": [
+    "## CTC-based Word Spotter\n",
+    "\n",
+    "\n",
+    "This tutorial considers a fast context-biasing method using a CTC-based Word Spotter (CTC-WS).\n",
+    "The method involves decoding CTC log probabilities with a context graph built for words and phrases from the context-biasing list.\n",
+    "The spotted context-biasing candidates (with their scores and time intervals) are compared by scores with words from the greedy\n",
+    "CTC decoding results to improve recognition accuracy and pretend false accepts of context-biasing (Figure 1).  \n",
+    "  \n",
+    "  \n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c7e45bf2",
+   "metadata": {},
+   "source": [
+    "<figure markdown>\n",
+    "  <img src=\"https://github.com/NVIDIA/NeMo/releases/download/v1.22.0/asset-post-v1.22.0-ctcws_scheme_2.png\" alt=\"CTC-WS\" style=\"width: 60%;\" height=\"auto\"> <!-- Adjust the width as needed -->\n",
+    "  <figcaption><b>Figure 1.</b> <i> High-level representation of the proposed context-biasing method with CTC-WS in case of CTC model. Detected words (gpu, nvidia, cuda) are compared with words from the greedy CTC results in the overlapping intervals according to the accumulated scores to prevent false accept replacement. </i></figcaption>\n",
+    "</figure>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ba163f41",
+   "metadata": {},
+   "source": [
+    "\n",
+    "<!-- <img width=\"500px\" height=\"auto\"\n",
+    "     src=\"https://github.com/NVIDIA/NeMo/releases/download/v1.22.0/asset-post-v1.22.0-ctcws_scheme_2.png\"\n",
+    "     alt=\"CTC-WS2\"\n",
+    "     style=\"float: right; margin-left: 20px;\"> -->\n",
+    "     \n",
+    "A [Hybrid Transducer-CTC](https://arxiv.org/abs/2312.17279) model (a shared encoder trained together with CTC and Transducer output heads) enables the use of the CTC-WS method for the Transducer model.\n",
+    "Context-biasing candidates obtained by CTC-WS are also filtered by the scores with greedy CTC predictions and then merged with greedy Transducer results.\n",
+    "\n",
+    "The CTC-WS method allows using pretrained NeMo models (`CTC` or `Hybrid Transducer-CTC`) for context-biasing recognition without model retraining (Figure 2).\n",
+    "The method shows inspired results for context-biasing with only a little additional work time and computational resources.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c05b16d8",
+   "metadata": {},
+   "source": [
+    "<figure markdown>\n",
+    "  <img src=\"https://github.com/NVIDIA/NeMo/releases/download/v1.22.0/asset-post-v1.22.0-ctcws_scheme_1.png\" alt=\"CTC-WS\" style=\"width: 65%;\" align=\"center\"> <!-- Adjust the width as needed -->\n",
+    "  <figcaption><b>Figure 2.</b> <i> Scheme of the context-biasing method with CTC-based Word Spotter. CTC-WS uses CTC log probabilities to detect context-biasing candidates. Obtained candidates are filtered by CTC word alignment and then merged with CTC or RNN-T word alignment to get the final text result. </i></figcaption>\n",
+    "</figure>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ac0ec822",
+   "metadata": {},
+   "source": [
+    "# Installing dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "69c86a4f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "BRANCH = 'main'\n",
+    "\n",
+    "\"\"\"\n",
+    "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
+    "\n",
+    "Instructions for setting up Colab are as follows:\n",
+    "1. Open a new Python 3 notebook.\n",
+    "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n",
+    "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
+    "4. Run this cell to set up dependencies.\n",
+    "\"\"\"\n",
+    "\n",
+    "import os\n",
+    "# either provide a path to local NeMo repository with NeMo already installed or git clone\n",
+    "\n",
+    "# option #1: local path to NeMo repo with NeMo already installed\n",
+    "NEMO_DIR_PATH = os.path.dirname(os.path.dirname(os.path.abspath('')))\n",
+    "\n",
+    "# check if Google Colab is being used\n",
+    "try:\n",
+    "    import google.colab\n",
+    "    IN_COLAB = True\n",
+    "except (ImportError, ModuleNotFoundError):\n",
+    "    IN_COLAB = False\n",
+    "\n",
+    "# option #2: download NeMo repo\n",
+    "if IN_COLAB or not os.path.exists(os.path.join(NEMO_DIR_PATH, \"nemo\")):\n",
+    "    ## Install dependencies\n",
+    "    !apt-get install sox libsndfile1 ffmpeg\n",
+    "\n",
+    "    !git clone -b $BRANCH https://github.com/NVIDIA/NeMo\n",
+    "    %cd NeMo\n",
+    "    !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
+    "    NEMO_DIR_PATH = os.path.abspath('')\n",
+    "\n",
+    "import sys\n",
+    "sys.path.insert(0, NEMO_DIR_PATH)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5260d4fa",
+   "metadata": {},
+   "source": [
+    "## Practical part 1 (base)\n",
+    "In this part, we will consider the base usage of the CTC-WS method for pretrained NeMo models.\n",
+    "\n",
+    "### Data preparation.\n",
+    "We will use a subset of the GTC data set. The data set contains 10 audio files with NVIDIA GTC talks. \n",
+    "The primary data set feature is the computer science and engineering domain, which has a large number of unique terms and product names (NVIDIA, GPU, GeForce, Ray Tracing, Omniverse, teraflops, etc.), which is good fit for the context-biasing task. All the text data is normalized and lowercased."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "637f2c6d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# download data\n",
+    "!wget https://asr-tutorial-data.s3.eu-north-1.amazonaws.com/context_biasing_data.gz\n",
+    "!tar -xvzf context_biasing_data.gz\n",
+    "!apt-get install tree"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6baefc80",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!tree context_biasing_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "09fe748b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nemo.collections.asr.parts.utils.manifest_utils import read_manifest\n",
+    "\n",
+    "# data is already stored in nemo data manifest format\n",
+    "test_nemo_manifest = \"./context_biasing_data/gtc_data_subset_10f.json\"\n",
+    "test_data = read_manifest(test_nemo_manifest)\n",
+    "\n",
+    "for idx, item in enumerate(test_data):\n",
+    "    print(f\"[{idx}]: {item['text']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "64ab4764",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import librosa\n",
+    "import IPython.display as ipd\n",
+    "\n",
+    "# load and listen to the audio file example\n",
+    "example_file = test_data[0]['audio_filepath']\n",
+    "audio, sample_rate = librosa.load(example_file)\n",
+    "\n",
+    "file_id = 0\n",
+    "print(f\"[TEXT {file_id}]: {test_data[file_id]['text']}\\n\")\n",
+    "ipd.Audio(example_file, rate=sample_rate)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a85ea8ec",
+   "metadata": {},
+   "source": [
+    "### Load ASR models\n",
+    "\n",
+    "For testing the CTC-WS method, we will use the following NeMo models:\n",
+    " - (CTC): [stt_en_fastconformer_ctc_large](https://huggingface.co/nvidia/stt_en_fastconformer_ctc_large) - a large fast-conformer model trained on English ASR data\n",
+    " - (Hybrid Transducer-CTC): [stt_en_fastconformer_hybrid_large_streaming_multi](https://huggingface.co/nvidia/stt_en_fastconformer_hybrid_large_streaming_multi) - a large fast-conformer model trained jointly with CTC and Transducer heads on English ASR data. The model is streaming, which means it can process audio in real time. It can cause a slight WER degradation in comparison with the first offline model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d34ee0ba",
+   "metadata": {
+    "jupyter": {
+     "outputs_hidden": true
+    },
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "from nemo.collections.asr.models import EncDecCTCModelBPE, EncDecHybridRNNTCTCBPEModel\n",
+    "\n",
+    "# ctc model\n",
+    "ctc_model_name = \"stt_en_fastconformer_ctc_large\"\n",
+    "ctc_model = EncDecCTCModelBPE.from_pretrained(model_name=ctc_model_name)\n",
+    "\n",
+    "# hybrid transducer-ctc model\n",
+    "hybrid_ctc_rnnt_model_name = \"stt_en_fastconformer_hybrid_large_streaming_multi\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "082208cd",
+   "metadata": {},
+   "source": [
+    "### Transcribe \n",
+    "Let's transcribe test data and analyze the regontion accuracy of specific words "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "74436885",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "test_audio_files = [item['audio_filepath'] for item in test_data]\n",
+    "recog_results = ctc_model.transcribe(test_audio_files)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b993d650",
+   "metadata": {},
+   "source": [
+    "### Compute per-word recognition statisctic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "70f5714b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import texterrors\n",
+    "\n",
+    "word_dict = {} # {word: [num_of_occurances, num_of_correct_recognition]}\n",
+    "eps = \"<eps>\"\n",
+    "ref_text = [item['text'] for item in test_data]\n",
+    "\n",
+    "for idx, ref in enumerate(ref_text):\n",
+    "    ref = ref.split()\n",
+    "    hyp = recog_results[idx].split()\n",
+    "    texterrors_ali = texterrors.align_texts(ref, hyp, False)\n",
+    "    ali = []\n",
+    "    for i in range(len(texterrors_ali[0])):\n",
+    "        ali.append((texterrors_ali[0][i], texterrors_ali[1][i]))\n",
+    "\n",
+    "    for pair in ali:\n",
+    "        word_ref, word_hyp = pair\n",
+    "        if word_ref == eps:\n",
+    "            continue\n",
+    "        if word_ref in word_dict:\n",
+    "            word_dict[word_ref][0] += 1\n",
+    "        else:\n",
+    "            word_dict[word_ref] = [1, 0]\n",
+    "        if word_ref == word_hyp:\n",
+    "            word_dict[word_ref][1] += 1\n",
+    "\n",
+    "word_candidats = {}\n",
+    "\n",
+    "for word in word_dict:\n",
+    "    gt = word_dict[word][0]\n",
+    "    tp = word_dict[word][1]\n",
+    "    if tp/gt < 1.0:\n",
+    "        word_candidats[word] = [gt, round(tp/gt, 2)]\n",
+    "        \n",
+    "# print obtained per-word statistic\n",
+    "word_candidats_sorted = sorted(word_candidats.items(), key=lambda x:x[1][0], reverse=True)\n",
+    "max_word_len = max([len(x[0]) for x in word_candidats_sorted])\n",
+    "for item in word_candidats_sorted:\n",
+    "    print(f\"{item[0]:<{max_word_len}} {item[1][0]}/{item[1][1]}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "27a9f88b",
+   "metadata": {},
+   "source": [
+    "## Create a context-biasing list\n",
+    "\n",
+    "Now, we need to select the words, recognition of wich we want to improve by CTC-WS context-biasing.\n",
+    "Usually, we select only nontrivial words with the lowest recognition accuracy.\n",
+    "Such words should have a character length >= 3 because short words in a context-biasing list may produce high false-positive recognition.\n",
+    "In this toy example, we will select all the words that look like names with a recognition accuracy less than 1.0.\n",
+    "\n",
+    "The structure of the context-biasing file is:\n",
+    "\n",
+    "WORD1_TRANSCRIPTION1  \n",
+    "WORD2_TRANSCRIPTION1   \n",
+    "...\n",
+    "\n",
+    "TRANSCRIPTION here is a word spelling. We need this structure to add alternative transcriptions (spellings) for some word. We will cover such a case further."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c27848f0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cb_words = [\"gpu\", \"nvidia\", \"nvidia's\", \"nvlink\", \"omniverse\", \"cunumeric\", \"numpy\", \"dgx\", \"dgxs\", \"dlss\",\n",
+    "            \"cpu\", \"tsmc\", \"culitho\", \"xlabs\", \"tensorrt\", \"tensorflow\", \"pytorch\", \"aws\", \"chatgpt\", \"pcie\"]\n",
+    "\n",
+    "# write context-biasing file \n",
+    "cb_list_file = \"context_biasing_data/context_biasing_list.txt\"\n",
+    "with open(cb_list_file, \"w\", encoding=\"utf-8\") as fn:\n",
+    "    for word in cb_words:\n",
+    "        fn.write(f\"{word}_{word}\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b0e8c800",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cat {cb_list_file}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c44fc910",
+   "metadata": {},
+   "source": [
+    "## Run context-biasing evaluation\n",
+    "\n",
+    "The main script for CTC-WS context-biasing in NeMo is:\\\n",
+    "`{NEMO_DIR_PATH}/scripts/asr_context_biasing/eval_greedy_decoding_with_context_biasing.py`\n",
+    "\n",
+    "Context-biasing is managed by `apply_context_biasing` parameter [true or false].  \n",
+    "Other important context-biasing parameters are:\n",
+    "- `beam_threshold` - threshold for CTC-WS beam pruning\n",
+    "- `context_score` - per token weight for context biasing\n",
+    "- `ctc_ali_token_weight` - per token weight for CTC alignment (prevents false acceptances of context-biasing words) \n",
+    "\n",
+    "All the context-biasing parameters are selected according to the default values in the script.  \n",
+    "You can tune them according to your data and ASR model (list all the values in the [] separated by commas)  \n",
+    "for example: `beam_threshold=[7.0,8.0,9.0]`, `context_score=[3.0,4.0,5.0]`, `ctc_ali_token_weight=[0.5,0.6,0.7]`.  \n",
+    "The script will run the recognition with all the combinations of the parameters and will select the best one based on WER value."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9a2d32e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create directory with experimental results\n",
+    "import os\n",
+    "\n",
+    "exp_dir = \"exp\"\n",
+    "if not os.path.isdir(exp_dir):\n",
+    "    os.makedirs(exp_dir)\n",
+    "else:\n",
+    "    print(f\"Directory '{exp_dir}' already exists\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "116f2abe",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# ctc model (no context-biasing)\n",
+    "\n",
+    "!python {NEMO_DIR_PATH}/scripts/asr_context_biasing/eval_greedy_decoding_with_context_biasing.py \\\n",
+    "            nemo_model_file={ctc_model_name} \\\n",
+    "            input_manifest={test_nemo_manifest} \\\n",
+    "            preds_output_folder={exp_dir} \\\n",
+    "            decoder_type=\"ctc\" \\\n",
+    "            acoustic_batch_size=64 \\\n",
+    "            apply_context_biasing=false \\\n",
+    "            context_file={cb_list_file} \\\n",
+    "            beam_threshold=[7.0] \\\n",
+    "            context_score=[3.0] \\\n",
+    "            ctc_ali_token_weight=[0.5]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "674d0af1",
+   "metadata": {},
+   "source": [
+    "The results must be:\n",
+    "\n",
+    "`Precision`: 1.0000 (1/1) fp:0 (fp - false positive recognition)  \n",
+    "`Recall`:    0.0333 (1/30)  \n",
+    "`Fscore`:    0.0645  \n",
+    "`Greedy WER/CER` = 35.68%/8.16%\n",
+    "\n",
+    "The model could recognize 1 out of 30 words from the context-biasing list.\n",
+    "Let's enable context-biasing during decoding:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "239da41d",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# ctc model (with context biasing)\n",
+    "!python {NEMO_DIR_PATH}/scripts/asr_context_biasing/eval_greedy_decoding_with_context_biasing.py \\\n",
+    "            nemo_model_file={ctc_model_name} \\\n",
+    "            input_manifest={test_nemo_manifest} \\\n",
+    "            preds_output_folder={exp_dir} \\\n",
+    "            decoder_type=\"ctc\" \\\n",
+    "            acoustic_batch_size=64 \\\n",
+    "            apply_context_biasing=true \\\n",
+    "            context_file={cb_list_file} \\\n",
+    "            beam_threshold=[7.0] \\\n",
+    "            context_score=[3.0] \\\n",
+    "            ctc_ali_token_weight=[0.5]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "faa1e73c",
+   "metadata": {},
+   "source": [
+    "Now, recognition results are much better:\n",
+    "\n",
+    "`Precision`: 1.0000 (21/21) fp:0  \n",
+    "`Recall`:    0.7000 (21/30)  \n",
+    "`Fscore`:    0.8235  \n",
+    "`Greedy WER/CER` = 17.09%/4.43%\n",
+    "\n",
+    "But we are still able to recognize only 21 out of 30 specific words.\\\n",
+    "You can see that unrecognized words are mostly abbreviations (`dgxs`, `dlss`, `gpu`, `aws`, etc.) or compound words (`culitho`).\\\n",
+    "The ASR models tends to recognize such words as a sequence of characters (`\"aws\" -> \"a w s\"`) or subwords (`\"culitho\" -> \"cu litho\"`).\\\n",
+    "We can try to improve the recognition of such words by adding alternative transcriptions to the context-biasing list."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d72b6391",
+   "metadata": {},
+   "source": [
+    "### Alternative transcriptions\n",
+    "\n",
+    "wordninja is used to split compound words into simple words according to the default word dictionary."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f7e00263",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install wordninja"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "46fe91e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import wordninja\n",
+    "\n",
+    "cb_list_file_modified = cb_list_file + \".abbr_and_ninja\"\n",
+    "\n",
+    "with open(cb_list_file, \"r\", encoding=\"utf-8\") as fn1, \\\n",
+    "    open(cb_list_file_modified, \"w\", encoding=\"utf-8\") as fn2:\n",
+    "\n",
+    "    for line in fn1:\n",
+    "        word = line.strip().split(\"_\")[0]\n",
+    "        new_line = f\"{word}_{word}\"\n",
+    "        # split all the short words into characters\n",
+    "        if len(word) <= 4 and len(word.split()) == 1:\n",
+    "            abbr = ' '.join(list(word))\n",
+    "            new_line += f\"_{abbr}\"\n",
+    "        # split the long words into the simple words (not for phrases)\n",
+    "        new_segmentation = wordninja.split(word)\n",
+    "        if word != new_segmentation[0]:\n",
+    "            new_segmentation = ' '.join(new_segmentation)\n",
+    "            new_line += f\"_{new_segmentation}\"\n",
+    "        fn2.write(f\"{new_line}\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "da69da45",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cat {cb_list_file_modified}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4a21cbf4",
+   "metadata": {},
+   "source": [
+    "Run context-biasing with modified context-biasing list:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "913a0f5e",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# ctc models (with context biasing and modified cb list)\n",
+    "!python {NEMO_DIR_PATH}/scripts/asr_context_biasing/eval_greedy_decoding_with_context_biasing.py \\\n",
+    "            nemo_model_file={ctc_model_name} \\\n",
+    "            input_manifest={test_nemo_manifest} \\\n",
+    "            preds_output_folder={exp_dir} \\\n",
+    "            decoder_type=\"ctc\" \\\n",
+    "            acoustic_batch_size=64 \\\n",
+    "            apply_context_biasing=true \\\n",
+    "            context_file={cb_list_file_modified} \\\n",
+    "            beam_threshold=[7.0] \\\n",
+    "            context_score=[3.0] \\\n",
+    "            ctc_ali_token_weight=[0.5]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "654751ed",
+   "metadata": {},
+   "source": [
+    "Now, the recognition results are:\n",
+    "\n",
+    "`Precision`: 1.0000 (28/28) fp:1  \n",
+    "`Recall`:    0.9333 (28/30)  \n",
+    "`Fscore`:    0.9655  \n",
+    "`Greedy WER/CER` = 7.04%/2.93%\n",
+    "\n",
+    "As you can see, that adding alternative transcriptions to the cb_list file improved the recognition accuracy of the context-biasing words. However, we still miss 2 words. Unfortunately, this algorithm is not a silver bullet.\n",
+    "\n",
+    "In some cases, you can improve results by adding alternative transcriptions manually based on the recognition errors of your ASR model for the specific words (for example, `\"nvidia\" -> \"n video\"`). "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b96c4023",
+   "metadata": {},
+   "source": [
+    "### Hybrid Transducer-CTC model\n",
+    "The CTC-WS context-biasing method for Transducer (RNN-T) models is supported only for Hybrid Transducer-CTC model.  \n",
+    "To use Transducer head of the Hybrid Transducer-CTC model, we need to set `decoder_type=\"rnnt\"`.  \n",
+    "Other parameters are the same as for the CTC model because the context-biasing is applied only on the CTC part of the model. Spotted context-biasing words will have been merged with greedy decoding results of the Transducer head.\n",
+    "\n",
+    "We can use already prepared context-biasing list because the CTC and Hybrid Transducer-CTC models have almost the same BPE tokenizer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "456e47df",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Transducer model (no context-biasing)\n",
+    "!python {NEMO_DIR_PATH}/scripts/asr_context_biasing/eval_greedy_decoding_with_context_biasing.py \\\n",
+    "            nemo_model_file={hybrid_ctc_rnnt_model_name} \\\n",
+    "            input_manifest={test_nemo_manifest} \\\n",
+    "            preds_output_folder={exp_dir} \\\n",
+    "            decoder_type=\"rnnt\" \\\n",
+    "            acoustic_batch_size=64 \\\n",
+    "            apply_context_biasing=false \\\n",
+    "            context_file={cb_list_file_modified} \\\n",
+    "            beam_threshold=[7.0] \\\n",
+    "            context_score=[3.0] \\\n",
+    "            ctc_ali_token_weight=[0.5]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "773e11f1",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Transducer model (with context-biasing)\n",
+    "!python {NEMO_DIR_PATH}/scripts/asr_context_biasing/eval_greedy_decoding_with_context_biasing.py \\\n",
+    "            nemo_model_file={hybrid_ctc_rnnt_model_name} \\\n",
+    "            input_manifest={test_nemo_manifest} \\\n",
+    "            preds_output_folder={exp_dir} \\\n",
+    "            decoder_type=\"rnnt\" \\\n",
+    "            acoustic_batch_size=64 \\\n",
+    "            apply_context_biasing=true \\\n",
+    "            context_file={cb_list_file_modified} \\\n",
+    "            beam_threshold=[7.0] \\\n",
+    "            context_score=[3.0] \\\n",
+    "            ctc_ali_token_weight=[0.5]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "45a91385",
+   "metadata": {},
+   "source": [
+    "CTC-WS context-biasing works for Transducer model as well as for CTC (`F-score improvenment: 0.3784 -> 0.9286`). Differences in the nature of offline and online models may cause differences in results (usually, online models have a tendency to predict tokens earlier what can affect the difference between the timestamps of CTC and RNN-T models). "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1968e7bc",
+   "metadata": {},
+   "source": [
+    "## Practical part 2 (advanced)\n",
+    "In this section, we will consider the context-biasing process more deeply:\n",
+    "- Visualization of the context-biasing graph\n",
+    "- Running CTC-WS with the context-biasing graph\n",
+    "- Merge the obtained spotted words with greedy decoding results\n",
+    "- Analysis of the results"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "277104b5",
+   "metadata": {},
+   "source": [
+    "### Build a context graph (for visualization only)\n",
+    "The context graph consists of a composition of a prefix tree (Trie) with the CTC transition topology for words and phrases from the context-biasing list. We use a BPE tokenizer from the target ASR model for word segmentation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "904ea41b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nemo.collections.asr.parts import context_biasing\n",
+    "\n",
+    "# get bpe tokenization\n",
+    "cb_words_small = [\"nvidia\", \"gpu\", \"nvlink\", \"numpy\"]\n",
+    "context_transcripts = []\n",
+    "for word in cb_words_small:\n",
+    "    # use text_to_tokens method for viasualization only\n",
+    "    word_tokenization = ctc_model.tokenizer.text_to_tokens(word)\n",
+    "    print(f\"{word}: {word_tokenization}\")\n",
+    "    context_transcripts.append([word, [word_tokenization]])\n",
+    "\n",
+    "# build context graph\n",
+    "context_graph = context_biasing.ContextGraphCTC(blank_id=\"⊘\")\n",
+    "context_graph.add_to_graph(context_transcripts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f7fab1e1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "context_graph.draw()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e1c57878",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# install graphviz from source if you have problems with graph picture\n",
+    "# set instal_graphviz = True\n",
+    "# this may take about 5-10 minutes\n",
+    "\n",
+    "instal_graphviz = False\n",
+    "\n",
+    "if instal_graphviz:\n",
+    "    !{NEMO_DIR_PATH}/scripts/installers/install_graphviz.sh"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "04a6f4be",
+   "metadata": {},
+   "source": [
+    "### Build a real context graph (for decoding)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2ba2d8a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get bpe tokenization\n",
+    "context_transcripts = []\n",
+    "for word in cb_words:\n",
+    "    word_tokenization = [ctc_model.tokenizer.text_to_ids(x) for x in word]\n",
+    "    context_transcripts.append([word, word_tokenization])\n",
+    "\n",
+    "# build context graph\n",
+    "context_graph = context_biasing.ContextGraphCTC(blank_id=ctc_model.decoding.blank_id)\n",
+    "context_graph.add_to_graph(context_transcripts)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "71e0e86b",
+   "metadata": {},
+   "source": [
+    "### Run CTC-based Word Spotter\n",
+    "\n",
+    "The CTC-WS task is to search for words by decoding CTC log probabilities using the context graph. As a result, we obtain a list of detected words with exact start/end frames in the audio file and their overall scores. The relatively small size of the context graph and hypotheses pruning methods allow this algorithm to work very quickly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f2bc370b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from tqdm.notebook import tqdm\n",
+    "\n",
+    "# get ctc logprobs\n",
+    "audio_file_paths = [item['audio_filepath'] for item in test_data]\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    ctc_model.eval()\n",
+    "    ctc_model.encoder.freeze()\n",
+    "    device = next(ctc_model.parameters()).device\n",
+    "    hyp_results = ctc_model.transcribe(audio_file_paths, batch_size=10, return_hypotheses=True)\n",
+    "    ctc_logprobs = [hyp.alignments.cpu().numpy() for hyp in hyp_results]\n",
+    "    blank_idx = ctc_model.decoding.blank_id\n",
+    "    \n",
+    "# run ctc-based word spotter\n",
+    "ws_results = {}\n",
+    "for idx, logits in tqdm(\n",
+    "    enumerate(ctc_logprobs), desc=f\"Eval CTC-based Word Spotter...\", total=len(ctc_logprobs)\n",
+    "):\n",
+    "    ws_results[audio_file_paths[idx]] = context_biasing.run_word_spotter(\n",
+    "        logits,\n",
+    "        context_graph,\n",
+    "        ctc_model,\n",
+    "        blank_idx=blank_idx,\n",
+    "        beam_threshold=7.0,\n",
+    "        cb_weight=3.0,\n",
+    "        ctc_ali_token_weight=0.5,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4bd6645c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# print CTC-WS hypotheses for the first audio file\n",
+    "ws_results[audio_file_paths[0]]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "245a66f0",
+   "metadata": {},
+   "source": [
+    "### Merge CTC-WS words with greedy CTC decoding results\n",
+    "\n",
+    "Use `print_stats=True` to get more information about spotted words and greedy CTC word alignment."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "423b2b9e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "target_transcripts = [item['text'] for item in test_data]\n",
+    "\n",
+    "# merge spotted words with greedy results\n",
+    "for idx, logprobs in enumerate(ctc_logprobs):\n",
+    "    greedy_predicts = np.argmax(logprobs, axis=1)\n",
+    "    if ws_results[audio_file_paths[idx]]:\n",
+    "        # make new text by mearging alignment with ctc-ws predictions:\n",
+    "        print(\"\\n\" + \"********\" * 10)\n",
+    "        print(f\"File name: {audio_file_paths[idx]}\")\n",
+    "        pred_text, raw_text = context_biasing.merge_alignment_with_ws_hyps(\n",
+    "            greedy_predicts,\n",
+    "            ctc_model,\n",
+    "            ws_results[audio_file_paths[idx]],\n",
+    "            decoder_type=\"ctc\",\n",
+    "            blank_idx=blank_idx,\n",
+    "            print_stats=True,\n",
+    "        )\n",
+    "        print(f\"[raw text]: {raw_text}\")\n",
+    "        print(f\"[hyp text]: {pred_text}\")\n",
+    "        print(f\"[ref text]: {target_transcripts[idx]}\")\n",
+    "    else:\n",
+    "        # if no spotted words, use standard greedy predictions\n",
+    "        pred_text = ctc_model.wer.decoding.ctc_decoder_predictions_tensor(greedy_predicts)[0][0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fb8b5f51",
+   "metadata": {},
+   "source": [
+    "In these logs, you can find detailed context-biasing statistics about each audio file:\n",
+    "- Audio file name\n",
+    "- Greedy word alignment\n",
+    "- List of spotted words\n",
+    "- Text results:\n",
+    "    - Greedy decoding (raw text)\n",
+    "    - Text after applying context-biasing (hyp text)\n",
+    "    - Ground truth transcription (ref text)\n",
+    "    \n",
+    "These statistics can be helpful in case of problems with context-biasing word recognition. For example, Transducer models sometimes recognize tokens 1-2 frames earlier than CTC models. To solve this problem, you can shift the start frame of the detected word left in the CTC-WS sources."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "11220db2",
+   "metadata": {},
+   "source": [
+    "## Summary\n",
+    "\n",
+    "This tutorial demonstrates how to use the CTC-WS context-biasing technique to improve the recognition accuracy of specific words in the case of CTC and Transducer (RNN-T) ASR models. The tutorial includes the methodology for creating the context-biasing list, improving recognition accuracy of abbreviations and compound words, visualization of the context-biasing process, and results analysis.\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/tutorials/asr/README.md b/tutorials/asr/README.md
index 77f157acac0c..138e13f58a08 100644
--- a/tutorials/asr/README.md
+++ b/tutorials/asr/README.md
@@ -34,6 +34,7 @@ In this repository, you will find several tutorials discussing what is Automatic
 
 13) `ASR_Example_CommonVoice_Finetuning`: Learn how to fine-tune an ASR model using CommonVoice to a new alphabet, Esperanto. We walk through the data processing steps of MCV data using HuggingFace Datasets, preparation of the tokenizer, model and then setup fine-tuning.
 
+14) `ASR_Context_Biasing`: This tutorial aims to show how to improve the recognition accuracy of specific words in NeMo framework for CTC and Trasducer (RNN-T) ASR models by using the fast context-biasing method with CTC-based Word Spotter.
 
 ----------------
 

From da2d3c71bea5c862b6fc0a83e035914d3a60c407 Mon Sep 17 00:00:00 2001
From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com>
Date: Thu, 14 Mar 2024 17:22:10 -0400
Subject: [PATCH 021/140] update for manifest loading (#8661)

Signed-off-by: stevehuang52 <heh@nvidia.com>
---
 .../asr/parts/utils/manifest_utils.py         | 19 +++++++++++++++++--
 .../asr/parts/utils/transcribe_utils.py       | 11 +++++------
 .../common/parts/preprocessing/manifest.py    | 19 ++++++++++++++++++-
 3 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/nemo/collections/asr/parts/utils/manifest_utils.py b/nemo/collections/asr/parts/utils/manifest_utils.py
index 71a35ceb3426..e9f91045c9a2 100644
--- a/nemo/collections/asr/parts/utils/manifest_utils.py
+++ b/nemo/collections/asr/parts/utils/manifest_utils.py
@@ -30,6 +30,7 @@
     segments_manifest_to_subsegments_manifest,
     write_rttm2manifest,
 )
+from nemo.utils import logging
 from nemo.utils.data_utils import DataStoreObject
 
 
@@ -476,10 +477,24 @@ def read_manifest(manifest: Union[Path, str]) -> List[dict]:
         f = open(manifest.get(), 'r', encoding='utf-8')
     except:
         raise Exception(f"Manifest file could not be opened: {manifest}")
-    for line in f:
-        item = json.loads(line)
+
+    errors = []
+    for line in f.readlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            item = json.loads(line)
+        except json.JSONDecodeError:
+            errors.append(line)
+            continue
         data.append(item)
     f.close()
+    if errors:
+        logging.error(f"{len(errors)} Errors encountered while reading manifest file: {manifest}")
+        for error in errors:
+            logging.error(f"-- Failed to parse line: `{error}`")
+        raise RuntimeError(f"Errors encountered while reading manifest file: {manifest}")
     return data
 
 
diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py
index e5cd8d7bbc10..681fab751e5f 100644
--- a/nemo/collections/asr/parts/utils/transcribe_utils.py
+++ b/nemo/collections/asr/parts/utils/transcribe_utils.py
@@ -26,7 +26,7 @@
 import nemo.collections.asr as nemo_asr
 from nemo.collections.asr.metrics.wer import word_error_rate
 from nemo.collections.asr.models import ASRModel, EncDecHybridRNNTCTCModel, EncDecMultiTaskModel
-from nemo.collections.asr.parts.utils import rnnt_utils
+from nemo.collections.asr.parts.utils import manifest_utils, rnnt_utils
 from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchASR, FrameBatchMultiTaskAED
 from nemo.collections.common.metrics.punct_er import OccurancePunctuationErrorRate
 from nemo.collections.common.parts.preprocessing.manifest import get_full_path
@@ -295,10 +295,9 @@ def prepare_audio_data(cfg: DictConfig) -> Tuple[List[str], bool]:
     return filepaths, partial_audio
 
 
-def read_and_maybe_sort_manifest(path: str, try_sort: bool = False) -> list[dict]:
+def read_and_maybe_sort_manifest(path: str, try_sort: bool = False) -> List[dict]:
     """Sorts the manifest if duration key is available for every utterance."""
-    with open(path) as f:
-        items = [json.loads(l) for l in f]
+    items = manifest_utils.read_manifest(path)
     if try_sort and all("duration" in item for item in items):
         items = sorted(items, reverse=True, key=lambda item: item["duration"])
     return items
@@ -563,8 +562,8 @@ def compute_metrics_per_sample(
     manifest_path: str,
     reference_field: str = "text",
     hypothesis_field: str = "pred_text",
-    metrics: list[str] = ["wer"],
-    punctuation_marks: list[str] = [".", ",", "?"],
+    metrics: List[str] = ["wer"],
+    punctuation_marks: List[str] = [".", ",", "?"],
     output_manifest_path: str = None,
 ) -> dict:
 
diff --git a/nemo/collections/common/parts/preprocessing/manifest.py b/nemo/collections/common/parts/preprocessing/manifest.py
index d3cc02fe3c68..1d49bd7c7019 100644
--- a/nemo/collections/common/parts/preprocessing/manifest.py
+++ b/nemo/collections/common/parts/preprocessing/manifest.py
@@ -15,6 +15,7 @@
 import json
 import os
 import re
+from collections import defaultdict
 from os.path import expanduser
 from typing import Any, Callable, Dict, Iterator, List, Optional, Union
 
@@ -70,6 +71,7 @@ def item_iter(
     if parse_func is None:
         parse_func = __parse_item
 
+    errors = defaultdict(list)
     k = -1
     logging.debug('Manifest files: %s', str(manifests_files))
     for manifest_file in manifests_files:
@@ -78,12 +80,27 @@ def item_iter(
         logging.debug('Cached at: %s', str(cached_manifest_file))
         with open(expanduser(cached_manifest_file), 'r') as f:
             for line in f:
+                line = line.strip()
+                if not line:
+                    continue
                 k += 1
-                item = parse_func(line, manifest_file)
+                try:
+                    item = parse_func(line, manifest_file)
+                except json.JSONDecodeError:
+                    errors[str(manifest_file)].append(line)
+                    continue
                 item['id'] = k
 
                 yield item
 
+    if len(errors) > 0:
+        for filename, lines in errors.items():
+            logging.error("=============================================")
+            logging.error(f"Failed to parse {len(lines)} lines from manifest file: {filename}")
+            for line in lines:
+                logging.error(f"-- Failed to parse line: `{line}`")
+        raise RuntimeError("Failed to parse some lines from manifest files. See logs for more details.")
+
 
 def __parse_item(line: str, manifest_file: str) -> Dict[str, Any]:
     item = json.loads(line)

From 01aedc69d673a89c85d680c7bfe58a10d36f718c Mon Sep 17 00:00:00 2001
From: Rachit Garg <rachitgarg91@gmail.com>
Date: Thu, 14 Mar 2024 15:50:36 -0700
Subject: [PATCH 022/140] add the persistent_workers to the dataloader (#8654)

Signed-off-by: rachitg <rachitg@nvidia.com>
Co-authored-by: rachitg <rachitg@nvidia.com>
---
 .../nlp/models/language_modeling/megatron_gpt_sft_model.py       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index 325f039d461b..7ab00f1af85a 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -804,6 +804,7 @@ def build_data_loader(self, dataset, data_cfg, consumed_samples=0):
             collate_fn=collate_fn,
             num_workers=data_cfg.num_workers,
             pin_memory=data_cfg.pin_memory,
+            persistent_workers=True if data_cfg.num_workers > 0 else False,
         )
 
     def setup_training_dataloader(self):

From c9347b98038b704ee79da56bc7ee0e30e6161900 Mon Sep 17 00:00:00 2001
From: Adi Renduchintala <adithyare@nvidia.com>
Date: Thu, 14 Mar 2024 15:55:09 -0700
Subject: [PATCH 023/140] LLM Embedding model (#8622)

* config update

Signed-off-by: arendu <adithya.r@gmail.com>

* save embeddings and some refac

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* entry point script for dumping embeddings to disk

Signed-off-by: arendu <adithya.r@gmail.com>

* normalize query and pos_doc even if no soft negatives are used

Signed-off-by: arendu <adithya.r@gmail.com>

* yaml for generation script

Signed-off-by: arendu <adithya.r@gmail.com>

* all possible negatives

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* updates

Signed-off-by: arendu <adithya.r@gmail.com>

* logging

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* need to update docstrings

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* headers and rename

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* log diff and fix cs logging

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* non-standard solution to get wandb logger to have the config

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* check for rank

Signed-off-by: arendu <adithya.r@gmail.com>

* cfg working for multi gpu

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* MCoreMixin chages.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* using new commit of meg-LM

Signed-off-by: arendu <adithya.r@gmail.com>

* default to use all layers for lora

Signed-off-by: arendu <adithya.r@gmail.com>

* validation only uses hard negatives, val scores are batch agnostic

Signed-off-by: arendu <adithya.r@gmail.com>

* minor reorg

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* metadata and bug fixes

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* dump embeddings with tracable ids, disabled val logs for the moment

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* val ids

Signed-off-by: arendu <adithya.r@gmail.com>

* val ids by consumed samples

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* don't gather if not saving embs

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* init global step to allow consumed samples to be called in test time

Signed-off-by: arendu <adithya.r@gmail.com>

* enable adapters with packed seq

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add the following changes for PTL 2.1

1) Remove LightningModuleWrapperBase around model as its not required with PTL 2.1
2) Make precision as None when using precision plugin in MegatronTrainerBuilder
3) Change dataloader_iter API for some megatron model

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Change dataloader_iter API and remove val_iterator_done

1) Change dataloader_iter API according to PTl 2.1 for bert and gpt model
2) Comment self._val_iterator_done for all megatron models

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Override format_checkpoint_nae and fix dataloader_iter API

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Update PTL version in requirements

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add the following changes for PTL 2.1

1) Remove LightningModuleWrapperBase around model as its not required with PTL 2.1
2) Make precision as None when using precision plugin in MegatronTrainerBuilder
3) Change dataloader_iter API for some megatron model

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Change dataloader_iter API and remove val_iterator_done

1) Change dataloader_iter API according to PTl 2.1 for bert and gpt model
2) Comment self._val_iterator_done for all megatron models

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Override format_checkpoint_nae and fix dataloader_iter API

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove unused import and comment val_iterator_done

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Override _link_checkpoint

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Temporarily disable GPU unit tests

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Temporarily comment out CPU Unit tests

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Remove precision arg from Trainer in convert_hf_llama_to_nemo.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix dataloader_iter API for megatron_lm_encoder_decoder_model.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Temporarily disable NMT Training TP=2 test

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix val_step, test_step func API MegatronLMEncoderDecoderModel

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Enable NMT training TP=2 test

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Disable some unit tests

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Comment CI tests

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Comment resume part of BART

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Uncomment few lines from JenkinsFile

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Return len of dataloader in microbatches

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix _link_checkpoint

1) Add inject_model_parallel_rank to _link_checkpoint
2) Override super._link_checkpoint to remove condition check for rank 0

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Check if using dist ckpt in _link_checkpoint

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Temporarily disable GPT with PP=2

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Remove batch_idx arg from validation_step megatron_gpt_sft_model.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Use PTL bug fix branch

Test unit tests with PTL bug fix https://github.com/Lightning-AI/pytorch-lightning/pull/19344/files

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Temporarily disable test_ema_saved_state in test_ema.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Skip test_beam_decoding_preserve_alignments in test_rnnt_decoding.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Use PTL with fs.lexists

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Comment _link_checkpoint related overrides

In order to test with PTL without symbolic links

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Return only batch for dataloader_iter in DFT model

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Modify get_batch in GPTModel

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add condition checks for batch extraction from dataloader_iter

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add missing condition check for batch extraction in GPTModel

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add condition check for dataloader_iter extraction in MegatronLMEncoderDecoder

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Comment test_invalid_checkpoints_removed_from_topk in test_exp_manager.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix test invalid ckpts in test_exp_manager.py

Also uncomment some of the commented out tests in JenkinsFile and test_ema.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix bug in test_invalid_checkpoints_removed_from_topk

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix validation step of GPTModel for finetuning case with multi dataloaders

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* multi dataloaders for validation query and docs

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* validation loop made more efficient with 2 dataloders

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* WIP test set generation

Signed-off-by: arendu <adithya.r@gmail.com>

* generate working for multi dataloaders

Signed-off-by: arendu <adithya.r@gmail.com>

* Add the following changes for PTL 2.1

1) Remove LightningModuleWrapperBase around model as its not required with PTL 2.1
2) Make precision as None when using precision plugin in MegatronTrainerBuilder
3) Change dataloader_iter API for some megatron model

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Change dataloader_iter API and remove val_iterator_done

1) Change dataloader_iter API according to PTl 2.1 for bert and gpt model
2) Comment self._val_iterator_done for all megatron models

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Override format_checkpoint_nae and fix dataloader_iter API

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Update PTL version in requirements

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add the following changes for PTL 2.1

1) Remove LightningModuleWrapperBase around model as its not required with PTL 2.1
2) Make precision as None when using precision plugin in MegatronTrainerBuilder
3) Change dataloader_iter API for some megatron model

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Change dataloader_iter API and remove val_iterator_done

1) Change dataloader_iter API according to PTl 2.1 for bert and gpt model
2) Comment self._val_iterator_done for all megatron models

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Override format_checkpoint_nae and fix dataloader_iter API

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove unused import and comment val_iterator_done

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Override _link_checkpoint

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Temporarily disable GPU unit tests

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Temporarily comment out CPU Unit tests

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Remove precision arg from Trainer in convert_hf_llama_to_nemo.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix dataloader_iter API for megatron_lm_encoder_decoder_model.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Temporarily disable NMT Training TP=2 test

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix val_step, test_step func API MegatronLMEncoderDecoderModel

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Enable NMT training TP=2 test

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Disable some unit tests

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Comment CI tests

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Comment resume part of BART

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Uncomment few lines from JenkinsFile

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Return len of dataloader in microbatches

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix _link_checkpoint

1) Add inject_model_parallel_rank to _link_checkpoint
2) Override super._link_checkpoint to remove condition check for rank 0

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Check if using dist ckpt in _link_checkpoint

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Temporarily disable GPT with PP=2

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Remove batch_idx arg from validation_step megatron_gpt_sft_model.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Use PTL bug fix branch

Test unit tests with PTL bug fix https://github.com/Lightning-AI/pytorch-lightning/pull/19344/files

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Temporarily disable test_ema_saved_state in test_ema.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Skip test_beam_decoding_preserve_alignments in test_rnnt_decoding.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Use PTL with fs.lexists

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Comment _link_checkpoint related overrides

In order to test with PTL without symbolic links

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Return only batch for dataloader_iter in DFT model

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Modify get_batch in GPTModel

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add condition checks for batch extraction from dataloader_iter

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add missing condition check for batch extraction in GPTModel

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add condition check for dataloader_iter extraction in MegatronLMEncoderDecoder

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Comment test_invalid_checkpoints_removed_from_topk in test_exp_manager.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix test invalid ckpts in test_exp_manager.py

Also uncomment some of the commented out tests in JenkinsFile and test_ema.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix bug in test_invalid_checkpoints_removed_from_topk

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix validation step of GPTModel for finetuning case with multi dataloaders

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix test_step_outputs for SFT in GPTMOdel

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Pass dataloader_idx for val_step of GPTModel and remove unwanted code

1) Pass dataloader_idx to val_step of GPTModel as its required for GPTSFTModel in case multi dataloaders to append the outputs correctly val/test_step_output
2) Remove val_iterator_done check from all megatron GPT models

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add condition check for extraction of batch in T5SFTModel & LMEncoderDecoder

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add condition check for extracting batch in MegatronNMTModel

Also uncomment GPT PP=2 and NMT tests from JenkinsFIle

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix typo and uncomment multimodel tests

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* default names

Signed-off-by: arendu <adithya.r@gmail.com>

* Add the following changes for PTL 2.1

1) Remove LightningModuleWrapperBase around model as its not required with PTL 2.1
2) Make precision as None when using precision plugin in MegatronTrainerBuilder
3) Change dataloader_iter API for some megatron model

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Change dataloader_iter API and remove val_iterator_done

1) Change dataloader_iter API according to PTl 2.1 for bert and gpt model
2) Comment self._val_iterator_done for all megatron models

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Override format_checkpoint_nae and fix dataloader_iter API

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Update PTL version in requirements

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add the following changes for PTL 2.1

1) Remove LightningModuleWrapperBase around model as its not required with PTL 2.1
2) Make precision as None when using precision plugin in MegatronTrainerBuilder
3) Change dataloader_iter API for some megatron model

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Change dataloader_iter API and remove val_iterator_done

1) Change dataloader_iter API according to PTl 2.1 for bert and gpt model
2) Comment self._val_iterator_done for all megatron models

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Override format_checkpoint_nae and fix dataloader_iter API

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove unused import and comment val_iterator_done

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Override _link_checkpoint

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Temporarily comment out CPU Unit tests

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Remove precision arg from Trainer in convert_hf_llama_to_nemo.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix dataloader_iter API for megatron_lm_encoder_decoder_model.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Temporarily disable NMT Training TP=2 test

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix val_step, test_step func API MegatronLMEncoderDecoderModel

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Enable NMT training TP=2 test

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Disable some unit tests

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Comment CI tests

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Comment resume part of BART

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Uncomment few lines from JenkinsFile

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Return len of dataloader in microbatches

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix _link_checkpoint

1) Add inject_model_parallel_rank to _link_checkpoint
2) Override super._link_checkpoint to remove condition check for rank 0

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Check if using dist ckpt in _link_checkpoint

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Remove batch_idx arg from validation_step megatron_gpt_sft_model.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Use PTL bug fix branch

Test unit tests with PTL bug fix https://github.com/Lightning-AI/pytorch-lightning/pull/19344/files

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Temporarily disable test_ema_saved_state in test_ema.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Skip test_beam_decoding_preserve_alignments in test_rnnt_decoding.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Use PTL with fs.lexists

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Comment _link_checkpoint related overrides

In order to test with PTL without symbolic links

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Return only batch for dataloader_iter in DFT model

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Modify get_batch in GPTModel

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add condition checks for batch extraction from dataloader_iter

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add missing condition check for batch extraction in GPTModel

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add condition check for dataloader_iter extraction in MegatronLMEncoderDecoder

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Comment test_invalid_checkpoints_removed_from_topk in test_exp_manager.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix test invalid ckpts in test_exp_manager.py

Also uncomment some of the commented out tests in JenkinsFile and test_ema.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix bug in test_invalid_checkpoints_removed_from_topk

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix validation step of GPTModel for finetuning case with multi dataloaders

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix test_step_outputs for SFT in GPTMOdel

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Pass dataloader_idx for val_step of GPTModel and remove unwanted code

1) Pass dataloader_idx to val_step of GPTModel as its required for GPTSFTModel in case multi dataloaders to append the outputs correctly val/test_step_output
2) Remove val_iterator_done check from all megatron GPT models

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add condition check for extraction of batch in T5SFTModel & LMEncoderDecoder

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add condition check for extracting batch in MegatronNMTModel

Also uncomment GPT PP=2 and NMT tests from JenkinsFIle

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix typo and uncomment multimodel tests

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Change to new dataloader_iter API for MultiModal

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix new dataloader_api for MegatronLatenDiffusion Model

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Store and restore precision value in MegatronGPTSFTModel

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Temporarily comment Multimodal Stable Diffusion Train

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Update JenkinsFile for multimodal with latest main

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Upgrade PTL to version 2.2 in reqs

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Install PTL 2.2 from fork

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add strict arg to load_model_state_dict func in NLPDDPStrategy

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Delete megatron_t5_adapter_tuning.py, megatron_t5_ia3_tuning.py

These files were added in the branch by mistake

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Delete megatron_t5_prompt_learning.py that got added by mistake

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add appropriate comments, code clean up

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Remove PTL installation from JenkinsFile

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update

Signed-off-by: arendu <adithya.r@gmail.com>

* llm embeddings with ptl2.2

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* global in batch negatives using all gather

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove old files

Signed-off-by: arendu <adithya.r@gmail.com>

* remove changes in untouched files

Signed-off-by: arendu <adithya.r@gmail.com>

* inference for embedding model from ckpt

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>
Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>
Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Jiaqi Zeng <jiaqiz@nvidia.com>
Co-authored-by: Tugrul Konuk <ertkonuk@gmail.com>
Co-authored-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 Jenkinsfile                                   | 356 +++++++-------
 ...megatron_gpt_embedder_generate_config.yaml | 216 +++++++++
 .../megatron_gpt_embedder_tuning_config.yaml  | 212 +++++++++
 .../megatron_gpt_embedding_finetuning.py      |  74 +++
 .../megatron_gpt_embedding_generate.py        | 135 ++++++
 .../conf/megatron_gpt_finetuning_config.yaml  |   2 +-
 .../gpt_embedding_dataset.py                  | 281 ++++++++++++
 .../megatron_gpt_embedding_model.py           | 433 ++++++++++++++++++
 .../language_modeling/megatron_base_model.py  |   2 +
 .../language_modeling/megatron_gpt_model.py   |  35 +-
 .../megatron_gpt_sft_model.py                 | 234 +++++-----
 .../megatron_lm_encoder_decoder_model.py      |   2 -
 .../nlp/parts/mixins/nlp_adapter_mixins.py    |   3 +-
 .../construct_random_negatives.py             |   0
 .../information_retrieval/get_msmarco.sh      |   0
 15 files changed, 1699 insertions(+), 286 deletions(-)
 create mode 100644 examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml
 create mode 100644 examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml
 create mode 100644 examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py
 create mode 100644 examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py
 create mode 100644 nemo/collections/nlp/data/information_retrieval/gpt_embedding_dataset.py
 create mode 100644 nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
 rename {examples/nlp/information_retrieval => scripts}/construct_random_negatives.py (100%)
 rename {examples/nlp => scripts}/information_retrieval/get_msmarco.sh (100%)

diff --git a/Jenkinsfile b/Jenkinsfile
index 602c78890262..b278a53d8213 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -85,6 +85,14 @@ pipeline {
       }
     }
 
+    stage('Pytorch lightning installation') {
+      steps {
+         sh 'git clone -b bug_fix https://github.com/athitten/pytorch-lightning.git && \
+             cd pytorch-lightning && \
+             PACKAGE_NAME=pytorch pip install -e .'
+      }
+    }
+
     // pip package should be working with main, if not we can update the commit here
     // until the pip package is updated
     stage('Megatron Core installation') {
@@ -147,8 +155,8 @@ pipeline {
     stage('L2: Multimodal Imagen Train') {
       when {
         anyOf {
-          branch 'r1.23.0'
-          changeRequest target: 'r1.23.0'
+          branch 'main'
+          changeRequest target: 'main'
         }
       }
       failFast true
@@ -161,7 +169,6 @@ pipeline {
         trainer.devices=1 \
         ++exp_manager.max_time_per_run=00:00:03:00 \
         trainer.max_steps=20 \
-        model.conditioning.embed_dim=64 \
         model.micro_batch_size=1 \
         model.global_batch_size=1 \
         model.data.synthetic_data=True \
@@ -173,11 +180,12 @@ pipeline {
         sh "rm -rf /home/TestData/multimodal/imagen_train"
       }
     }
+
     stage('L2: Multimodal Stable Diffusion Train') {
       when {
         anyOf {
-          branch 'r1.23.0'
-          changeRequest target: 'r1.23.0'
+          branch 'main'
+          changeRequest target: 'main'
         }
       }
       failFast true
@@ -204,81 +212,79 @@ pipeline {
             model.unet_config.from_pretrained=null \
             model.first_stage_config.from_pretrained=null \
             model.unet_config.use_flash_attention=False \
-            model.unet_config.attention_resolutions=[1] \
-            model.unet_config.channel_mult=[1] \
             "
         sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
         sh "rm -rf /home/TestData/multimodal/stable_diffusion_train"
       }
     }
-//     stage('L2: Multimodal ControlNet Train') {
-//       when {
-//         anyOf {
-//           branch 'main'
-//           changeRequest target: 'main'
-//         }
-//       }
-//       failFast true
-//       steps {
-//         sh "rm -rf /home/TestData/multimodal/controlnet_train"
-//         sh "pip install webdataset==0.2.48"
-//         sh "python examples/multimodal/text_to_image/controlnet/controlnet_train.py \
-//             trainer.precision=16 \
-//             trainer.num_nodes=1 \
-//             trainer.devices=1 \
-//             ++exp_manager.max_time_per_run=00:00:03:00 \
-//             trainer.max_steps=20 \
-//             model.micro_batch_size=1 \
-//             model.global_batch_size=1 \
-//             model.data.synthetic_data=True \
-//             exp_manager.exp_dir=/home/TestData/multimodal/controlnet_train \
-//             model.inductor=False \
-//             model.image_logger.max_images=0 \
-//             model.control_stage_config.params.from_pretrained_unet=null \
-//             model.unet_config.from_pretrained=null \
-//             model.first_stage_config.from_pretrained=null \
-//             model.unet_config.use_flash_attention=False \
-//             "
-//         sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
-//         sh "rm -rf /home/TestData/multimodal/controlnet_train"
-//       }
-//     }
-//     stage('L2: Multimodal DreamBooth Train') {
-//       when {
-//         anyOf {
-//           branch 'main'
-//           changeRequest target: 'main'
-//         }
-//       }
-//       failFast true
-//       steps {
-//         sh "rm -rf /home/TestData/multimodal/dreambooth_train"
-//         sh "pip install webdataset==0.2.48"
-//         sh "python examples/multimodal/text_to_image/dreambooth/dreambooth.py \
-//             trainer.precision=16 \
-//             trainer.num_nodes=1 \
-//             trainer.devices=1 \
-//             ++exp_manager.max_time_per_run=00:00:03:00 \
-//             trainer.max_steps=20 \
-//             model.micro_batch_size=1 \
-//             model.global_batch_size=1 \
-//             exp_manager.exp_dir=/home/TestData/multimodal/dreambooth_train \
-//             model.inductor=False \
-//             model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
-//             ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
-//             ++model.cond_stage_config.max_length=77 \
-//             ~model.cond_stage_config.restore_from_path \
-//             ~model.cond_stage_config.freeze \
-//             ~model.cond_stage_config.layer \
-//             model.unet_config.from_pretrained=null \
-//             model.first_stage_config.from_pretrained=null \
-//             model.data.instance_dir=/home/TestData/multimodal/tiny-dreambooth \
-//             model.unet_config.use_flash_attention=False \
-//             "
-//         sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
-//         sh "rm -rf /home/TestData/multimodal/dreambooth_train"
-//       }
-//     }
+    stage('L2: Multimodal ControlNet Train') {
+      when {
+        anyOf {
+          branch 'main'
+          changeRequest target: 'main'
+        }
+      }
+      failFast true
+      steps {
+        sh "rm -rf /home/TestData/multimodal/controlnet_train"
+        sh "pip install webdataset==0.2.48"
+        sh "python examples/multimodal/text_to_image/controlnet/controlnet_train.py \
+            trainer.precision=16 \
+            trainer.num_nodes=1 \
+            trainer.devices=1 \
+            ++exp_manager.max_time_per_run=00:00:03:00 \
+            trainer.max_steps=20 \
+            model.micro_batch_size=1 \
+            model.global_batch_size=1 \
+            model.data.synthetic_data=True \
+            exp_manager.exp_dir=/home/TestData/multimodal/controlnet_train \
+            model.inductor=False \
+            model.image_logger.max_images=0 \
+            model.control_stage_config.params.from_pretrained_unet=null \
+            model.unet_config.from_pretrained=null \
+            model.first_stage_config.from_pretrained=null \
+            model.unet_config.use_flash_attention=False \
+            "
+        sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
+        sh "rm -rf /home/TestData/multimodal/controlnet_train"
+      }
+    }
+    stage('L2: Multimodal DreamBooth Train') {
+      when {
+        anyOf {
+          branch 'main'
+          changeRequest target: 'main'
+        }
+      }
+      failFast true
+      steps {
+        sh "rm -rf /home/TestData/multimodal/dreambooth_train"
+        sh "pip install webdataset==0.2.48"
+        sh "python examples/multimodal/text_to_image/dreambooth/dreambooth.py \
+            trainer.precision=16 \
+            trainer.num_nodes=1 \
+            trainer.devices=1 \
+            ++exp_manager.max_time_per_run=00:00:03:00 \
+            trainer.max_steps=20 \
+            model.micro_batch_size=1 \
+            model.global_batch_size=1 \
+            exp_manager.exp_dir=/home/TestData/multimodal/dreambooth_train \
+            model.inductor=False \
+            model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
+            ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
+            ++model.cond_stage_config.max_length=77 \
+            ~model.cond_stage_config.restore_from_path \
+            ~model.cond_stage_config.freeze \
+            ~model.cond_stage_config.layer \
+            model.unet_config.from_pretrained=null \
+            model.first_stage_config.from_pretrained=null \
+            model.data.instance_dir=/home/TestData/multimodal/tiny-dreambooth \
+            model.unet_config.use_flash_attention=False \
+            "
+        sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
+        sh "rm -rf /home/TestData/multimodal/dreambooth_train"
+      }
+    }
     stage('L2: Vision ViT Pretrain TP=1') {
       when {
         anyOf {
@@ -2725,106 +2731,106 @@ pipeline {
         }
       }
     }
-    stage('L2: Megatron NMT Training TP=2') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "python examples/nlp/machine_translation/megatron_nmt_training.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        +trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='swiglu' \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.micro_batch_size=2 \
-        model.global_batch_size=4 \
-        model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-        model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-        model.train_ds.num_workers=1 \
-        model.validation_ds.num_workers=1 \
-        ~model.test_ds \
-        model.train_ds.dataset_type=text_memmap \
-        model.encoder_tokenizer.library=sentencepiece \
-        model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-        model.decoder_tokenizer.library=sentencepiece \
-        model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model"
-        // Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error
-        // if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run()
-        sh "python examples/nlp/machine_translation/megatron_nmt_training.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        +trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation='swiglu' \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method='block' \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation='swiglu' \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method='block' \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.micro_batch_size=2 \
-        model.global_batch_size=4 \
-        model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-        model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-        model.train_ds.num_workers=1 \
-        model.validation_ds.num_workers=1 \
-        ~model.test_ds \
-        model.train_ds.dataset_type=text_memmap \
-        model.encoder_tokenizer.library=sentencepiece \
-        model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-        model.decoder_tokenizer.library=sentencepiece \
-        model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model"
-        sh "rm -rf examples/nlp/machine_translation/megatron_nmt_results"
-      }
-    }
+    // stage('L2: Megatron NMT Training TP=2') {
+    //   when {
+    //     anyOf {
+    //       branch 'main'
+    //       changeRequest target: 'main'
+    //     }
+    //   }
+    //   failFast true
+    //   steps {
+    //     sh "python examples/nlp/machine_translation/megatron_nmt_training.py \
+    //     trainer.devices=2 \
+    //     trainer.accelerator=gpu \
+    //     trainer.log_every_n_steps=1 \
+    //     trainer.val_check_interval=10 \
+    //     +trainer.limit_val_batches=2 \
+    //     trainer.accumulate_grad_batches=1 \
+    //     trainer.max_steps=10 \
+    //     trainer.precision=16 \
+    //     trainer.gradient_clip_val=1.0 \
+    //     exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \
+    //     model.tensor_model_parallel_size=2 \
+    //     model.seq_length=128 \
+    //     model.encoder.num_layers=4 \
+    //     model.encoder.hidden_size=64 \
+    //     model.encoder.num_attention_heads=8 \
+    //     model.encoder.activation='swiglu' \
+    //     model.encoder.masked_softmax_fusion=False \
+    //     model.encoder.bias_activation_fusion=False \
+    //     model.encoder.activations_checkpoint_method='block' \
+    //     model.encoder.activations_checkpoint_num_layers=1 \
+    //     model.decoder.num_layers=2 \
+    //     model.decoder.hidden_size=64 \
+    //     model.decoder.num_attention_heads=8 \
+    //     model.decoder.activation='swiglu' \
+    //     model.decoder.masked_softmax_fusion=False \
+    //     model.decoder.bias_activation_fusion=False \
+    //     model.decoder.activations_checkpoint_method='block' \
+    //     model.decoder.activations_checkpoint_num_layers=1 \
+    //     model.micro_batch_size=2 \
+    //     model.global_batch_size=4 \
+    //     model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+    //     model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+    //     model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+    //     model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+    //     model.train_ds.num_workers=1 \
+    //     model.validation_ds.num_workers=1 \
+    //     ~model.test_ds \
+    //     model.train_ds.dataset_type=text_memmap \
+    //     model.encoder_tokenizer.library=sentencepiece \
+    //     model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
+    //     model.decoder_tokenizer.library=sentencepiece \
+    //     model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model"
+    //     // Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error
+    //     // if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run()
+    //     sh "python examples/nlp/machine_translation/megatron_nmt_training.py \
+    //     trainer.devices=2 \
+    //     trainer.accelerator=gpu \
+    //     trainer.log_every_n_steps=1 \
+    //     trainer.val_check_interval=1 \
+    //     +trainer.limit_val_batches=2 \
+    //     trainer.accumulate_grad_batches=1 \
+    //     trainer.max_steps=10 \
+    //     trainer.precision=16 \
+    //     trainer.gradient_clip_val=1.0 \
+    //     exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \
+    //     model.tensor_model_parallel_size=2 \
+    //     model.seq_length=128 \
+    //     model.encoder.num_layers=4 \
+    //     model.encoder.hidden_size=64 \
+    //     model.encoder.num_attention_heads=8 \
+    //     model.encoder.activation='swiglu' \
+    //     model.encoder.masked_softmax_fusion=False \
+    //     model.encoder.bias_activation_fusion=False \
+    //     model.encoder.activations_checkpoint_method='block' \
+    //     model.encoder.activations_checkpoint_num_layers=1 \
+    //     model.decoder.num_layers=2 \
+    //     model.decoder.hidden_size=64 \
+    //     model.decoder.num_attention_heads=8 \
+    //     model.decoder.activation='swiglu' \
+    //     model.decoder.masked_softmax_fusion=False \
+    //     model.decoder.bias_activation_fusion=False \
+    //     model.decoder.activations_checkpoint_method='block' \
+    //     model.decoder.activations_checkpoint_num_layers=1 \
+    //     model.micro_batch_size=2 \
+    //     model.global_batch_size=4 \
+    //     model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+    //     model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+    //     model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+    //     model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+    //     model.train_ds.num_workers=1 \
+    //     model.validation_ds.num_workers=1 \
+    //     ~model.test_ds \
+    //     model.train_ds.dataset_type=text_memmap \
+    //     model.encoder_tokenizer.library=sentencepiece \
+    //     model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
+    //     model.decoder_tokenizer.library=sentencepiece \
+    //     model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model"
+    //     sh "rm -rf examples/nlp/machine_translation/megatron_nmt_results"
+    //   }
+    // }
     stage('L2: Megatron BART Perceiver MIM Training TP=2') {
       // Testing Megatron hidden transformations
       when {
diff --git a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml
new file mode 100644
index 000000000000..778dc937efdc
--- /dev/null
+++ b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml
@@ -0,0 +1,216 @@
+name: megatron_gpt_peft_${model.peft.peft_scheme}_tuning
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 9999
+  max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10 # frequency with which training steps are logged
+  val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.test_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    always_save_nemo: True
+    save_best_model: True
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+
+  global_batch_size: 1
+  micro_batch_size: 1
+  restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training.
+  sync_batch_comm: False
+  megatron_amp_O2: False
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  temperature: 0.8
+  num_soft_negatives: 0 # Number of soft negatives to use for contrastive loss,it should be max(batch_size - 1), 0 means use hard negatives only
+
+  peft:
+    peft_scheme: "lora"  # can be either adapter,ia3, or ptuning
+    restore_from_path: null
+    restore_from_ckpt:
+      checkpoint_dir: null
+      checkpoint_name: null
+
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      target_modules: ['attention_qkv','attention_dense','mlp_fc1','mlp_fc2'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
+      adapter_dim: 32
+      alpha: ${peft.lora_tuning.adapter_dim} 
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+    
+    selective_tuning:
+      tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
+
+  data:
+    return_output_tensors: True
+    test_ds:
+      query_file_names: ??? # Path to a list of JSONL files corresponding to the query data. Data format is identical to validation_ds.
+      doc_file_names: ??? # Path to a list of JSONL files corresponding to the doc data. Data format is identical to validation_ds.
+      names: ["queries", "doc"] # Names of the corresponding datasets used to log metrics.
+      global_batch_size: 1
+      micro_batch_size: 1
+      shuffle: False
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      add_eos: True
+      add_bos: False
+      write_embeddings_to_file: True
+      output_file_path_prefix: "test_embeddings" # Prefix of the file to write predictions to.
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+    
+inference:
+  greedy: True # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  all_probs: False  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+  outfile_path: output.txt
+  compute_attention_mask: True
+
+# server-related configs
+server: False  # whether launch the API server
+port: 5555 # the port number for the inference server
+web_server: False # whether launch the web inference server
+share: True  # whether create a public URL
+username: test # user name for web client
+password: test2  # password for web client
+web_port: 9889 # the port number of the web server 1058
+chat: False # use the chat interface
+chatbot_config:
+  value: False   # whether to inject the value attributes
+  attributes:
+    - name: Quality
+      min: 0
+      max: 4
+      key: quality
+      type: int
+      default: 4
+    - name: Toxicity
+      min: 0
+      max: 4
+      key: toxcity
+      type: int
+      default: 0
+    - name: Humor
+      min: 0
+      max: 4
+      key: humor
+      type: int
+      default: 0
+    - name: Creativity
+      min: 0
+      max: 4
+      key: creativity
+      type: int
+      default: 0
+    - name: Violence
+      min: 0
+      max: 4
+      key: violence
+      type: int
+      default: 0
+    - name: Helpfulness
+      min: 0
+      max: 4
+      key: helpfulness
+      type: int
+      default: 4
+    - name: Not_Appropriate
+      min: 0
+      max: 4
+      key: not_appropriate
+      type: int
+      default: 0
+    - name: Language
+      choices: ['ar', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'eo', 'es', 'eu', 'fa', 'fi', 'fr', 'gl', 'he', 'hu', 'id', 'it', 'ja', 'ko', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sv', 'th', 'tr', 'uk', 'vi', 'zh']
+      key: lang
+      type: list
+      default: en
+   
+  user: User
+  assistant: Assistant
+  system: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n"
diff --git a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml
new file mode 100644
index 000000000000..efd5271884ed
--- /dev/null
+++ b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml
@@ -0,0 +1,212 @@
+name: megatron_gpt_peft_${model.peft.peft_scheme}_tuning
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 9999
+  max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10 # frequency with which training steps are logged
+  val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: True
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+
+  global_batch_size: 128
+  micro_batch_size: 4
+  restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training.
+  sync_batch_comm: False
+  megatron_amp_O2: False
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  temperature: 0.8
+  num_soft_negatives: 0 # Number of soft negatives to use for contrastive loss,it should be max(batch_size - 1), 0 means use hard negatives only
+  use_all_possible_negatives: False # If True, use all possible negatives for contrastive loss, otherwise use num_soft_negatives, if num_soft_negatives is 0, use hard negatives only
+
+  peft:
+    peft_scheme: "lora"  # can be either adapter,ia3, or ptuning
+    restore_from_path: null
+
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      target_modules: ['attention_qkv', 'attention_dense', 'mlp_fc1', 'mlp_fc2'] #
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+    
+    selective_tuning:
+      tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
+
+  data:
+    return_output_tensors: True
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # file_names:
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
+      file_names: ??? # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      memmap_workers: 2
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Example of how to specify concat_sampling_probabilities
+      # concat_sampling_probabilities:
+      #   - 0.5
+      #   - 0.25
+      #   - 0.25
+      concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      label_key: 'output'
+      add_eos: True
+      add_bos: False
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] 
+    validation_ds:
+      query_file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      doc_file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: ["queries", "doc"] # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      memmap_workers: ${model.data.train_ds.memmap_workers}
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      label_key: ${model.data.train_ds.label_key}
+      add_eos: ${model.data.train_ds.add_eos}
+      add_bos: ${model.data.train_ds.add_bos}
+      write_embeddings_to_file: False
+      output_file_path_prefix: "validation_embeddings" # Prefix of the file to write predictions to.
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+    test_ds:
+      file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: null # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      memmap_workers: ${model.data.train_ds.memmap_workers}
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      add_eos: ${model.data.train_ds.add_eos}
+      add_bos: ${model.data.train_ds.add_bos}
+      write_predictions_to_file: True
+      output_file_path_prefix: "test_embeddings" # Prefix of the file to write predictions to.
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 50
+      min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1
+      constant_steps: 0 # Constant steps should also be 0 when min_lr=0
+      monitor: val_loss
+      reduce_on_plateau: false
\ No newline at end of file
diff --git a/examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py b/examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py
new file mode 100644
index 000000000000..e1fe28cc892f
--- /dev/null
+++ b/examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import MutableMapping
+
+import torch.multiprocessing as mp
+from omegaconf.omegaconf import OmegaConf
+from pytorch_lightning.loggers import WandbLogger
+
+from nemo.collections.nlp.models.information_retrieval.megatron_gpt_embedding_model import MegatronGPTEmbeddingModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
+from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.exp_manager import exp_manager
+
+mp.set_start_method("spawn", force=True)
+
+
+def flatten_dict(d: MutableMapping, parent_key: str = '', sep: str = '.') -> MutableMapping:
+    items = []
+    for k, v in d.items():
+        new_key = parent_key + sep + k if parent_key else k
+        if isinstance(v, MutableMapping):
+            items.extend(flatten_dict(v, new_key, sep=sep).items())
+        else:
+            items.append((new_key, v))
+    return dict(items)
+
+
+@hydra_runner(config_path="conf", config_name="megatron_gpt_embedder_tuning_config")
+def main(cfg) -> None:
+    logging.info("\n\n************** Experiment configuration ***********")
+    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
+
+    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
+    exp_manager(trainer, cfg.exp_manager)
+
+    model_cfg = MegatronGPTEmbeddingModel.merge_cfg_with(cfg.model.restore_from_path, cfg)
+    if trainer.global_rank == 0:
+        for logger in trainer.loggers:
+            if isinstance(logger, WandbLogger):
+                fd = flatten_dict(dict(model_cfg), sep="/")
+                logger.experiment.config.update(fd)
+    model = MegatronGPTEmbeddingModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
+    peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
+
+    if cfg.model.peft.restore_from_path is not None:
+        # initialize peft weights from a checkpoint instead of randomly
+        # This is not the same as resume training because optimizer states are not restored.
+        logging.info("PEFT Weights will be loaded from", cfg.model.peft.restore_from_path)
+        model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls(model_cfg))
+    elif peft_cfg_cls is not None:
+        logging.info("Adding adapter weights to the model for PEFT")
+        model.add_adapter(peft_cfg_cls(model_cfg))
+    else:
+        logging.info(f"Running full finetuning since no peft scheme is given.\n{model.summarize()}")
+
+    trainer.fit(model)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py b/examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py
new file mode 100644
index 000000000000..8cddcebbab62
--- /dev/null
+++ b/examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import asyncio
+import os
+import threading
+from functools import partial
+
+import torch
+import torch.multiprocessing as mp
+from omegaconf.omegaconf import OmegaConf, open_dict
+
+from nemo.collections.nlp.models.information_retrieval.megatron_gpt_embedding_model import MegatronGPTEmbeddingModel
+from nemo.collections.nlp.modules.common.text_generation_server import MegatronServer
+from nemo.collections.nlp.modules.common.text_generation_utils import generate
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
+from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.model_utils import inject_model_parallel_rank
+
+try:
+    from megatron.core import parallel_state
+
+    HAVE_MEGATRON_CORE = True
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
+
+mp.set_start_method("spawn", force=True)
+
+
+def use_inference_server(cfg, model, trainer):
+    if not HAVE_MEGATRON_CORE:
+        raise ValueError('Megatron-core needs to be installed to use this feature!')
+
+    from nemo.collections.nlp.modules.common.megatron_web_server import get_chatbot_demo, get_demo
+
+    trainer.test(model, dataloaders=None)
+
+    if parallel_state.is_pipeline_first_stage() and parallel_state.get_tensor_model_parallel_rank() == 0:
+        if cfg.web_server:
+            if cfg.chat:
+                defaults = {
+                    'user': cfg.chatbot_config.user,
+                    'assistant': cfg.chatbot_config.assistant,
+                    'system': cfg.chatbot_config.system,
+                }
+                web_ui = partial(
+                    get_chatbot_demo,
+                    defaults=defaults,
+                    value=cfg.chatbot_config.value,
+                    attributes=cfg.chatbot_config.attributes,
+                )
+            else:
+                web_ui = get_demo
+            loop = asyncio.new_event_loop()
+            thread = threading.Thread(
+                target=web_ui, daemon=True, args=(cfg.share, cfg.username, cfg.password, cfg.port, cfg.web_port, loop),
+            )
+            thread.start()
+        server = MegatronServer(model.cuda())
+        server.run("0.0.0.0", port=cfg.port)
+
+    while True:
+        choice = torch.cuda.LongTensor(1)
+        torch.distributed.broadcast(choice, 0)
+        if choice[0].item() == 0:
+            generate(model.cuda())
+
+
+@hydra_runner(config_path="conf", config_name="megatron_gpt_embedder_generate_config")
+def main(cfg) -> None:
+    logging.info("\n\n************** Experiment configuration ***********")
+    logging.info(f"\n{OmegaConf.to_yaml(cfg)}")
+    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
+
+    if cfg.model.peft.restore_from_path:
+        model_cfg = MegatronGPTEmbeddingModel.merge_inference_cfg(cfg.model.peft.restore_from_path, cfg)
+    else:
+        model_cfg = MegatronGPTEmbeddingModel.merge_inference_cfg(cfg.model.restore_from_path, cfg)
+
+    with open_dict(model_cfg):
+        model_cfg.data.return_output_tensors = True
+        model_cfg.post_process = False
+
+    model = MegatronGPTEmbeddingModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
+
+    if cfg.model.peft.restore_from_path:
+        model.load_adapters(cfg.model.peft.restore_from_path)
+    elif cfg.model.peft.restore_from_ckpt.checkpoint_dir and cfg.model.peft.restore_from_ckpt.checkpoint_name:
+        peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
+        checkpoint_path = os.path.join(
+            cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name
+        )
+        # checkpoint_path is a dir in case of distributed checkpointing
+        if not os.path.isdir(checkpoint_path):
+            # legacy checkpoint needs model parallel rank injection
+            checkpoint_path = inject_model_parallel_rank(
+                os.path.join(
+                    cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name
+                )
+            )
+            model.load_adapters(checkpoint_path, peft_cfgs=peft_cfg_cls(model_cfg))
+        else:
+            raise NotImplementedError("distributed checkpointing of PEFT weights is not supported")
+
+    model.freeze()
+    logging.info(f"Freezing parameters for PEFT eval:\n{model.summarize()}")
+
+    if not cfg.model.get('use_flash_attention', False):
+        cfg.inference.compute_attention_mask = True
+    config = OmegaConf.to_container(cfg.inference, resolve=True)
+    model.set_inference_config(config)
+
+    if not cfg.server:
+        trainer.test(model)
+    else:
+        use_inference_server(cfg, model, trainer)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
index 40347f317fbb..a50b578b95f4 100644
--- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
@@ -101,7 +101,7 @@ model:
       position_embedding_strategy: null # used only when weight_tying is True
 
     lora_tuning:
-      target_modules: ['attention_qkv'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
+      target_modules: ['attention_qkv','attention_dense','mlp_fc1','mlp_fc2'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
       adapter_dim: 32
       alpha: ${model.peft.lora_tuning.adapter_dim} 
       adapter_dropout: 0.0
diff --git a/nemo/collections/nlp/data/information_retrieval/gpt_embedding_dataset.py b/nemo/collections/nlp/data/information_retrieval/gpt_embedding_dataset.py
new file mode 100644
index 000000000000..352aff87217b
--- /dev/null
+++ b/nemo/collections/nlp/data/information_retrieval/gpt_embedding_dataset.py
@@ -0,0 +1,281 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Mapping, Optional
+
+import datasets
+import numpy as np
+import torch
+
+# hack to avoid the "not enough disk space" error in some slurm cluster
+datasets.builder.has_sufficient_disk_space = lambda needed_bytes, directory='.': True
+
+from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import get_samples_mapping
+from nemo.collections.nlp.data.language_modeling.text_memmap_dataset import JSONLMemMapDataset
+from nemo.core.classes import Dataset
+from nemo.utils import logging
+
+__all__ = ['GPTEmbeddingDataset']
+
+
+class GPTEmbeddingDataset(Dataset):
+    def __init__(
+        self,
+        file_path: str,
+        tokenizer: TokenizerSpec,
+        max_seq_length: int = 1024,
+        min_seq_length: int = 1,
+        add_bos: bool = False,
+        add_eos: bool = True,
+        max_num_samples: int = None,
+        seed: int = 1234,
+        index_mapping_dir: str = None,
+        virtual_tokens: int = 0,
+        memmap_workers: Optional[int] = None,
+        truncation_method: str = 'right',
+        special_tokens: Optional[Mapping[str, str]] = None,  # special tokens, a dictory of {token_type: token}
+        data_type: str = 'train',  # train, query or doc
+    ):
+        """
+        file_path: Path to a JSONL dataset with (query,pos_doc,neg_doc) triplets in jsonl format. 
+        tokenizer: Tokenizer for the dataset. Instance of a class that inherits TokenizerSpec (ex: YTTM, SentencePiece).
+        max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated.
+        min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements.
+        add_bos (bool): Whether to add a beginning of sentence token to each data example
+        add_eos (bool): Whether to add an end of sentence token to each data example
+        seed: Random seed for data shuffling.
+        max_num_samples: Maximum number of samples to load. This can be > dataset length if you want to oversample data. If None, all samples will be loaded.
+        index_mapping_dir: Directory to save the index mapping to. If None, will write to the same folder as the dataset.
+        truncation_method: Truncation from which position. Options: ['left', 'right']
+        special_tokens: special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '<extra_id_0>', 'turn_start': '<extra_id_1>', 'label_start': '<extra_id_2>', 'end_of_turn': '\n', "end_of_name": "\n"}
+        """
+        # TODO: lot of copy-paste from GPTSFDDataset, should refactor both to use a common base class (@adithyare)
+        self.tokenizer = tokenizer
+        self.file_path = file_path
+        self.max_seq_length = max_seq_length
+        self.min_seq_length = min_seq_length
+        self.add_bos = add_bos
+        self.add_eos = add_eos
+        self.max_num_samples = max_num_samples
+        self.seed = seed
+        self.index_mapping_dir = index_mapping_dir
+        self.virtual_tokens = virtual_tokens
+        self.truncation_method = truncation_method
+        if special_tokens is None:
+            self.special_tokens = {
+                "system_turn_start": "<extra_id_0>",
+                "turn_start": "<extra_id_1>",
+                "label_start": "<extra_id_2>",
+                "end_of_turn": "\n",
+                "end_of_name": "\n",
+            }
+        else:
+            self.special_tokens = special_tokens
+        self.data_type = data_type
+
+        self.indexed_dataset = JSONLMemMapDataset(
+            dataset_paths=[file_path],
+            tokenizer=None,
+            header_lines=0,
+            index_mapping_dir=index_mapping_dir,
+            workers=memmap_workers,
+        )
+
+        # Will be None after this call if `max_num_samples` is None
+        self.samples_mapping = None
+        self._build_samples_mapping()
+
+    def _build_samples_mapping(self):
+        if self.max_num_samples is not None:
+            self.samples_mapping = get_samples_mapping(
+                indexed_dataset=self.indexed_dataset,
+                data_prefix=self.file_path,
+                num_epochs=None,
+                max_num_samples=self.max_num_samples,
+                max_seq_length=self.max_seq_length - 2,
+                short_seq_prob=0,
+                seed=self.seed,
+                name=self.file_path.split('/')[-1],
+                binary_head=False,
+                index_mapping_dir=self.index_mapping_dir,
+            )
+        else:
+            self.samples_mapping = None
+
+    def __len__(self):
+        if self.max_num_samples is None:
+            return len(self.indexed_dataset)
+        else:
+            assert self.samples_mapping is not None
+            return len(self.samples_mapping)
+
+    def __getitem__(self, idx):
+        if isinstance(idx, np.int64):
+            idx = idx.item()
+
+        if self.samples_mapping is not None:
+            assert idx < len(self.samples_mapping)
+            idx, _, _ = self.samples_mapping[idx]
+            if isinstance(idx, np.uint32):
+                idx = idx.item()
+
+        assert idx < len(self.indexed_dataset)
+        # idx may < 0 because we pad_samples_to_global_batch_size, e.g. id = -1
+        if idx < 0:
+            idx = len(self) + idx
+            auto_gen_idx = True
+        else:
+            auto_gen_idx = False
+        try:
+            example = self.indexed_dataset[idx]
+            if auto_gen_idx:
+                example['__AUTOGENERATED__'] = True
+        except Exception as e:
+            logging.error(f"Error while loading example {idx} from dataset {self.file_path}")
+            raise e
+        return self._process_example(example)
+
+    def _process_example(self, example):
+        """
+        Create an example by concatenating text and answer.
+        Truncation is carried out when needed, but it is performed only on the prompt side.
+        BOS, EOS, and SEP, are added if specified.
+        """
+        metadata = {k: v for k, v in example.items()}
+        if self.data_type == 'train':
+            q = self.tokenizer.text_to_ids("query: " + example['query'].strip())
+            d = self.tokenizer.text_to_ids("passage: " + example['pos_doc'].strip())
+            nd = self.tokenizer.text_to_ids("passage: " + example['neg_doc'].strip())
+        elif self.data_type == 'query':
+            q = self.tokenizer.text_to_ids("query: " + example['query'].strip())
+            d, nd = None, None
+            assert "query_id" in example, "query_id is required for query dataset"
+            assert "doc_id" in example, "doc_id is required for query dataset"
+        elif self.data_type == 'doc':
+            d = self.tokenizer.text_to_ids("passage: " + example['pos_doc'].strip())
+            assert "doc_id" in example, "doc_id is required for doc dataset"
+            q, nd = None, None
+        else:
+            raise ValueError(f"Invalid data type: {self.data_type}")
+
+        q = q if q is not None else []
+        d = d if d is not None else []
+        nd = nd if nd is not None else []
+
+        if self.virtual_tokens:
+            # (@adithyare) we are going to insert "pad/eos" tokens in the beginning of the text and context
+            # these pad/eos tokens are placeholders for virtual tokens for ptuning (if used)
+            q = [self.tokenizer.eos_id] * self.virtual_tokens + q  # type: ignore
+            d = [self.tokenizer.eos_id] * self.virtual_tokens + d  # type: ignore
+            nd = [self.tokenizer.eos_id] * self.virtual_tokens + nd  # type: ignore
+
+        if self.add_bos:
+            q = [self.tokenizer.bos_id] + q  # type: ignore
+            d = [self.tokenizer.bos_id] + d  # type: ignore
+            nd = [self.tokenizer.bos_id] + nd  # type: ignore
+
+        # TODO: (@adithyare) should probably add a warning before truncation
+        q = q[: self.max_seq_length - 1]
+        d = d[: self.max_seq_length - 1]
+        nd = nd[: self.max_seq_length - 1]
+
+        if self.add_eos:
+            q = q + [self.tokenizer.eos_id]  # type: ignore
+            d = d + [self.tokenizer.eos_id]  # type: ignore
+            nd = nd + [self.tokenizer.eos_id]  # type: ignore
+
+        processed_example = {
+            'query': q,
+            'pos_doc': d,
+            'neg_doc': nd,
+            'metadata': metadata,
+        }
+
+        return processed_example
+
+    def _maybe_cast_to_list(self, x):
+        if isinstance(x, np.ndarray):
+            return [item.tolist() for item in x]
+        return x
+
+    def _ceil_to_nearest(self, n, m):
+        return (n + m - 1) // m * m
+
+    def _collate_item(self, item, max_length, pad_id):
+        item = self._maybe_cast_to_list(item)
+        # max_length = max([len(x) for x in item]) if item else 0
+        # here [0] should be tokenizer.pad_id
+        item = [x + [pad_id] * (max_length - len(x)) for x in item]
+        return item
+
+    @torch.no_grad()
+    def _create_attention_mask(self, max_length):
+        """Create `attention_mask`.
+        Args:
+            input_ids: A 1D tensor that holds the indices of tokens.
+        """
+        # seq_length = len(input_ids)
+        # `attention_mask` has the shape of [1, seq_length, seq_length]
+        attention_mask = torch.tril(torch.ones((max_length, max_length))).unsqueeze(0)
+        attention_mask = attention_mask < 0.5
+        return attention_mask
+
+    def collate_fn(self, batch):
+        input_ids = []
+        metadata = []
+        lengths = []
+        max_length = -1
+        for item in batch:
+            metadata.append(item['metadata'])
+            if self.data_type == 'train':
+                input_ids.append(item['query'])
+                lengths.append(len(item['query']))
+                input_ids.append(item['pos_doc'])
+                lengths.append(len(item['pos_doc']))
+                input_ids.append(item['neg_doc'])
+                lengths.append(len(item['neg_doc']))
+                max_length = max(max_length, len(item['query']), len(item['pos_doc']), len(item['neg_doc']))
+            elif self.data_type == 'query':
+                input_ids.append(item['query'])
+                lengths.append(len(item['query']))
+                max_length = max(max_length, len(item['query']))
+            elif self.data_type == 'doc':
+                input_ids.append(item['pos_doc'])
+                lengths.append(len(item['pos_doc']))
+                max_length = max(max_length, len(item['pos_doc']))
+            else:
+                raise ValueError(f"Invalid data type: {self.data_type}")
+
+        max_length = min(self.max_seq_length, self._ceil_to_nearest(max_length, 16))
+        assert max_length <= self.max_seq_length
+
+        attention_mask = [self._create_attention_mask(max_length) for _ in batch]
+        attention_mask = torch.stack(attention_mask)
+        position_ids = [list(range(max_length)) for _ in batch]
+        position_ids = torch.LongTensor(position_ids)
+        input_ids = torch.LongTensor(
+            self._collate_item(input_ids, max_length=max_length, pad_id=self.tokenizer.eos_id)
+        )
+        lengths = torch.LongTensor(lengths) - 1  # subtract 1 to account for the eos token
+
+        processed_batch = {
+            'tokens': input_ids,
+            'attention_mask': attention_mask,
+            'loss_mask': lengths,
+            'position_ids': position_ids,
+            'metadata': metadata,
+        }
+
+        return processed_batch
diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
new file mode 100644
index 000000000000..91fa4a6f92b5
--- /dev/null
+++ b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
@@ -0,0 +1,433 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import os
+
+import numpy as np
+import torch
+from omegaconf import DictConfig, ListConfig
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.collections.nlp.data.information_retrieval.gpt_embedding_dataset import GPTEmbeddingDataset
+from nemo.collections.nlp.data.language_modeling.megatron.base_dataset_utils import (
+    get_datasets_weights_and_num_samples,
+)
+from nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset import BlendableDataset
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
+from nemo.utils import logging
+
+try:
+    from megatron.core import parallel_state
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
+try:
+
+    HAVE_APEX = True
+except (ImportError, ModuleNotFoundError):
+    HAVE_APEX = False
+
+
+def listify(tensor):
+    l_tensor = []
+    for t in tensor:
+        for rid in range(t.shape[0]):
+            r = t[rid, :].unsqueeze(0).cpu()
+            l_tensor.append(r)
+    return l_tensor
+
+
+class MegatronGPTEmbeddingModel(MegatronGPTSFTModel):
+    def __init__(self, cfg: DictConfig, trainer: Trainer):
+        super().__init__(cfg, trainer=trainer)
+        self.temperature = self.cfg.get('temperature', 0.02)
+        self.use_all_possible_negatives = self.cfg.get("use_all_possible_negatives", True)
+        self.global_inbatch_negatives = self.cfg.get("global_inbatch_negatives", True)
+        assert (
+            self.cfg.get("post_process", False) is False
+        ), "post_process must be False to get hidden states in the loss_func"
+
+    def model_provider_func(self, pre_process, post_process):
+        # (@adithyare) We need post_process to be False to get hidden states in the loss_func
+        return super().model_provider_func(pre_process, post_process=False)
+
+    def maybe_setup_test(self):
+        if (
+            hasattr(self.cfg.data, 'test_ds')
+            and self.cfg.data.test_ds.get('doc_file_names', None) is not None
+            and self.cfg.data.test_ds.get('query_file_names', None) is not None
+        ):
+            self._test_dl = self.setup_eval_dataloader(self._test_ds, self.cfg.data.test_ds)
+        return
+
+    def maybe_build_test(self):
+        if (
+            hasattr(self.cfg.data, 'test_ds')
+            and self.cfg.data.test_ds.get('doc_file_names', None) is not None
+            and self.cfg.data.test_ds.get('query_file_names', None) is not None
+        ):
+            logging.info('Building GPT Embedder test datasets.')
+            # Wrap this in a list since the general finetuning parent class supports multi-validation.
+            self._test_ds = self._build_dataset(self.cfg.data.test_ds, is_train=False)
+
+    def _build_dataset(self, data_cfg, is_train=True):
+        packed_sequence = data_cfg.get("packed_sequence", False)
+
+        # Determine if we are using a single dataset or a list of datasets.
+        if is_train:
+            # Construct the data prefix list for `get_datasets_weights_and_num_samples()`
+            # that is of the format [weight1,file_name1,weight2,file_name2,...]
+            if data_cfg.concat_sampling_probabilities is None or not isinstance(
+                data_cfg.concat_sampling_probabilities, ListConfig
+            ):
+                raise ValueError(
+                    (
+                        f"concat_sampling_probabilities must be a ListConfig with the same number of files in file_names."
+                        f"Found: {data_cfg.concat_sampling_probabilities}"
+                    )
+                )
+
+            if len(data_cfg.get('concat_sampling_probabilities', None)) != len(data_cfg.file_names):
+                raise ValueError(
+                    (
+                        f"concat_sampling_probabilities must be of the same size as file_names.",
+                        f"Provided size {len(data_cfg.concat_sampling_probabilities)}, number of datasets {len(data_cfg.file_names)}",
+                    )
+                )
+
+            data_prefix = []
+            for weight, prefix in zip(data_cfg.concat_sampling_probabilities, data_cfg.file_names):
+                data_prefix.append(weight)
+                data_prefix.append(prefix)
+
+            if self.trainer.max_steps is None or self.trainer.max_steps <= 0:
+                raise ValueError(
+                    f'Trainer max_steps must be set to a positive integer. Found {self.trainer.max_steps}'
+                )
+            num_train_samples = [self.trainer.max_steps * data_cfg.global_batch_size]
+            _, _, num_train_samples_per_dataset = get_datasets_weights_and_num_samples(data_prefix, num_train_samples)
+            num_train_samples_after_blend = sum([x[0] for x in num_train_samples_per_dataset])
+        else:
+            num_query_samples_per_dataset = [[None]] * len(data_cfg.query_file_names)
+            num_doc_samples_per_dataset = [[None]] * len(data_cfg.doc_file_names)
+
+        # Check dataset max_seq_legnth and max_position_embeddings size
+        if (
+            self.cfg.get('position_embedding_type', None) in [None, 'learned_absolute']
+            and data_cfg.max_seq_length > self.cfg.max_position_embeddings
+        ):
+            logging.warning(
+                f"Set dataset max_seq_length to max_position_embeddings {self.cfg.max_position_embeddings} if using learned_absolute position embedding"
+            )
+            data_cfg.max_seq_length = self.cfg.max_position_embeddings
+
+        # TE requires that the first input dim is divisible by 8 and the second by 16 for fp8
+        # When using sequence parallel, sequence will further be split by TP size
+        pad_seq_length_to_mult = (
+            8 * self.cfg.get('tensor_model_parallel_size', 1) if self.cfg.get('sequence_parallel', False) else 16
+        )
+        if is_train:
+            datasets = []
+            for file_path, num_samples in zip(data_cfg.file_names, num_train_samples_per_dataset):
+                dataset = GPTEmbeddingDataset(
+                    file_path=file_path,
+                    tokenizer=self.tokenizer,
+                    max_seq_length=data_cfg.max_seq_length,
+                    min_seq_length=data_cfg.min_seq_length,
+                    add_bos=data_cfg.get('add_bos', False),
+                    add_eos=data_cfg.get('add_eos', True),
+                    max_num_samples=num_samples[0],
+                    seed=data_cfg.get('seed', 1234),
+                    index_mapping_dir=data_cfg.get('index_mapping_dir', None),
+                    virtual_tokens=self.virtual_tokens,
+                    memmap_workers=data_cfg.get(
+                        'memmap_workers', None
+                    ),  # used to set num. of workers to create the memmap index files
+                    truncation_method=data_cfg.get(
+                        'truncation_method', 'right'
+                    ),  # used to choose truncation method. Options: ['random', 'left', 'right']
+                    special_tokens=self.cfg.data.get(
+                        'chat_prompt_tokens', None
+                    ),  # special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '<extra_id_0>', 'turn_start': '<extra_id_1>', 'label_start': '<extra_id_2>', 'end_of_turn': '\n', "end_of_name": "\n"}
+                )
+                datasets.append(dataset)
+            if packed_sequence:
+                raise NotImplementedError("Packed sequence is not supported for MegatronGPTEmbeddingModel")
+
+            dataset = BlendableDataset(
+                datasets=datasets, weights=data_cfg.concat_sampling_probabilities, size=num_train_samples_after_blend
+            )
+            return dataset
+        else:
+            query_dataset = GPTEmbeddingDataset(
+                file_path=data_cfg.query_file_names[0],
+                tokenizer=self.tokenizer,
+                max_seq_length=data_cfg.max_seq_length,
+                min_seq_length=data_cfg.min_seq_length,
+                add_bos=data_cfg.get('add_bos', False),
+                add_eos=data_cfg.get('add_eos', True),
+                max_num_samples=None,
+                seed=data_cfg.get('seed', 1234),
+                index_mapping_dir=data_cfg.get('index_mapping_dir', None),
+                virtual_tokens=self.virtual_tokens,
+                memmap_workers=data_cfg.get(
+                    'memmap_workers', None
+                ),  # used to set num. of workers to create the memmap index files
+                truncation_method=data_cfg.get(
+                    'truncation_method', 'right'
+                ),  # used to choose truncation method. Options: ['random', 'left', 'right']
+                special_tokens=self.cfg.data.get(
+                    'chat_prompt_tokens', None
+                ),  # special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '<extra_id_0>', 'turn_start': '<extra_id_1>', 'label_start': '<extra_id_2>', 'end_of_turn': '\n', "end_of_name": "\n"}
+                data_type="query",
+            )
+            doc_dataset = GPTEmbeddingDataset(
+                file_path=data_cfg.doc_file_names[0],
+                tokenizer=self.tokenizer,
+                max_seq_length=data_cfg.max_seq_length,
+                min_seq_length=data_cfg.min_seq_length,
+                add_bos=data_cfg.get('add_bos', False),
+                add_eos=data_cfg.get('add_eos', True),
+                max_num_samples=None,
+                seed=data_cfg.get('seed', 1234),
+                index_mapping_dir=data_cfg.get('index_mapping_dir', None),
+                virtual_tokens=self.virtual_tokens,
+                memmap_workers=data_cfg.get(
+                    'memmap_workers', None
+                ),  # used to set num. of workers to create the memmap index files
+                truncation_method=data_cfg.get(
+                    'truncation_method', 'right'
+                ),  # used to choose truncation method. Options: ['random', 'left', 'right']
+                special_tokens=self.cfg.data.get(
+                    'chat_prompt_tokens', None
+                ),  # special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '<extra_id_0>', 'turn_start': '<extra_id_1>', 'label_start': '<extra_id_2>', 'end_of_turn': '\n', "end_of_name": "\n"}
+                data_type="doc",
+            )
+            return [query_dataset, doc_dataset]
+
+    def training_step_fwd_bwd_step_call(self, dataloader_iter, forward_only):
+        loss_mean, non_loss_tensors = self.fwd_bwd_step(dataloader_iter, forward_only)
+        avg_pos_cs = non_loss_tensors['avg_pos_cs'][0].item()
+        avg_neg_cs = non_loss_tensors['avg_neg_cs'][0].item()
+        diff_cs = non_loss_tensors['diff_cs'][0].item()
+        self.log("avg_pos_cs", avg_pos_cs, prog_bar=True, rank_zero_only=True, batch_size=1)
+        self.log("avg_neg_cs", avg_neg_cs, prog_bar=True, rank_zero_only=True, batch_size=1)
+        self.log("diff_cs", diff_cs, prog_bar=True, rank_zero_only=True, batch_size=1)
+        return loss_mean
+
+    def inference_step_validation_call(self, batch, batch_idx, data_cfg, dataloader_idx=0):
+        metadata = batch.get('metadata', [{}] * len(batch['tokens']))
+        loss, non_loss_tensors = self.local_validation_step(itertools.chain([dataloader_idx], [batch]))
+        outputs = {
+            'loss': loss,
+            'metadata': metadata,  # [dict]
+            'q_hs': non_loss_tensors['query_hs'],  # [batch_size, hidden_size]
+            'd_hs': non_loss_tensors['doc_hs'],  # [batch_size, hidden_size]
+        }
+        return outputs
+
+    def gather_and_maybe_write_predictions(self, output, data_cfg, mode, dataloader_idx=0):
+        if not data_cfg.get("write_embeddings_to_file", False):
+            return True
+        gathered_output_batches = [None for _ in range(parallel_state.get_data_parallel_world_size())]
+        torch.distributed.all_gather_object(
+            gathered_output_batches,
+            [{'q_hs': batch['q_hs'], 'd_hs': batch['d_hs'], 'metadata': batch['metadata'],} for batch in output],
+            group=parallel_state.get_data_parallel_group(),
+        )
+
+        # Remove duplicate examples due to distributed sampler.
+        deduplicated_outputs = {
+            'q_hs': [],
+            'd_hs': [],
+            'metadata': [],
+        }
+        total_size, skipped = 0, 0
+        for rank in range(0, parallel_state.get_data_parallel_world_size()):
+            for batch in gathered_output_batches[rank]:
+                l_q_hs = listify(batch['q_hs'])
+                l_d_hs = listify(batch['d_hs'])
+                l_m = batch['metadata']
+                assert len(l_m) == len(l_q_hs) == len(l_d_hs)
+                for q_hs, d_hs, metadata in zip(l_q_hs, l_d_hs, l_m,):
+                    total_size += 1
+                    if not metadata.get("__AUTOGENERATED__", False):
+                        deduplicated_outputs['q_hs'].append(q_hs)
+                        deduplicated_outputs['d_hs'].append(d_hs)
+                        deduplicated_outputs['metadata'].append(metadata)
+                    else:
+                        skipped += 1
+
+        logging.info(
+            f"{total_size-skipped} deduplicated outputs in dataloader:{dataloader_idx}, (skipped {skipped} autogenerated examples)."
+        )
+        # Compute metric score
+        metric_name = self.val_metric_name if mode == 'validation' else self.test_metric_name
+        assert metric_name == "loss", "Only loss is supported for now."
+        # avg_pos_cs = torch.tensor(deduplicated_outputs['avg_pos_cs']).mean().item()
+        # avg_neg_cs = torch.tensor(deduplicated_outputs['avg_neg_cs']).mean().item()
+        # diff_cs = torch.tensor(deduplicated_outputs['diff_cs']).mean().item()
+        # self.log('val_avg_pos_cs', avg_pos_cs, prog_bar=True, rank_zero_only=True, batch_size=1)
+        # self.log('val_avg_neg_cs', avg_neg_cs, prog_bar=True, rank_zero_only=True, batch_size=1)
+        # self.log('val_diff_cs', diff_cs, prog_bar=True, rank_zero_only=True, batch_size=1)
+
+        # Write predictions to file
+        if self.global_rank == 0 and data_cfg.get("write_embeddings_to_file", False):
+            logging.info(
+                f"Total deduplicated inference data size: {total_size} to {len(deduplicated_outputs['metadata'])}"
+            )
+
+            # Check if the user provided a prefix path to the file(s) they want to write.
+            if not hasattr(data_cfg, "output_file_path_prefix") or data_cfg.output_file_path_prefix is None:
+                raise ValueError(
+                    f"Cannot write predictions to file when output_file_path_prefix is not set or present in the yaml config file."
+                )
+            # (@adithyare) We are not using the log key to write the embeddings to file
+            filename_log_key = self._determine_log_key(data_cfg, dataloader_idx, None, mode)
+            consumed_samples = self._compute_consumed_samples_after_training_step()
+            fldr_path = f"{data_cfg.output_file_path_prefix}/consumed_samples{consumed_samples}/{filename_log_key}"
+            self.write_embeddings_to_file(deduplicated_outputs, fldr_path, dataloader_idx)
+        return deduplicated_outputs, total_size
+
+    def write_embeddings_to_file(self, outputs, output_file_path, d_idx):
+        emb_type = 'query' if d_idx == 0 else 'doc'
+        hs = torch.cat(outputs['q_hs' if d_idx == 0 else 'd_hs'], dim=0)
+        hs_npy = hs.float().numpy()
+        emb_fldr = f"{output_file_path}"
+        os.makedirs(emb_fldr, exist_ok=True)
+        with open(f"{output_file_path}/{emb_type}.ids", "w") as f:
+            for m in outputs['metadata']:
+                f.write(m[f"{emb_type}_id"] + "\n")
+        np.save(f"{emb_fldr}/{emb_type}.npy", hs_npy)
+        return True
+
+    def local_validation_step(self, dataloader_iter):
+        """
+            Our dataloaders produce a micro-batch and then we fetch
+            a number of microbatches depending on the global batch size and model parallel size
+            from the dataloader to produce a list of microbatches.
+            The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
+        """
+        # Check if iterator is exhausted
+        # dataloader_iter, done = self._val_iterator_done(dataloader_iter)
+        # if done:
+        #     return
+        # Get the dataloader_idx when MegatronGPTSFTModel calls validation_step of MegatronGPTModel
+        next_item_dataloader = next(dataloader_iter)
+        if isinstance(next_item_dataloader, int):
+            dataloader_idx = next_item_dataloader
+        else:
+            dataloader_iter = itertools.chain([next_item_dataloader], dataloader_iter)
+        mode = 'test' if self.trainer.testing else 'val'
+        # Initialize userbuffer communicators.
+        if self.initialize_ub:
+            self.initialize_ub_func()
+
+        if isinstance(self.model, list):
+            for model_module in self.model:
+                model_module.eval()
+
+        if self.cfg.get('fp8', False):
+            first_val_step = self.prev_step_training and not self.training
+            self.prev_step_training = self.training
+        else:
+            first_val_step = None
+
+        loss, non_loss_tensors = self.fwd_bwd_step(dataloader_iter, True, first_val_step)
+
+        if isinstance(self.model, list):
+            for model_module in self.model:
+                model_module.train()
+
+        if mode == 'val':
+            # MegatronGPTSFTModel class supports multiple dataloaders and uses validation_step of MegatronGPTModel.
+            # Supporting that case with below lines
+            if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1:
+                self.validation_step_outputs[dataloader_idx].append(loss)
+            else:
+                self.validation_step_outputs.append(loss)
+        else:
+            if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1:
+                self.test_step_outputs[dataloader_idx].append(loss)
+            else:
+                self.test_step_outputs.append(loss)
+
+        return loss, non_loss_tensors
+
+    def constrastive_scores(self, pos_doc_hs, neg_doc_hs, query_hs, bs, use_all_possible_negatives=False):
+        all_doc_hs = torch.cat([pos_doc_hs, neg_doc_hs], dim=0)  # (2bs) x hidden_size
+        cs = torch.mm(query_hs, all_doc_hs.transpose(0, 1))  # (bs) x (2bs)
+        pos_cs = cs[:, :bs].diag()
+        neg_cs = cs[:, bs:].diag()
+        if use_all_possible_negatives:
+            labels = torch.arange(bs, device=cs.device).long()
+        else:
+            labels = torch.zeros(bs, device=cs.device).long()
+            cs = torch.cat([pos_cs.unsqueeze(1), neg_cs.unsqueeze(1)], dim=1)
+        pos_cs = pos_cs.clone().detach().mean()
+        neg_cs = neg_cs.clone().detach().mean()
+        return cs, pos_cs, neg_cs, labels
+
+    def inference_loss_func(self, loss_mask, num_valid_tokens_in_ub, eos_tensors):
+        hs = eos_tensors
+        hs = torch.nn.functional.normalize(hs, dim=1)
+        _blank = torch.zeros(1, device=hs.device, dtype=hs.dtype)[0]
+        return _blank, hs, hs, _blank, _blank, _blank
+
+    def _gather_global_inbatch_representations(self, local_eos_tensor):
+        local_eos_tensor = local_eos_tensor.contiguous()
+        global_eos_tensors = [
+            torch.zeros_like(local_eos_tensor) for _ in range(parallel_state.get_data_parallel_world_size())
+        ]
+        torch.distributed.all_gather(
+            global_eos_tensors, local_eos_tensor, group=parallel_state.get_data_parallel_group()
+        )
+        global_eos_tensors[parallel_state.get_data_parallel_rank()] = local_eos_tensor
+        global_eos_tensors = torch.cat(global_eos_tensors, dim=0)
+        return global_eos_tensors
+
+    def loss_func(self, loss_mask, num_valid_tokens_in_ub, output_tensor):
+        idx = torch.arange(output_tensor.shape[1], device=output_tensor.device)
+        eos_tensors = output_tensor[loss_mask, idx, :]
+        if self.global_inbatch_negatives and self.trainer.training:
+            eos_tensors = self._gather_global_inbatch_representations(eos_tensors)
+        if not self.trainer.training:
+            return self.inference_loss_func(loss_mask, num_valid_tokens_in_ub, eos_tensors)
+        bs = eos_tensors.shape[0] // 3
+        query_hs = eos_tensors[::3, :]  # every third tensor is a query (bs x hidden_size)
+        pos_doc_hs = eos_tensors[1::3, :]  # every third tensor is a positive doc (bs x hidden_size)
+        neg_doc_hs = eos_tensors[2::3, :]  # every third tensor is a negative doc (bs x hidden_size)
+
+        query_hs = torch.nn.functional.normalize(query_hs, dim=1)
+        pos_doc_hs = torch.nn.functional.normalize(pos_doc_hs, dim=1)
+        neg_doc_hs = torch.nn.functional.normalize(neg_doc_hs, dim=1)
+
+        cs, pos_cs, neg_cs, labels = self.constrastive_scores(
+            pos_doc_hs, neg_doc_hs, query_hs, bs, self.use_all_possible_negatives
+        )
+        cs = cs.clamp(-1.0, 1.0)
+        cs = cs / self.temperature
+        loss = torch.nn.functional.cross_entropy(cs, labels)
+
+        cp_size = self.cfg.get('context_parallel_size', 1)
+        if cp_size > 1:
+            torch.distributed.all_reduce(loss, group=parallel_state.get_context_parallel_group())
+        query_hs = query_hs.clone().detach()
+        pos_doc_hs = pos_doc_hs.clone().detach()
+        diff_cs = pos_cs - neg_cs
+        return loss, query_hs, pos_doc_hs, pos_cs, neg_cs, diff_cs
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index cd5587351ecd..803bc671a7cf 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -895,6 +895,8 @@ def compute_consumed_samples(self, steps_since_resume=0):
 
     def _compute_consumed_samples_after_training_step(self):
         # Add +1 to account for the current batch, which is not counted yet in `trainer.global_step`.
+        if not hasattr(self, 'init_global_step'):
+            self.init_global_step = 0  # in case this method is called before training starts.
         return self.compute_consumed_samples(self.trainer.global_step + 1 - self.init_global_step)
 
     def _extract_consumed_samples_from_ckpt(self, ckpt_path):
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 79d48269d3a6..c9aae27eb5ed 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -637,6 +637,14 @@ def initialize_ub_func(self):
         )
         self.initialize_ub = False
 
+    def training_step_fwd_bwd_step_call(self, dataloader_iter, forward_only):
+        """
+        This method is called from the training_step method.
+        It is separated out to allow for overriding in the MegatronGPTEmbeddingModel
+        """
+        loss_mean = self.fwd_bwd_step(dataloader_iter, forward_only)
+        return loss_mean
+
     def training_step(self, dataloader_iter):
         """
             We pass the dataloader iterator function to the micro-batch scheduler.
@@ -676,7 +684,7 @@ def training_step(self, dataloader_iter):
                     for param in module.embedding.parameters():
                         param.data_ptr()
 
-        loss_mean = self.fwd_bwd_step(dataloader_iter, False)
+        loss_mean = self.training_step_fwd_bwd_step_call(dataloader_iter, forward_only=False)
 
         if self.cfg.get('fp8', False):
             self.prev_step_training = self.training
@@ -1012,7 +1020,7 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_
                 'input_ids': batch['tokens'],
                 'position_ids': batch['position_ids'],
                 'attention_mask': None if self.get_attention_mask_from_fusion else batch['attention_mask'],
-                'labels': batch['labels'],
+                'labels': batch['labels'] if 'labels' in batch else None,
                 'loss_mask': batch['loss_mask'],
             }
 
@@ -1056,8 +1064,27 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_
             def loss_func(output_tensor):
                 # Loss for a micro-batch (ub)
                 loss_for_ub = self.loss_func(batch['loss_mask'], batch['num_valid_tokens_in_ub'], output_tensor)
-                cp_size = parallel_state.get_context_parallel_world_size()
-                if validation_step and not self.cfg.data.get('validation_drop_last', True):
+                cp_size = self.cfg.get('context_parallel_size', 1)
+                if self.cfg.data.get(
+                    "return_output_tensors", False
+                ):  # TODO: need a better way to check if loss_func is returning more stuff than just loss... (@adithyare)
+                    loss_for_ub, q_hs, d_hs, pos_cs, neg_cs, diff_cs = loss_for_ub
+                    reduced_loss = average_losses_across_data_parallel_group([loss_for_ub])
+                    pos_cs = average_losses_across_data_parallel_group([pos_cs])
+                    neg_cs = average_losses_across_data_parallel_group([neg_cs])
+                    diff_cs = average_losses_across_data_parallel_group([diff_cs])
+                    return (
+                        loss_for_ub * cp_size,
+                        {
+                            'avg': reduced_loss,
+                            'query_hs': q_hs,
+                            'doc_hs': d_hs,
+                            'avg_pos_cs': pos_cs,
+                            'avg_neg_cs': neg_cs,
+                            'diff_cs': diff_cs,
+                        },
+                    )
+                elif validation_step and not self.cfg.data.get('validation_drop_last', True):
                     num_valid_tokens_in_ub = batch['num_valid_tokens_in_ub']
                     if loss_for_ub.isnan():
                         assert batch['loss_mask'].count_nonzero() == 0, 'Got NaN loss with non-empty input'
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index 7ab00f1af85a..0320fc6c0713 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -97,6 +97,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
 
         self._reset_activation_checkpointing_args()
         self.virtual_tokens = 0
+        self.init_global_step = 0
 
     def setup_metric(self, data_cfg):
         metric_name = "exact_string_match"
@@ -160,6 +161,11 @@ def setup_metric(self, data_cfg):
     def _metrics_require_string2category_map(self):
         return set(["f1", "accuracy", "average_precision"])
 
+    def maybe_setup_test(self):
+        if hasattr(self.cfg.data, 'test_ds') and self.cfg.data.test_ds.get('file_names', None) is not None:
+            self._test_dl = self.setup_eval_dataloader(self._test_ds, self.cfg.data.test_ds)
+        return
+
     def setup(self, stage=None):
         # NOTE: super().__init__ will try and setup train/val/test datasets, but we sidestep this using a if self._train_ds is not None condition
         # We then set things up for real only once setup() of this class is called.
@@ -182,8 +188,7 @@ def setup(self, stage=None):
             self.setup_training_dataloader()
         if hasattr(self, '_validation_ds'):
             self._validation_dl = self.setup_eval_dataloader(self._validation_ds, self.cfg.data.validation_ds)
-        if hasattr(self.cfg.data, 'test_ds') and self.cfg.data.test_ds.get('file_names', None) is not None:
-            self._test_dl = self.setup_eval_dataloader(self._test_ds, self.cfg.data.test_ds)
+        self.maybe_setup_test()
 
         # when using pipeline model parallel the final stage need to initialize word embeddings
         self.initialize_last_rank_embeddings()
@@ -369,8 +374,15 @@ def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
             first_val_step=first_val_step,
         )
 
+        non_loss_tensors = {}
         # only the last stages of the pipeline return losses
         if losses_reduced_per_micro_batch:
+            for item in losses_reduced_per_micro_batch:
+                for k, v in item.items():
+                    if k != 'avg':
+                        av = non_loss_tensors.get(k, [])
+                        av.append(v)
+                        non_loss_tensors[k] = av
             if (not forward_only) or self.cfg.data.get('validation_drop_last', True):
                 # average loss across micro batches
                 loss_tensors_list = [loss_reduced['avg'] for loss_reduced in losses_reduced_per_micro_batch]
@@ -396,7 +408,12 @@ def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
             else:
                 loss_mean = torch.tensor(0.0).cuda()
 
-        return loss_mean
+        # if forward_only:
+        # return loss_mean
+        if non_loss_tensors:  # TODO: need a nicer way to do this via inheritance (@adithyare)
+            return loss_mean, non_loss_tensors
+        else:
+            return loss_mean
 
     def validation_step(self, dataloader_iter):
         return self.inference_step(dataloader_iter, 'validation')
@@ -409,6 +426,23 @@ def inference_step(self, dataloader_iter, mode):
         data_cfg = self.cfg.data.validation_ds if mode == 'validation' else self.cfg.data.test_ds
         self._reconfigure_and_process_inference_batch(batch, data_cfg)
         # Meta data from dataset
+        outputs = self.inference_step_validation_call(batch, batch_idx, data_cfg, dataloader_idx)
+
+        if mode == 'validation':
+            if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1:
+                # super().validation_step appends just loss to self.validation_step_outputs, replace the last appended loss with the outputs dict
+                self.validation_step_outputs[dataloader_idx][-1] = outputs
+            else:
+                # super().validation_step appends just loss to self.validation_step_outputs, replace the last appended loss with the outputs dict
+                self.validation_step_outputs[-1] = outputs
+        else:
+            if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1:
+                self.test_step_outputs[dataloader_idx][-1] = outputs
+            else:
+                self.test_step_outputs[-1] = outputs
+        return outputs
+
+    def inference_step_validation_call(self, batch, batch_idx, data_cfg, dataloader_idx=0):
         metadata = batch.get('metadata', [{}] * len(batch['tokens']))
         # Pass dataloader_idx, as it's needed in val_step of GPTModel to append the loss correctly to self.val/test_step_outputs
         # in case of multi dataloaders
@@ -442,22 +476,91 @@ def inference_step(self, dataloader_iter, mode):
             'inputs': inputs_text,  # [str]
             'metadata': metadata,  # [dict]
         }
+        return outputs
 
-        if mode == 'validation':
-            if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1:
-                # super().validation_step appends just loss to self.validation_step_outputs, replace the last appended loss with the outputs dict
-                self.validation_step_outputs[dataloader_idx][-1] = outputs
+    def gather_and_maybe_write_predictions(self, output, data_cfg, mode, dataloader_idx=0):
+        # Gather the outputs object from all data parallel ranks since we are using the DistributedSampler which splits data across DDP ranks.
+        gathered_outputs = [None for _ in range(parallel_state.get_data_parallel_world_size())]
+        torch.distributed.all_gather_object(
+            gathered_outputs,
+            [
+                {'preds': x['preds'], 'labels': x['labels'], 'inputs': x['inputs'], 'metadata': x['metadata']}
+                for x in output
+            ],
+            group=parallel_state.get_data_parallel_group(),
+        )
+
+        # Remove duplicate examples due to distributed sampler.
+        deduplicated_outputs = {
+            'preds': [],
+            'labels': [],
+            'inputs': [],
+            'metadata': [],
+        }
+        total_size = 0
+        for rank in range(0, parallel_state.get_data_parallel_world_size()):
+            for batch in gathered_outputs[rank]:
+                for pred, label, input, metadata in zip(
+                    batch['preds'], batch['labels'], batch['inputs'], batch['metadata']
+                ):
+                    total_size += 1
+                    if not metadata.get("__AUTOGENERATED__", False):
+                        deduplicated_outputs['preds'].append(pred)
+                        deduplicated_outputs['labels'].append(label)
+                        deduplicated_outputs['inputs'].append(input)
+                        deduplicated_outputs['metadata'].append(metadata)
+                    else:
+                        logging.info(f"skipping autogenerated example example {input} prediction {pred} label {label}")
+
+        # Compute metric score
+        metric_name = self.val_metric_name if mode == 'validation' else self.test_metric_name
+        metric_label_key = self.val_metric_label_key if mode == 'validation' else self.test_metric_label_key
+        if metric_name != 'loss':
+            metric_log_key = self._determine_log_key(data_cfg, dataloader_idx, metric_name, mode)
+            metric_fn = self.val_metric[dataloader_idx] if mode == 'validation' else self.test_metric[dataloader_idx]
+            if metric_label_key in deduplicated_outputs['metadata'][0]:
+                labels = [m[metric_label_key] for m in deduplicated_outputs['metadata']]
             else:
-                # super().validation_step appends just loss to self.validation_step_outputs, replace the last appended loss with the outputs dict
-                self.validation_step_outputs[-1] = outputs
-        else:
-            if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1:
-                self.test_step_outputs[dataloader_idx][-1] = outputs
+                labels = deduplicated_outputs['labels']
+
+            for pred, label in zip(deduplicated_outputs['preds'], labels):
+                _ = metric_fn(pred, label)
+
+            metric_result = metric_fn.compute()
+
+            if metric_name == 'rouge':
+                for k, v in metric_result.items():
+                    if 'fmeasure' in k:
+                        self.log(metric_log_key + f'_{k}', v.item(), sync_dist=True)
+                        logging.info(f"{mode} {metric_name} {k}: {v.item()}")
+                metric_result = metric_result['rouge1_fmeasure']
             else:
-                self.test_step_outputs[-1] = outputs
-        return outputs
+                self.log(metric_log_key, metric_result.item(), sync_dist=True)
+                logging.info(f"{mode} {metric_name}: {metric_result.item()}")
+
+            metric_fn.reset()
+            averaged_metric.append(metric_result)
+
+        # Write predictions to file
+        if self.global_rank == 0 and data_cfg.get("write_predictions_to_file", False):
+            logging.info(
+                f"Total deduplicated inference data size: {total_size} to {len(deduplicated_outputs['inputs'])}"
+            )
+
+            # Check if the user provided a prefix path to the file(s) they want to write.
+            if not hasattr(data_cfg, "output_file_path_prefix") or data_cfg.output_file_path_prefix is None:
+                raise ValueError(
+                    f"Cannot write predictions to file when output_file_path_prefix is not set or present in the yaml config file."
+                )
+            filename_log_key = self._determine_log_key(data_cfg, dataloader_idx, None, mode)
+            self.write_predictions_to_file(
+                deduplicated_outputs, f"{data_cfg.output_file_path_prefix}_{filename_log_key}"
+            )
+
+        return deduplicated_outputs, total_size
 
     def inference_epoch_end(self, outputs, mode, data_cfg):
+        # TODO: this method should be modularized. It is too long and does too many things. (@adithyare)
         # Parent class will handle logging of the loss.
         if not outputs or not outputs[0]:
             return
@@ -487,92 +590,13 @@ def inference_epoch_end(self, outputs, mode, data_cfg):
             # we can only log on one rank if it is rank zero so we broadcast from last rank
             torch.distributed.broadcast(loss, get_last_rank())
 
-            if mode != 'test':
-                self.log('val_loss', loss, prog_bar=True, rank_zero_only=True, batch_size=1)
+            self.log('val_loss', loss, prog_bar=True, rank_zero_only=True, batch_size=1)
 
+            # Determine the key used to log the loss based on the user provided name of the dataset or the dataloader index.
+            loss_log_key = self._determine_log_key(data_cfg, dataloader_idx, "loss", mode)
+            self.log(loss_log_key, loss, batch_size=1)
             averaged_loss.append(loss)
-
-            # Gather the outputs object from all data parallel ranks since we are using the DistributedSampler which splits data across DDP ranks.
-            gathered_outputs = [None for _ in range(parallel_state.get_data_parallel_world_size())]
-            torch.distributed.all_gather_object(
-                gathered_outputs,
-                [
-                    {'preds': x['preds'], 'labels': x['labels'], 'inputs': x['inputs'], 'metadata': x['metadata']}
-                    for x in output
-                ],
-                group=parallel_state.get_data_parallel_group(),
-            )
-
-            # Remove duplicate examples due to distributed sampler.
-            deduplicated_outputs = {
-                'preds': [],
-                'labels': [],
-                'inputs': [],
-                'metadata': [],
-            }
-            total_size = 0
-            for rank in range(0, parallel_state.get_data_parallel_world_size()):
-                for batch in gathered_outputs[rank]:
-                    for pred, label, input, metadata in zip(
-                        batch['preds'], batch['labels'], batch['inputs'], batch['metadata']
-                    ):
-                        total_size += 1
-                        if not metadata.get("__AUTOGENERATED__", False):
-                            deduplicated_outputs['preds'].append(pred)
-                            deduplicated_outputs['labels'].append(label)
-                            deduplicated_outputs['inputs'].append(input)
-                            deduplicated_outputs['metadata'].append(metadata)
-                        else:
-                            logging.info(
-                                f"skipping autogenerated example example {input} prediction {pred} label {label}"
-                            )
-
-            # Compute metric score
-            metric_name = self.val_metric_name if mode == 'validation' else self.test_metric_name
-            metric_label_key = self.val_metric_label_key if mode == 'validation' else self.test_metric_label_key
-            if metric_name != 'loss':
-                metric_log_key = self._determine_log_key(data_cfg, dataloader_idx, metric_name, mode)
-                metric_fn = (
-                    self.val_metric[dataloader_idx] if mode == 'validation' else self.test_metric[dataloader_idx]
-                )
-                if metric_label_key in deduplicated_outputs['metadata'][0]:
-                    labels = [m[metric_label_key] for m in deduplicated_outputs['metadata']]
-                else:
-                    labels = deduplicated_outputs['labels']
-
-                for pred, label in zip(deduplicated_outputs['preds'], labels):
-                    _ = metric_fn(pred, label)
-
-                metric_result = metric_fn.compute()
-
-                if metric_name == 'rouge':
-                    for k, v in metric_result.items():
-                        if 'fmeasure' in k:
-                            self.log(metric_log_key + f'_{k}', v.item(), sync_dist=True)
-                            logging.info(f"{mode} {metric_name} {k}: {v.item()}")
-                    metric_result = metric_result['rouge1_fmeasure']
-                else:
-                    self.log(metric_log_key, metric_result.item(), sync_dist=True)
-                    logging.info(f"{mode} {metric_name}: {metric_result.item()}")
-
-                metric_fn.reset()
-                averaged_metric.append(metric_result)
-
-            # Write predictions to file
-            if self.global_rank == 0 and data_cfg.get("write_predictions_to_file", False):
-                logging.info(
-                    f"Total deduplicated inference data size: {total_size} to {len(deduplicated_outputs['inputs'])}"
-                )
-
-                # Check if the user provided a prefix path to the file(s) they want to write.
-                if not hasattr(data_cfg, "output_file_path_prefix") or data_cfg.output_file_path_prefix is None:
-                    raise ValueError(
-                        f"Cannot write predictions to file when output_file_path_prefix is not set or present in the yaml config file."
-                    )
-                filename_log_key = self._determine_log_key(data_cfg, dataloader_idx, None, mode)
-                self.write_predictions_to_file(
-                    deduplicated_outputs, f"{data_cfg.output_file_path_prefix}_{filename_log_key}"
-                )
+            self.gather_and_maybe_write_predictions(output, data_cfg, mode, dataloader_idx)
 
             torch.distributed.barrier(group=parallel_state.get_data_parallel_group())
             outputs[dataloader_idx].clear()  # free memory
@@ -759,6 +783,14 @@ def _reconfigure_and_process_inference_batch(self, batch, data_cfg):
                     data_parallel_size=parallel_state.get_data_parallel_world_size(),
                 )
 
+    def maybe_build_test(self):
+        if hasattr(self.cfg.data, 'test_ds') and self.cfg.data.test_ds.get('file_names', None) is not None:
+            logging.info('Building GPT SFT test datasets.')
+            # Wrap this in a list since the general finetuning parent class supports multi-validation.
+            self._test_ds = self._build_dataset(self.cfg.data.test_ds, is_train=False)
+            logging.info(f'Length of test dataset: {len(self._test_ds[0])}')
+        return
+
     def build_train_valid_test_datasets(self, stage):
         if stage != 'test':
             logging.info('Building GPT SFT validation datasets.')
@@ -767,11 +799,7 @@ def build_train_valid_test_datasets(self, stage):
             logging.info(f'Length of val dataset: {len(self._validation_ds[0])}')
 
         if stage != 'validate':
-            if hasattr(self.cfg.data, 'test_ds') and self.cfg.data.test_ds.get('file_names', None) is not None:
-                logging.info('Building GPT SFT test datasets.')
-                # Wrap this in a list since the general finetuning parent class supports multi-validation.
-                self._test_ds = self._build_dataset(self.cfg.data.test_ds, is_train=False)
-                logging.info(f'Length of test dataset: {len(self._test_ds[0])}')
+            self.maybe_build_test()
 
         if stage == 'validate' or stage == 'test':
             return
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
index 38c887304f7a..e016022a6c44 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
@@ -708,8 +708,6 @@ def id_func(output_tensor):
 
         return fwd_output_only_func
 
-    ##########
-
     def _test_validation_step(self, dataloader_iter):
         """
         Shared code for validation and test step
diff --git a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
index 26555cc3341a..7e4df2f27c6d 100644
--- a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
+++ b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
@@ -503,6 +503,7 @@ def merge_inference_cfg(cls, path: str, cfg: DictConfig) -> DictConfig:
 
         with open_dict(cfg):
             cfg.inference.add_BOS = peft_cfg.data.test_ds.add_bos
-            cfg.inference.tokens_to_generate = peft_cfg.data.test_ds.tokens_to_generate
+            cfg.inference.tokens_to_generate = peft_cfg.data.test_ds.get("tokens_to_generate", 1)
+
         peft_cfg.megatron_amp_O2 = False  # always evaluate with O1
         return peft_cfg
diff --git a/examples/nlp/information_retrieval/construct_random_negatives.py b/scripts/construct_random_negatives.py
similarity index 100%
rename from examples/nlp/information_retrieval/construct_random_negatives.py
rename to scripts/construct_random_negatives.py
diff --git a/examples/nlp/information_retrieval/get_msmarco.sh b/scripts/information_retrieval/get_msmarco.sh
similarity index 100%
rename from examples/nlp/information_retrieval/get_msmarco.sh
rename to scripts/information_retrieval/get_msmarco.sh

From 87489c87380827f8f408455f04a16b8e4ae9feeb Mon Sep 17 00:00:00 2001
From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Date: Fri, 15 Mar 2024 03:34:22 +0200
Subject: [PATCH 024/140] add mcore updates (#8643)

* add mcore updaates

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore version

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
---
 Jenkinsfile                                                 | 2 +-
 .../nlp/language_modeling/conf/megatron_gpt_config.yaml     | 4 ++--
 .../nlp/data/language_modeling/megatron/gpt_fim_dataset.py  | 6 +++---
 .../nlp/models/language_modeling/megatron_gpt_model.py      | 1 +
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index b278a53d8213..45c766a966d6 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -99,7 +99,7 @@ pipeline {
       steps {
          sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \
              cd Megatron-LM && \
-             git checkout 36e9b6bf3d8034b10c9bbd9fc357c2df2bd1515c && \
+             git checkout a5415fcfacef2a37416259bd38b7c4b673583675 && \
              pip install .'
       }
     }
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index c9f8b8952d5e..368efc7b3b77 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -235,6 +235,7 @@ model:
     data_prefix: ???
     index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
     data_impl: mmap
+    mmap_bin_files: True
     splits_string: 900,50,50
     seq_length: ${model.encoder_seq_length}
     skip_warmup: True
@@ -247,8 +248,7 @@ model:
     no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token
     pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size
     shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled
-    exchange_indices_distributed: False # Set to True to exchange indices via torch.distributed instead of filesystem
-    mock_dataset: False # Set to True and data_prefix to None to use artificially generated mock dataset
+    exchange_indices_distributed: False # Set to True to exchange indices via torch.distributed instead of filesystem 
 
   # Nsys profiling options
   nsys_profile:
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py
index 17576bea4c75..20ebf555f0b5 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig
-from megatron.core.datasets.indexed_dataset import MMapIndexedDataset
+from megatron.core.datasets.indexed_dataset import IndexedDataset
 from megatron.core.datasets.utils import Split
 
 
@@ -41,7 +41,7 @@ class GPTFIMDataset(GPTDataset):
     """The base GPT dataset
 
     Args:
-        indexed_dataset (MMapIndexedDataset): The MMapIndexedDataset around which to build the
+        indexed_dataset (IndexedDataset): The IndexedDataset around which to build the
         MegatronDataset
 
         indexed_indices (np.ndarray): The set of the documents indices to expose
@@ -55,7 +55,7 @@ class GPTFIMDataset(GPTDataset):
 
     def __init__(
         self,
-        indexed_dataset: MMapIndexedDataset,
+        indexed_dataset: IndexedDataset,
         dataset_path: str,
         indexed_indices: np.ndarray,
         num_samples: int,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index c9aae27eb5ed..7cdb8b3abb37 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1302,6 +1302,7 @@ def build_train_valid_test_datasets(self):
             "reset_attention_mask": self.reset_attention_mask,
             "eod_mask_loss": self.eod_mask_loss,
             "mock": mock_dataset,
+            "mmap_bin_files": self.cfg.data.get("mmap_bin_files", True),
         }
 
         # support for dict data input type

From 355e36c344be55b2bf7b1fd55f5554a831e6fcd3 Mon Sep 17 00:00:00 2001
From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Date: Fri, 15 Mar 2024 14:50:32 +0200
Subject: [PATCH 025/140] FSDP update to PTL 2.2 (#8658)

* fsdp fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* return extra line

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix empty line

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../nlp/language_modeling/conf/megatron_gpt_config.yaml   | 2 +-
 nemo/collections/nlp/parts/megatron_trainer_builder.py    | 8 +++++++-
 nemo/collections/nlp/parts/nlp_overrides.py               | 5 +++--
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 368efc7b3b77..79bd7c1473f5 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -145,7 +145,7 @@ model:
   # FSDP
   fsdp: False # Enable training with torch FSDP.
   fsdp_sharding_strategy: 'full' # Method to shard model states. Available options are 'full', 'hybrid', and 'grad'.
-  fsdp_grad_reduce_dtype: 'fp32' # Gradient reduction data type.
+  fsdp_grad_reduce_dtype: 32 # Gradient reduction data type.
   fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint.
 
   ## Activation Checkpointing
diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
index 055671219fb8..968674b0fb92 100644
--- a/nemo/collections/nlp/parts/megatron_trainer_builder.py
+++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py
@@ -19,6 +19,7 @@
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import ModelSummary
 from pytorch_lightning.plugins.environments import TorchElasticEnvironment
+from pytorch_lightning.plugins.precision.fsdp import FSDPPrecision
 
 from nemo.collections.nlp.parts.nlp_overrides import (
     CustomProgressBar,
@@ -113,7 +114,12 @@ def _plugins(self) -> list:
             if megatron_amp_O2 and not with_distributed_adam:
                 plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
             else:
-                plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+                if self.cfg.model.get('fsdp', False):
+                    plugins.append(FSDPPrecision(precision=plugin_precision, scaler=scaler))
+                else:
+                    plugins.append(
+                        PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)
+                    )
             self.cfg.trainer.precision = None
 
         if self.cfg.get('cluster_type', None) == 'BCP':
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 66fa99ffefd1..0b117fd8d860 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -589,6 +589,7 @@ def __init__(
 
         self.nccl_communicator_config_path = nccl_communicator_config_path
         self.sharp = sharp
+        self.sharding_strategy = sharding_strategy
         super().__init__(**kwargs)
 
     def _set_mixed_precision_recipe(
@@ -625,7 +626,7 @@ def setup_environment(self) -> None:
         if not parallel_state.model_parallel_is_initialized():
             app_state = AppState()
             assert app_state.pipeline_model_parallel_size == 1, "FSDP does not support pipeline parallelism"
-            if self.kwargs['sharding_strategy'] == ShardingStrategy.HYBRID_SHARD:
+            if self.sharding_strategy == ShardingStrategy.HYBRID_SHARD:
                 assert (
                     app_state.tensor_model_parallel_size == 1
                 ), "FSDP hybrid sharding cannot be used when tensor_model_parallel_size > 1."
@@ -678,7 +679,7 @@ def optimizer_state(self, optimizer: torch.optim.Optimizer) -> Dict[str, torch.T
             optim_state_dict = FSDP.optim_state_dict(self.model, optimizer)
         return optim_state_dict
 
-    def load_model_state_dict(self, checkpoint: Mapping[str, Any]) -> None:
+    def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict=None) -> None:
         # Release strict state dict matching when using Megatron AMP-O2 to skip matching
         # half-precision module wrapper module.
         # TODO: Refactor this to be more generic.

From e848378053958a98897cdeb58c5f3a20dfe08af8 Mon Sep 17 00:00:00 2001
From: Andrei Andrusenko <52885736+andrusenkoau@users.noreply.github.com>
Date: Fri, 15 Mar 2024 18:52:57 +0400
Subject: [PATCH 026/140] Add CTC-WS documentation (#8470)

* add ctcws doc

Signed-off-by: andrusenkoau <andrusenkoau@gmail.com>

* add ctcws docs

Signed-off-by: andrusenkoau <andrusenkoau@gmail.com>

* fixes

Signed-off-by: andrusenkoau <andrusenkoau@gmail.com>

---------

Signed-off-by: andrusenkoau <andrusenkoau@gmail.com>
---
 ...r_language_modeling_and_customization.rst} | 119 +++++++++++++++---
 docs/source/asr/intro.rst                     |   2 +-
 2 files changed, 100 insertions(+), 21 deletions(-)
 rename docs/source/asr/{asr_language_modeling.rst => asr_language_modeling_and_customization.rst} (88%)

diff --git a/docs/source/asr/asr_language_modeling.rst b/docs/source/asr/asr_language_modeling_and_customization.rst
similarity index 88%
rename from docs/source/asr/asr_language_modeling.rst
rename to docs/source/asr/asr_language_modeling_and_customization.rst
index fc3b9c26effc..013b31dd28cd 100644
--- a/docs/source/asr/asr_language_modeling.rst
+++ b/docs/source/asr/asr_language_modeling_and_customization.rst
@@ -1,5 +1,5 @@
 #####################
-ASR Language Modeling
+ASR Language Modeling and Customization
 #####################
 
 Language models have shown to help the accuracy of ASR models. NeMo supports the following two approaches to incorporate language models into the ASR models:
@@ -472,6 +472,7 @@ You can then pass this file to your flashlight config object during decoding:
            decoding.beam.flashlight_cfg.beam_size_token = 32 \
            decoding.beam.flashlight_cfg.beam_threshold = 25.0
 
+
 Combine N-gram Language Models
 ==============================
 
@@ -526,22 +527,100 @@ The following is the list of the arguments for the opengrm script:
 | kenlm_bin_path       | str    | Required         | The path to the bin folder of KenLM library. It is a folder named `bin` under where KenLM is installed.         |
 +----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
 | ngram_bin_path       | str    | Required         | The path to the bin folder of OpenGrm Ngram. It is a folder named `bin` under where OpenGrm Ngram is installed. |
-+----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
-| arpa_a               | str    | Required         | Path to the ARPA N-gram model file A                                                                            |
-+----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
-| alpha                | float  | Required         | Weight of N-gram model A                                                                                        |
-+----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
-| arpa_b               | int    | Required         | Path to the ARPA N-gram model file B                                                                            |
-+----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
-| beta                 | float  | Required         | Weight of N-gram model B                                                                                        |
-+----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
-| out_path             | str    | Required         | Path for writing temporary and resulting files.                                                                 |
-+----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
-| test_file            | str    | None             | Path to test file to count perplexity if provided.                                                              |
-+----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
-| symbols              | str    | None             | Path to symbols (.syms) file. Could be calculated if it is not provided.                                        |
-+----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
-| nemo_model_file      | str    | None             | The path to '.nemo' file of the ASR model, or name of a pretrained NeMo model.                                  |
-+----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
-| force                | bool   | ``False``        | Whether to recompile and rewrite all files                                                                      |
-+----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
++----------------------+--------+------------------+-------------------------------------------------------------------------+
+| arpa_a               | str    | Required         | Path to the ARPA N-gram model file A                                    |
++----------------------+--------+------------------+-------------------------------------------------------------------------+
+| alpha                | float  | Required         | Weight of N-gram model A                                                |
++----------------------+--------+------------------+-------------------------------------------------------------------------+
+| arpa_b               | int    | Required         | Path to the ARPA N-gram model file B                                    |
++----------------------+--------+------------------+-------------------------------------------------------------------------+
+| beta                 | float  | Required         | Weight of N-gram model B                                                |
++----------------------+--------+------------------+-------------------------------------------------------------------------+
+| out_path             | str    | Required         | Path for writing temporary and resulting files.                         |
++----------------------+--------+------------------+-------------------------------------------------------------------------+
+| test_file            | str    | None             | Path to test file to count perplexity if provided.                      |
++----------------------+--------+------------------+-------------------------------------------------------------------------+
+| symbols              | str    | None             | Path to symbols (.syms) file. Could be calculated if it is not provided.|
++----------------------+--------+------------------+-------------------------------------------------------------------------+
+| nemo_model_file      | str    | None             | The path to '.nemo' file of the ASR model, or name of a pretrained NeMo model.  |
++----------------------+--------+------------------+-------------------------------------------------------------------------+
+| force                | bool   | ``False``        | Whether to recompile and rewrite all files                              |
++----------------------+--------+------------------+-------------------------------------------------------------------------+
+
+
+******************
+Context-biasing (word boosting) without external LM
+******************
+
+NeMo toolkit supports a fast context-biasing method for CTC and Transducer (RNN-T) ASR models with CTC-based Word Spotter.
+The method involves decoding CTC log probabilities with a context graph built for words and phrases from the context-biasing list.
+The spotted context-biasing candidates (with their scores and time intervals) are compared by scores with words from the greedy CTC decoding results to improve recognition accuracy and pretend false accepts of context-biasing.
+
+A Hybrid Transducer-CTC model (a shared encoder trained together with CTC and Transducer output heads) enables the use of the CTC-WS method for the Transducer model.
+Context-biasing candidates obtained by CTC-WS are also filtered by the scores with greedy CTC predictions and then merged with greedy Transducer results.
+
+Scheme of the CTC-WS method:
+
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.22.0/asset-post-v1.22.0-ctcws_scheme_1.png
+    :align: center
+    :alt: CTC-WS scheme
+    :scale: 40%
+
+High-level overview of the context-biasing words replacement with CTC-WS method:
+
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.22.0/asset-post-v1.22.0-ctcws_scheme_2.png
+    :align: center
+    :alt: CTC-WS high level overview
+    :scale: 40%
+
+More details about CTC-WS context-biasing can be found in the `tutorial <https://github.com/NVIDIA/NeMo/tree/main/tutorials/asr/ASR_Context_Biasing.ipynb>`__.
+
+To use CTC-WS context-biasing, you need to create a context-biasing text file that contains words/phrases to be boosted, with its transcriptions (spellings) separated by underscore.
+Multiple transcriptions can be useful for abbreviations ("gpu" -> "g p u"), compound words ("nvlink" -> "nv link"), 
+or words with common mistakes in the case of our ASR model ("nvidia" -> "n video").
+
+Example of the context-biasing file:
+
+.. code-block::
+
+    nvidia_nvidia
+    omniverse_omniverse
+    gpu_gpu_g p u
+    dgx_dgx_d g x_d gx
+    nvlink_nvlink_nv link
+    ray tracing_ray tracing
+
+The main script for CTC-WS context-biasing in NeMo is: 
+
+.. code-block::
+
+    {NEMO_DIR_PATH}/scripts/asr_context_biasing/eval_greedy_decoding_with_context_biasing.py
+
+Context-biasing is managed by ``apply_context_biasing`` parameter [true or false].
+Other important context-biasing parameters are:
+
+*  ``beam_threshold`` - threshold for CTC-WS beam pruning
+*  ``context_score`` - per token weight for context biasing
+*  ``ctc_ali_token_weight`` - per token weight for CTC alignment (prevents false acceptances of context-biasing words)
+
+All the context-biasing parameters are selected according to the default values in the script.
+You can tune them according to your data and ASR model (list all the values in the [] separated by commas)
+for example: ``beam_threshold=[7.0,8.0,9.0]``, ``context_score=[3.0,4.0,5.0]``, ``ctc_ali_token_weight=[0.5,0.6,0.7]``.
+The script will run the recognition with all the combinations of the parameters and will select the best one based on WER value.
+
+.. code-block::
+
+    # Context-biasing with the CTC-WS method for CTC ASR model 
+    python {NEMO_DIR_PATH}/scripts/asr_context_biasing/eval_greedy_decoding_with_context_biasing.py \
+            nemo_model_file={ctc_model_name} \
+            input_manifest={test_nemo_manifest} \
+            preds_output_folder={exp_dir} \
+            decoder_type="ctc" \
+            acoustic_batch_size=64 \
+            apply_context_biasing=true \
+            context_file={cb_list_file_modified} \
+            beam_threshold=[7.0] \
+            context_score=[3.0] \
+            ctc_ali_token_weight=[0.5]
+
+To use Transducer head of the Hybrid Transducer-CTC model, you need to set ``decoder_type=rnnt``.
diff --git a/docs/source/asr/intro.rst b/docs/source/asr/intro.rst
index 540c26d71239..79d1f3e3e3f8 100644
--- a/docs/source/asr/intro.rst
+++ b/docs/source/asr/intro.rst
@@ -184,7 +184,7 @@ For more information, see additional sections in the ASR docs on the left-hand-s
 
    models
    datasets
-   asr_language_modeling
+   asr_language_modeling_and_customization
    results
    scores
    configs

From 0fbfa211ee6681ff1937e0fe902d5460a0c5a342 Mon Sep 17 00:00:00 2001
From: Jagadeesh Balam <4916480+jbalam-nv@users.noreply.github.com>
Date: Fri, 15 Mar 2024 10:38:11 -0700
Subject: [PATCH 027/140] Rearranged the order of asr models (#8653)

* Changed the order of models in asr/models doc

Signed-off-by: Jagadeesh Balam <jbalam@jbalam-mlt.client.nvidia.com>

* Rearranged Conformer models

Signed-off-by: Jagadeesh Balam <jbalam@jbalam-mlt.nvidia.com>

---------

Signed-off-by: Jagadeesh Balam <jbalam@jbalam-mlt.client.nvidia.com>
Signed-off-by: Jagadeesh Balam <jbalam@jbalam-mlt.nvidia.com>
Co-authored-by: Jagadeesh Balam <jbalam@jbalam-mlt.client.nvidia.com>
Co-authored-by: Jagadeesh Balam <jbalam@jbalam-mlt.nvidia.com>
---
 docs/source/asr/examples/kinyarwanda_asr.rst |   6 +-
 docs/source/asr/models.rst                   | 291 ++++++++++---------
 2 files changed, 154 insertions(+), 143 deletions(-)

diff --git a/docs/source/asr/examples/kinyarwanda_asr.rst b/docs/source/asr/examples/kinyarwanda_asr.rst
index bd1eac94e31f..f8057585b104 100644
--- a/docs/source/asr/examples/kinyarwanda_asr.rst
+++ b/docs/source/asr/examples/kinyarwanda_asr.rst
@@ -1,5 +1,9 @@
+
+Example With MCV
+================
+
 ########################################################################
-Example: Kinyarwanda ASR using Mozilla Common Voice Dataset
+Kinyarwanda ASR using Mozilla Common Voice Dataset
 ########################################################################
 
 In this example, we describe essential steps of training an ASR model for a new language (Kinyarwanda). Namely,
diff --git a/docs/source/asr/models.rst b/docs/source/asr/models.rst
index 6b0087fd5f3d..cb7457b2d5d8 100644
--- a/docs/source/asr/models.rst
+++ b/docs/source/asr/models.rst
@@ -13,74 +13,12 @@ Pretrained checkpoints for all of these models, as well as instructions on how t
 section. You can use the available checkpoints for immediate inference, or fine-tune them on your own datasets. The checkpoints section
 also contains benchmark results for the available ASR models.
 
-.. _Jasper_model:
-
-Jasper
-------
-
-Jasper ("Just Another Speech Recognizer") :cite:`asr-models-li2019jasper` is a deep time delay neural network (TDNN) comprising of
-blocks of 1D-convolutional layers. The Jasper family of models are denoted as ``Jasper_[BxR]`` where ``B`` is the number of blocks
-and ``R`` is the number of convolutional sub-blocks within a block. Each sub-block contains a 1-D convolution, batch normalization,
-ReLU, and dropout:
-
-    .. image:: images/jasper_vertical.png
-        :align: center
-        :alt: jasper model
-        :scale: 50%
-
-Jasper models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecCTCModel` class.
-
-QuartzNet
+.. _Conformer_model:
+Conformer
 ---------
-
-QuartzNet :cite:`asr-models-kriman2019quartznet` is a version of Jasper :cite:`asr-models-li2019jasper` model with separable
-convolutions and larger filters. It can achieve performance similar to Jasper but with an order of magnitude fewer parameters.
-Similarly to Jasper, the QuartzNet family of models are denoted as ``QuartzNet_[BxR]`` where ``B`` is the number of blocks and ``R``
-is the number of convolutional sub-blocks within a block. Each sub-block contains a 1-D *separable* convolution, batch normalization,
-ReLU, and dropout:
-
-    .. image:: images/quartz_vertical.png
-        :align: center
-        :alt: quartznet model
-        :scale: 40%
-
-QuartzNet models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecCTCModel` class.
-
-.. _Citrinet_model:
-
-Citrinet
---------
-
-Citrinet is a version of QuartzNet :cite:`asr-models-kriman2019quartznet` that extends ContextNet :cite:`asr-models-han2020contextnet`,
-utilizing subword encoding (via Word Piece tokenization) and Squeeze-and-Excitation mechanism :cite:`asr-models-hu2018squeeze` to
-obtain highly accurate audio transcripts while utilizing a non-autoregressive CTC based decoding scheme for efficient inference.
-
-    .. image:: images/citrinet_vertical.png
-        :align: center
-        :alt: citrinet model
-        :scale: 50%
-
-Citrinet models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecCTCModelBPE` class.
-
-.. _ContextNet_model:
-
-ContextNet
-----------
-
-ContextNet is a model uses Transducer/RNNT loss/decoder and is introduced in :cite:`asr-models-han2020contextnet`.
-It uses Squeeze-and-Excitation mechanism :cite:`asr-models-hu2018squeeze` to model larger context.
-Unlike Citrinet, it has an autoregressive decoding scheme.
-
-ContextNet models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecRNNTBPEModel` class for a
-model with sub-word encoding and :class:`~nemo.collections.asr.models.EncDecRNNTModel` for char-based encoding.
-
-You may find the example config files of ContextNet model with character-based encoding at
-``<NeMo_git_root>/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_char.yaml`` and
-with sub-word encoding at ``<NeMo_git_root>/examples/asr/conf/contextnet_rnnt/contextnet_rnnt.yaml``.
-
 .. _Conformer-CTC_model:
-
 Conformer-CTC
+~~~~~~~~~~~~~
 -------------
 
 Conformer-CTC is a CTC-based variant of the Conformer model introduced in :cite:`asr-models-gulati2020conformer`. Conformer-CTC has a
@@ -109,7 +47,7 @@ with sub-word encoding at ``<NeMo_git_root>/examples/asr/conf/conformer/conforme
 .. _Conformer-Transducer_model:
 
 Conformer-Transducer
---------------------
+~~~~~~~~~~~~~~~~~~~~
 
 Conformer-Transducer is the Conformer model introduced in :cite:`asr-models-gulati2020conformer` and uses RNNT/Transducer loss/decoder.
 It has the same encoder as Conformer-CTC but utilizes RNNT/Transducer loss/decoder which makes it an autoregressive model.
@@ -128,6 +66,32 @@ You may find the example config files of Conformer-Transducer model with charact
 ``<NeMo_git_root>/examples/asr/conf/conformer/conformer_transducer_char.yaml`` and
 with sub-word encoding at ``<NeMo_git_root>/examples/asr/conf/conformer/conformer_transducer_bpe.yaml``.
 
+.. _Conformer-HAT_model:
+
+Conformer-HAT
+~~~~~~~~~~~~~
+
+Conformer HAT (Hybrid Autoregressive Transducer) model (do not confuse it with Hybrid-Transducer-CTC) is a modification of Conformer-Transducer model based on this previous `work <https://arxiv.org/abs/2003.07705>`_.
+The main idea is to separate labels and blank score predictions, which allows to estimate the internal LM probabilities during decoding.
+When external LM is available for inference, the internal LM can be subtracted from HAT model prediction in beamsearch decoding to improve external LM efficiency.
+It can be helpful in the case of text-only adaptation for new domains.
+
+The only difference from the standard Conformer-Transducer model (RNNT) is the use of `"HATJiont" <https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/asr/modules/hybrid_autoregressive_transducer.py#L39>`_
+class (instead of "RNNTJoint") for joint module. The all HAT logic is implemented in the "HATJiont" class.
+
+    .. image:: images/hat.png
+        :align: center
+        :alt: HAT Model
+        :scale: 50%
+
+You may find the example config files of Conformer-HAT model with character-based encoding at
+``<NeMo_git_root>/examples/asr/conf/conformer/hat/conformer_hat_char.yaml`` and
+with sub-word encoding at ``<NeMo_git_root>/examples/asr/conf/conformer/hat/conformer_hat_bpe.yaml``.
+
+By default, the decoding for HAT model works in the same way as for Conformer-Transducer.
+In the case of external ngram LM fusion you can use ``<NeMo_git_root>/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py``.
+To enable HAT internal LM subtraction set ``hat_subtract_ilm=True`` and find more appropriate couple of ``beam_alpha`` and ``hat_ilm_weight`` values in terms of the best recognition accuracy.
+
 Fast-Conformer
 --------------
 
@@ -242,56 +206,6 @@ To include caching support, `model.set_export_config({'cache_support' : 'True'})
 Or, if ``<NeMo_git_root>/scripts/export.py`` is being used:
 `python export.py cache_aware_conformer.nemo cache_aware_conformer.onnx --export-config cache_support=True`
 
-.. _LSTM-Transducer_model:
-
-LSTM-Transducer
----------------
-
-LSTM-Transducer is a model which uses RNNs (eg. LSTM) in the encoder. The architecture of this model is followed from suggestions in :cite:`asr-models-he2019streaming`.
-It uses RNNT/Transducer loss/decoder. The encoder consists of RNN layers (LSTM as default) with lower projection size to increase the efficiency.
-Layer norm is added between the layers to stabilize the training.
-It can be trained/used in unidirectional or bidirectional mode. The unidirectional mode is fully causal and can be used easily for simple and efficient streaming. However the accuracy of this model is generally lower than other models like Conformer and Citrinet.
-
-This model supports both the sub-word level and character level encodings. You may find the example config file of RNNT model with wordpiece encoding at ``<NeMo_git_root>/examples/asr/conf/lstm/lstm_transducer_bpe.yaml``.
-You can find more details on the config files for the RNNT models at `LSTM-Transducer <./configs.html#lstm-transducer>`_.
-
-.. _LSTM-CTC_model:
-
-LSTM-CTC
---------
-
-LSTM-CTC model is a CTC-variant of the LSTM-Transducer model which uses CTC loss/decoding instead of Transducer.
-You may find the example config file of LSTM-CTC model with wordpiece encoding at ``<NeMo_git_root>/examples/asr/conf/lstm/lstm_ctc_bpe.yaml``.
-
-.. _Squeezeformer-CTC_model:
-
-Squeezeformer-CTC
------------------
-
-Squeezeformer-CTC is a CTC-based variant of the Squeezeformer model introduced in :cite:`asr-models-kim2022squeezeformer`. Squeezeformer-CTC has a
-similar encoder as the original Squeezeformer but uses CTC loss and decoding instead of RNNT/Transducer loss, which makes it a non-autoregressive model. The vast majority of the architecture is similar to Conformer model, so please refer to `Conformer-CTC <./models.html#conformer-ctc>`_.
-
-The model primarily differs from Conformer in the following ways :
-
-* Temporal U-Net style time reduction, effectively reducing memory consumption and FLOPs for execution.
-* Unified activations throughout the model.
-* Simplification of module structure, removal of redundant layers.
-
-Here is the overall architecture of the encoder of Squeezeformer-CTC:
-
-    .. image:: images/squeezeformer.png
-        :align: center
-        :alt: Squeezeformer-CTC Model
-        :scale: 50%
-
-This model supports both the sub-word level and character level encodings. You can find more details on the config files for the
-Squeezeformer-CTC models at `Squeezeformer-CTC <./configs.html#squeezeformer-ctc>`_. The variant with sub-word encoding is a BPE-based model
-which can be instantiated using the :class:`~nemo.collections.asr.models.EncDecCTCModelBPE` class, while the
-character-based variant is based on :class:`~nemo.collections.asr.models.EncDecCTCModel`.
-
-You may find the example config files of Squeezeformer-CTC model with character-based encoding at
-``<NeMo_git_root>/examples/asr/conf/squeezeformer/squeezeformer_ctc_char.yaml`` and
-with sub-word encoding at ``<NeMo_git_root>/examples/asr/conf/squeezeformer/squeezeformer_ctc_bpe.yaml``.
 
 .. _Hybrid-Transducer_CTC_model:
 
@@ -323,32 +237,6 @@ To export as CTC (single encoder+decoder graph), `model.set_export_config({'deco
 Or, if ``<NeMo_git_root>/scripts/export.py`` is being used:
 `python export.py hybrid_transducer.nemo hybrid_transducer.onnx --export-config decoder_type=ctc`
 
-.. _Conformer-HAT_model:
-
-Conformer-HAT (Hybrid Autoregressive Transducer)
-------------------------------------------------
-Conformer HAT model (do not confuse it with Hybrid-Transducer-CTC) is a modification of Conformer-Transducer model based on `Google paper <https://arxiv.org/abs/2003.07705>`_.
-The main idea is to separate labels and blank score predictions, which allows to estimate the internal LM probabilities during decoding.
-When external LM is available for inference, the internal LM can be subtracted from HAT model prediction in beamsearch decoding to improve external LM efficiency.
-It can be helpful in the case of text-only adaptation for new domains.
-
-The only difference from the standard Conformer-Transducer model (RNNT) is the use of `"HATJiont" <https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/asr/modules/hybrid_autoregressive_transducer.py#L39>`_
-class (instead of "RNNTJoint") for joint module. The all HAT logic is implemented in the "HATJiont" class.
-
-    .. image:: images/hat.png
-        :align: center
-        :alt: HAT Model
-        :scale: 50%
-
-You may find the example config files of Conformer-HAT model with character-based encoding at
-``<NeMo_git_root>/examples/asr/conf/conformer/hat/conformer_hat_char.yaml`` and
-with sub-word encoding at ``<NeMo_git_root>/examples/asr/conf/conformer/hat/conformer_hat_bpe.yaml``.
-
-By default, the decoding for HAT model works in the same way as for Conformer-Transducer.
-In the case of external ngram LM fusion you can use ``<NeMo_git_root>/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py``.
-To enable HAT internal LM subtraction set ``hat_subtract_ilm=True`` and find more appropriate couple of ``beam_alpha`` and ``hat_ilm_weight`` values in terms of the best recognition accuracy.
-
-
 .. _Hybrid-ASR-TTS_model:
 
 Hybrid ASR-TTS Model
@@ -406,6 +294,125 @@ A typical workflow to create and use the ensemble is like this
 Note that the ensemble cannot be modified after construction (e.g. it does not support finetuning) and only
 transcribe functionality is supported (e.g., ``.forward()`` is not properly defined).
 
+.. _Jasper_model:
+
+Jasper
+------
+
+Jasper ("Just Another Speech Recognizer") :cite:`asr-models-li2019jasper` is a deep time delay neural network (TDNN) comprising of
+blocks of 1D-convolutional layers. The Jasper family of models are denoted as ``Jasper_[BxR]`` where ``B`` is the number of blocks
+and ``R`` is the number of convolutional sub-blocks within a block. Each sub-block contains a 1-D convolution, batch normalization,
+ReLU, and dropout:
+
+    .. image:: images/jasper_vertical.png
+        :align: center
+        :alt: jasper model
+        :scale: 50%
+
+Jasper models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecCTCModel` class.
+
+.. _Quartznet_model:
+
+QuartzNet
+---------
+
+QuartzNet :cite:`asr-models-kriman2019quartznet` is a version of Jasper :cite:`asr-models-li2019jasper` model with separable
+convolutions and larger filters. It can achieve performance similar to Jasper but with an order of magnitude fewer parameters.
+Similarly to Jasper, the QuartzNet family of models are denoted as ``QuartzNet_[BxR]`` where ``B`` is the number of blocks and ``R``
+is the number of convolutional sub-blocks within a block. Each sub-block contains a 1-D *separable* convolution, batch normalization,
+ReLU, and dropout:
+
+    .. image:: images/quartz_vertical.png
+        :align: center
+        :alt: quartznet model
+        :scale: 40%
+
+QuartzNet models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecCTCModel` class.
+
+
+.. _Citrinet_model:
+
+Citrinet
+--------
+
+Citrinet is a version of QuartzNet :cite:`asr-models-kriman2019quartznet` that extends ContextNet :cite:`asr-models-han2020contextnet`,
+utilizing subword encoding (via Word Piece tokenization) and Squeeze-and-Excitation mechanism :cite:`asr-models-hu2018squeeze` to
+obtain highly accurate audio transcripts while utilizing a non-autoregressive CTC based decoding scheme for efficient inference.
+
+    .. image:: images/citrinet_vertical.png
+        :align: center
+        :alt: citrinet model
+        :scale: 50%
+
+Citrinet models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecCTCModelBPE` class.
+
+.. _ContextNet_model:
+
+ContextNet
+----------
+
+ContextNet is a model uses Transducer/RNNT loss/decoder and is introduced in :cite:`asr-models-han2020contextnet`.
+It uses Squeeze-and-Excitation mechanism :cite:`asr-models-hu2018squeeze` to model larger context.
+Unlike Citrinet, it has an autoregressive decoding scheme.
+
+ContextNet models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecRNNTBPEModel` class for a
+model with sub-word encoding and :class:`~nemo.collections.asr.models.EncDecRNNTModel` for char-based encoding.
+
+You may find the example config files of ContextNet model with character-based encoding at
+``<NeMo_git_root>/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_char.yaml`` and
+with sub-word encoding at ``<NeMo_git_root>/examples/asr/conf/contextnet_rnnt/contextnet_rnnt.yaml``.
+
+.. _Squeezeformer-CTC_model:
+
+Squeezeformer-CTC
+-----------------
+
+Squeezeformer-CTC is a CTC-based variant of the Squeezeformer model introduced in :cite:`asr-models-kim2022squeezeformer`. Squeezeformer-CTC has a
+similar encoder as the original Squeezeformer but uses CTC loss and decoding instead of RNNT/Transducer loss, which makes it a non-autoregressive model. The vast majority of the architecture is similar to Conformer model, so please refer to `Conformer-CTC <./models.html#conformer-ctc>`_.
+
+The model primarily differs from Conformer in the following ways :
+
+* Temporal U-Net style time reduction, effectively reducing memory consumption and FLOPs for execution.
+* Unified activations throughout the model.
+* Simplification of module structure, removal of redundant layers.
+
+Here is the overall architecture of the encoder of Squeezeformer-CTC:
+
+    .. image:: images/squeezeformer.png
+        :align: center
+        :alt: Squeezeformer-CTC Model
+        :scale: 50%
+
+This model supports both the sub-word level and character level encodings. You can find more details on the config files for the
+Squeezeformer-CTC models at `Squeezeformer-CTC <./configs.html#squeezeformer-ctc>`_. The variant with sub-word encoding is a BPE-based model
+which can be instantiated using the :class:`~nemo.collections.asr.models.EncDecCTCModelBPE` class, while the
+character-based variant is based on :class:`~nemo.collections.asr.models.EncDecCTCModel`.
+
+You may find the example config files of Squeezeformer-CTC model with character-based encoding at
+``<NeMo_git_root>/examples/asr/conf/squeezeformer/squeezeformer_ctc_char.yaml`` and
+with sub-word encoding at ``<NeMo_git_root>/examples/asr/conf/squeezeformer/squeezeformer_ctc_bpe.yaml``.
+
+.. _LSTM-Transducer_model:
+
+LSTM-Transducer
+---------------
+
+LSTM-Transducer is a model which uses RNNs (eg. LSTM) in the encoder. The architecture of this model is followed from suggestions in :cite:`asr-models-he2019streaming`.
+It uses RNNT/Transducer loss/decoder. The encoder consists of RNN layers (LSTM as default) with lower projection size to increase the efficiency.
+Layer norm is added between the layers to stabilize the training.
+It can be trained/used in unidirectional or bidirectional mode. The unidirectional mode is fully causal and can be used easily for simple and efficient streaming. However the accuracy of this model is generally lower than other models like Conformer and Citrinet.
+
+This model supports both the sub-word level and character level encodings. You may find the example config file of RNNT model with wordpiece encoding at ``<NeMo_git_root>/examples/asr/conf/lstm/lstm_transducer_bpe.yaml``.
+You can find more details on the config files for the RNNT models at `LSTM-Transducer <./configs.html#lstm-transducer>`_.
+
+.. _LSTM-CTC_model:
+
+LSTM-CTC
+--------
+
+LSTM-CTC model is a CTC-variant of the LSTM-Transducer model which uses CTC loss/decoding instead of Transducer.
+You may find the example config file of LSTM-CTC model with wordpiece encoding at ``<NeMo_git_root>/examples/asr/conf/lstm/lstm_ctc_bpe.yaml``.
+
 
 References
 ----------

From 0675b9dca908bfe3b7c4be838af5c7a0640b14c1 Mon Sep 17 00:00:00 2001
From: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Date: Fri, 15 Mar 2024 12:22:57 -0700
Subject: [PATCH 028/140] Add generic check for dataloader_iter with PTL 2.2
 (#8647)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 .../language_modeling/megatron_gpt_model.py   | 22 +++++++------------
 .../megatron_lm_encoder_decoder_model.py      | 11 ++++------
 .../megatron_t5_sft_model.py                  | 11 ++++------
 .../machine_translation/megatron_nmt_model.py | 11 ++++------
 4 files changed, 20 insertions(+), 35 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 7cdb8b3abb37..c44f95fccad4 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -936,13 +936,10 @@ def get_batch(self, data_iterator, tuning):
 
         # Broadcast data.
         if data_iterator is not None:
-            # Check if instance of PTL's _DataFetcherWrapper or not, since sometimes (batch, batch_idx, dataloader_idx) as a tuple
-            # from the dataloader_iter are already extracted in the child class validation steps. In that case extact only the batch
-            # from the data_iterator
-            if isinstance(data_iterator, _DataFetcherWrapper):
-                data, _, _ = next(data_iterator)
-            else:
-                data = next(data_iterator)
+            # If tuple, 1st element in it is the batch since dataloader_iter returns batch, batch_idx, dataloader_idx
+            data = next(data_iterator)
+            if isinstance(data, tuple):
+                data = data[0]
         else:
             data = None
 
@@ -1113,13 +1110,10 @@ def loss_func(output_tensor):
 
     def get_forward_output_only_func(self):
         def fwd_output_only_func(dataloader_iter, model):
-            # Check if instance of PTL's _DataFetcherWrapper or not, since sometimes (batch, batch_idx, dataloader_idx) as a tuple
-            # from the dataloader_iter are already extracted in the child class validation steps. In that case extact only the batch
-            # from the data_iterator
-            if isinstance(dataloader_iter, _DataFetcherWrapper):
-                batch, _, _ = next(dataloader_iter)
-            else:
-                batch = next(dataloader_iter)
+            # If tuple, 1st element in it is the batch since dataloader_iter returns batch, batch_idx, dataloader_idx
+            batch = next(dataloader_iter)
+            if isinstance(batch, tuple):
+                batch = batch[0]
             extra_arg = {}
             if len(batch) == 3:
                 batch = [x.cuda() for x in batch]
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
index e016022a6c44..651034c91520 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
@@ -567,13 +567,10 @@ def _process_batch(self, global_batch: Dict[str, torch.Tensor]) -> List[torch.Te
 
     def get_forward_output_and_loss_func(self):
         def fwd_output_and_loss_func(dataloader_iter, model):
-            # Check if instance of PTL's _DataFetcherWrapper or not, since sometimes (batch, batch_idx, dataloader_idx) as a tuple
-            # from the dataloader_iter are already extracted in the child class or previous functions. In that case extact only the batch
-            # from the data_iterator
-            if isinstance(dataloader_iter, _DataFetcherWrapper):
-                batch, _, _ = next(dataloader_iter)
-            else:
-                batch = next(dataloader_iter)
+            # If tuple, 1st element in it is the batch since dataloader_iter returns batch, batch_idx, dataloader_idx
+            batch = next(dataloader_iter)
+            if isinstance(batch, tuple):
+                batch = batch[0]
             # convert to list if not already converted.
             if isinstance(batch, dict):
                 # convert to list if not already converted.
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_t5_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_t5_sft_model.py
index 0b32530668be..2344dac3a64a 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_t5_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_t5_sft_model.py
@@ -293,13 +293,10 @@ def fwd_bwd_step(self, dataloader_iter, forward_only):
             Dataloader produces a global batch which is turned into a list of microbatches.
             The list of microbatches is then piped through the pipeline using Apex fwd/bwd functions.
         """
-        # Check if instance of PTL's _DataFetcherWrapper or not, since sometimes (batch, batch_idx, dataloader_idx) as a tuple
-        # from the dataloader_iter are already extracted in the child class. In that case extact only the batch
-        # from the data_iterator
-        if isinstance(dataloader_iter, _DataFetcherWrapper):
-            batch, _, _ = next(dataloader_iter)
-        else:
-            batch = next(dataloader_iter)
+        # If tuple, 1st element in it is the batch since dataloader_iter returns batch, batch_idx, dataloader_idx
+        batch = next(dataloader_iter)
+        if isinstance(batch, tuple):
+            batch = batch[0]
         if isinstance(batch, dict):
             # convert to list if not already converted.
             batch = self._process_batch(batch)
diff --git a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py
index 952c76ce929e..5a41682a4b5b 100644
--- a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py
+++ b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py
@@ -292,13 +292,10 @@ def fwd_bwd_step(self, dataloader_iter, forward_only):
             Dataloader produces a global batch which is turned into a list of microbatches.
             The list of microbatches is then piped through the pipeline using Apex fwd/bwd functions.
         """
-        # Check if instance of PTL's _DataFetcherWrapper or not, since sometimes (batch, batch_idx, dataloader_idx) as a tuple
-        # from the dataloader_iter are already extracted in the child class or previous functions. In that case extact only the batch
-        # from the data_iterator
-        if isinstance(dataloader_iter, _DataFetcherWrapper):
-            batch, _, _ = next(dataloader_iter)
-        else:
-            batch = next(dataloader_iter)
+        # If tuple, 1st element in it is the batch since dataloader_iter returns batch, batch_idx, dataloader_idx
+        batch = next(dataloader_iter)
+        if isinstance(batch, tuple):
+            batch = batch[0]
         if isinstance(batch, dict):
             # convert to list if not already converted.
             batch = self._process_batch(batch)

From 9f8137458be72a75f3ca262355ae75211f929d85 Mon Sep 17 00:00:00 2001
From: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Date: Fri, 15 Mar 2024 12:51:38 -0700
Subject: [PATCH 029/140] Docs updates: compress NeMo core, add ASR Model
 spotlight (#8671)

* make core_index.rst file

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* rename title from Tasks to NLP Tasks

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* move neural modules section to own RST file

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* add asr Spotlight Models

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* replace embedded hf space with canary one

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

---------

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>
---
 docs/source/asr/asr_all.bib         |  8 +++
 docs/source/asr/intro.rst           |  8 ++-
 docs/source/asr/models.rst          | 34 +++++++++++
 docs/source/core/core.rst           | 90 -----------------------------
 docs/source/core/core_index.rst     | 37 ++++++++++++
 docs/source/core/neural_modules.rst | 88 ++++++++++++++++++++++++++++
 docs/source/index.rst               |  7 +--
 docs/source/nlp/models.rst          |  4 +-
 8 files changed, 176 insertions(+), 100 deletions(-)
 create mode 100644 docs/source/core/core_index.rst
 create mode 100644 docs/source/core/neural_modules.rst

diff --git a/docs/source/asr/asr_all.bib b/docs/source/asr/asr_all.bib
index 17caa233013e..11998d30cd5e 100644
--- a/docs/source/asr/asr_all.bib
+++ b/docs/source/asr/asr_all.bib
@@ -1033,3 +1033,11 @@ @misc{park2022multi
     year = {2022},
     copyright = {Creative Commons Attribution 4.0 International}
 }
+
+@inproceedings{vaswani2017attention,
+  title={Attention is all you need},
+  author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
+  booktitle={Advances in Neural Information Processing Systems},
+  pages={6000--6010},
+  year={2017}
+}
\ No newline at end of file
diff --git a/docs/source/asr/intro.rst b/docs/source/asr/intro.rst
index 79d1f3e3e3f8..8a244c3ea28d 100644
--- a/docs/source/asr/intro.rst
+++ b/docs/source/asr/intro.rst
@@ -148,7 +148,11 @@ There is also more information about the ASR model architectures available in Ne
 
 Try out NeMo ASR transcription in your browser
 ----------------------------------------------
-You can try out transcription with NeMo ASR models without leaving your browser, by using the HuggingFace Space embedded below.
+You can try out transcription with a NeMo ASR model without leaving your browser, by using the HuggingFace Space embedded below.
+
+This HuggingFace Space uses `Canary-1B <https://huggingface.co/nvidia/canary-1b>`__, the latest ASR model from NVIDIA NeMo. It sits at the top of the `HuggingFace OpenASR Leaderboard <https://huggingface.co/spaces/hf-audio/open_asr_leaderboard>`__ at time of publishing.
+
+Canary-1B is a multi-lingual, multi-task model, supporting automatic speech-to-text recognition (ASR) in 4 languages (English, German, French, Spanish) as well as translation between English and the 3 other supported languages.
 
 .. raw:: html
 
@@ -184,7 +188,7 @@ For more information, see additional sections in the ASR docs on the left-hand-s
 
    models
    datasets
-   asr_language_modeling_and_customization
+   asr_language_modeling
    results
    scores
    configs
diff --git a/docs/source/asr/models.rst b/docs/source/asr/models.rst
index cb7457b2d5d8..4f05cec410fa 100644
--- a/docs/source/asr/models.rst
+++ b/docs/source/asr/models.rst
@@ -13,6 +13,38 @@ Pretrained checkpoints for all of these models, as well as instructions on how t
 section. You can use the available checkpoints for immediate inference, or fine-tune them on your own datasets. The checkpoints section
 also contains benchmark results for the available ASR models.
 
+
+Spotlight Models
+----------------
+
+Canary
+~~~~~~
+
+Canary-1B is the latest ASR model from NVIDIA NeMo. It sits at the top of the `HuggingFace OpenASR Leaderboard <https://huggingface.co/spaces/hf-audio/open_asr_leaderboard>`__ at time of publishing.
+
+You can `download the checkpoint <https://huggingface.co/nvidia/canary-1b>`__  or try out Canary in action in this `HuggingFace Space <https://huggingface.co/spaces/nvidia/canary-1b>`__.
+
+Canary-1B is an encoder-decoder model with a :ref:`FastConformer Encoder <Fast-Conformer>` and Transformer Decoder :cite:`asr-models-vaswani2017attention`.
+
+It is a multi-lingual, multi-task model, supporting automatic speech-to-text recognition (ASR) in 4 languages (English, German, French, Spanish) as well as translation between English and the 3 other supported languages.
+
+
+Parakeet
+~~~~~~~~
+
+Parakeet is the name of a family of ASR models with a :ref:`FastConformer Encoder <Fast-Conformer>` and a CTC, RNN-T, or TDT decoder.
+
+Model checkpoints:
+
+* `Parakeet-CTC-0.6B <https://huggingface.co/nvidia/parakeet-ctc-0.6b>`__ and `Parakeet-CTC-1.1B <https://huggingface.co/nvidia/parakeet-ctc-1.1b>`__ model cards
+* `Parakeet-RNNT-0.6B <https://huggingface.co/nvidia/parakeet-rnnt-0.6b>`__ and `Parakeet-RNNT-1.1B <https://huggingface.co/nvidia/parakeet-rnnt-1.1b>`__ model cards
+* `Parakeet-TDT-1.1B <https://huggingface.co/nvidia/parakeet-tdt-1.1b>`__ model card
+
+HuggingFace Spaces to try out Parakeet models in your browser:
+
+* `Parakeet-RNNT-1.1B <https://huggingface.co/spaces/nvidia/parakeet-rnnt-1.1b>`__ space
+* `Parakeet-TDT-1.1B <https://huggingface.co/spaces/nvidia/parakeet-tdt-1.1b>`__ space
+
 .. _Conformer_model:
 Conformer
 ---------
@@ -92,6 +124,8 @@ By default, the decoding for HAT model works in the same way as for Conformer-Tr
 In the case of external ngram LM fusion you can use ``<NeMo_git_root>/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py``.
 To enable HAT internal LM subtraction set ``hat_subtract_ilm=True`` and find more appropriate couple of ``beam_alpha`` and ``hat_ilm_weight`` values in terms of the best recognition accuracy.
 
+.. _Fast-Conformer:
+
 Fast-Conformer
 --------------
 
diff --git a/docs/source/core/core.rst b/docs/source/core/core.rst
index a71495800216..7fe4a65cc32f 100644
--- a/docs/source/core/core.rst
+++ b/docs/source/core/core.rst
@@ -740,93 +740,3 @@ To register a child model, use the ``register_nemo_submodule`` method of the par
             else:
                 self.child_model = None
 
-
-Neural Modules
-==============
-
-NeMo is built around Neural Modules, conceptual blocks of neural networks that take typed inputs and produce typed outputs. Such 
-modules typically represent data layers, encoders, decoders, language models, loss functions, or methods of combining activations.
-NeMo makes it easy to combine and re-use these building blocks while providing a level of semantic correctness checking via its neural 
-type system.
-
-.. note:: *All Neural Modules inherit from ``torch.nn.Module`` and are therefore compatible with the PyTorch ecosystem.*
-
-There are 3 types on Neural Modules:
-
-    - Regular modules
-    - Dataset/IterableDataset
-    - Losses
-
-Every Neural Module in NeMo must inherit from `nemo.core.classes.module.NeuralModule` class.
-
-.. autoclass:: nemo.core.classes.module.NeuralModule
-
-Every Neural Modules inherits the ``nemo.core.classes.common.Typing`` interface and needs to define neural types for its inputs and outputs.
-This is done by defining two properties: ``input_types`` and ``output_types``. Each property should return an ordered dictionary of 
-"port name"->"port neural type" pairs. Here is the example from :class:`~nemo.collections.asr.modules.ConvASREncoder` class:
-
-.. code-block:: python
-
-    @property
-    def input_types(self):
-        return OrderedDict(
-            {
-                "audio_signal": NeuralType(('B', 'D', 'T'), SpectrogramType()),
-                "length": NeuralType(tuple('B'), LengthsType()),
-            }
-        )
-
-    @property
-    def output_types(self):
-        return OrderedDict(
-            {
-                "outputs": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()),
-                "encoded_lengths": NeuralType(tuple('B'), LengthsType()),
-            }
-        )
-
-    @typecheck()
-    def forward(self, audio_signal, length=None):
-        ...
-
-The code snippet above means that ``nemo.collections.asr.modules.conv_asr.ConvASREncoder`` expects two arguments:
-    * First one, named ``audio_signal`` of shape ``[batch, dimension, time]`` with elements representing spectrogram values.
-    * Second one, named ``length`` of shape ``[batch]`` with elements representing lengths of corresponding signals.
-
-It also means that ``.forward(...)`` and ``__call__(...)`` methods each produce two outputs:
-    * First one, of shape ``[batch, dimension, time]`` but with elements representing encoded representation (``AcousticEncodedRepresentation`` class).
-    * Second one, of shape ``[batch]``, corresponding to their lengths.
-
-.. tip:: It is a good practice to define types and add ``@typecheck()`` decorator to your ``.forward()`` method after your module is ready for use by others.
-
-.. note:: The outputs of ``.forward(...)`` method will always be of type ``torch.Tensor`` or container of tensors and will work with any other Pytorch code. The type information is attached to every output tensor. If tensors without types is passed to your module, it will not fail, however the types will not be checked. Thus, it is recommended to define input/output types for all your modules, starting with data layers and add ``@typecheck()`` decorator to them.
-
-.. note:: To temporarily disable typechecking, you can enclose your code in ```with typecheck.disable_checks():``` statement.
-
-
-Dynamic Layer Freezing
-----------------------
-
-You can selectively freeze any modules inside a Nemo model by specifying a freezing schedule in the config yaml. Freezing stops any gradient updates
-to that module, so that its weights are not changed for that step. This can be useful for combatting catastrophic forgetting, for example
-when finetuning a large pretrained model on a small dataset.
-
-The default approach is to freeze a module for the first N training steps, but you can also enable freezing for a specific range of steps,
-for example, from step 20 - 100, or even activate freezing from some N until the end of training. You can also freeze a module for the entire training run.
-Dynamic freezing is specified in training steps, not epochs.
-
-To enable freezing, add the following to your config:
-
-.. code-block:: yaml
-
-  model:
-    ...
-    freeze_updates:
-      enabled: true  # set to false if you want to disable freezing
-      
-      modules:   # list all of the modules you want to have freezing logic for
-        encoder: 200       # module will be frozen for the first 200 training steps
-        decoder: [50, -1]  # module will be frozen at step 50 and will remain frozen until training ends
-        joint: [10, 100]   # module will be frozen between step 10 and step 100 (step >= 10 and step <= 100)
-        transcoder: -1     # module will be frozen for the entire training run
-
diff --git a/docs/source/core/core_index.rst b/docs/source/core/core_index.rst
new file mode 100644
index 000000000000..28cd149bdcb5
--- /dev/null
+++ b/docs/source/core/core_index.rst
@@ -0,0 +1,37 @@
+=========
+NeMo Core
+=========
+
+You can learn more about the underlying principles of the NeMo codebase in this section.
+
+The `NeMo Framework codebase <https://github.com/NVIDIA/NeMo>`__ is composed of a `core <https://github.com/NVIDIA/NeMo/tree/main/nemo/core>`__ section which contains the main building blocks of the framework, and various `collections <https://github.com/NVIDIA/NeMo/tree/main/nemo/collections>`__ which help you
+build specialized AI models.
+
+You can learn more about aspects of the NeMo "core" by following the links below:
+
+.. toctree::
+   :maxdepth: 1
+   :name: core
+   :titlesonly:
+
+   core
+   neural_modules
+   exp_manager
+   neural_types
+   export
+   adapters/intro
+   api
+
+
+
+Alternatively, you can jump straight to the documentation for the individual collections:
+
+* :doc:`Large Language Models (LLMs) <../nlp/nemo_megatron/intro>`
+
+* :doc:`Automatic Speech Recognition (ASR) <../asr/intro>`
+
+* :doc:`Multimodal (MM) Models <../multimodal/mllm/intro>`
+
+* :doc:`Text-to-Speech (TTS) <../tts/intro>`
+
+* :doc:`Computer Vision (CV)  <../vision/intro>`
diff --git a/docs/source/core/neural_modules.rst b/docs/source/core/neural_modules.rst
new file mode 100644
index 000000000000..fbeec5440d01
--- /dev/null
+++ b/docs/source/core/neural_modules.rst
@@ -0,0 +1,88 @@
+Neural Modules
+==============
+
+NeMo is built around Neural Modules, conceptual blocks of neural networks that take typed inputs and produce typed outputs. Such 
+modules typically represent data layers, encoders, decoders, language models, loss functions, or methods of combining activations.
+NeMo makes it easy to combine and re-use these building blocks while providing a level of semantic correctness checking via its neural 
+type system.
+
+.. note:: *All Neural Modules inherit from ``torch.nn.Module`` and are therefore compatible with the PyTorch ecosystem.*
+
+There are 3 types on Neural Modules:
+
+    - Regular modules
+    - Dataset/IterableDataset
+    - Losses
+
+Every Neural Module in NeMo must inherit from `nemo.core.classes.module.NeuralModule` class.
+
+.. autoclass:: nemo.core.classes.module.NeuralModule
+
+Every Neural Modules inherits the ``nemo.core.classes.common.Typing`` interface and needs to define neural types for its inputs and outputs.
+This is done by defining two properties: ``input_types`` and ``output_types``. Each property should return an ordered dictionary of 
+"port name"->"port neural type" pairs. Here is the example from :class:`~nemo.collections.asr.modules.ConvASREncoder` class:
+
+.. code-block:: python
+
+    @property
+    def input_types(self):
+        return OrderedDict(
+            {
+                "audio_signal": NeuralType(('B', 'D', 'T'), SpectrogramType()),
+                "length": NeuralType(tuple('B'), LengthsType()),
+            }
+        )
+
+    @property
+    def output_types(self):
+        return OrderedDict(
+            {
+                "outputs": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()),
+                "encoded_lengths": NeuralType(tuple('B'), LengthsType()),
+            }
+        )
+
+    @typecheck()
+    def forward(self, audio_signal, length=None):
+        ...
+
+The code snippet above means that ``nemo.collections.asr.modules.conv_asr.ConvASREncoder`` expects two arguments:
+    * First one, named ``audio_signal`` of shape ``[batch, dimension, time]`` with elements representing spectrogram values.
+    * Second one, named ``length`` of shape ``[batch]`` with elements representing lengths of corresponding signals.
+
+It also means that ``.forward(...)`` and ``__call__(...)`` methods each produce two outputs:
+    * First one, of shape ``[batch, dimension, time]`` but with elements representing encoded representation (``AcousticEncodedRepresentation`` class).
+    * Second one, of shape ``[batch]``, corresponding to their lengths.
+
+.. tip:: It is a good practice to define types and add ``@typecheck()`` decorator to your ``.forward()`` method after your module is ready for use by others.
+
+.. note:: The outputs of ``.forward(...)`` method will always be of type ``torch.Tensor`` or container of tensors and will work with any other Pytorch code. The type information is attached to every output tensor. If tensors without types is passed to your module, it will not fail, however the types will not be checked. Thus, it is recommended to define input/output types for all your modules, starting with data layers and add ``@typecheck()`` decorator to them.
+
+.. note:: To temporarily disable typechecking, you can enclose your code in ```with typecheck.disable_checks():``` statement.
+
+
+Dynamic Layer Freezing
+----------------------
+
+You can selectively freeze any modules inside a Nemo model by specifying a freezing schedule in the config yaml. Freezing stops any gradient updates
+to that module, so that its weights are not changed for that step. This can be useful for combatting catastrophic forgetting, for example
+when finetuning a large pretrained model on a small dataset.
+
+The default approach is to freeze a module for the first N training steps, but you can also enable freezing for a specific range of steps,
+for example, from step 20 - 100, or even activate freezing from some N until the end of training. You can also freeze a module for the entire training run.
+Dynamic freezing is specified in training steps, not epochs.
+
+To enable freezing, add the following to your config:
+
+.. code-block:: yaml
+
+  model:
+    ...
+    freeze_updates:
+      enabled: true  # set to false if you want to disable freezing
+      
+      modules:   # list all of the modules you want to have freezing logic for
+        encoder: 200       # module will be frozen for the first 200 training steps
+        decoder: [50, -1]  # module will be frozen at step 50 and will remain frozen until training ends
+        joint: [10, 100]   # module will be frozen between step 10 and step 100 (step >= 10 and step <= 100)
+        transcoder: -1     # module will be frozen for the entire training run
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 9b62174ecbe2..7bf97cb779c3 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -40,12 +40,7 @@ For more information, browse the developer docs for your area of interest in the
    :name: core
    :titlesonly:
 
-   core/core
-   core/exp_manager
-   core/neural_types
-   core/export
-   core/adapters/intro
-   core/api
+   core/core_index
 
 
 .. toctree::
diff --git a/docs/source/nlp/models.rst b/docs/source/nlp/models.rst
index ad50d976db9f..2654cfca26d8 100755
--- a/docs/source/nlp/models.rst
+++ b/docs/source/nlp/models.rst
@@ -1,7 +1,7 @@
 .. _nlp_models:
 
-Tasks
-=====
+NLP Tasks
+=========
 
 NeMo's NLP collection supports provides the following task-specific models:
 

From bd958aab80f85896e0cb2234426f35c381230cd9 Mon Sep 17 00:00:00 2001
From: Valerie Sarge <vsarge@nvidia.com>
Date: Fri, 15 Mar 2024 13:34:47 -0700
Subject: [PATCH 030/140] Add option to write nemo checkpoint as loose files
 instead of compacted .nemo from megatron_ckpt_to_nemo.py (#8641)

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 .../nlp/language_modeling/megatron_ckpt_to_nemo.py  | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
index c58ae7f156eb..40ba35f819ef 100644
--- a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
+++ b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
@@ -78,6 +78,11 @@ def get_args():
         help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
     )
     parser.add_argument("--nemo_file_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument(
+        "--no_pack_nemo_file",
+        action="store_true",
+        help="If passed, output will be written under nemo_file_path as a directory instead of packed as a tarred .nemo file.",
+    )
     parser.add_argument("--gpus_per_node", type=int, required=True, default=None)
     parser.add_argument("--tensor_model_parallel_size", type=int, required=True, default=None)
     parser.add_argument("--pipeline_model_parallel_size", type=int, required=True, default=None)
@@ -215,11 +220,17 @@ def convert(local_rank, rank, world_size, args):
             checkpoint_path, hparams_file=args.hparams_file, trainer=trainer
         )
     model._save_restore_connector = NLPSaveRestoreConnector()
+    save_file_path = args.nemo_file_path
+    if args.no_pack_nemo_file:
+        # With --no_pack_nemo_file, nemo_file_path is expected to be a directory.
+        # Adding a dummy model filename here conforms with SaveRestoreConnector's convention.
+        model._save_restore_connector.pack_nemo_file = False
+        save_file_path = os.path.join(save_file_path, 'model.nemo')
 
     if torch.distributed.is_initialized():
         torch.distributed.barrier()
 
-    model.save_to(args.nemo_file_path)
+    model.save_to(save_file_path)
 
     logging.info(f'NeMo model saved to: {args.nemo_file_path}')
 

From 24f6f9606e8db238669209664bfaa601b590aa46 Mon Sep 17 00:00:00 2001
From: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Date: Fri, 15 Mar 2024 15:20:38 -0700
Subject: [PATCH 031/140] update hf space to be canary (#8675)

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>
---
 docs/source/asr/intro.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/asr/intro.rst b/docs/source/asr/intro.rst
index 8a244c3ea28d..d8fe1f105caf 100644
--- a/docs/source/asr/intro.rst
+++ b/docs/source/asr/intro.rst
@@ -156,7 +156,7 @@ Canary-1B is a multi-lingual, multi-task model, supporting automatic speech-to-t
 
 .. raw:: html
 
-    <iframe src="https://hf.space/embed/smajumdar/nemo_multilingual_language_id/+"
+    <iframe src="https://hf.space/embed/nvidia/canary-1b/+"
     width="100%" class="gradio-asr" allow="microphone *"></iframe>
 
     <script type="text/javascript" language="javascript">

From f36509789701148aef75fae6f2456922552d6fe0 Mon Sep 17 00:00:00 2001
From: Somshubra Majumdar <titu1994@gmail.com>
Date: Fri, 15 Mar 2024 15:28:49 -0700
Subject: [PATCH 032/140] Update readme for installation (#8669)

* Update readme for installation

Signed-off-by: smajumdar <titu1994@gmail.com>

* Update docker paths

Signed-off-by: smajumdar <titu1994@gmail.com>

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
---
 README.rst | 45 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 35 insertions(+), 10 deletions(-)

diff --git a/README.rst b/README.rst
index ea35f9bc96ab..d2852dfad87b 100644
--- a/README.rst
+++ b/README.rst
@@ -154,6 +154,18 @@ FAQ can be found on NeMo's `Discussions board <https://github.com/NVIDIA/NeMo/di
 
 Installation
 ------------
+
+The NeMo Framework can be installed in a variety of ways, depending on your needs. Depending on the domain, you may find one of the following installation methods more suitable.
+
+* Conda / Pip - Refer to the `Conda <#conda>`_ and `Pip <#pip>`_ sections for installation instructions.
+  * This is recommended for Automatic Speech Recognition (ASR) and Text-to-Speech (TTS) domains.
+  * When using a Nvidia PyTorch container as the base, this is the recommended installation method for all domains.
+
+* Docker - Refer to the `Docker containers <#Docker-containers>`_ section for installation instructions.
+  * This is recommended for Large Language Models (LLM), Multimodal and Vision domains.
+  * NeMo LLM & Multimodal Container - `nvcr.io/nvidia/nemo:24.01.01.framework`
+  * NeMo Speech Container - `nvcr.io/nvidia/nemo:24.01.speech`
+
 Conda
 ~~~~~
 
@@ -184,6 +196,19 @@ Use this installation mode if you want the latest released version.
 
 Depending on the shell used, you may need to use ``"nemo_toolkit[all]"`` instead in the above command.
 
+Pip (Domain Specific)
+~~~~~~~~~~~~~~~~~~~~~
+
+To install only a specific domain of NeMo, use the following commands. Note: It is required to install the above pre-requisites before installing a specific domain of NeMo.
+
+.. code-block:: bash
+
+    pip install nemo_toolkit['asr']
+    pip install nemo_toolkit['nlp']
+    pip install nemo_toolkit['tts']
+    pip install nemo_toolkit['vision']
+    pip install nemo_toolkit['multimodal']
+
 Pip from source
 ~~~~~~~~~~~~~~~
 Use this installation mode if you want the version from a particular GitHub branch (e.g main).
@@ -277,9 +302,9 @@ Note that RNNT requires numba to be installed from conda.
   pip uninstall numba
   conda install -c conda-forge numba
 
-NeMo Megatron
-~~~~~~~~~~~~~
-NeMo Megatron training requires NVIDIA Apex to be installed.
+Apex
+~~~~
+NeMo LLM Domain training requires NVIDIA Apex to be installed.
 Install it manually if not using the NVIDIA PyTorch container.
 
 To install Apex, run
@@ -313,7 +338,7 @@ With the latest versions of Apex, the `pyproject.toml` file in Apex may need to
 
 Transformer Engine
 ~~~~~~~~~~~~~~~~~~
-NeMo Megatron GPT has been integrated with `NVIDIA Transformer Engine <https://github.com/NVIDIA/TransformerEngine>`_
+NeMo LLM Domain has been integrated with `NVIDIA Transformer Engine <https://github.com/NVIDIA/TransformerEngine>`_
 Transformer Engine enables FP8 training on NVIDIA Hopper GPUs.
 `Install <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html>`_ it manually if not using the NVIDIA PyTorch container.
 
@@ -327,8 +352,8 @@ Transformer Engine requires PyTorch to be built with CUDA 11.8.
 
 
 Flash Attention
-~~~~~~~~~~~~~~~~~~~~
-Transformer Engine already supports Flash Attention for GPT models. If you want to use Flash Attention for non-causal models, please install `flash-attn <https://github.com/HazyResearch/flash-attention>`_. If you want to use Flash Attention with attention bias (introduced from position encoding, e.g. Alibi), please also install triton pinned version following the `implementation <https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/flash_attn_triton.py#L3>`_.
+~~~~~~~~~~~~~~~
+When traning Large Language Models in NeMo, users may opt to use Flash Attention for efficient training. Transformer Engine already supports Flash Attention for GPT models. If you want to use Flash Attention for non-causal models, please install `flash-attn <https://github.com/HazyResearch/flash-attention>`_. If you want to use Flash Attention with attention bias (introduced from position encoding, e.g. Alibi), please also install triton pinned version following the `implementation <https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/flash_attn_triton.py#L3>`_.
 
 .. code-block:: bash
 
@@ -347,15 +372,15 @@ NeMo Text Processing
 ~~~~~~~~~~~~~~~~~~~~
 NeMo Text Processing, specifically (Inverse) Text Normalization, is now a separate repository `https://github.com/NVIDIA/NeMo-text-processing <https://github.com/NVIDIA/NeMo-text-processing>`_.
 
-Docker containers:
-~~~~~~~~~~~~~~~~~~
-We release NeMo containers alongside NeMo releases. For example, NeMo ``r1.22.0`` comes with container ``nemo:23.10``, you may find more details about released containers in `releases page <https://github.com/NVIDIA/NeMo/releases>`_.
+Docker containers
+~~~~~~~~~~~~~~~~~
+We release NeMo containers alongside NeMo releases. For example, NeMo ``r1.23.0`` comes with container ``nemo:24.01.speech``, you may find more details about released containers in `releases page <https://github.com/NVIDIA/NeMo/releases>`_.
 
 To use built container, please run
 
 .. code-block:: bash
 
-    docker pull nvcr.io/nvidia/nemo:23.10
+    docker pull nvcr.io/nvidia/nemo:24.01.speech
 
 To build a nemo container with Dockerfile from a branch, please run
 

From a8f5393452aca664689300f8aeece3f43477160a Mon Sep 17 00:00:00 2001
From: Somshubra Majumdar <titu1994@gmail.com>
Date: Fri, 15 Mar 2024 15:37:16 -0700
Subject: [PATCH 033/140] Update README.rst

Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>
---
 README.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index d2852dfad87b..1beef67832f0 100644
--- a/README.rst
+++ b/README.rst
@@ -158,10 +158,12 @@ Installation
 The NeMo Framework can be installed in a variety of ways, depending on your needs. Depending on the domain, you may find one of the following installation methods more suitable.
 
 * Conda / Pip - Refer to the `Conda <#conda>`_ and `Pip <#pip>`_ sections for installation instructions.
+
   * This is recommended for Automatic Speech Recognition (ASR) and Text-to-Speech (TTS) domains.
   * When using a Nvidia PyTorch container as the base, this is the recommended installation method for all domains.
 
-* Docker - Refer to the `Docker containers <#Docker-containers>`_ section for installation instructions.
+* Docker - Refer to the `Docker containers <#docker-containers>`_ section for installation instructions.
+
   * This is recommended for Large Language Models (LLM), Multimodal and Vision domains.
   * NeMo LLM & Multimodal Container - `nvcr.io/nvidia/nemo:24.01.01.framework`
   * NeMo Speech Container - `nvcr.io/nvidia/nemo:24.01.speech`

From ebb769e129ee9d3c66721cdec9ba8f425b21c677 Mon Sep 17 00:00:00 2001
From: akoumpa <153118171+akoumpa@users.noreply.github.com>
Date: Fri, 15 Mar 2024 15:39:04 -0700
Subject: [PATCH 034/140] EP docs (#8660)

* Add EP docs image.

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Add comments on different types of parallelisms.

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../nlp/nemo_megatron/gpt/gpt_training.rst    |  16 +++++++-------
 docs/source/nlp/nemo_megatron/images/ep.png   | Bin 0 -> 37765 bytes
 .../source/nlp/nemo_megatron/parallelisms.rst |  20 +++++++++++++++---
 3 files changed, 25 insertions(+), 11 deletions(-)
 create mode 100644 docs/source/nlp/nemo_megatron/images/ep.png

diff --git a/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst b/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst
index 986e7be30a00..2e94cc45b40f 100644
--- a/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst
+++ b/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst
@@ -24,7 +24,7 @@ The step below will download Wikipedia data (around 20GB) and can take several h
 .. code-block:: bash
 
     wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
-    
+
 **Step 2: Extract raw data**
 
 .. code-block:: bash
@@ -58,7 +58,7 @@ It comes as a dependency with NeMo, so if you have installed NeMo it should alre
 Note that training tokenizer model will also take some time.
 
 .. code-block:: bash
-   
+
    sudo apt install jq
    jq .text train_data.jsonl >> text_for_tokenizer.txt
    spm_train --input=text_for_tokenizer.txt \
@@ -68,7 +68,7 @@ Note that training tokenizer model will also take some time.
         --model_type=bpe \
         --byte_fallback=true \
         --pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 \
-        --split_digits true 
+        --split_digits true
 
 After this is done (will take a while), you'll have two files: ```spm_32k_wiki.model``` and ```spm_32k_wiki.vocab``corresponding to the model and vocabulary.
 
@@ -90,12 +90,12 @@ This format makes training more efficient, especially with many nodes and GPUs.
     --merge-file gpt2-merges.txt \
     --output-prefix=hfbpe_gpt_training_data \
     --append-eod \
-    --workers=32 
+    --workers=32
 
-*Option 2:* Using `Google Sentencepiece <https://github.com/google/sentencepiece>`_ tokenizer library.  
+*Option 2:* Using `Google Sentencepiece <https://github.com/google/sentencepiece>`_ tokenizer library.
 
 .. code-block:: bash
-    
+
     python <NeMo_ROOT_FOLDER>/scripts/nlp_language_modeling/preprocess_data_for_megatron.py \
     --input=train_data.jsonl \
     --json-keys=text \
@@ -103,7 +103,7 @@ This format makes training more efficient, especially with many nodes and GPUs.
     --tokenizer-model=spm_32k_wiki.model \
     --output-prefix=gpt_training_data \
     --append-eod \
-    --workers=32 
+    --workers=32
 
 
 Train GPT-style Model
@@ -111,7 +111,7 @@ Train GPT-style Model
 
 Once you have prepared training data and tokenizer, you are ready to train the model.
 The configuration we present below has about 124M parameters and should fit on a single 16GB GPU using float16.
-Let's go!!!
+Let's go!
 
 *Option 1:* Using HuggingFace GPT2 tokenizer files.
 
diff --git a/docs/source/nlp/nemo_megatron/images/ep.png b/docs/source/nlp/nemo_megatron/images/ep.png
new file mode 100644
index 0000000000000000000000000000000000000000..66e868906ecfead5c6743682afa419e698eaf8c5
GIT binary patch
literal 37765
zcmeEuWmJ@1*sctn(%s!5AP9<d2%<<gh`=b)-9vXuNyyMjr%0E8^Z?RGcXtbC^S<9%
z-;eX}taHx#);ixGlyT;Hp4t13>$>i18}?dN0T+uB>%oHuxJrt$Zyr2AZhi0oLX80d
zuY7<0n)|^6*aIb5DGfKHoecC8;?D74bbKVG?3KvyYzz!Wc6LHJe-x=$`X`~}zX(}4
z7&)=xWm$fuJ}ASK<EKv{LL+7(Q#%Pq(zsFc>${<I+@lTO`aJb{dTNg1%ro=&fSbg_
zLRCBCs<5G644*j=UmAlU@TW?vD*eBIl?{fZ{_|R_C>+N6&uh$h>~Ktm`@e=EV?poV
zk1xFh`|r2?*I$qg?)6SG;~gS4WOTghn0WQ6THDOsj2sFPo7n^ob**BZ!zs(msl-o-
ziCo&f$@Ar=0jzs}8ng+!BPkqnp{zg7rma?CcYpVNh3!Z4U&db`WFAtOE47n9_i1!<
z8~XAr^<r3oPHD*>6+5z0w=>iCmV-{j;{+<sM{;p{y}x>m?fS<#*^m*UoXk6>Co=C<
z_4)MXcu4uG<@S_MFymct%iZmDlD^G1wH*1mMt8^cLBl$S)uOWIX<p*!f-fE~hBP8)
z?K*JJ)G&Iq*x^;_?#FAWOU%BvH)m?mnDAh>m#HTkIWf}|ahJQb^B?zR^DPEahn}6M
zdxx$pcTw3U@OkYwoqt&3Yq?<z1J^558_)RkN<tUU?O;LbrPqQVI&)l(8maxbu4PzA
z7DV`dH4=Z88a7axSA`q<{?(M&OHfy>IDX^z>iJ)d=9bsFiUb(K4Y46IA$U@)?@Q2d
zDO2jIEQgX#(roH`R9unmH|w`c1=(7k#DWDEvYFMP;XT<J>*FPh-W)B^ClSn5`TXeq
z(Qhx7V+fe!iwzsb2kDg_qB<qXAn}-#$1U!*q0?0#GZH1k(7;NB|Mk#xkwWsukwMCE
z=SIT);&r41?k+Ot)=?bq47mzp`@*hSNQ`&76u~#`{v^!2FX>w?mu4Y~z@M*bIpq5l
zuO+cValoscWu9xFEq9*ApS+f|e8LI0`u(+n;qhaeStuRLY2EGBcIU%E$W@#IZN)PZ
z2zujQ{WcaUH+$#NN>4!6Z;98B;-VmwhxPWMo_lrO?8=1lI!J%l(rm+L3RE*bBn)sy
zKSTPHqwugH7CnmJp9QX6YQRxo(i6{OQIDJ<?wS<yYY|efU1qct^5vN)t(eoV!RT--
zXgaGnT!FqPr=CjN&^htK4<bgA1sinGkTiB%40172Wjuyl&EMVDW>_Eeuw|w@jVQgT
zl#3vBak+n*)OW(BsMSN%f{^-m;6^VJFaovgZ%!wSU7iHo-JZ9Y_a;d7EOkaG8!Ah(
z;bTFwz#=S#oNiCdxKm_7t}6R@tBxHAi4h#Q|E~SJ8W`H~`cQJb`_YPDaCy}yG^`CX
za;%O197z4gsY-2Q@0uGLZ<6jlz83CIw1>3ZzF;ARU#2}KdER(F=jmc350?5rF87HU
zLTNU#%rNJB_xJDh`OFl7<f8NT_}^{gJCD;b?epCkOG||Z-5}dKs)i)2K48$HoPe0C
zKFEKWwzGrpdvn-LW!qTcak>Q_n!&>RVlf0AkGdQ?XnDWo&Sx=*Y`oMk;#I}5^v+xp
z72Op`fTdsTXJCycSw?yyVYi3fbc_1B1C`#_F5)w-FC{ykmo=SDn=#CJ9S_`Hx7<}J
z%RaF2TF-E$^|>&0bDsHb?c&0J0oG0iIg2Mt?K4$?=)utQy9Ig>^g;$v>X&CTw%xbS
z?rx4>{lFv1^0__EyqlbgyMOs`^Z<F0l=|nODw`mLGUFsK*+{R>fu)&2H86w54_w4+
zAXOMjco3HR&PXz+Dktsf;ZGuvpwy~X1}`zzm%vRGL_$JxM1~bpAe5!CEe)>wHpznI
zLQwo(VlFM5dEMTEQkeWc40*Vb$$cM@C4(piSAFWk?O}Cdb!_h^ljXi#9y+(7HEfoH
z1+{jo0E>T>6C<Hl^MS-?ADzw%-;Aky%KiM0E5Rbc07Budd!wdg4~T>1;?GQq+z?8d
zt&-X~X}HWm+ve@x^Z8vu<}^5n!bco%=M0LjU^<_Rwn03!_5ROHsNuvMM`PN?N=u2_
zMjT-np&PrxR^thGSS-p(vN!%%Pp%XdB*`-PO{F(S@&@!!v4}UuOQ_z(8j6tza>hR*
zzpni_<!`Ego5rPGdbT@b>~+|w5d9X`e&+Y|CktGlbb%<3e<>j`q6fKI>C0316PWaH
zwp>}mUOgzQQ=DZnzZPB}m<C~yth9(qPm~%u574#mfhYEzb6?*?A`!J4$yG9{SY?G1
zRAymDQ$Lf9q7=s0+|6ca$J$_jNFdEt^I@#_Xb1#Hj9@5t5x-t4ampieEct|~D$BkS
zmps#sVEtN~#CGzq?TIqmHEHs?-JFo&CiMbUX!zPNBx~J<!HS-x<ml>|LKG#cFu%`@
zTY?bRW2<o-wuwSio0h*W9590IhW+LM%YHq!bi4Lg1{u5Foeu;kPA4H4!$kW(mhl|X
zF=It+=f+F)%?xUlsU(Fm-Pb-_@Cea*X<48MvHBR+JM|><WxDH}#AV5Z1ZUXJy3AO=
zu{d5E@I2e8TT8X#emL<>9bFGmN=l+Gxjh}(9)zv%COoC0uQ%%U@3qXK@*f;J#y~W=
z<0RRiGF9cU6nZ+`cQUOY&Gc`r4!4TRUXU2dk-iWAYDhlvl4!V)<#4ZIf3CtTgram<
zVgM27ChD@IgJQ>nk5SVFYL)z(839P9+3)f}nl9pMM=Q=ym|2oQ+mwj1Ji{kQy<vmP
z8yYs~`Q9IpK%qaFQSp;cuKQAnWFOu%*)Mkq4vNb3Vn9E-&beC>Q$*PaJFF;16-l@G
zqh6owMoiThPGfxMYdnCwH36l5r6sh&&iPNha}0IogP>E4Oz$&__KYzKwI)#WQr=V1
z$yK6bZE&-Ff>7#xf04-PfFg^_MfO2c=F2xl4N0@4;()+TR5AmY-AC#jU-;-2n}<~S
z*_^T_5-!L4O_(Fdxp-w>0riAHY$(ow-lT>cx$$YX8Z)P6;k)-x5(Jl4arqkI%T%Gb
zM;9KVB^F6!Zmsn%2+CmcDic&wrhZF@Mi^nrl4V9sl;OL-2x8WLEJp_r^&fGn2*`x4
z=tC(Zxu8^WdL{<vjIGu0N1gnMNpUQaD9881=8Kdc^Bi>8mk#Jj$w)Hwjc{SDX&Cj;
zmchD;f4J;UE8uwuk#m#r8bCEMgLwmR$@yD(GolMa+)#!5HFH0q!tq4h9^0rR;jVt*
zApBI`!bq48>+%(7NxYK)wZvr@pEH#~;w>4%q$`pl9wTb0$vp5|&dPX^_N%>V^BUL}
zVJp%#s-xNmV|gLc)f4X<lcj^OvOaGLEWZWmmGnaJ3AoCD)ou?x-1lmeZ3+YRWbe;R
zulI742!16bi%T^B6<FSlLn9@ER1sU+GZ)ge5V5o)e~NtV;Sc4Vh~jyz)`GyMH9~%r
z6RxsGf5#7&R3_uzC+LJN_uUNC5v1&@GnZqB<$FAlBaf@JKUxr@wj2seYU1Fn-^iXY
z^5h|}_l+_tA;actI(bt?Itk)&?TgRVrg9RuAIem-w>a7$SLqMwkwf!-T?Yg{0^ip!
zc1{e`W00va@TabOv!I~K1euI*o1@`T|KOJ8PF7hWL|~J0$HrsCt#uX-=O`$PwNpdR
zJ@=bxpP~53wfOvXSRdr6BQJ;wpdmmU<<N8UuS)(FJWcwb5P9Ob)_-}>@o@DQXaG(j
zcr?t|I^x@%T=47jeLmd_ypE8EzSq08hXg6>CO^iIu8;coYCm-H!UYc5-{dONJ1rgj
z0X2!{lEiAFbj;gmIM(OpP@yEdT>}kmWeIjM24y=a-wPe7ey^U6jbjFlO>t##y}h|i
zDN^J`%bnCFdUiHx_6B?4)%S%&zRPia+bFu{M$ZY>e--D=t79$n^`}2kw|EiD6kFw?
zf}?FAM}kNZK$B)${Z{bhYCY2zk6IMQpj&n=ouy~R16Bb#w_zS1#2Ps0KN?PlDkls@
z(C?lCJSuF38ioCl9eK?EJ@SV|Hl0tOf1n6yBxrpcFQ)j-8GQ*_6Q6OjfiJPg$#z+b
z&G;K69<oV0)!_&5S~#0c^Q8qYYY;Zs+1yH4APbzJVWlNb%l?<OSii&Y?`1*+B0n@L
z9xw6Z8SyEaf3Q4lwdsj>q67vvVed)47b|LYAmvGNz4LZ9^EB;MPQui8>oi*GfroVA
z>H$P3&s}#W)kX+Xsa@>q0Y1+2pm%z|0P1$~ar9-0DmlLU$*+Upn-9CF(5QHSI){@A
zrg$YJ8k|#(4ArzWcJPUP4{WEOp#<_#cbHHVuez<kyWS>YjW+%$cmLng<&UiVO7v^9
zDHM6TW~-}(((&9dW%Sb?TQk!bB3FP#Zqri0PgkdqT9~o?;*Lf;ddB~8lK(L4XO#E#
zS<jivMbyT^283`A|J>=F)ciz)7qk9-!vsl^n?M`SFxN>Ei{TIppqJSZPgs57573Fn
z#gIr4aPaZ*H{bc1FQm;5x5kPFxSKR_o3;%-;VpR}CG(|0sp?JUi#dEX!ccd@=&Dub
z7wk4hXWmQvLz%Z!k|sE6zaC9q-pc%QMc?N8f{c7CZI``SjxU!B0S}qUwgh)vWmFO%
zxKxjYM#1_QHu0x!1@W$?*)|U=-;Ps#02QK;hn`nhP10}-u_h!L^q$_ybfn%R1WDde
z;K4$R1+P2?P0>3eGHefL!5#T3(dfwh3o<sv(SiLi3zKi}9z0jHIo%q2Ih+I0t1KR)
z1Py8UdS@feLdyW+@hg@g-l#~6XLkZ?CDlMN;7Z7nFJ-mY`R$EPKUXAWNSjiE8QKy7
zLNs56q%{zesCX6-AAP>=r4$R;8zdTx8pq?m5p4+58k*FSXJSb(38#%%19#kpaAXcf
zn|Bauzy|PH3Zv{sSO`YRPlL}pU#X2tTE3A)msUP?YWe=B!PTk=f!YnW^-Ln#0U)~<
zyo0JcpGHjBQ#V&eukkdfu>>7^IvXb(e5A)EGaUz?oL!J1UE|B2f77YRF2xzw>`V#N
zVfyC&5bG1K)ppZBn#eYZ@wUU!<$5OHW=fIU{^_P|TO{Mha#YPCtucWm(B?-YX5+>s
zbH<eFoplv9M)D$7@;8d)V<-SkqYfearW5d*pZJ$kk4CM{AO9?SX<}s7QAD(sI|%~7
zC7cG?kB3X0Z{UFUrG|>KhmoT(?!L{67{aF!8YV`}q0&mPp$e_!^tZp{pDZ@0tF)bO
z_Q0-`^n<K5-ZWqDeL4a4uwC?kfS+^2+R8Cd#{y#`+1O|5>)~@Jtz{ZF6iwT4t_L9b
z#g+y$#3xj=$IFcQP(MtRTIYB3!kwv)u>4J+yTY=j0Q(N$I=wXA`ui4ID#mH{tHL3m
ztK9zX@A-`TBBBoKM2h)+Ha0{5y28Nz6A+3y7k91r5fv}7QX1^P!N9d$Cxd=jSPFD+
zPNE74RdA=ZwHW(c=H;hB%RURek7tdh5>X`tq4GZAnLlURB<cyEWw}^e!GA_w_ve~C
zwp%p>IpJBS0vUoIZeqfS*k1Ta1)xp26O^L}kqf*H%yrYYhaM38p(ul)g(}7eA^gw+
zs=jTrqv4R7kl<NV!0^PM|91VEz^=wPBKWQlujKNBQ#Fq%J%;DpJz8&qTb+&vVr-DN
znOM>>o;Vo-2$%~irX5~oStF7Pum9?>+G{%T9%|O*q()?)#8P2U=U{4u)D&ad%sxkq
zh}*-FB}8mxuJ$FTZ9=MVHw!Z73N^V@(s!ONF%ctlRi#m!_)m(PjELPgvO=6Rd82K~
zgi)&Jkg6wZZH*V(gOC}#;o1{j(KK8&!MIc|PKQjAbjqI}TTer?qfN>E7!B`!HNv7%
zZ>YJJ7Oq>J$+dE3cjME{ThkwZg#{DhidVOWWO^O-sK0tfa)udiqL|u85KhYT;eHQ6
z&*K=)1$D!{tcin!ODmy%CQ4JF0vnnr%A;2eQyn>BPig<%C|amdW@PBlfwxY?suFAP
zi&C5qf^SLQ!)RR!;ceJ?^kfYbVIj2^wl!_j%uD3jIgg?Q0d2cNSafB9Ta(9WZe(3H
zhYM0k1d5o`20V;3g-V&?LY-)=c21no-(_SHwK|;ngjk#}L$}I8iK0JWNKF`BC{XPT
zqe!bH9uj9!Zqx|50@6eh@jfS>ZV0jxbH;BeN%SaX579rAX!e{PSoz!}2vd<u3JI?t
zEArv%Qf5(~T!poXPrC4yQZkYU4PF1Rm>0A7PVvG6x<0`}Z=mlzthjardpY4vwbLj+
zeLs&f8bBMzq6l&tQQrgN@un+%6yTETZgx-NIFxE$g{SAUj!dY$@w@*r|15|6pYqJ#
zs_hn~pCpw24Hr5ogq=xD!8c0WX&yN@;K*j@^wBPc(D}qZ0*U%tmIMKMYO~rIyB9uU
zJ>4<7MpRrz0>~Ga$Lrgrcx%>9BnXTK`KT0_#h*s^;{lO*ZUxwlu%j6nqDZ?eNj9DG
z<;lo5bv6!*IYf-1WXttlLp(V~2>~U)$pcEsTx>&stWia?fI0rdy#)fH;Gw{$=mGDM
zDHC5vI!fjGyt1;c#(eiRp-m?*4CD!ARHZ^zvYdUkuz(8^D0;R^zY~#-=&{V3SWo=8
zsjrq}p66bh<(`+G5+bZ}RU#pJGjsOcH0y@6@UN+SnuQ6hg)ksLd<FaC5yJeY{Ah^;
zA^GLwcP;x?Hit6gLhU!RC;<I6^d<B0q5gCZ#nk`)v7#^Nml#r%Ks&o9ilH>V`E#NZ
z2#0!>G%ARsX;mPQ5SgzP7a+F`s=Nw?1^d@_w_r?$C@N&4jF}F5Fe<7cMJ;?>&r=7@
zfg@-8uTw?Rxf9nG;|=x8-zGC4nR0g2K#tA(T$gaqV+EGx=jiC^rMf1P^%&jDUH#8x
zq1iFqDAIn9c8((94DgOUFSe828SUPUmoiPUy7f+*y>W4mB)j!U5NjEH5&hWENPX{i
zBpwWcV(hY<2dL#vn<IzI-7?*hJe!Y@S&CtMV@1jKk^GWb(yY9C)zJPF0YoK6+qf3m
zAVjb1a<OQ-piR<-6(w5B6XepupH*ZtPBhO*N7J&|BK~K`{eruV_?+<P*E@pZeB{5s
z1qHz7zFSkRVP3oy<+}*(3@7;{!(oc*1mI<)8FeS9k9jyx^TF2T><oDT3Zh7HEbqGm
zsRw00K2#~fTOQYmI*|xTlkF}D{#(X9PFzC(3u+im*mN`Rd&koW*KlI_inRgiBCX+#
zuL&L>YBmabL^(vyGs->r2B=kWvW8u2tnh#=@zblP_Viod8Fbm;L*n~Sb9M-v9)>(J
z@15LbqA@fHFlKnz^z;NFb9Cmw_sH@ZsBd{Mm=}2$poK37(9Hlt1{%)LZMhSG&tE$q
z-Qj1wmB6ILIG2shhKP-@aKd9Yvfe9&%WGH5&ungP?6IfpZDk2fo`unG1jL~_@mFug
zzWZ!4nF>mNN1BzQmLyWfDzlcP?R2%$b-e2}tsJV;q<*Zw4PIWMGF^TcL1G#woB%J{
zt92O2A|c+d`AG3-#H!s6mlQpZr9N!n*?=|7RkmtFU`<m9TIi(Ba!_h!pjPubhZ3k$
z1F>L{QAp#@x8IQEdgu4XK^zv?7WMHY5D&l}<Ia5AQDkbAb&#x|#n@(Ba~n5%rG5op
z;!zmfetz8R<8x5WChFhHmaet{+3#qcQuxD0R^zW}#D>4l)DQa{>QO!UeL2hE`hFGE
zlM_3A4s;?mxaMk~488x;X^c#(@G2M{rIiDMuwOxqr&3i?#A>`Bq=uE}8cy7dLjB{O
z2&k^1p7+_5<qVy>kVi?|><bWf^6*$$K8x)aa2VRFXQv51FA`2e7~umpdwG^aUmVgw
zRp29kv+l7%KCEyiJo$D<d`l1~8)L%+isdx@zJTUQwG_Z!IkW7nxBo2dYsvlj7CNHW
zyV7;hb1NgVs7?vzpEo5cukcfh`l&!E^}ge#E*#X9RH)NQGlWo%lq<y0dc;a9ejEex
zK^VFEyvH_ApyHp8T*a_vpm9en(<}mQ&1NJwjP^0U-gcw_d34kI7ti}b*W`KrqP`0D
z1?(;J<ByYwGapg#2izHeOo#zeT>$rfaC4pG`r}vWSfc<~b&IwV5c0z9D^DKj&{t1B
zkWmC0sU3Udl<(iuWE*-SEy<9-P5w4?MnFn_sexo&|64j)<Fon{XeAP-_kHtPHdqD^
zgMz1qKJN6j(F#r%JKQ-_)G@DyOo;gzst4|n2omm!{SZ(-`kMGq(A}NfrmNrUb?Hjp
zvlT210Noy5ugLH_LiEVY0!a`TuR$)HY$k)0n;KxGcJQT_K`P<uyJvRSOfmAd0V&8Y
z;xV8&G8T=!pV(gIHhIfRF4s;{pw&~Wx-^Rz4+`7Nip~r#BGl=gO=1nBR>Lv{fkv`z
z4V6wnwfKpCU#3v7pnuM!lD89i4aIiq*+~#;vIP=OdE!%>??iU$$Jm*xVB_p)n4*A3
z>_@XmEsuB_K=k+?lCi%w+4}tr2w_2*lExV8-0=P8Ya6A=rAb9m4Ctr(UqC~n5pzQH
z9kYFL%bue;)s?iJYt$8y+ncOd24jLP!`qo+T|o{L3ABQFqGJ|NO*csC)*OgQ5aSXt
ztl;}3Jz7(B0j2CKi&Erjlyq-2jd*TS4gHw(S|BJjLWWN)01n-KihRS!Ve#GaIoO0<
z&LYthJXqble;RDA*}LImP&9kixoPq4Orj2oQ*N?J5l*<W<aQHxl~yqP+s9BC3YV<L
z6Yl!&hEosJ)WR7MpFf3{Lp*>ebQY3l#`{ze4jX?N=%n+svr4=Ht}NOvk<YggQE_s}
zfSWsK{VbbExjafvlB>aG*8>Q>K>sXtWsZJ+>xS~7f%*{pUh+|c=ouII4^}yEYyElp
ze5d?fa~NV!S)n6p&u;d+wZ0N&LsbZDv*tW&;T<W$Bjce%_aJtK>%LJ_mx&EKfuW>K
z;s+Dl28H(sol=i>OtzHG3hWKg)Qt!SEY%xL0*{S&B9tX_JaRq9@yk4rRLC4IxZyx>
zW?vonOUHdugJ(5fJS_Y7iyG-uZrw`R-V`fj4atx(eScM{0ZS3k=%L#2q91;<zdU(s
z-3P<_s*jtJR+_RQd`gab0z^4y#b~E~xP4BeYKB<qMgeSGxWOTxx9Ma=653!~6&U~v
zR@QusOCv@cr$s8$!DgDt6v+r*jau^iReP&p+RXw#<<Y1trRUyQi#wx@LMhysj%43@
ztzR}42Ff<MQUXlgGI=BFatKV9a%xdJc*V7lXx^(oQSo%5&5O{aeEgBUrhRXR0iJ#C
zo~zh4&Rh?RW{$44p2;s$z-<_!Is8XeV}M)%CABR5_Jmfx*aRwqV`#q^RmcfvD)}^?
z_I+(YoW^VmsG<qau%^heUuQ`t8#siqb}_<Fqe$$XFI2Fg8`6d~6k0t<vb=vW)K6r|
z5E$0+5eGDzaS5O{yr8feGu(ktE*T}h%$<RR{y93a%zKg8FVgvw9e%h9))$kHghx+k
z#9WZZ?ZlyY2y*3*%^Q0(PSr{B)wfC3T9xwDb4f^U+Dg|+XJI4h8$>MrsLPF`0IgK8
zAOS@8t}qFwMrifrGSGL(FgWS}=YCH?Vi)JKJW|6VfVBZ<7`CZfB^BqR`LjJ>V6jyI
z!^>}|wOeerE1RXN_}=b}kEoB5N#xXgCy4b6s^mA*#5ag=t@Jp*8D_g|fBZN@4Sk6i
zQBU46`+XjA#EXBVlAx5I3}oECxSXBg@g`4K4l}v-k7S4x{jKH12r@pr;H!qgv!?TC
zp=dchwh}YHCy*^MvHb{768AKwpS5+`IU<uDami4$r`JUWPQ!9QEQdwM2yu)&d(b#@
zKs0}^C=fEjYtpjJifG>ip+B)QeEu4su&!%Lj16YPa7)e<I^P>L5hua<)T|V$i8S%}
z``vhnt43>key@fIeQCjL;J2A?_P!Ex+GfhehW;7kv#Zv4gFoAsz`=wHpX=xP3G!fJ
zxL~fqh5@?s8>#c-i%W7;Cj@?jDPLYrePcK&RR2g<avQZBu-%zKY}00s(MH<hc|rWe
zL|!0mC35N5C0`yiU$VBBt4sW$8E>?qiByo3`TQRHu&~}_qZhh-(!@FtzCB*@>E|a^
zu^!L)2Zr+d7xxTQ9J@cKz4@V>$TwI+zw>Je><td}e8r?${|@cS(h(rFIWw8w@Om2)
zzN?mS+-Z6aAJT9;R~_G_Z?}2*b=wNr9*ob-oAB{n%$&VUcXLf=-Y`4msIc!_&wQVn
zYFhwo%gK?@>No^!i3j$t*<wT268V_#j8$0B(2x5ht|+g(oiCBQtlFDzudH+-(WGzp
z9^~vyRVjF;KPkYa`p9zyls@xfGNUc*KK=&5Q4()T;StBz|5SVG5N&s4L5&iFx;N4E
z5h9Uhbag%tq&{vjtG_adeaxq_H`k;u&W46*m4;_H9v17T_|7OkuyMRkBlP9k@?-_Z
zhQ3XmT(8i_N&M^&KnIU|O!cUz5jH+hau{+N78*$zjG|7#9l;6c&R9Oc;`$Rd_E%2%
z3@J05pcX8$d$)1Rb1To9JkB0E@Bb{=-5+i!X~l@9=J;2J-^tN9nLKsktv4xxNg+Dn
zi2Q5u#5nxfcEGX!WwVevkW2VF1ka!NfKjMZOd*(MDvSmwU_(zD0Nv54TKIG?lak86
z-PSYPfrvqimv94J9zAZE0g;x~!UceoR`*2F8q{J9E!r-6Fbeq)0~+ewkI2<p?M=J~
z6oRf}g__eCC27kN*ia$19>T`!9PrFdWtHmurm>$%R&q`m-xzuZNQR>Y(-snGTTh~5
zlL}2N?N3(7ysa!H1yvY5s@6bC$xKB+T$uG(Y?xEG@?8RIvk`yA1Pc~mxxZ^eu`zZk
za~bgWYLYPt5EU)GR4H5^@Sm<?Tm;*d<>#<$^ZCDhTxO6`cGNX#DeoC+bA^uz$sieW
z{>vcU21L05Hre52Cwva=()79vmq`<=tck>fKg}0Q*wrkUkb1X=1v9Cw%cIVeVR&!n
z7~1%*fbf#Y)qtDK%=?e9BSL@{c@Ei`ty51PaNnO~)TV=_z5OU$2GMJ$%YMwrE#J&*
zSpW5{q};wsd}trgWa@}90vV-&2=to3(Yr}BYH`+UN}vBRlJ%Yt^Uov75kOhIG-;|2
z9PzOhzk;EOp+gS~<1+#!;e}M!yJT}_kkk%-2ukzNr_>Z9FUq6=fKxE0w2V%+$J{Vc
zT>m`S%^o4>f2d+iLx|C|HlT+_(*To~9k1ZWpAJV_C!iaybF^%d36JYY55DAv3rK!w
zkoz=l%_Zi%#T+3T$1a}ghe^Z=mHYdJoz?0ooS%QOt!&hm9K%rVdt(7a4^<Y-FFSXy
z=3Y=Eh{vT3Kv-~FV+F3|iN;(I-|+5E;M>j3NRjWK#?1rv`}RbRXXD9xHXxWJv&4hy
zVxowZ;%Q-M*M{mXouW?-x7Js;$wYNJ*x0>mtgQn26#hW`*9Z}n6ISGW>wHdO@n-nI
zgC?C0XZjNzPKfR9bWPqa37-+Qx@b?hwxro~KA8b>RvGia4={N4_=IaYKXO$8lUg|$
zeKo;kM$58pNXLm0n7-LG7l)5TLssb}K<=@lscoC4rr{`TlA27SA*o!@r4*+V&hU!N
z2dGBvRF(8^PM6f^t942NLjF1l80i!`D}FeCS<7v`F4xEM8ibns@L(yI<pZ&NFGhJ6
zZ$sqf7UzeuwVFtCA=66j|I9d;-~wN_bFKhH7fg(0EGmO!S#^2*QA9lncqjq2u3<I7
zzXFUaMq=bH7PAzM82gv_{&s>HkVCnJirl}l?|*;&h!2psiEKUF#{>%%05mK!tqo=J
zamWPY4C_%~K_4*K?3Lu3C<eTcpB%%+S@V%O#Dc;Ei}Yq8SC}Q8Gmn9~+lJsVG1T2g
z73-hIo3aFY_$L2fIZ~)L7`xQq(XP#~_wQpS=5QKFb)<;eXHR%?!hw4vH-<k|awe$B
zow5vu7d&0?tzL!S57To(l-%RZ>wo~v+_5Ac{dW!|$>3ipv!3`q_RHayJK;S)H@-)J
z)n3OEDfkUAu%v@lymS1QK!DIFPQi9wKyefwZXM=}yB|EC&Lv<9;Y*kD{s4Q3E~71S
zAQg8O7>x0~l&*UaR5)kTTGYYogT;=H=FCS8C?|f%=uX`>8+!Z4qf$9{(n{Mrl*hVw
z&-^i2AI{~`KqQFU3|1>qJ%F!8BiF?02#muwkZB<vpWm7G267lo9<tz1!}_jg8Gt?F
zUh;vCI@ECUu3YYc-thiK!Y!;qs&|7)ixWOktQ&Aq`C+UOU}9+@8F>?<dtR(Cov(2T
zh7Ie388fAZ;!gX$iCp)iYD6j+TjMm~xPo?eg!;v82~&LNfdGa8YCVMv2kWS5a+028
zKq2~4z3_V^KVj2$6!sEuX^^c2Krt4y=tt;;nd(Ti|8}JFI`Csot$o`BpA02n;!iSx
z$-Zvhsq6!u6W&m&o=$~dQvJ@TU@q#<oGL&jlfbOUB|x11NLEGhrYnjn)4q$M#%iL=
zQc6WfnUWaiv4f-|Y^u?{ia9lnh3i5hg|PLBsHm!#k1A;Wbe=l``%E@~=oV31f@z;t
zQn}mVTaJZ9(KbYyIP*lk1rZH378^!OJ+KX>s0x8uwjjD=MH_GgaE6Sb3ML^!$`CjU
zk~d;FP59k$k24TRI<%$jH%OmkL{XSn^!O|!;*8V+ds4j69qEFB$><FsQ`7Z>svm&Q
zmVprynV}OwvwlY~uII@{1+b|oW@Z-~x6rO*jFMc+T}&U%42S2-+9(hE(fTJZ%mF6e
zI%IbNnpy|%$-`h;g``Z<_m;^0o_Vn5X&xCc4`lmBc*Z*BI`7R^H>VsU)iJjAXxwBY
zk0HL9!P{&fOru`(j^rs1#IweJe}k4XleN&;57?j-P85>gaO#SrX-<pl1_CK7A8d>r
zeSN^}2W{8SRJR*^;pQRP!03QPfYzPxE3)gUN$Z0^CtJ2CvcN|ePN8n;uGNlSDE{wq
zqv-v!GOAXs6x`>nnS~%fgPwLD0ow^Qyw0ryWtcEZs~1|Jjuf?Q^7sfC#TG@Dx|f8q
zTs=>}lDQ!>d<F3rJFQu$;dh*6e=qBn#~3R-{EnrEo&B|%<;2`dT{*G@Ft)Dq1c~Uf
za00KXN^{c!A#v9|jUhLYK1R$C$kpUeVuTAV!;@f~vIK>8LPTJ!M)yCj9LgYhf0Rag
zsTd1fiSc--LQ34kh;CM5L?8+Wur6-Cmi+Wn`hWK=&;Oo`p5dE{G%zrS*e$doZDPc^
zm`3~rf&njyV3j)~1GD=D$e29!G~0cqpP*d*w?mTQ|0m06yt1S(P`>+;c+#XXdQ1UH
zUg>!FyqXEZfF=&4l6t3af1DIU-;d@$yE*81es@!~TcdIS#>+Wo)0wi;%(IIh=hPH`
z)DlToVr&3QiJ^Tb2{43U;Y&Mk0?*<WKy43cpF+_w;;W{B<_)w}QOS3TU|<e9c9ZSA
z#7uzw5g5$?Cr-bF*kLSwNR@lHqI6@1WW2N_n42}d?F>Iq^@E@S$Rp%_^v?543gb!s
zCcwIvN2@F^z%=Frz_f}El%8)p^DQj^f<ei_zjQn>6`>(~3rs~NdNn{9dPuDa%pd&*
zMV_~p>)0fmfy>0a2ISq;A~xxNO_}ZSERUmuWZlIp-nGDPfw=JD_gAC>At+*D#<r!V
z3X|bU0I*DgD;0tfQz#m_;?Bo-_0LZnSyNHgw1MrY=J?leBM{<70tUtR8$rSD8;t(P
zbNSm5aD;nfhIktF%Qc$8H5&2H=Pqx1e2{ob*82sAW*c0~G8Dk7dx4o9(4S>gE`S|j
z_i6m`1$YYJlL<NlHaj%&n80NK^6}#Y{&A<G0V@aZ*3=+9!LP`Usm5g%!8ujYDQ-Ae
zXPtZ%itn`2p(DW0_%jfl)~$7r{z*&}l}NV0??TPKLHaoN-I@uY`+s`wd+WCE<O>{q
z2QG1pa=^-Bz8?2WR6KoP2FcQvE&U@Y!nXM`-@xa)AO^#eL?FX6anCyR@wlM%xZl_1
zC{r*tc0mh!0cP(DwIQ?~zXn5lfuSo#k4OJ|wV<N>k0;<#xFY{v3Y&~KkPP@u-&x+r
zg?he<lupW1_sR?@2U_6Fl`;FVnV+_ULzoEAY^&|P(>8-35qM5>#(&r^@8HA}*nk~!
z=@^*7WJ^P>uKuF*iFJx1Fg86L1A1d+8Jmty6+mU<<5(o;Us?)&#{aYh%kk+ac3|v^
zpQ#`FM;x0UmjflFnDzC8C8SD(EozcP((%~AsM+NLKNi<SYW0{mw#2|sr5kyqml#UC
z<fNhPT?@f!^Rpk~;6>ZuC-Wf{st^Nnorw-T*8g%dP^e>9{Qr;N=!KR@Z~*$ld&{Zn
zOYhm6H7zdQv;BEnHX5QIb%P~ZJ2xVM8L3zAYUbYtLQ<Jw_p0b*xhYV7ZC3F?w1NrA
zFi2Rlq|JNFS^(JM)6G#!pts(G$|NxJ!>#<{d+VMvcb6jO%vtGkdub`&(E*r1@IyMA
zg81L}OQ-vr+{kmc2GF8=3#%VK5Pr7iTke3P)I4c(s;U4*C{L?IFGI{37$@_R+=1^X
zR>Je_<NY;(b*XaB=h|Al16f)t01bENcmF?YO;HJ^$1^q!CE$U8U$yNJ7^kD=8^HVk
zxQ?NpA4*)b9zFhXu@qt3c-RGGRMiOQ{W+RVBa&!c{1JQA5+_v6y~g7~Y~CLaItL}L
zcYqW525<<MER}^du!$5$z;t8U`s{M$Q=WS=yBZ{C7vz#n(&j{2=IU;$u$9Zs<RS2V
zk8!MC@3+L9=-W1p6|)1}g02ffxl(WhbZ+eC4vvEKH(-QnRa$2Oz@$r<O01ku1Q`}I
zUYGUNmvlB+fQyEe0Wwni_J$>W9H5^Y06+DLLC$v5xTa4TzIu8nODql)_)GvcIUMJ`
z&l$`?VbW9HoUY-je6$k?kSLhc8GmAZ2?AV<pBtZrixOFgB3k$BYlg2C91PR7wlNzT
z<tA;~M(%PEHPB&5C%8-QsL!uv2r3r+VK9|V>?45M9O&DNQo3X4Vv0eUPg5vVx$No%
z7Kf8?O;ddC7_hae$vy@qdismya)f1$LQGeiLNql82Y@rr+z#Nx0c$cZfGhL&%tH03
zstjWwJiG<?co{qE#fWeWKO}xK&%^%Jv=XoU`T-lIu}LcoiQJW6^)r(OtG#1}e#wSP
zst_RK2KR<ZU@LY1VgKqgL^oZ;)=20NnUR}RMBC-X1~y;g>!v6I>Y7+lw*wQ7<E!nm
z<rcH9$YH(kMtP6FSEnxWp#T+<HGKo)zOUjyi|y9{-myzCjh$RI=)r1vB8l|!Q74!s
z2MLBoTN)|KSGOC`dH(>-CN(rvx76Ut-?DN5fp0F4nIvvOXKGWy-T-j&ExR-j4-<kx
z6l2=rgcAD2j!f5BC4Y`)pi}q2&<&Gob3(fBKvk2zz!Q8wrf=H?_IgYURY@7~!Bk=F
zUbN0=VC&h}EYTxru?J$l#T8&T@VAG$AYv1V2@r^J3jux*?YY1;v3g`If&>EQ!wOJ}
zHP!h~P$I)#ul=)1?5#th^cEh^Zw>-J--6+^bY~KW`Wj$93E&#QULg>XrLIw=g}Zrg
zkme_XP}Vm>hqJ*9vFYX<aI$-!#@+4T7~WOY_P=Xrs6zJZpyhFBe8pco1V)^O3^E*G
zC8Fr)|870A(qf7#v@u%naQZ=4mI*7$3AhUm0Ud^+3A<6FD5}$oj&jL_pO4Wto~TTK
zHLmkKw*jNj#=mDXhq^;gk0WnHieYHL@yRx_K9ot<9|)Uk@x9Zv%m{)!^EmmcU%OYq
z3GdX-chFpze3SPg15AvL<`p57z2pTrORsLXS?`tiPy?$0G>ImoCJ&1j7IybxqS(rl
zr<@#I)?VuT(*QLS_&7K(ZFK<2<zbhuFMi=bgh-)r-W)NX{b6tM79<InwHgSr2r`e`
z4!)Koa_#@JW)89i*b7JQ2dj5c%<$m>yV!zEAF&inq$o^YryCIN2O<(d5V1T{9r_n<
zsgQpoM?VJWMV#p#=-#d-8*lkCVQv63D(VPdh8Xt7y_3Aiu;C0?OG8>1uE9PV0@DMi
zeIA;rFad~9Yzuh1iJ~V7#~ayO7b3vVYc!8rd~Yw=n#olwKBrL38mwz^5vE)(LgQ$W
zra>RC!x4*zC0R{YSqxy+V?p`ARGg<VsxCWY@x|M}YdH7Y7S2`Lx_P}*HMA3H2Zmd~
zkoX*!5`c}R2Dr<>ZV4@YlZcT<%GkQE1{G0A(+~wgZGU+NFoFE~9sx%hnRj2$0Pm~r
z0o#x@i9Z8&*Grs<Iu7LoM0VU<rMdEDSrzO%XswTW`E<%#2kklsoNa0t8MJf8i*?N`
z1DT3pUHsEO?9FmpuC|KWWHB~CvptR$-~0MH3({rYM3do8xGiGI4i6ItdPv9SC`&-w
z>+=~Oq3P!g@CXvF@BIs<4SQn1lwltT$iFg$C6e`jKCo2#FVJZQeu7G19?%B%hjWt=
z&Zd&W&yqo9^JDS1`YXA&-O)68IH*p<%~xAo)Fr^5ZLtIxyoxQdZXy?i@)ObNV<8)0
z+B^z|;eFZyA|V*K{st}x?G_*JeH+@re;OY%m`_FJSkE<1smxZt|FpRaActZ-#zuu%
zcVR~e@akM`7BE!+=KhBD7(yA~L;thY4;f4_wZ&%=e_jm$j<ci+rp@+O{p5ih`6r~P
zIic@*%qB>IE2IV3v`=_xQH3xcapMPoOz9u97jnOS)srD!O8`vMB&itC2zAvb=1V`r
z>K{cx#IWwqYan;MlSvUU3tk-T@H!b*0ENX`t=vQ^JqXxiUlv&!Nj^XN#nesb>jgsp
zpYsRqpIiUgnBg+x=hVAR=W~KY=JVuvAPnMpA=B9)Bw~MiOt65b&!$lD1SfrtR{OM@
zJe+q$KQmYOID$iNKSr};(9<Ox#}?+AJWnBlXo;=EKmk2mTGO}yhH=4jBvg>#QY1H4
ztXQG#zrxKY7_c@pLO?MLv&I2-qSj%jc@?QL$$NT;6a%{j2b)$crfcVYti}nAyiYX~
zD!wEW%12X&72luN89cBn#)AHu1qt#6Ovf#<-BI;Oky0TZpd)ykKia9<Og8f12;USY
zL*>NX1gc}z2v7d2%r8K7xjNlub0OOJ2S&+B%ko%YSY#XdC3W&A!Ge!FXGHldcos8U
z=P+e;G<c3>sK*CGVH<wkw4VKgc8ve=Rhg)827p*Th;axGZJ<Y!BIbk<q~w#YSHZ8f
zg8pq_V8D3L`N%g9rs@o!Tl{1fJz1nDIDldqcz-YinHFi+I~1`3mPO`s`L!cB&4Q~t
z-lyB{P>i)<vf(eVbVg?EWoj7+Olz_WP^4W$WNSb$9um*cdRTO=0RODK2RO1&;<UQZ
z767&gH!vx|?bP+(;;#4$by-pM;&|x!f7}Me2QTn(c!l6d?a&A+18MAXeMrUuSlt}2
zN!LozSe=$9GDt6Hr)sRUft4PxE-IJDd`JSkNKv8@oM2j9J_2U15f2Li*+`@%{m(4}
zIZ+588#r3gZuA5wH2V;HtPKE|=!F=5=tf{2%mcd!F^`^xF)?9~nU3Uh0Dt@tL5t-@
zQ2;?whpI~sfsWQm3XJM`pfb?oYi9Nw1%p!)=EFrLf9`7;7Fd>ndrM&L<=YgzpB(VT
zBBEDME4(g`);_7Wy{;<L_(rJL?8@_!-&@Na(1x@yaN<Z|>0Tsd75ik@QYQtvbkNQ_
zZGb$%a;gytSkT*i0fN_B510@m<JbaL2oPypzR816mQ1WKQ{NFsVxT$|-gl%xkzin9
z*$>3Om)bPT!5CS--!Ht=(hxw;c=8A^@7ovfT20j6D&hj5k%jC95O}Y0`hwTVNfB))
z&g;08Lg3UDnMPyt@t`<3k3}Nw4JhM^gF__50~8$4QUVG+!NH_c=#a$qy?4|8Yxy>Q
zE8w}20mdGidG!*q<)!f!Xd)AJW8n}BCV(CWGoRmqqW~uppf}nA9n=_%NszNbE|~>L
zk#G?-kSwyP4P?c8K5WoO2dLt(dCcnu7wcbEpD9sgv@5gP?V}G~M)Phc{@b&AxPdKQ
z%C*y-TRlIrkwAmCQI3(h%*ZWNO%n8{>o+t_G$FuF9|2D35Mn%|iC<J?eJxoahJ!8J
z6W(<Iw)+b>YXac6a&oE0xA+@xpj_QI1J1s;#0UgRWd%)ZmTXYK+CW<O4~>OMQpEkH
z3tB`r+7SdmAf7-y?34ql`<UTtGh#$rEM=0&;9Ds0@4#9kp`g&a3_AhJ%a94ZKT9|z
zXZSC;`#9Hl37S<Bolc<!$MC=d<ZXbGPm>fQAy+ZaZzZo2nBa*YEZa6Bxf`6sfZ2=#
zNC!Yk@|`kIb_RgtEhXDu017Mm>f%X!9KnHG<_G;R?FoT}`t8XWD4}`(N~`6x7gWL@
zG$t(x8W*6;T~WYzXS9kB%E@N7Ttsm<ZVv2~R7$zJ(pD=76z&f=uY176xen;9@OW^~
zC*Uk37%lDmX_(+X243nVN3*b}_$;88w@a;HL2lia!q|X2>0NR}D-aj=fl4bb?08QG
zwYcdE%^Vy$Nwiy^10<ON*l3MzAPyq{8iGVj6(G%l_zS24fr8b1^RwG%r(N9?3Gb@}
zyI<SPMD4_EAX0kqtDAtDYXmwAbPmkad*D(5NqojO&l1AiF(fAR(g;YMlE8Z0fRj7G
zoP$X%VYeXJuldLTv&w3MH{Bd(9$21%Gv^J2?+m@F-}Vqlfd6I4=l?U7`+xgJ`aqYu
z$LCogw5igM-tNuT-77YK$=Oa^b`Xma$G@0j?pX|lKa&z#eqdK9P&%H2Q;AyY45o3h
z!ktQ{P@dr8NvU7=r!(>qi_Nj0*D~yYI8_TIG2lT!A`}F>DDPX<L_r0PtiWrRgYeEj
zVUcj6#M6t$zmx0=1!1fg6)0N(bZfs0aRP;RB9uZ<evgxl2Q+(RHUr{|%t0{H!D4T<
zhA%n1`{G|nd&JHtCgASL(sJRb(uRX)7C4gqVYNfqwPyVGZgJU|cA{yn+~Zm2laEML
ziDJw>?U;LD(F7!~N(N~YvXp*tQe)}jWrQ+0GzQ0N*57=y)Bg4*Z{wi&yUOGJz)u4Z
z4`7>I3YB2mQ=K_s;~{NWH#QCnzFqnB>SKsJ0Lngb?|GYOlG_$X%r@=lKHh$s$#pk&
z91#94qcms+G#@CA+$&D!yL!X9acOKT3|&<0%nd}CD5LnKf5G=;myENHMPM4q?^u4c
z4i>6XR}!2D5x`G%bN7~wc#Es=6f}R|gv5n70nw-bFjhSC%TrR%sqY^pOZd<Qc-pN)
z;?os?a5hX%S-ZJw%~s96HV$skFlET~oQF8x2~fr=Z%Pue*@4+q*Y+L@q(9&=j$4-D
zkeU(taJJ61QAin)O4{QA7pMlnAklieWt6MW=y~eRyMR`CnCjL5<fHp<BhwX%*JN|@
zok)+^aa)!;E~DgSt*M<MIIm}&DIbLADp>pP7XZi2$&G$A?z@!*K-F+)v48s3DkkX<
z&)~_2T)$tkMnRv+VqR|(XdjprBWT+F==39VZEZIY9D0tIf3{vVyy&ABkD?HdV;5;B
zSgR^)D^5kTS5W6Jz6c6@uG-M${Rfhm+v1p!shMW`ONfSN2JbIy14KP=B=`MqM~V9n
zuqc6Te$b5}0v`3``~6i@(`t$E0HvcmO&_}dfD$3K(xcY@evJQKjsH@yp^AmQ;U~fi
z{at9utbaCQV@pE!ac|k!ji~AI`?;&C?Z>D2-O$_Zy3Qf*edo$)L{DF%rdRnY-@fj_
zZx`G8>0Ea{gOSax)YDXB#7#ITTSiCiLvxQRQtt3tJlegDo)<kU_erh|f9Nu_Z0tca
zw>y(eS&+xmB@&!^~Tr5Ry&9(_OG{*xWCp9QktH{8xOGr5eNRyD4Amy=MP%kRs6
z$hdmBA?8*n@=PA2*XP5tC|8Nq)2itmg<7|w3zK1mcaik?gk;}$wEoU^4T$5kzB)(>
z8@_RMi--<r^da4L8Q_^ay47(C8PYrSDJq?ix)HLOvHV#Q<Bt6kdY{M~+;BddQEcak
zM0WjbCzdiE??W=%#NzLGP2F1Gw7S!5rv7l%elaMN29T>!S0MGyWABTvcTj!z`t!E!
z4?62epBgnQr}w<xDNltL%-bDz%yCx?{$*PdC*w1$SY*-uqI=DFW4M^G6*2C$&ht6z
za{pkB2A$SsuCvdr30J@I3?1C<ar$a_!I#~aWcnBVc<Irzb;l<4ZwgG)mXBzS3k?$8
zUwQ3<kN#6=BrUo^y)Ln{X*?Pp7Tc%I%^m4lpG!1UZB3n)HG0&G^i^8f=*4rn?nUtK
zE)Ikb&x)y8H2yiRpLt#TE3C6=F8I$;E7csQuC;cIv6pmK_MB72^~A2&Y4Pzv#?V*y
zwxDDeKHcwFw>D#|dbT|}{k|b&oNu^??A@D6gH2y7@MMm;+#NsvqE>~(>I1&2S`}@`
zjuiH{`4tCtmK!^$w{3+^|NS&z74^jWVe|SxFK>~XUR1B9mHX(+hWED-Y>Dh4wqL>_
zSB>(!jk;6jOqQkX+TqM_OI}|xrq>z4CCM#~3U7kvs)<(xtj<XX5jU<sFgDWmeFxGK
z4PCZAJNIdpuDv9)nl7cImT);6J(}mhHXi)6E{dZ(7f<nTJ%48r34?W|Po;e^d~DTu
zeZez~dJ?XEyYUE$b%pmLKO5F;vlTE`yi*y>Q!oA2W3&^r^$@<N&{@XI{Uh9#F{@c5
zj!$bP|K42`#c30(z<so;Z_i_g=q5=K;;|XgLd>I8snS{Kfm;MR6Z=EWU7qx;e>(@=
zcd%Vi%d+**XXl4}Hil=9EsEy2CzooT_rI8l{F+P}0JE9>{!msP#EVSCiVjlKN>JwF
zuHppiqNSVX;MP!+?pxQVCyUv_^ZCH0R7P$`<WRsj_l`CDJc-x0u+}bkt)+vA9lrj&
z4i_(`+H3VocT0N3Nr_?~&z}A@ozOnI@#QlfCM0i*-H5@$KfhF@T-lB4vuoc4|J%ww
zQ!o*-z!^#q0eePuV`+(`Z_s`<EicBrBR;s#D)__vPI|an<(sio_JGfp^yomb9ok8$
zyD>zevnBm_tA6*TVr4+_oRDB+!IEMi2OC-)^=7p&g+r9vM>@nv!~@$BdMo}^4MMp{
zLmO~&YSEU|e8S}*u+z)FP}890gXL~_5%U#tm4rSgTBi{-*Rl7TIFg%v{`<9^?IO+q
z2%U8uQiSsO9ZFlm`NfUst9but?OdHO!aOb>pFK|}HI4m8b`GbG4}H;Jmmh?U_cHWH
zy+{<7bBO8l^cr>fs}W=%ZvKQYWUgAs*U$fAd8htmZVEn!Ea|gv#A0lCZ(5YP&0a~G
zamGBtO?4e6K4Hmqq7y^*HTLL4w9CC}Udv};Bj#o_D%2`skf-6fX}vOxnmGy11k6^O
z&$7K&`w8}NxzDG*xS$kv<wt?#3=VCh^C~G-q#5%nPru1N5yHIx7QP`ss1uNGWwRq&
zDa(Z`k#QNJwGa^DV7$HDNIlgLO<r)Rb~(nd<7>>4`1t#Z(@^i3S67ZA^`paE2rBk+
zOx2b_WLRbbO2+l86Cb+1q^4@BlE1&J29GVwq5rd7GRh!517#Z2igc}}FCUESUX2z1
zeQ@)~^1{#$K}_@Kg`Pz8WA`>?J(~FJKbkp*9P#?qIz2_*eB-nAQPWv}b6_i5l1zb#
z%jEV5mf>r=mDUP&xK)NPX79$5_Rn>Yv8Fgp&vpZ<5f|)z<BqGnBp82kt*6{WFGY!D
z$@34lI>d1G`}&!AocahjnnH$>s2a&vop;bolZ7Oo&fc_cTlBs>HkMtC_U>~3HzvRX
zI}Zmxb&oFcx_Xj=^a6|0k=}dpCyZ)+d4IbgFRWX#u6K;J-`l7tM|~k8tQ|9ubR8bP
zW`rmfWc7UX6a6WnqAJv><3V6N=pCB_y@=Vy47+G^B;2%SH=+cov8~f#Vy%y}*aCic
zs=5~rdWzO?v6WNZ8mcUP>31@hsM60~Z!PUGerJS4XC71M8zaW?xi%`|xghng<31rG
z`GdAZKJczk=2yjfUoefJYBUYH0ta@=EQHE_`^8OE99q+P`muid@p}d&+;zIlyTE3q
z{jPM90F)D<{M;p?bb1_q_6Knx%_4_ioGpKUgxxV__rx)fF7Cc%>HWShj3&|TzW-?B
zU`PsG-gIB&Fg3JYS<G-3d=JpBf+1o&PE9B!UwbKPSR}^ZPonW?H8Mab_kyYH@qZ9Y
z5pg^Vp5t^TV7GTE)F={kITK9jGC7c01kJavhtRlfQ8X+0lpKl0@5r1h!_O3RKzwJi
zESLe;au+0k{f$8F^F^N6|Lh<@^ezUde3}nGnSV%@q|S!}LirbxDRDi_vjM@_zkJ1>
zJF%5&FiwW5=hAGkcI3)GPG5*FJ*wIh1mo4Ykkv`E>7Wl{<MYBiqC|xgG&y<-^qb@%
z(-uJ}+mOU?h~fceh`IZ|<-k*jL+e|%A}!f56DpW8IeYml2_So7S0&zd;<aOzJ}nBD
z;|@{vfG(bP49Gz3yT7mAZ$c<j39Gy{e!fca*ynof94dYA-AFdVhD@l1(asIC8+)$!
zKQe4|q$SZn?DO>n+1EGUmo^=;WKo2uqZvr%y2^sr1thmMOGuL+%KNR+pv~6rSmQ7*
zQI8?WgI=*~5RcbypW>T|n#-{9$@s~al=*is{W(y<uCY#+I6;kEOIew{-chyv{7*ds
zt;|WMKMmr*Mdz(w?>f;o%4~=tq?%>q91vXOwl%deL)gywMGM(Ct@eAv!wf&pM{-0+
z)|kCG$#!4!zQUpd=dARr*TjDi2RG|wtC)-0fr_C>iX!wPb>bo<$fAQh??6PzwN+I7
zeHImlDBelt4H<+IwI3U8IN44o_<>rrj?&LPMP-HTv^zX9B$}k8j1Z{6g&_VBRXz*S
z^4VuFX@B=j7&;c)chGt_ZkFy$dz)mcQhZ)6t|s?Ih72jHCuWVdl+YP7gz_ZBYbbyB
zDo7LaKRYGDs>3LM{=6_j_x6ZZMM_R2-DuhCCX38mR<a1DEM7b^*OSOaCatRneTH@m
zI#<kEc0`sw!v`g@02m%M54)g2dsb3CM2x#;uthc^>c^=-uE_<h`1O0yasYomkfJz!
zSw2d&!i0~2{~1E??wfBxURn2#<G2`MvNq0Gp8;hQA!c~p2>lfyUZ3piFRu2Uwp!ev
z#bpKK($NLYvH6J<YL)q3NZf><4;B8w@ZF$MzHC}E^8!86v~;#^^$c5H<;L2u?tirR
z-SJqr{om(_5RsLYEs>qQokqw`Np^*hWGgbxB4m%U<Fra<iDVa*&57*Htc;YsJ)g6#
z-*sR2b>06wujhF^&p&tn^m?7X<M<xqb9~18{XU8>=U<Dt&NY>W&?U>Wl%G&&kEjh)
zA=}rgJ+(7L*2B6&8}bonFbphE<#STRy#gm?cNqi-jl*9i9tJAg$%uD2oSu{tFh^<e
zTw0L3N=0>cjH)K&^n(|%jM<`8!|O~>;sr58;Yqfa6@f!uAJ;!De36@(JgmY^C6)b0
zMEYv#vxZ*c1m~(}7$>Vh+0FVlTPJ(Wm=2$93nh~m_rJv_=WwGZwhJ^`<pWI%+Z;!Q
z3TrQ)_~D#~rHjsQfhjd=;gpi#(N*1<=XREZ(r+ZGOsHG7ivz<&llN#St@dsYcm$JX
zSJ9F+@-`E#O^&_mm8I^;!aG(UAk#=aQ}9&G$rm=@nP#^y_Pq<zw|LHsc5IAm#NKZU
zr<|inRPnw?(Uz*gsemD3w>}b1dJ7hs$@eRH>=fsP=3JJvr^u7m(1VYrDsmaLMzu+g
z+O3JKQfcc_V2Ot;oEo%lG@njbe`rfWCtu<Iu%|6JjA9}`eRzQX%ze8#9jzW^Z_0-J
zZ;m_?sZtdJ{CJf%vkq<ZfK>L&nz>#eoikJsLu-L^44BBzT$(ynTz+jv-dW??9IO4P
zYlWsZfmnT+R~v8dd{d>f{a(N)K{4jh+J5>|y5^<qFd`}V-jFVSe{uLM)`}+IxY%oa
zF8{`>6o|5QTS({Tw4^?qx~0^c!DSEBJ`2e&*Ww==R!fy}oUZ3MNg9G?@fg1OdgrVV
z24*Vz>9A{UAF*J;eU;yHTt2^MIJmW(N-8M+^wtA=7ZbZLB|ay!&_N+WVB-jqn@OYi
zEyDZ3JoCdlZYWlA84<r03PFFDn7Z*gyI}c4qI)a86Fs>?=#Gw{o)&9)K`TD6zr?+o
z%$GYq-Trf@j^|>!-aUm$Z&U4}=+j*lUnq~3I$i!5*E|$zR!ARPkU|Y=MsU^ff-x?=
z`YW=?;6Bu*=tbe-<AL8zm2FvnE_XG(Wy2d9l9r`K)?EKO+UgyyIwI;|IaQF>Q_?W;
zihqJ4!1TwNzSG$pnAw7zHf=eWfL>9ZQHp{KXX}6bywH?LigetTP(v2J@o8|P?_&>S
z^0e_-^{j8E#AAGI&_voM)Lx@_4wcoJ!nGTb9Pr`MW<koEyI&`IdY(zn*v2<=hoV{B
z8slzwkl4}B)KV1u*!a0ka`he7JzSE$v!EXQdZWV24W(=ARTNbg!L5uZ*Tfv6maY)2
z^?s$xic0|3b-Oe(0Ky+&B&y2X?W|_t#T#POwf2(&FJ3gh_5`uck($x8ioK^Wt0Vl|
z=b;!_Q^R>WAl<AxvNiS;r0m#4tyIH-?o@E8;vzOcSUZv3oS<}37(<xkG1;8+Q+=VR
zpK~)gCH}5#j9GjD`i&riJ}QiiUQ~#J#MrR2$f@Wpgat2e$3K3)<IE9X{lkrIP3(35
zFnvK!_mR>vGE@+__F8o<0pB2=oN+F06FX#Pf!VOoa#;p{lssE8?bK-)0v~)RhSB;r
zH5Ai}RFfy1;UXy2a4fq*P%5rgTCoy@+TmJyiwl8f2k5XRnFxcoHnZ$JihJ?uux*r9
zJRs`>H?35(H0qo~t%`tyG!0d-wl}Kp>IGD=xbX$FpZND8@gpR(Mc^YbKAC#Z4ym=?
zdeXE5Le6-8(de$Gp<Arp-k%-wr-{k&ys-lhr4GJ{cO*D2i&%l%tp?Vfg^@SgZTNe`
z1(tS$?sKS=SQ!D|6YXCb>rl7T<QjW#V_VwH_r7eZoE{bG-~D<5EA@n%hWF-DXw>Na
z0I6zg?(?{OtB9x?aLA4?3nuf?ncQnGF$_H+lr^CYA3#U(*&>EIEZw~Pyva^){KB&$
z;p5`_wMM}sdoRr{kaA*Fc0H$^kr7Co{~g`2uhuTl#2PN(%ZAa*x9H^ma&K!9wIhPR
z%%b_D8>U8Uz*+c??Vz!#j_1S6We{&R@?cEenP`YdbX6xfK58~SSiwz*YnQZ?M>6-*
zFRVT&UCS5PQ7xb+v^(~8Wx%o;kFnuw$HVn2URixc8KDi+0yt=!l^Lp4)R8-Z-+yfA
zQ0}E4-6Usy`Yj($JVp*Zt`P`y#_On<u7S}HT6G-V4E{`-t={$$uM*rZD|@VcV-Y%j
zmY*whoz?gFL2W^!Ml%(1q`D!-utGrAl3K%`$8El*c-$#aw1(~eC~p6-;Y$-dTqm(}
z|M;%!_}N4Wt8b=1grA8W^;#%9B#%0WvPu=SsDz3~f|{)5&ZLmtJxI>-I2oKkk|8rX
zzqA>zGq|p2w1G#x5YK$wRp!+7AdnvlRUPbSh~o+m8%hygl3q<E{Ylw%&+dv=>>pPB
z6|tTDe2FVU)RV`)<~9dj<&Byd#>c9i%`*1koD>ux!j++P^_i+kEs5t_Ds4e?%T{$A
z@*2sHt}R^w2CsQ^Q{yZj`jO1k(1f{2WGi1nbdNF~;{jboD(z4p%UQBY<Gr!G1&{mK
zJTv)ZxlR5ti2`&y^G^=>7Jfn3;q^fWm~phohznFK(LwG#dvtEo9fkcms;G3ss+ilZ
z+;<44y6A{cxu?56$^TYDE6q&25<?&s#B#~0$1yP}*Yn;G(UyCvi?2SB3cdoKe3Pk^
ziW?ZD^Ik~iZ-%2KZ%JfM6slV~3;n#1Mhvg%(}(y4?4I4QuT-hHNp_LUy8>e4xp{2+
zR?uwfm<YwtU4!pRfZhnH){t9otHeh3m6!1YMLyfet5xJ!S3!yWRTDdzuu;6DTtc2y
zNV4E8{I9a_;(}$-E<~uby8Gk{1GZl#dM*?Y>^#4%_3Z@KRgvZ+r2k_!xui-mEHXH_
zjV(_!mTP{x4&S|#Uu{F%|09+U|C71&qp03a$77vg=%8{^I(dc;Rno@szzH+f_n-C%
zT28VAxf4H)l(-*#IogfxGOD$E<6_=Z2YEA|pbHdT6y#1L>)zq#OB_PQ(?0$yy0e@z
z@@l|-wlNadCSAXv=#^|ds>_YH^JBVue*NbW!CQkmI|=jOV0&fD0c$w4&eU7mpBQVC
zKVB!3vq@s7JwwUvf(qn!lRR3SQ-qFO<8|_4QK@}$``M#y&jV@_rTFH{?jaYpgw&EC
z2qE*wr53!}sqn~q*pKl<-<BfJWIa#|O|1e^1cf4>C!*6`{yolOgW1zl1}6%N8C#Dv
z;s(DZzxo<IHM=XtR7Q+@ITypjuG?Tr!3U3y6heO@^uNy2e`hn3j_rtBJ#NF&X#9ru
z>5ibT#`7N3s&e^$)oyfVUPs)EmrzAmf&4jr7WF4m=GaLQ-1?=V#o{}ep|@>LrIh@z
zX><7*+?$2<h`GB^Qhp9qz9>g<M7W*T^xH9>qYm;|`6&NKcunWr<RFdZZ6<qhxbO41
zlT%VRyaEGYXI5W$)tBZ?mN94|PnB?f;pfH93?lER1*hXWAR9UE<VNHnMQ~>2QMU(o
z`@M-Tq{H#V_TQDP=uG%BF{BQu{ROIa$M;B+Ew2^Lq2$*6R&h}u0V8=rYtb{MPhNH0
z1s|sa3S)6u&qm?J{gs=EahSr%D8YbdiDlj+&O}sBwTI%ASe@bZy7?Vrdc~H@ay5{*
zn^7sX&(N|ftTYQVlVUnc!>rUlJWlM&^m;F~H=Wb2emLfKcf02BmzU>^nuwWx;Cz&2
z-$tAN@hOx?TPQ(VE+-*r^BUHLBI#@;=jSjq3j|X<oW2AN!DR-X^{HjE=3gvMk<zSg
zRi*dxyQ{4RrO-zPQ3UJK_>OSbo>cfeG<`<!Uujjy@virD(-!X|YtOZHIYiqBTO9;6
zG0QJl;`}Agk<%(pCL~6zEO^NtZ<HxIbY~B!`HTvWI2q-cyoL<J`VJ4FgM1{T>)C>a
z;l9_8>@#Z4R9hPGcbm7*fj<v6eF}QVE*E012G$!XA5@PNp+4pa?el`Fsh}>&p7C_}
zP~y~;giK+Jey46qM#dxaVz;b<+>h1j2rfAI=ER5^^%nFqLC$O9gaUbmlu1&kVV7)y
zI=z+T9}_02^yTnCPUv>=tcVk+K$trjG#PYEaA0b_-qF1BtI}irm7X0>Mzm_vI;+AX
z^VXWDNhNyY+4I1h)P{!JRtdbn%R@TIrML^IU3(hr)BxMbMq(1X^a#8my)1Nx91WHR
ztH5xiG#%i`kz^$<;TZJBAP;6=3?yCS+>WeuC-^bT9gM!%tetGv^wW}-PoN#L#V3zi
zL%GMAwn+PIx856fT;RI-;*Yd<gw=<F()Kx3*x)~<JlTVCT1nTiea1R==YxNXaf|d`
zWU!+0YP?RfK?g2)aV<jY+T)%&|KGaOf2(Z&`!z=>GFL)%{FXfV9|h3lyC>hj_vC5z
z(!cgLDWx7${xpJovj;cTpgs(02JwZB?{jB;BxJtmQ9E%;4Ult5M*%|UCAD<oF#qF(
z($w~LDBa%1ts7hc8;eo;56=d)PSDe)_u15CPGh4``_)ey^%Ru^K1o#)E{2mJor5;L
zCtdQ6T#*T({WyMh%W-Gz+Z&av2P?t5H<vGr;&*47-e3ZfMzP17Jgr}fj7Q#*e(Y*=
z7{49WsHbw+b*d7z2u2$yUGsECPu~!bJ?`SX1(2|B&O~kwqd029n#yr`0-?wGJ~Q|W
zcc9=+>hQyGo7cA9=||cUHaDa^Kh={KYU?)L6S-Q!jW&>L!t*nI>zsjm&6Sr|k&O7s
zf#pxJ91M86kfcmixjd+8D{TGI#<~AISBs34oPa=tm1;HZ%~3&4S6tzHA210ZgH9RA
zV^Uz9by;2r2Ok(2=h~Wo*%(U8tmwD*_%6!pGl^FfK@D>jR}*yNtD$Wj+399CGXO#W
z29!_M0_YRTNJf?Ka$Z>B>cWRJ;$+64$MBBW0r#O%CdZ6j7?=#VK1_Cyg9sH3GfS1t
zGp}F#kZ}OiIRN1R{XwP3tYYbua^wk2!gmnLl{JV?CBc2mh^EFR06U;x^?KSI!_dwT
z1#>dQFa-DK8uutLJqmqYPOY^;YBvEOXNwYR9DD{;%}PEss{#mTxZG(B<U#hjN^DvI
z)+A}){XuYwb4vtBwXcLcJ{^rje;o)uJ%MT1M>4aHu+}C6)eZpAiwGrG2|FqPTQPt}
zm-4F-_mU?78S|LKDiMZg3DuMW1Q-WTB;7s-X#bs!CEy)g4}boK=qrKxI~Qj$rk(@(
zSY15k17HZ~q)MF~Oul0fgL2{ywWh?LZw0uQsC{?#ZS@|2r~xjJC14+vYyhwXg1Kr{
zjO>BYR!;$<><b8sJ_J(uB$N2f&*2QB-)lB4A}Irz;(*b5O!?GQ2{RiR1;>&M97N!X
zh!HD2+DNP%DbItkx(-wW(A+K$xE{hH@4Yxw)>?!jW<O<%VXOo$)CO@^p()7LORG`1
zE-E3J`2hc<#8j$cRel@r^X2A`DU>yImBroD6?~=CC}OEG=*R$2lw@P{Tpl&&<U9z<
zO9T00xYRZQPDY5e(kH{IkU4}W0v8BC=1U|%NQ4Zr0*lua<%bXXMU|QlIO}uvB6y?$
zr0##XJLY)=xPj96#X%B{A)qR851@6P%*q-!m0<_~pQudnxQQn}-G6&J0i?eHR;ESe
zRJHU5K+0c=)ZjCSt}0x_{T(P+Tw3vS^Z@I<<D8|Y9Es$?d{vWtBg$HE0BUnMU2oq9
z*1%I~er?DOr!|b}lQ~9GLK}=<U4K{<ohI!zgTrfH;t0&9r@;!cDUh=&eA&Ia4nSBK
z3V8ro9gU&p*2RhE7l=R~M>-0tg)__c$*QsKgX~t9mO#&Qy+Z)Jz+a;uqw$7f0PQt+
z{u@q!9XK>=u%x+ENAG@(z!6YD$$8Ng_mB%9&3ctPhcm$h^b6UnKHxZ1TjTgu6u=4s
z9yu>CbLwEG!S$LUOlPlu0J<-Z_?Ey4UA+p@NMpb)c#&Z8DT?(F;Cw?5iP@V;j=1-^
zj=#GPU^jbMM^qd*SR+mh*K~Lqj;WZEI{-CQ+jL(~fZd3w!9)tW!+-SrQE&vt>)==e
zkPwmqw#3nxkS8IDAt|2HxY3D%`BMx){UL|qCugruHO0U*pJdijj)g?Wp_Ka^78q)x
zAc}M}1}Hh+5Gx)mA0BiHs>Arqvftr%dOJbL!1;sd;G~U!IQyxS@MVMwTjK%Eaok!V
zz_2?6b@Cm+v<*P}p@HNqa&N(^#&D{)(7OP$tjM%HlJt=HyadQCMBw@ojz@^07<{gm
zjsxipzHSs$#l6U74x0h6VwJJx`iK63r%uvf2it&B<N><8@Rr{Ee*EKdKYH{O2Y<aq
zW7KzZ1YlXM^k)I>$DQsY(sDWvrUeGAO9BYJv$h?nfJMJUTC?u~vW+;Jg?R`pg~$(P
zTarLo>CHD<-ap*<PN0xCf3SO(_}UnOF+ogw83|1Sht?$&O_)apFNxi$Q_(?&w1c71
zr%RGal<yPq6Hi?8DiG(-BsgbG^LRR{8%2GV=Ltu&Fj|3wqy8{~$(8wgV4`%A_V+KV
zUUeOO#cdU{9oXTH<;^8P(ZU+DBw!06#*%Yn7-zmFusah&^^D6v>*X6EJOuq2d3pIW
zXU>0|gMXm5kS^oVQcH}|r@%1Jf_gHb$U;Ix4GdR6j((`lzs{Eu@lGQtDG69}7swuT
zb|P{fB9`<GD028UT`n2A0ucS)=J&E+&*kIela@9X56R8V1qka6TianeB+pp_J3vzn
zI~l=7K~0^NHw6-|U=bWG2@t~F?*oWz+Y&Pr6hR%Y6L@{gO);Ds=Mb6*qznO$l=$Ms
z3xHc)Jc|R3fk~dNvOppxLC7ACjkuhi9+SRO)iywfB<8_$Z;}F|3p9kUU%wiLv%mrX
z?ExI6Y*z+>_KIYZ76I)mkmP-)KH7+dSH5q=-j7v5lb;0%HFIEv!lE+__xoNkyH#X_
zK*v6PS_2x73s!T@&CSNf#?YkD2n2I1m?<3Y4sgpasH9<RfvOU`?0p1b_G6oV2hIc$
z5fxnuL(t$OWddI;dozpQe9Rl;;^G3BQXKJSaRGstk@lGX3~2hStSrMQ4NXl=J-u=v
zJPdRxA?zSHLMSiJ987EFArPUZ`1T~PUcJgwcb4R4V<X0Xe+8}m@Ilp(18z?cF!fxS
zI={;S-=~|dPy?X-w{N9-IYBYB%BqnZem|#%_Dl&64+kV^9u>oh6F}n1Oig9_b;qqc
zZj92dQ^1#~IKzv%k66+L>daj*orP5ko#~3{RSWnC!Ksn<GXesRU?O<cj<YKG05lDD
zh}XsV&Rzs|lDoj9I@~C)GrrTGFlO_qmKbq0E!}U&9;j)sOI|o@;y3Xu7u7|H8NrPz
zptZ}t7aan>lvT?4-LH#*#snk^udN>w@OUOEVQ?=}GV$s{5ssm$sj0A2!joZg-c(Zy
z!c9WC`DeMHOmm5hLqWk0)IDI`y(-3!jzT!b1M^+b^i5l(fZg@$lOR367=}Qo4FMkm
z=CffyNLZLDpud6834=p{Y0th*3hXvZL$1b%ljUx+y0{LBfhj2u!VA4@*R8C!ZzJE)
zA&1OhEjNJJa^diUNJjC{T?QoL@!Hy2cJ`@WwoDiRAxppc&u4Zy-d!TF>v<}FZv%J)
zu<fJsVxbF$OYNyQMez_JZEbBJPn1j81$#E^JE>AG^|i#p$B~Q`pZCMl?tPaw<N~yG
zwcozCLI@dx0%jY`xjZG+bLRxi-U`dg4#U+)k&Ja)ASVomAc$Q5Ky1jh%_f0(=lWy(
zJx<}!vP;})RIWCY^b-fF;XClqV5P7E`Z}}=){d5r&OVs&rC)Oo>@Qfs!s5BuUjybW
zizg=7x*ITX_Uu{Eu-}4J3OxjK)OD%}s1^mqbIE6|KEWD=UpF~WUyT6gd<PahuiENS
zDg>HKR`woPygnm$E^cmH8=FBo<ZV^7S`5xkgYhunGPpSnhuZ+Z<_2!DcY+jUJil=_
zZ2G;N&$N?blalN$3Eq+-8U28nHxBymnF>lPK&9+_D!U1a9<b;TtEn*lfS>P^968O!
zmGk^LSd`mJ=%BGvVp{H()7fH#1!9u5#t;a)mAY>d5)zj`&H;xPj`o0FBlab*vy%Y}
z4Q(jU8UsXk4(>n^IwYda4Lol+w8v!39XRF#%(9+fL^8q=5!YXt??|{zegN6Rp<ma{
zv?bvLGa4K73C}{O-~iK#{-d?z#7YEq1n+>ZQUwI@#6)@+krZYG3OtotCMH8b_c08h
zSMb>a&k;Vx){-cY22Tuc1_+;eIyy_h1kH;D1|m+Jp8F&M0sB-Ba7coVF=R3Z?r|Kq
z>4q0j!V40YoIE+z!Uw<y1uy27B@r7|7o0-hoi*GtCVa<$vqx~M&qPBov=7u3N=r+@
zG7j}4Bh}GrkD?#E_1oWp;mhaU{^Hi@Qi9~M2gVZ_DXGI~wRE|CaNJWxe?M?`|GBD|
zpfjl4!AA$wKKO9jhRsNPLSVhhrrttmYA7o!UttjuQ?7@h1#^?nsA^{R1*A^&(iOn0
zOHm;ixpbf2q6hP5$OSIZJ`f!S`jI>=Comu%rV^w^JO~{O+rXuOkcbG}VF92@Eeq@D
z8ZnkQ4LBe;;b+6<THjp*uKVu{kTR!F&q&s7lQA%6aQ)u8#6$kNbvYly6MNtYp6R1U
zk1jtkdvj|L+y$7jTY5+4K)KNu96oEiz|8-+5S<ZtVw&n|YM(y8xM=t!JTw$IB9!Ii
zh|p#dEK5}Iu?XxF_?Mb`OFZPyEiDHu6vv}H&?wPyMgvo46HXEU!)(AcJ&Q)-FY;*)
z15p8YP>tdPG$+^HBX%NUrPA5z;6Q!axsJ4zAjYv;0GHq8M{o^+SLy<j>>{vMgt>7i
zIf0ZR$QGQjU)^x*zyeE0H@ab)N{|d|ZD;3f5|s<IMt~?k{lb>*|8qu!0y)HQz|ecL
zR%L<EYSM)3SmlWH*G~MQNY;U&JGC^8TgdWtpSemCqf4=z8$eGOiaI?mT#HOTMph^F
zMm>CK?2|p2@+Z=Lr>a^PGHEkV(VK0Md&Z7Bd5W7`{i#u69EBJAYu2oguXUzrml<T_
z<wyLqk=$p5QE?d;RM&_jT-+B#CB>TCu~Bj>A^Yh1UScfa`Sa(S<%_Y;UO64TN~7Q{
z@BWmj5)j!#QD??aC6K~*(0u&J(p7oiiOiZtq}Hdd@rk&bUI`o%^wCdy!h*mq;1m~E
zv#gKym%`d*f3=CPoj=Q4GPJa{OIa_zJ5O>H3lq;z$1lom+7zGRVN}ek!YQmyn%v+Y
zH|NeI3(t9uNh)UjU?&pT#B!Y)z`gh(k==0y#)p~ecE`N~??Wpyq;4J$<y5XOCdB<A
zK|Kn~vPO(gi6K*V`OHG29vGNO@aqps|M&yPuRD%p{QXhNUHGNJ$H-XMh?kV2gkfaZ
zXw;XKTvRoF@S*NCVc*ioE8oL%9#ch25P!UIPoIgBYL5)}G1XYkQX=_!6nwcpQyI}m
zN_Hhq+zn0tXcIvVVu@Xa^Y}j=RQl`9-l}H_|Lr*$Qph2dAPJU#Tp!ASk+-&c-Szhu
zXKektf&OndJYC-U`4x!bn!=_KW`(liZobHc<#hjRv%9u|n9dX=_q8#qocsD=Oa__i
z-@0!0;^As!csP^I8{lTM^YOK^1eZl;=&1aM%UBE-^XbMTjyckJ2X0{0=|d2X6roug
z7%3T6WMyD<M*H7~WtJ0_z$8-2K1Vqez;cZZuT+UAawuSO&WB)*>RgZn^6y_1g)r}K
z!h&qqzYjY+h6u?pH1Z!FGtQ<`^^cB;LNJS2+m-)oS{=RXqW#p@KQ2FiJ2LW)L}J3-
z{?5bi3sqsGQf2v@@ixbIJBS?hnV!Op+SYbPBzL3cRtF+#$tHI`iL)$sFMjQ3T{kb5
zS1{e*@|W0HdwP86_t!_G=^77HzMM~cxm39BefX2mL2ht;h_5)8XO;81$<Fu8v$ey_
z=5@N$6TWz>x4eIUIUA=+mEOC1A6R81V!0Q$u^yxHlLt*flr{~m38aU1A31BQ?7|J*
zs+y?%R-#a+UH$(UPXbVWN9_9_xF$F%?0(ICMwb61jQDUS>$&gd%AF}6Dz0K>wAutQ
z_Pu(LXA*jo_yk68an3Qtf46t6)o0ezXcw9@b4s=4H6@198m2Wo&q5H@dRJkiyu^OL
zzNoLQ!q-=l|DfT)`OR$Yg~avIi=$z8YPQu)mv(*j<5>B(0~RL-e=g=#m`}Z^JgiM>
z^VwDG8tr`J+%3Q2wB>zKXxX>CuFG+3c*Lgh<3-tO@nxj(w;DsIUma`C+l0MM^@GKw
z0wIufAt0~p=A|EL(zW}8{%wnU^I>-m-THjTmTIre+w8sMRtr5$Sl;|$u{W_2yjkF7
z=GT0sf4{=kv?;$$CuFUbUo~xM-ALxpcXFg#N!P!xG5PSIM`7ibbh_%2#dhLamY$LP
z%zo+r%sD&@CeowWC=YwylmgfIgFb3W@hasTd6xyb6&o5_*Ot9rpK$c=&+LfMDgH4k
zU)$JTqCM$4^q}aP#VMxNzP3_sr;6HVa@8xLq0=1oHmArc8ivZZ*UMmHX#3@vkD9Jw
zHboc6LbFyWF<PVyL*jU&oPu;1z3+?W6$Yv>=Zr;H<GvReoKdDVDmJn3(|i+8UnpTy
zv-!9!oYgsHQ?%(p(WY2s^qr(K-u28u@!sCP{?m7IFLRame=fdg>?e_HYK&JtbgEtB
zhT5isVXfKQJ%eJ;9JG&F@b5K+`@UmXBE>oBYvW5cErDfkGvax*$Ce^*-!*(PB+i#d
z-Mym!RG6N+UvtSo16@1rcC>LbeItg~c(X<GWP)5a3;Cm)f~2%M-!QZq?(rJFdJ8)X
zwC+!(i<m!lQ4q`-s0Mik!-N_`_<A}mav$C;y_iBMlAh>4`E|+2Ur6BV=o_Q_<9L3x
z2hFm{{V^g%9f>q77@;k*`k#I+to048UswnA%~iHOR_Tp+YAq%A@-ZC<Z)T8SSEjnr
z+3Hu$)+GL69pFn0(SrI_rVGaYE-5KSBL>er)<(<s9gPh0=rcG<ERo*tv+qq!tRyU#
z(WZCK^{;#~=@ZpJ$D8bp2_WC@4KdQF<7|$gNGT~nwR$dMNo!>RJ1S71sO)Jsg1I^E
z+E}jWu*9>(nR#o=$~@WPhazgMzxLXC-n8ELJ+884^OTdxy`s4Y-_7Oudy1w%|GM-9
zxD3)Rs&xhz<%71{6QfJoraxM9Iz`gFro4@3%sZ^L?;YqSU9yeYC&6}lRw=P`Y`ImP
zyC`!-WxiPZhCh=ByAVp!ajVsm5MP8RvO^s$=wUslPxpf*18;54uW(A*a?am$*mR+}
zp=r2SW~EECFL{+=KifslxnKN^@3Ni6Y~Iilp)Cf?^dPTccy!R@2%Gh#A{rN`;V+&1
zCJD|17Da#QMqi<ct*v$S{7a#^MpfMh>vcZb`YoxS)rV`to_j|7j_nQa)9oF@*46DM
zzib`Y^xP!%Lhfvs-W<j~1!th`xY<;KOL3#FCG}~&PUi;8{O19geSQy%8)-|P0gjgT
zOw+9oEq{oj(uaCY%kV7)oC-YH6;XP=2fD+pC)|GTsT%LXvYgPh(VO4M4D&kn_|)3v
zexhEB*wp(Cxc$sv6m2XcZLQM@CYW0|+5cmd@#$toG#>sMj0~g+W*XP7EU%BM;H@#Y
zFKl(t*@7MEoIvi3+!@o+(gFr*A_$ih{r0>z$he}8F;~LM^TA)+4*z1!A1Aa{M<2p<
zJ~%kRtJIBpxOT~@N^|7d*LOiIC4OcH-r~@SJi@5-k&}E&YUaBa#`TdqIWdktk|%8c
zY`WEbq{*j-r;M$8>0g};k8_hbFweM2hBsDCJ-fO7A`-?a@170Ua-o4t!wUY|&!x_m
z<%Ucl!nqIZcbZg&4LN$!>fGGxF7uX9W7;pJB77k*eA!wBl2E-RAcHXU<^_m?NRGJo
zz^QV;Ms@#o(P9!O(+4S?C>HstO$dwfeGO~ZES}A%htp5U8@~nNfF+P+2nh+{0qIsq
zwBpjr4smk+IGIWjk?MG&qYN^tN2=Ag2<;dQHXJS&Q(TbP^EWumz2SGS=ZR5o(Oyrj
zrEXa2(35*9k@rkyEnfK8xe0m2Ds_sLzd5kA(n|s_vi0rsPV)LqqZNf_j>1zLk5aO)
znX{odoxwkYjkMYf9q(;ab9393a$;WLsio6ASB}2s)GNn|exBB?*@9hI@a|R1lV^xL
zSTkW8%MCu$Bcu6-y!ANt@DQx@F7<J-i>7ru7~78|Hcqug&q&5ZMz?pDf%KkL<Wz-1
zOBlW&u+su}`t|GALcV=ruCA_^lE1()u8n-CFbGz?i!a5UJB}>|gS>u86ZEkPkI7qo
zI+pZpv}QL)_6YO<4fFsFJ;r7aHV6^_VS%#VhfKW4<6*a7+)j4)bSl`HYCBBn+EyMt
zGtb!DB6(Ba&?B04<AdC2j&Y(9`f@t|VPo5uk8~@C)%GK24AWjaZKhb?@arC-gFuK0
zoER`uLUb6LwzOzd3e)m}e3HqNqRVlg`CjbYuPUn8j~}|`Wr-+KSmplux;M(`1r2+K
zao^a$8i_IOy4`o~BZb8}G0iXK9A*{SjvHy$l`C(TlxQV11v#4ZKG}wC-B@y;4V7|+
z4OwZ`7@6;jJB#yX5)@Gc&3RWrhdQ3F&~Rarxsi_liu1SE7LQ|NIZU~9@`6W^l|UxN
zW&JMaHG@0cJngYCa}ZSr41VZLNH`RlNpHrNgC=;L3EN@DDZSv};N_vRg>LhJz+=p=
zzjv)p+o@466$zczDq3TfykvjJta*dp;un^NL-#kuPtJEH_U1i~tGRi(m~BZVwE6_*
zkb6C9VxofE-1_~)>JMKQG8U$;eI+zpj=D9lm^=^tz%5wpdLI$-4i~+Qf4uOZSZ+Ad
ze`YkVM<P2=2W-XQ&c`Y`MI9dGrcl80+<xQ9oy}_FmC{Yi1;Hs_ZR^GK+fNF!cFSU)
z43Ev7-Zk+L*zNq378hLZ2X0HBLv!*6_KHs0{ecKKDvrWjU-o6|AekPDcS<hqwP=Dl
zkeX|$Z4WzvsRJ=!2Uj4x-xLuP>~L5T8hYc%X9{9-g)H7gc0YmbdxftzoCI5PYhxN#
zqQ(IkB={VMN{!2JKONdnyXV^3FaoLnK5EaU<Xw5ypFe;0X-C6(j#}WymDG90+P?Nh
zsFB%|m7K>b^~gWiym%lymo#J0n?twdr*sv3iJ$Vf;|mQYa>T18DjK@0wL}VhbM9!<
zZ?!r!w~gn1v5lMc0YA-@FUm12wtpX-z&Eo(Gk-dn>TZM|yS6p?P{o#Y7CN6ke7IX!
zGiQ0YW0X`)8e5=8VS=oLV}s@saM^Q5gX4y1KhP%VZNO{pF%inJwzkf7n*b4_lGAW(
z1FtBQ5;jffZ&}Y#DF5BhCe=VguKsp*)aTEa%GxARTDO+483D<uAgCwQndr2!IC!mM
z%$)UL?C`*-{R$FSas~0r7)#L4+5lmuyP=erDp2%?69JQ)21|0}>ckHrd6mBo(wL9>
zSFWyhU_p|q{7{mp5%ceHTGr~3Qz*JQIEmD-wE!^!@^(;Dec=%K1P+X4mU15Nwr%gN
z^Y<UbiPSiJBEeF<rKYCtU7k-b^b?vC$Hv1#fy9yS(lF$7aW$#F(?@r1`au@c;@;y2
zkREC|42$+}h8*0|Vf*{D|A-@yN3l>$WS#yBDUnO9f_Lc12et4fbM(Brps>M93exj7
z?RSannrlm(u8dAI;q*IK&s7r+dDnv)-VIR5KHQsCgpvwl{GCpUg<E%Ybt5a$Qy{Qa
z+DF7{i%~i9_drJo7I#QLN^8SV$rCJCLRd!&6gSuwAVuhedy`jUeEc)rcF+08_w-VP
z3$>&kL+$KEAud~cy4T*drh$sW44+x~A0P7m7<QDEgFjU+qoD3<q-+6$DVi3NI+sN?
zDe-ZVh)K!GG7f$G+vnsvp%T^G2U%-zx4o^Okje_${^BfSr_q`5kR?NYj*^fAbuw@=
zR~}rI*){~If?YKp146aKCrK1!iZQTr-4rQk>PHoulxY@1ZLu<$0S?ekHU?$L<d|E9
zH-OH))tdHR*_&MYZ=v_El?i8bit%dPxbaM#4^;;{Vo5ECD8hO)6s<*?K*d2%C+LJU
z%GTD_5>DGLsfEnUg$oy`_IsOWpujrnEwGS^JJd*Mr0PRVnhNEA=-wSyIFEL=iv$~6
z-P^cT;4~m8jJmf{N61ysgdB>y;}8a^y}X*oAZrHaZZwz(Lz&`b_uJ!Q-=tOl7Jzqc
zhf-r;L-zh=F1QFqMgCSu&p&6$8<wEE0DSa@>hA&UyLsHNMpC>W*>%$o)G9_-AL7#R
zmR{3Opy&bu5pBuhtsBQL8eN5iI`&>kHZ+><?~N{&LRPA_rz=_>T#Jc`0gr}XRu<Au
zLo3~^zH2eQrjOX=gT?+Hc&A(8uR)y+PU$g#1RLm>wAB`#D!lSanOp<JX_iiE9{VQ!
z`5zy|bP{t;Q!{jL7?PXU$Mj<%9ebtL$7@+DQ;X)?;E{h!D_wHzbx7@>l(_X0+%Cu^
z6dS+(RNVR^EMRSs{O`l75KLr8$0=V$`1cQl=$=>OpyHS^0zr*Buc~yZ^Y%Y15JdW{
z-v({|(E@@wHhARUb1c6C_H0|De?|Jg5=gj6EFfrY@$x^KhUe}aon{{X_ZhB^Cdcc3
zUG<NCBF1t>tSx%~>oJf;TOntqzLUS4l73+e7g3QvWkT(qeP~62!F3(MwtVIjTnbGg
z`&BomwhCGTib<i>4YH;7i@P(nGzv1Mx7mVq$uawh^=IMf!E3id!sIhO@W&XNcziUh
za?}%p5RTDMXMwG}ImJg;IeU3G-KBy(Sc^QFNk$sIVMWr&hDxC5el0ym=?!7iX}nTn
z$W&6tKmRk?`1C?kv`Th|{H*6=X}3jnYF%Zt;L&W`9xUe`;~H@sB_@K?$<-tB{r&oe
zhNX+-`h_NjyjiO+5X=k=(k-_@rCRy&<#>5w2X^_CGsWtc%AP23JKqQ(Zk65C{*^&f
zm6n$Nm1Rxa+S-D={(!C46DFPmUOFnsHQjP@c*Y2pK%R6rYVDQ?(FU<SSvT9_YjRYS
zFfEc%*9GcIPzw4!RyS6Q<aq#dNiSKfdGyOMP_7Doi85<c_73XNaEK;kcdUmCxCrc|
zK0>7cgfT#EnrfF2aW0M`g!KLSPG>gUgR^x8BoR4*2r~68oE(nthVhFiC}GR^A5{B_
zqB(f{(5K}a?q{F*{Y2&DhrXivjKo;yz|WsQvuwW&#m8Q3L+E%1iJrgsk}*(>boT~E
zi~@6E2b8IwZ*ASpS8zQ+gn2H7OkO+IXwdJ()6Ji4Ghl>Jf<*o|s3pZ2GTn2AE3{-$
zacL7*`qod<+B0mNLt!c58ER21j)tEgKv^H>B}au>;F{9>dHrObI8V38Vi+33iOPBm
z#S<+hsQx_v<4NZ@aZhp{?RQ#ha-!{)T3kSnbHH_w@o%q^;x+q-wEH3X>ylbrbEFov
z?&JP0_CiSSRl+AZ`1@9CG46|2y~g|-PFm(YFo3q&8(5|puoX=r_6*nHk7^n(&Fdy2
zgbbCw;qEQ8sM(3|fi%4V_4IS^x1cS%dlT`H_c}C~^O6Q9&r2m@d)AF!6a^(_f(A;=
z3N3oLfw=e>_b8rBrFZtwqa^d3`r37jnDd>)7sQCA$Y89Md<jBd{J>o__t%Ra64-T%
zDl?Md>8hZEF6y)1zg}sEd)4PIwsA2S?<4D!JkRj(#lA5Jq5{x86!6l|>X<|y+%Mn%
z`pYfcu=Hd8^8QiWFOnI5`Q<XM*KO=5oEh=NQXlLP)5MWrp{|x~`}Wvh8eo8|)J_ul
z$B*!^&A&bj$~0W^4Nv~^v>pgZaFS+maaLo9Kq*Fo)F1@ebBqEWf<*Ptwj)sY|8)z3
zhvLr{BN0db)iDSR>R<hapa~&FK4(WYoXL$spz;3IGYEaQ|LZSF;ekRSC|MpQ7X_Hi
zi>Q^J+Kfx>snTwsH^0G2jQBqNj%A{akR4=<4gi%k6vcK7!n6*!AkQ$Lc18{&45%1a
z?3Fq>ql?U|G}T|RY9gGOZ8wOyB|~5bgGyeTAJc><f!%8jn@dhxO-)U(*dvuQKuK;J
z1X4k@|6=3-PBTm~88I%nMqDQcM}0!DTgX&ID`1i;=hf{eHgOYjmQzzxxvL~liDjXI
zGTrW_AYCZbvtC3vlX?dRwY6zK$zTPT(k}90KGY0iV`DQr_y$N3e(j)H#vXhd3TI$H
z5w{^ty$b5meR+mj<Rg%2SZt*Eehy5vfkA9_1LV?g`fq;iKgOzHMs9xndggv29@-SD
z0c0SgOg;htpQS%8S5|&d-2ey2+xRbh?Qc?wQ{g7ULLugz0PJ3WI#3al#P8Ps`@{d&
z{>Tmt{dJ0zaW2F@!<#?50eJ-8<FGAr5N>odbJanO^8M(EYACG^)&?YmhaWQ*1vhz@
zFgW8^l_Q#pHxl&4?5DUmqBh%L(l74btpEwQ;+hSU-GT;vCgfX-qw67{18cH`0fWCn
z1@NNs8@DUJo!%w4P}pC)XsieTfq#3RM8BC^I|MwA={0hAVhAZE3hxg9^P1*5WiTvh
zMu=pL#7#-4ugtC>cL3BC+g;(i>pARm0nh!$jTwN{Oh5Nksl{o1f@me2X7SKh1^ufb
z6yB=r$Q?jo#>ohM{Bo~^2Pp%~8&Uw^zHNb5x-`7v3#xZ;X2pAu`Lr6NVEbKSqY9jy
zJHvy5-3a{KyLaziNrUF#^g@T`nK3z)9)PLh;Fw(YKdefQDH*8ra45^N>n|FC1^@Z9
zmq}8Rl*^>bad~TjyNj<HH8WKWaGe5exsUV)XeC6ZS%#k=yr}~aOj$E61_$B*Y(WXA
zF2%m-ZG#<f;q!|u&jS67`t*af=aAaV(*Y6iI3NHULHKz|AlIa99%{9p``%^#9up|!
zMd?B6w-16BfbJYl3;D++Z-LD73)C$eJ3f@wkEJgaZh$bM6|yLhSQ@D^x@iNh0T`51
zHtx-_Tt>6aEiEfkM00>8(aN)cb4TG|Z7)DKwYEVKe8p1kDo-TBOo&*%UJ~Ur@KWhP
zvHN|9!p=WU7k2=4>$3U2!jzbpi3%~4(@13*n7I-Wd3p0CzHi2FaBxb%8K$iz0N4c#
z%o`}T-^?e$8kxDA^YN+vfD<Ctl#DsjXPk)mx39T;1SH|k_qBm}#GUfy0J$%@ZR8FB
zb*TJv!Zm5$T!yHo9JVS5HcM_u-2P&>uda{8ojqDoJhYR3xThZL4MBdv`h8^=f^l!9
zIm%jOB>;L1whKjDIK;&b`!wd)?;e>0rM|bg<l`6T0Fu5*fa5>{v1)rJrSM|Mb$}Ii
zivwtOrujH_-V1luTZ{JjFFk*x=pX1a$txgt9&x8dyKaCmn76>sV_+B$xNt!Vw-8E;
z*bjtBt#1I7W3)9HMlSn&oSqQZiPbxGyHB|lQJZsJ4fp80)?tp_>m(rfNQqv6qU``)
zY2elKojAG2oP1Zlib~mF66|rsJ!hEPe1>_}IktMEFB}GG=;&rg*kC-F;{99Hj?DF2
z`i~5qtJwjgT$qIVW3B$47qtsTRiIutQ6_KX>RPg=HR`jfqi2DRAIlNZWT*kiM6bR5
zxf}H)96}TDoujsNa$ZTM9~a(bO5r)zz=|+MceJ%V4u;^l*8RzIz<!B4^sP-1*|8sQ
zSW#HJUI>vu#Q>ELeEUEgE2gKryTJU?%*H{O9p;bpF|6E1Zq%Lm&*$r3^>3IN5KLQ^
zfDkmaq3Fu1NywO3+?_a*1gFr@721@1g}iU&P%v)@fgOSv^Xv5ofXSN9@aQc$x1$V7
z>bPl8Ubnvl2ML*$-Q?1FzvD9j6W_WA@X?FR*VpF(py9oOU%Dyj1@%2#*_t3j%G(Fe
z-OMbtI(O;22VjH}n&ET2s})r5E)Kkcb)b>>W<E>qF(dEl=q^rA8bUl4FhvBHD7*ug
z#jyrcDUwd#%T8_LPlG6<W6!gQ+3w?%w}(n!K&;-!%9~Tz^bBI#yUTnFvu#O$aA5)X
z6jwOJFTaOCS4%67ZV6V2hhq@Udl0`XbsWAdzg8PDLd5DrK}pGe^ojtBqMy$|&YfO;
zR%DF+cXkv$3I)>v($_C0L|)r<aX6^;nm(#y0k${bUHID;lp|p|T`n>+_D%g1cSpr;
z-6a{k4=-XY&$T~4xs-ePp4qeq7f-8J<EKxTLFW$eHhuOthUs49cG3Z0$!nzp7*IWG
zES}%G&u!pTGh3hVffFLS{_xEI48Ry-AB^*e&uTj*B_*!!5^6zp-ugXwDY?2?Fki(U
z!VK%!QJIF1(a%zgfL#F+b4|77Bc7&-{nGXTV4(rD0aU!NgB&gnzpW!^QGKQ5I$WFm
z4i1NO2l?*4XMp57#vt;+`!2*q09963SI=KoiAl%lVM6e4JqE+nr0GFzr#Gsz;JvZ9
z4XTkQyWT4+o3LnWZ1Eo8Hn0a+PB1@Q9>cp;rE*>$&F0^@2Sa*i3{FmjQBm;QdkfV=
z<vvqO^6}=dago*jx&uS`$AQczPu^=4DA$O33x@+h3~9BLlAO%TSF^msA7-QG>nH#6
z_pW`4_+wWg&K5?oWJLT~#{cgR|Ihma9NH;n1@}4qT~pRNwLcRAzZ_b%-{6)rXU?pU
zM}>!%)~)|^n8Gw?#Npd1K0}Ptt`R%_-sWTD;sDkp?iI()%{@G}|5i@g%oRI@ZAJo;
z=a(PN;vJYojJ=U0YO8yNWxK8w4mn|dsNBqMFnc19NoKc;0C5La=tGukA^?(_(kbdV
ziwT)>zQBN_f50*!^y}u+Er8FhbhpO~MUm6^Kf%9B<E`4&#TkAMO^84s_E7)*@Bgq1
be$;)8?D;k#NqPzT6LJ2Wu4<9;)d&9z&HT;5

literal 0
HcmV?d00001

diff --git a/docs/source/nlp/nemo_megatron/parallelisms.rst b/docs/source/nlp/nemo_megatron/parallelisms.rst
index 172721a1d2dd..9129963ef021 100644
--- a/docs/source/nlp/nemo_megatron/parallelisms.rst
+++ b/docs/source/nlp/nemo_megatron/parallelisms.rst
@@ -3,19 +3,22 @@
 Parallelisms
 ------------
 
-NeMo Megatron supports 4 types of parallelisms (can be mixed together arbitraritly):
+NeMo Megatron supports 5 types of parallelisms (which can be mixed together arbitraritly):
 
 Distributed Data parallelism
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Distributed Data parallelism (DDP) creates idential copies of the model across multiple GPUs.
 
 .. image:: images/ddp.gif
     :align: center
     :width: 800px
     :alt: Distributed Data Parallel
-    
+
 
 Tensor Parallelism
 ^^^^^^^^^^^^^^^^^^
+With Tensor Paralellism (TP) a tensor is split into non-overlapping pieces and
+different parts are distributed and processed on separate GPUs.
 
 .. image:: images/tp.gif
     :align: center
@@ -24,6 +27,7 @@ Tensor Parallelism
 
 Pipeline Parallelism
 ^^^^^^^^^^^^^^^^^^^^
+With Pipeline Paralellism (PP) consecutive layer chunks are assigned to different GPUs.
 
 .. image:: images/pp.gif
     :align: center
@@ -36,7 +40,17 @@ Sequence Parallelism
 .. image:: images/sp.gif
     :align: center
     :width: 800px
-    :alt: Sqeuence Parallel
+    :alt: Sequence Parallel
+
+Expert Parallelism
+^^^^^^^^^^^^^^^^^^
+Expert Paralellim (EP) distributes experts across GPUs.
+
+
+.. image:: images/ep.png
+    :align: center
+    :width: 800px
+    :alt: Expert Parallelism
 
 Parallelism nomenclature
 ^^^^^^^^^^^^^^^^^^^^^^^^

From 9c27a30cc26b907520e3a88acd738ad8b7d4ba54 Mon Sep 17 00:00:00 2001
From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Date: Fri, 15 Mar 2024 20:34:40 -0700
Subject: [PATCH 035/140] Fix Jenkinsfile (#8680)

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
---
 Jenkinsfile | 139 +++++++++++++++++++++++++++-------------------------
 1 file changed, 71 insertions(+), 68 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 45c766a966d6..89b31b7cb919 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -169,6 +169,7 @@ pipeline {
         trainer.devices=1 \
         ++exp_manager.max_time_per_run=00:00:03:00 \
         trainer.max_steps=20 \
+        model.conditioning.embed_dim=64 \
         model.micro_batch_size=1 \
         model.global_batch_size=1 \
         model.data.synthetic_data=True \
@@ -212,79 +213,81 @@ pipeline {
             model.unet_config.from_pretrained=null \
             model.first_stage_config.from_pretrained=null \
             model.unet_config.use_flash_attention=False \
+            model.unet_config.attention_resolutions=[1] \
+            model.unet_config.channel_mult=[1] \
             "
         sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
         sh "rm -rf /home/TestData/multimodal/stable_diffusion_train"
       }
     }
-    stage('L2: Multimodal ControlNet Train') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "rm -rf /home/TestData/multimodal/controlnet_train"
-        sh "pip install webdataset==0.2.48"
-        sh "python examples/multimodal/text_to_image/controlnet/controlnet_train.py \
-            trainer.precision=16 \
-            trainer.num_nodes=1 \
-            trainer.devices=1 \
-            ++exp_manager.max_time_per_run=00:00:03:00 \
-            trainer.max_steps=20 \
-            model.micro_batch_size=1 \
-            model.global_batch_size=1 \
-            model.data.synthetic_data=True \
-            exp_manager.exp_dir=/home/TestData/multimodal/controlnet_train \
-            model.inductor=False \
-            model.image_logger.max_images=0 \
-            model.control_stage_config.params.from_pretrained_unet=null \
-            model.unet_config.from_pretrained=null \
-            model.first_stage_config.from_pretrained=null \
-            model.unet_config.use_flash_attention=False \
-            "
-        sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
-        sh "rm -rf /home/TestData/multimodal/controlnet_train"
-      }
-    }
-    stage('L2: Multimodal DreamBooth Train') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "rm -rf /home/TestData/multimodal/dreambooth_train"
-        sh "pip install webdataset==0.2.48"
-        sh "python examples/multimodal/text_to_image/dreambooth/dreambooth.py \
-            trainer.precision=16 \
-            trainer.num_nodes=1 \
-            trainer.devices=1 \
-            ++exp_manager.max_time_per_run=00:00:03:00 \
-            trainer.max_steps=20 \
-            model.micro_batch_size=1 \
-            model.global_batch_size=1 \
-            exp_manager.exp_dir=/home/TestData/multimodal/dreambooth_train \
-            model.inductor=False \
-            model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
-            ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
-            ++model.cond_stage_config.max_length=77 \
-            ~model.cond_stage_config.restore_from_path \
-            ~model.cond_stage_config.freeze \
-            ~model.cond_stage_config.layer \
-            model.unet_config.from_pretrained=null \
-            model.first_stage_config.from_pretrained=null \
-            model.data.instance_dir=/home/TestData/multimodal/tiny-dreambooth \
-            model.unet_config.use_flash_attention=False \
-            "
-        sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
-        sh "rm -rf /home/TestData/multimodal/dreambooth_train"
-      }
-    }
+//     stage('L2: Multimodal ControlNet Train') {
+//       when {
+//         anyOf {
+//           branch 'main'
+//           changeRequest target: 'main'
+//         }
+//       }
+//       failFast true
+//       steps {
+//         sh "rm -rf /home/TestData/multimodal/controlnet_train"
+//         sh "pip install webdataset==0.2.48"
+//         sh "python examples/multimodal/text_to_image/controlnet/controlnet_train.py \
+//             trainer.precision=16 \
+//             trainer.num_nodes=1 \
+//             trainer.devices=1 \
+//             ++exp_manager.max_time_per_run=00:00:03:00 \
+//             trainer.max_steps=20 \
+//             model.micro_batch_size=1 \
+//             model.global_batch_size=1 \
+//             model.data.synthetic_data=True \
+//             exp_manager.exp_dir=/home/TestData/multimodal/controlnet_train \
+//             model.inductor=False \
+//             model.image_logger.max_images=0 \
+//             model.control_stage_config.params.from_pretrained_unet=null \
+//             model.unet_config.from_pretrained=null \
+//             model.first_stage_config.from_pretrained=null \
+//             model.unet_config.use_flash_attention=False \
+//             "
+//         sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
+//         sh "rm -rf /home/TestData/multimodal/controlnet_train"
+//       }
+//     }
+//     stage('L2: Multimodal DreamBooth Train') {
+//       when {
+//         anyOf {
+//           branch 'main'
+//           changeRequest target: 'main'
+//         }
+//       }
+//       failFast true
+//       steps {
+//         sh "rm -rf /home/TestData/multimodal/dreambooth_train"
+//         sh "pip install webdataset==0.2.48"
+//         sh "python examples/multimodal/text_to_image/dreambooth/dreambooth.py \
+//             trainer.precision=16 \
+//             trainer.num_nodes=1 \
+//             trainer.devices=1 \
+//             ++exp_manager.max_time_per_run=00:00:03:00 \
+//             trainer.max_steps=20 \
+//             model.micro_batch_size=1 \
+//             model.global_batch_size=1 \
+//             exp_manager.exp_dir=/home/TestData/multimodal/dreambooth_train \
+//             model.inductor=False \
+//             model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
+//             ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
+//             ++model.cond_stage_config.max_length=77 \
+//             ~model.cond_stage_config.restore_from_path \
+//             ~model.cond_stage_config.freeze \
+//             ~model.cond_stage_config.layer \
+//             model.unet_config.from_pretrained=null \
+//             model.first_stage_config.from_pretrained=null \
+//             model.data.instance_dir=/home/TestData/multimodal/tiny-dreambooth \
+//             model.unet_config.use_flash_attention=False \
+//             "
+//         sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
+//         sh "rm -rf /home/TestData/multimodal/dreambooth_train"
+//       }
+//     }
     stage('L2: Vision ViT Pretrain TP=1') {
       when {
         anyOf {

From 7b9b266cfa08adb40dcaa4cf1bc4437d02fba9d8 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Sat, 16 Mar 2024 01:19:59 -0400
Subject: [PATCH 036/140] change answer only loss default to true (#8633)

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 .../tuning/conf/megatron_gpt_generate_config.yaml               | 2 +-
 .../tuning/conf/megatron_t5_generate_config.yaml                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_generate_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_generate_config.yaml
index 25049de1e439..67d43eb303f4 100644
--- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_generate_config.yaml
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_generate_config.yaml
@@ -61,7 +61,7 @@ model:
   # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
   activations_checkpoint_num_layers: null # not used with 'selective'
   activations_checkpoint_layers_per_pipeline: null
-  answer_only_loss: False # not used right now
+  answer_only_loss: True
   gradient_as_bucket_view: False
 
   hidden_dropout: 0.0
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_t5_generate_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_t5_generate_config.yaml
index 0ef90a2343fa..c506f3d96ada 100644
--- a/examples/nlp/language_modeling/tuning/conf/megatron_t5_generate_config.yaml
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_t5_generate_config.yaml
@@ -61,7 +61,7 @@ model:
   # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
   activations_checkpoint_num_layers: null # not used with 'selective'
   activations_checkpoint_layers_per_pipeline: null
-  answer_only_loss: False # not used right now
+  answer_only_loss: True
   gradient_as_bucket_view: False
 
   hidden_dropout: 0.0

From 7010f9332029964ccad382882dae866aa072ffc8 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Sat, 16 Mar 2024 01:20:18 -0400
Subject: [PATCH 037/140] Update PEFT doc (#8664)

* update PEFT doc

Signed-off-by: Chen Cui <chcui@nvidia.com>

* update support matrix

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 .../nlp/nemo_megatron/peft/landing_page.rst   | 20 +++++++++-------
 .../nemo_megatron/peft/supported_methods.rst  | 24 ++++++++++++-------
 2 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/docs/source/nlp/nemo_megatron/peft/landing_page.rst b/docs/source/nlp/nemo_megatron/peft/landing_page.rst
index 4461feffb5db..37ca1eff6c17 100644
--- a/docs/source/nlp/nemo_megatron/peft/landing_page.rst
+++ b/docs/source/nlp/nemo_megatron/peft/landing_page.rst
@@ -10,16 +10,18 @@ points, PEFT achieves comparable performance to full finetuning at a
 fraction of the computational and storage costs.
 
 NeMo supports four PEFT methods which can be used with various
-transformer-based models.
+transformer-based models. `Here <https://github.com/NVIDIA/NeMo/tree/main/scripts/nlp_language_modeling>`__
+is a collection of conversion scripts that convert
+popular models from HF format to nemo format.
 
-==================== ===== ======== ========= ====== ==
-\                    GPT 3 Nemotron LLaMa 1/2 Falcon T5
-==================== ===== ======== ========= ====== ==
-LoRA                  ✅    ✅      ✅        ✅     ✅
-P-Tuning              ✅    ✅      ✅        ✅     ✅
-Adapters (Canonical)  ✅    ✅      ✅               ✅
-IA3                   ✅    ✅      ✅               ✅
-==================== ===== ======== ========= ====== ==
+==================== ===== ======== ========= ====== ========= ===== ==
+\                    GPT 3 Nemotron LLaMa 1/2 Falcon Starcoder Gemma T5
+==================== ===== ======== ========= ====== ========= ===== ==
+LoRA                  ✅    ✅      ✅        ✅     ✅        ✅    ✅
+P-Tuning              ✅    ✅      ✅        ✅     ✅        ✅    ✅
+Adapters (Canonical)  ✅    ✅      ✅               ✅        ✅    ✅
+IA3                   ✅    ✅      ✅               ✅        ✅    ✅
+==================== ===== ======== ========= ====== ========= ===== ==
 
 Learn more about PEFT in NeMo with the :ref:`peftquickstart` which provides an overview on how PEFT works
 in NeMo. Read about the supported PEFT methods
diff --git a/docs/source/nlp/nemo_megatron/peft/supported_methods.rst b/docs/source/nlp/nemo_megatron/peft/supported_methods.rst
index 4479565be6aa..140963b3e602 100644
--- a/docs/source/nlp/nemo_megatron/peft/supported_methods.rst
+++ b/docs/source/nlp/nemo_megatron/peft/supported_methods.rst
@@ -17,6 +17,9 @@ NeMo supports the following PFET tuning methods
       each case, the output linear layer is initialized to 0 to ensure
       that an untrained adapter does not affect the normal forward pass
       of the transformer layer.
+   -  In NeMo, you can customize the adapter bottleneck dimension,
+      adapter dropout amount, as well as the type and position of
+      normalization layer.
 
 2. **LoRA**: `LoRA: Low-Rank Adaptation of Large Language
    Models <http://arxiv.org/abs/2106.09685>`__
@@ -24,17 +27,19 @@ NeMo supports the following PFET tuning methods
    -  LoRA makes fine-tuning efficient by representing weight updates
       with two low rank decomposition matrices. The original model
       weights remain frozen, while the low rank decomposition matrices
-      are updated to adapt to the new data , so the number of trainable
+      are updated to adapt to the new data, so the number of trainable
       parameters is kept low. In contrast with adapters, the original
       model weights and adapted weights can be combined during
       inference, avoiding any architectural change or additional latency
       in the model at inference time.
-   -  The matrix decomposition operation can be applied to any linear
-      layer, but in practice, it is only applied to the K, Q, V
-      projection matrices (sometimes just applied to the Q,V layers).
-      Since NeMo's attention implementation fuses KQV into a single
-      projection, our LoRA implementation learns a single Low-Rank
-      projection for KQV in a combined fashion.
+   -  In NeMo, you can customize the adapter bottleneck dimension and
+      the target modules to apply LoRA. LoRA can be applied to any linear
+      layer. In a transformer model, this includes 1) Q, K, V attention
+      projections, 2) attention output layer, and 3) either or both of
+      the two transformer MLP layers. For QKV, NeMo's attention
+      implementation fuses QKV into a single projection, so our LoRA
+      implementation learns a single Low-Rank projection for QKV
+      combined.
 
 3. **IA3**: `Few-Shot Parameter-Efficient Fine-Tuning is Better and
    Cheaper than In-Context Learning <http://arxiv.org/abs/2205.05638>`__
@@ -51,6 +56,7 @@ NeMo supports the following PFET tuning methods
       learning rescaling vectors can also be merged with the base
       weights, leading to no architectural change and no additional
       latency at inference time.
+   -  There is no hyperparameter to tune for the IA3 adapter.
 
 4. **P-Tuning**: `GPT Understands,
    Too <https://arxiv.org/abs/2103.10385>`__
@@ -63,9 +69,11 @@ NeMo supports the following PFET tuning methods
       vocabulary. They are simply 1D vectors that match the
       dimensionality of real tokens which make up the model's
       vocabulary.
-   -  In p-tuning, an intermediate LSTM or MLP model is used to generate
+   -  In p-tuning, an intermediate MLP model is used to generate
       virtual token embeddings. We refer to this intermediate model as
       our ``prompt_encoder``. The prompt encoder parameters are randomly
       initialized at the start of p-tuning. All base model parameters
       are frozen, and only the prompt encoder weights are updated at
       each training step.
+   -  In Nemo, you can customize the number of virtual tokens, as well
+      as the embedding and MLP bottleneck dimensions.

From eb0cd37cdd9d2c75a81fbac8dd9a6949e92e9a0a Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Sat, 16 Mar 2024 01:23:19 -0400
Subject: [PATCH 038/140] Enable Pipeline Parallelism for P-Tuning (#8651)

* enable PP for ptuning

Signed-off-by: Chen Cui <chcui@nvidia.com>

* remove print

Signed-off-by: Chen Cui <chcui@nvidia.com>

* pass self.virtual_tokens to all ranks to ensure same seqlen

Signed-off-by: Chen Cui <chcui@nvidia.com>

* pass self.use_peft to all ranks for correct ckpt saving

Signed-off-by: Chen Cui <chcui@nvidia.com>

* make load_adapter work for ptuning pp

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 .../megatron/gpt_sft_dataset.py               |  2 +-
 .../nlp/parts/mixins/nlp_adapter_mixins.py    | 65 +++++++++++--------
 2 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
index bed06179b8ad..d8314990b5cd 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
@@ -478,7 +478,7 @@ def collate_fn(self, batch):
 class GPTSFTPackedDataset(GPTSFTDataset):
     def __init__(self, file_path: str, tokenizer: TokenizerSpec, **kwargs):
         super().__init__(file_path, tokenizer, **kwargs)
-
+        assert self.virtual_tokens == 0, "P-Tuning with packed sequence is not supported."
         self._load_packed_dataset(file_path)
 
     def __getitem__(self, idx):
diff --git a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
index 7e4df2f27c6d..c0d128164760 100644
--- a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
+++ b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
@@ -65,7 +65,9 @@ def __init__(self, *args, **kwargs):
         self.use_peft = False
         self.tunable_base_param_names = []
         self.setup_complete = False
-        self.use_ptuning_only = False
+
+        # for P-Tuning with PP, second stage and onward have no trainable parameters so only first stage needs peft handling
+        self.ptuning_only_and_non_first_stage = False
         super().__init__(*args, **kwargs)
 
         self.use_mcore_gpt = hasattr(self, 'mcore_gpt') and self.mcore_gpt
@@ -174,22 +176,25 @@ def add_adapter(self, peft_cfgs: Union[PEFTConfig, List[PEFTConfig]]):
         if self.cfg.optim.name == "distributed_fused_adam":
             raise ValueError('distributed_fused_adam is not supported for PEFT. Please use fused_adam')
 
+        self.use_peft = True
         if not isinstance(peft_cfgs, List):
             peft_cfgs = [peft_cfgs]
 
+        # @chcui crucial to set self.virtual_tokens and self.use_peft for all PP ranks
+        for peft_cfg in peft_cfgs:
+            if isinstance(peft_cfg, PtuningPEFTConfig):
+                self.virtual_tokens = peft_cfg.virtual_tokens
+        ptuning_only = len(peft_cfgs) == 1 and isinstance(peft_cfgs[0], PtuningPEFTConfig)
+        self.ptuning_only_and_non_first_stage = ptuning_only and not self.first_stage_of_pipeline()
+        if self.ptuning_only_and_non_first_stage:
+            # There are no params to add if we are not in the first state of the pipeline
+            return
+
         self.base_keys = self._get_all_keys()
         self.freeze()
         logging.info(f"Before adding PEFT params:\n{self.summarize()}")
 
-        self.use_ptuning_only = len(peft_cfgs) == 1 and isinstance(peft_cfgs[0], PtuningPEFTConfig)
-
         for peft_cfg in peft_cfgs:
-            if self.use_ptuning_only:
-                if not self.first_stage_of_pipeline():
-                    # There are no params to add if we are not in the first state of the pipeline
-                    continue
-                self.virtual_tokens = peft_cfg.virtual_tokens
-
             self._check_and_add_peft_cfg(peft_cfg)
 
         logging.info(f"After adding PEFT params:\n{self.summarize()}")
@@ -202,7 +207,6 @@ def add_adapter(self, peft_cfgs: Union[PEFTConfig, List[PEFTConfig]]):
 
             if hasattr(cfg, "tunable_base_param_names") and cfg.tunable_base_param_names:
                 self.set_tunable_base_params(cfg)
-        self.use_peft = True
 
     def _get_config_and_state_dict_from_nemo(self, filepath, map_location):
         cwd = os.getcwd()
@@ -238,19 +242,22 @@ def setup_optimizer_param_groups(self):
         """
         if self.use_peft:
             self.freeze()  # Freeze the entire model
-            opt_params = []
-            for _, module in self.named_modules():
-                if isinstance(module, AdapterModuleMixin) and module.is_adapter_available():
-                    module.set_enabled_adapters(enabled=True)
-                    module.unfreeze_enabled_adapters()  # selectively unfreeze the adapter modules.
-                    opt_params += [p for p in module.parameters() if p.requires_grad]
-
-            for name, param in self.named_parameters():
-                if name in self.tunable_base_param_keys:
-                    param.requires_grad = True
-                    opt_params += [param]
-
-            self._optimizer_param_groups = ({"params": opt_params},)
+            if not self.ptuning_only_and_non_first_stage:
+                opt_params = []
+                for _, module in self.named_modules():
+                    if isinstance(module, AdapterModuleMixin) and module.is_adapter_available():
+                        module.set_enabled_adapters(enabled=True)
+                        module.unfreeze_enabled_adapters()  # selectively unfreeze the adapter modules.
+                        opt_params += [p for p in module.parameters() if p.requires_grad]
+
+                for name, param in self.named_parameters():
+                    if name in self.tunable_base_param_keys:
+                        param.requires_grad = True
+                        opt_params += [param]
+
+                self._optimizer_param_groups = ({"params": opt_params},)
+            else:
+                self._optimizer_param_groups = ({"params": []},)
             logging.info(f"Optimizer groups set:\n{self.summarize()}")
         else:
             super().setup_optimizer_param_groups()
@@ -293,7 +300,8 @@ def load_adapters(
             ), "Inferring peft scheme is only supported for .nemo checkpoints. Please supply the `peft_cfgs` argument."
             peft_cfgs = [PEFT_CONFIG_MAP[conf.peft.peft_scheme](conf)]
         self.add_adapter(peft_cfgs)
-        assert set(state_dict.keys()) == self.adapter_keys.union(self.tunable_base_param_keys)
+        if not self.ptuning_only_and_non_first_stage:
+            assert set(state_dict.keys()) == self.adapter_keys.union(self.tunable_base_param_keys)
         super().load_state_dict(state_dict, strict=False)
 
     def set_tunable_base_params(self, peft_cfg):
@@ -356,6 +364,8 @@ def get_peft_state_dict(self):
     def state_dict(self, destination=None, prefix=None, keep_vars=False):
         if self.use_peft and self.setup_complete:
             # Once setup is complete we no longer need to track the frozen part of the model. Only there adapter state dict keeps changing so state_dict only track these.
+            if self.ptuning_only_and_non_first_stage:
+                return {}
             return self.get_peft_state_dict()
         else:
             # we want all the params with the same keys as calling self.state_dict()
@@ -379,8 +389,9 @@ def load_state_dict(self, state_dict, strict: bool = True):
             # setting strict=False will ignore the missing keys (which are not being updated anyway)
             # explicitly check if state_dict.keys matches all the expected self.adapter_keys since we don't have the
             # safety in strict=True anymore.
-            assert set(state_dict.keys()) == self.adapter_keys.union(self.tunable_base_param_keys)
-            super().load_state_dict(state_dict, strict=False)
+            if not self.ptuning_only_and_non_first_stage:
+                assert set(state_dict.keys()) == self.adapter_keys.union(self.tunable_base_param_keys)
+                super().load_state_dict(state_dict, strict=False)
         else:
             super().load_state_dict(state_dict, strict=True)
 
@@ -389,7 +400,7 @@ def on_load_checkpoint(self, checkpoint) -> None:
         https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-load-checkpoint
         """
         if self.use_peft and self.setup_complete:
-            if not self.use_ptuning_only or self.first_stage_of_pipeline():
+            if not self.ptuning_only_and_non_first_stage:
                 # same as super().on_load_checkpoint() but strict=False and only check unexpected keys
                 # mcore uses distributed checkpointing
                 if hasattr(self, 'mcore_gpt') and self.mcore_gpt:

From 7e4d975ffe01cd2ca122e33bbbe3e35538633478 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Sat, 16 Mar 2024 01:25:19 -0400
Subject: [PATCH 039/140] Enable IA3 for Gemma and Starcoder (#8677)

* fix ia3 for certain community models

Signed-off-by: Chen Cui <chcui@nvidia.com>

* account for num_query_groups

Signed-off-by: Chen Cui <chcui@nvidia.com>

* account for models with bias (e.g. starcoder)

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 .../common/megatron/adapters/mcore_mixins.py  |  8 +++---
 nemo/collections/nlp/parts/peft_config.py     | 26 +++++++++++--------
 2 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
index 3eb63e96c3a3..198a78a73718 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
@@ -256,9 +256,11 @@ def mcore_register_adapters(self):
 
     def forward(self, hidden_states):
         # [s, b, 4 * h/p]
-        linear_fc1_output, bias_parallel = self.linear_fc1(hidden_states)
-
-        intermediate_parallel, layernorm_output = linear_fc1_output
+        if self.linear_fc1.te_return_bias:
+            intermediate_parallel, bias_parallel, layernorm_output = self.linear_fc1(hidden_states)
+        else:
+            # bias_parallel is None
+            (intermediate_parallel, layernorm_output), bias_parallel = self.linear_fc1(hidden_states)
 
         # LoRA logic
         if self.is_adapter_available():
diff --git a/nemo/collections/nlp/parts/peft_config.py b/nemo/collections/nlp/parts/peft_config.py
index 501779085e22..e6f0fe267d18 100644
--- a/nemo/collections/nlp/parts/peft_config.py
+++ b/nemo/collections/nlp/parts/peft_config.py
@@ -97,6 +97,16 @@ def __init__(self, peft_cfg: DictConfig, name_key_to_cfg: Dict):
     def get_config_dict(self):
         return self.name_key_to_cfg
 
+    def _calculate_kv_channels(self, cfg):
+        if cfg.get("kv_channels", None) is None:
+            assert (
+                cfg.hidden_size % cfg.num_attention_heads == 0
+            ), 'hidden_size must be divisible by num_attention_heads if kv_channels is None'
+            kv_channels = cfg.hidden_size // cfg.num_attention_heads
+        else:
+            kv_channels = cfg.kv_channels
+        return kv_channels
+
 
 class SelectivePEFTConfig(PEFTConfig):
     def __init__(self, cfg):
@@ -160,16 +170,6 @@ def __init__(self, cfg):
         self.name_key_to_mcore_mixins = name_key_to_mcore_mixins
         super().__init__(lora_cfg, name_key_to_cfg)
 
-    def _calculate_kv_channels(self, cfg):
-        if cfg.get("kv_channels", None) is None:
-            assert (
-                cfg.hidden_size % cfg.num_attention_heads == 0
-            ), 'hidden_size must be divisible by num_attention_heads if kv_channels is None'
-            kv_channels = cfg.hidden_size // cfg.num_attention_heads
-        else:
-            kv_channels = cfg.kv_channels
-        return kv_channels
-
     def _create_lora_config(self, cfg, lora_cfg, in_features, out_features, adapter_cfg_cls):
         config_args = {
             "in_features": in_features,
@@ -220,7 +220,11 @@ def __init__(self, cfg):
         mlp_infused_adapter_cfg = MLPInfusedAdapterConfig(
             in_features=cfg.ffn_hidden_size // cfg.tensor_model_parallel_size
         )
-        infused_adapter_cfg = InfusedAdapterConfig(in_features=cfg.hidden_size // cfg.tensor_model_parallel_size)
+
+        kv_channels = self._calculate_kv_channels(cfg)
+        num_query_groups = cfg.get("num_query_groups", cfg.num_attention_heads)
+        kv_projection_size = kv_channels * num_query_groups
+        infused_adapter_cfg = InfusedAdapterConfig(in_features=kv_projection_size // cfg.tensor_model_parallel_size)
 
         name_key_to_cfg = {
             AdapterName.KEY_INFUSED: infused_adapter_cfg,

From 0fa7c507cee6d7d9ada889b52d7ae414613bbf92 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Sat, 16 Mar 2024 01:47:29 -0400
Subject: [PATCH 040/140] Add script for packed sequence data preparation
 (#8682)

* add seq packing script

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* use logging instead of print

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../prepare_packed_ft_dataset.py              | 238 ++++++++++++++++++
 1 file changed, 238 insertions(+)
 create mode 100644 scripts/nlp_language_modeling/prepare_packed_ft_dataset.py

diff --git a/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py b/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py
new file mode 100644
index 000000000000..f01aa54fc265
--- /dev/null
+++ b/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py
@@ -0,0 +1,238 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+from dataclasses import dataclass
+from typing import Tuple
+
+import numpy as np
+from tqdm import tqdm
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.exp_manager import exp_manager
+
+""" 
+Script to prepare packed dataset from a SFT/PEFT dataset in the jsonl format.
+Two main steps are run in this script:
+1. The online processing code in GPTSFTDataset is run (including prompt template manipulation, 
+sequence length truncation, tokenization, etc) and the result is an array of tokenized sequences, 
+represented by indices). 
+2. The sequences are grouped by length, and a packing algorithm is run. (https://en.wikipedia.org/wiki/Bin_packing_problem#Offline_algorithms)
+Currently, two variants of "first fit" are supported.
+"first_fit_decreasing" sorts the sequences in decreasing order before applying first-fit. 
+It generates a more optimal packing, but it tends to keep all short sequences together, which may affect convergence.
+"first_fit_shuffle" runs first-fit in a random order. Packing is less optimal but it keeps the dataset order random.
+The recommendation is to run "first_fit_shuffle" and check the packed sequence lengths in the printout. 
+If they are similar to the target length (i.e. packing is efficient), then use shuffle. Otherwise try first_fit_decreasing.
+
+Example usage:
+
+python scripts/nlp_language_modeling/prepare_packed_ft_dataset.py \
+   model.data.train_ds.file_names=[/path/to/training.jsonl] \
+   model.data.train_ds.max_seq_length=2048 \
+   model.restore_from_path=<path/to/nemo_model> \
+   +output_dir=<output_folder> 
+   +pack_sizes=[2048,4096,8192]
+   
+Note: 
+- pack_sizes can take in a list 
+- model.data.train_ds.max_seq_length is the length to truncate long sequences before packing, and is different from the packing sizes
+- currenlty, we require a full nemo model file for simplicity and readability of code, but in theory only a tokenizer file is needed.
+  This part can be improved in a future iteration of the script.
+"""
+
+PACKING_ALGOS = ['first_fit_decreasing', 'first_fit_shuffle']
+
+
+def find_first_bin_that_fits(bins, s, bin_size):
+    for i, abin in enumerate(bins):
+        if sum(abin) + s <= bin_size:
+            return i
+    return -1
+
+
+def first_fit(seqlens, pack_size):
+    res = []
+    for s in seqlens:
+        first_bin = find_first_bin_that_fits(res, s, pack_size)
+        if first_bin == -1:  # open a new bin
+            res.append([s])
+        else:
+            res[first_bin].append(s)
+    return res
+
+
+def first_fit_decreasing(seqlens, pack_size):
+    sorted_seqlens = sorted(seqlens, reverse=True)
+    return first_fit(sorted_seqlens, pack_size)
+
+
+def first_fit_shuffle(seqlens, pack_size):
+    shuffled_seqlens = seqlens[:]
+    np.random.shuffle(shuffled_seqlens)
+    return first_fit(shuffled_seqlens, pack_size)
+
+
+def create_assignment(output_path, assignments, ifile_handles):
+    n_samples_in_this_shard = len(assignments)
+    input_ids, loss_mask, seq_start_id = {}, {}, {}
+
+    for oindex, assignment in tqdm(enumerate(assignments), total=n_samples_in_this_shard):
+        _input_ids, _loss_mask, _seq_start_id = [], [], [0]
+
+        for seq_length in assignment:
+            _input_ids.extend(ifile_handles[seq_length][0].pop())
+            _loss_mask.extend(ifile_handles[seq_length][1].pop())
+            _seq_start_id.append(len(_input_ids))
+
+        input_ids[oindex] = _input_ids
+        loss_mask[oindex] = _loss_mask
+        seq_start_id[oindex] = _seq_start_id[:-1]
+
+    output_data = []
+    for i in range(len(input_ids)):
+        item_dict = {'input_ids': input_ids[i], 'loss_mask': loss_mask[i], 'seq_start_id': seq_start_id[i]}
+        output_data.append(item_dict)
+
+    assert all(not seq[0] for seq in ifile_handles.values()), "Error: There are items left over from the assignment"
+    assert all(not seq[1] for seq in ifile_handles.values()), "Error: There are items left over from the assignment"
+    np.save(output_path, output_data)
+    logging.info(f"Done, output written to {output_path}")
+
+
+def tokenize_dataset(cfg):
+    logging.info("Tokenizing dataset...")
+    # using the same template as SFT/PEFT script. This may be overkill but guarantees the preprocess settings
+    # are identical to normal SFT training
+    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
+    exp_manager(trainer, cfg.exp_manager)
+
+    model_cfg = MegatronGPTSFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg)
+    model = MegatronGPTSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
+
+    # we set is_train=False to turn off samples mapping and get the actual length of train dataset
+    train_ds = model._build_dataset(cfg.model.data.train_ds, is_train=False)[0]
+    return np.array([train_ds[i] for i in range(len(train_ds))])
+
+
+def create_hist(dataset, truncate_seq_len):
+    logging.info("Creating histogram from tokenized dataset...")
+
+    sequences = collections.defaultdict(list)
+    counts = [0] * truncate_seq_len
+
+    for item_dict in dataset:
+        seq_len = len(item_dict['input_ids']) - 1
+        sequences[seq_len].append(item_dict)
+        counts[seq_len] += 1
+
+    logging.info("Histogram of sequence lengths")
+    logging.info(counts)
+
+    histogram = []
+    for seq_len in range(truncate_seq_len):
+        histogram.append(len(sequences[seq_len]))
+
+    return sequences, histogram
+
+
+def run_packing(sequences, histogram, output_dir, pack_size, packing_algorithm, seed=0):
+    logging.info(f"Packing sequences to length {pack_size}...")
+
+    all_seq_lens = []
+    for i, count in enumerate(histogram):
+        all_seq_lens.extend([i] * count)
+
+    packing_fn = globals()[packing_algorithm]
+    assignments = packing_fn(all_seq_lens, pack_size)
+    packed_seq_lens = [sum(x) for x in assignments]
+    packing_factor = len(all_seq_lens) / len(packed_seq_lens)
+
+    logging.info("Packed sequence lengths:")
+    logging.info(packed_seq_lens)
+    logging.info(
+        f">>>>> For pack size {pack_size}, average number of sequences per pack is n = {packing_factor} <<<<<"
+    )
+
+    ifile_handles = {}
+    for seq_len in tqdm(range(pack_size + 1)):
+        per_seq_data = sequences[seq_len]
+        if len(per_seq_data) > 0:
+            input_ids = np.array([x['input_ids'] for x in per_seq_data])
+            loss_mask = np.array(
+                [[idx >= x['answer_start_idx'] for idx in range(len(x['input_ids']))] for x in per_seq_data]
+            )
+            perm = np.random.permutation(len(input_ids))
+            ifile_handles[seq_len] = (input_ids[perm].tolist(), loss_mask[perm].tolist())
+        else:
+            ifile_handles[seq_len] = [], []
+
+    os.makedirs(output_dir, exist_ok=True)
+    output_path = os.path.join(output_dir, f'packed_{pack_size}_seed{seed}.npy')
+    create_assignment(output_path, assignments, ifile_handles)
+
+
+@dataclass
+class PackingArgs:
+    output_dir: str = "output"
+    pack_sizes: Tuple[int] = (2048,)
+    packing_algorithm: str = "first_fit_shuffle"
+    seed: int = 0
+
+    def from_config(self, cfg):
+        for required_arg in ('output_dir', 'pack_sizes'):
+            assert cfg.get(required_arg, None), f"Please specify +{required_arg}=..."
+        self.output_dir = cfg.output_dir
+        self.pack_sizes = cfg.pack_sizes
+        self.packing_algorithm = cfg.get("packing_algorithm", "first_fit_shuffle")
+        self.seed = cfg.get("seed", 0)
+        return self
+
+
+@hydra_runner(
+    config_path="../../examples/nlp/language_modeling/tuning/conf", config_name="megatron_gpt_finetuning_config"
+)
+def main(cfg) -> None:
+    args = PackingArgs().from_config(cfg)
+    dataset = tokenize_dataset(cfg)
+    sequences, histogram = create_hist(dataset, cfg.model.data.train_ds.max_seq_length)
+    for pack_size in args.pack_sizes:
+        run_packing(sequences, histogram, args.output_dir, pack_size, args.packing_algorithm, args.seed)
+    logging.info(
+        f"""
+✅ Packed datasets with pack sizes {args.pack_sizes} are prepared successfully.
+To train with packed sequences, you need to change three things in the SFT/PEFT config file
+1. Turn on the packed_sequence flag 
+   > +model.data.train_ds.packed_sequence=True
+2. Use the new dataset file instead of the original jsonl file
+   > model.data.train_ds.file_names=/path/to/packed_dataset.npy
+3. Adjust the batch sizes. 
+   Micro batch size has to be set to 1 as a nominal constraint. This is because batches are now concatenated 
+   in the preprocessing step. You can increase the pack_size to achieve the same purpose of increasing micro batch size.
+   Global batch size has to be reduced by the average number of sequences per pack `n`, 
+   where n = total number of sequences / total number of packs. This ensures that each gradient iteration 
+   sees (on average) the same number of sequences so that the recipe is maintained.
+   Please scroll up to see the value of n for each of your pack sizes.
+   > model.micro_batch_size=1
+   > model.global_batch_size=<previous GBS divided by n>
+"""
+    )
+
+
+if __name__ == '__main__':
+    main()

From b72cb9a5449f007c8bc73411224e94ccf9b2c2c1 Mon Sep 17 00:00:00 2001
From: Adi Renduchintala <adithyare@nvidia.com>
Date: Sat, 16 Mar 2024 01:01:05 -0700
Subject: [PATCH 041/140] bug fix for avg metric (#8674)

Signed-off-by: arendu <adithya.r@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 .../nlp/models/language_modeling/megatron_gpt_sft_model.py    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index 0320fc6c0713..6e7b93b5a649 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -478,7 +478,7 @@ def inference_step_validation_call(self, batch, batch_idx, data_cfg, dataloader_
         }
         return outputs
 
-    def gather_and_maybe_write_predictions(self, output, data_cfg, mode, dataloader_idx=0):
+    def gather_and_maybe_write_predictions(self, output, data_cfg, mode, averaged_metric, dataloader_idx=0):
         # Gather the outputs object from all data parallel ranks since we are using the DistributedSampler which splits data across DDP ranks.
         gathered_outputs = [None for _ in range(parallel_state.get_data_parallel_world_size())]
         torch.distributed.all_gather_object(
@@ -596,7 +596,7 @@ def inference_epoch_end(self, outputs, mode, data_cfg):
             loss_log_key = self._determine_log_key(data_cfg, dataloader_idx, "loss", mode)
             self.log(loss_log_key, loss, batch_size=1)
             averaged_loss.append(loss)
-            self.gather_and_maybe_write_predictions(output, data_cfg, mode, dataloader_idx)
+            self.gather_and_maybe_write_predictions(output, data_cfg, mode, averaged_metric, dataloader_idx)
 
             torch.distributed.barrier(group=parallel_state.get_data_parallel_group())
             outputs[dataloader_idx].clear()  # free memory

From 8847b2e958e309dff6f3b2094067bdc8482d7e82 Mon Sep 17 00:00:00 2001
From: akoumpa <153118171+akoumpa@users.noreply.github.com>
Date: Sat, 16 Mar 2024 15:40:53 -0700
Subject: [PATCH 042/140] Expert parallelism support (#8632)

* Expert parallelism support

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Divide grads by data_parallel_size

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Remove MoE pad parameter passing.

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Require pad_to_max_length for EP in SFT/PEFT

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Add tests for EP

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update Jenkinsfile

Signed-off-by: Eric Harper <complex451@gmail.com>

* Update Jenkinsfile

Signed-off-by: Eric Harper <complex451@gmail.com>

* Use amp_O2

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* disable EP test

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: Eric Harper <complex451@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 Jenkinsfile                                   | 53 +++++++++++
 .../conf/megatron_gpt_config.yaml             |  1 +
 .../language_modeling/megatron_gpt_model.py   |  7 ++
 .../megatron_gpt_sft_model.py                 |  5 ++
 .../nlp/modules/common/megatron/utils.py      | 43 +++++----
 nemo/core/optim/optimizer_with_main_params.py | 89 +++++++++++--------
 6 files changed, 141 insertions(+), 57 deletions(-)
 mode change 100644 => 100755 nemo/core/optim/optimizer_with_main_params.py

diff --git a/Jenkinsfile b/Jenkinsfile
index 89b31b7cb919..c152626e84ec 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -3705,6 +3705,59 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
       }
     }
+/*
+    stage('L2: Megatron GPT Pretraining with EP=2') {
+      when {
+        anyOf {
+          branch 'main'
+          changeRequest target: 'main'
+        }
+      }
+      failFast true
+      steps {
+        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=3 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        model.expert_model_parallel_size=2 \
+        ++model.num_moe_experts=2 \
+        ++model.moe_router_topk=1 \
+        ++model.megatron_amp_O2=True \
+        model.optim.name=fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=1 \
+        model.optim.sched.constant_steps=1 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.normalization=rmsnorm \
+        model.bias=False \
+        model.bias_activation_fusion=False \
+        model.bias_dropout_add_fusion=False \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        model.num_layers=8 \
+        model.cpu_offloading_num_layers=7 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.activations_checkpoint_method='block' \
+        model.activations_checkpoint_granularity='full' \
+        model.activations_checkpoint_num_layers=1 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
+        sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
+        sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
+      }
+    }
+*/
     stage('L2: Megatron GPT with Rope Pretraining and Resume Training TP=2') {
      when {
        anyOf {
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 79bd7c1473f5..e8b0f68d3682 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -65,6 +65,7 @@ model:
   tensor_model_parallel_size: 1 # intra-layer model parallelism
   pipeline_model_parallel_size: 1 # inter-layer model parallelism
   virtual_pipeline_model_parallel_size: null # interleaved pipeline
+  expert_model_parallel_size: 1 # expert model parallelism
 
   # model architecture
   encoder_seq_length: 512
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index c44f95fccad4..cf18d07b3060 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -294,6 +294,13 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         if not self.megatron_amp_O2 and self.cfg.get('virtual_pipeline_model_parallel_size', None):
             raise ValueError('Virtual pipeline model parallel is only supported when using megatron_amp_O2')
 
+        if not self.megatron_amp_O2 and self.cfg.get('expert_model_parallel_size', 1) > 1:
+            raise ValueError('Expert parallelism is only supported when using megatron_amp_O2')
+
+        # TODO(akoumparouli): this is temporary and will be removed in the future.
+        if self.cfg.get('expert_model_parallel_size', 1) > 1 and self.with_distributed_adam:
+            raise ValueError('Expert parallelism is currently not supporting distributed optimizer')
+
         # build_model returns a list of modules which are used for interleaved pipeline parallelism
         if isinstance(self.trainer.accelerator, CPUAccelerator):
             self.model = build_model(
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index 6e7b93b5a649..e6d2eb8b6329 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -265,6 +265,11 @@ def _build_dataset(self, data_cfg, is_train=True):
                 assert data_cfg.micro_batch_size == 1, "Micro batch size must be 1 if using packed sequence"
             else:
                 dataset_cls = GPTSFTDataset
+
+            # TODO(akoumparouli): MCore assumes/requires equal length input sequences.
+            if not data_cfg.get('pad_to_max_length', False) and self.cfg.get('expert_model_parallel_size', 1) > 1:
+                raise ValueError('Expert parallelism requires pad_to_max_length')
+
             dataset = dataset_cls(
                 file_path=file_path,
                 tokenizer=self.tokenizer,
diff --git a/nemo/collections/nlp/modules/common/megatron/utils.py b/nemo/collections/nlp/modules/common/megatron/utils.py
index 02c8ada6470e..35c5d36a8662 100644
--- a/nemo/collections/nlp/modules/common/megatron/utils.py
+++ b/nemo/collections/nlp/modules/common/megatron/utils.py
@@ -359,23 +359,24 @@ def get_params_for_weight_decay_optimization(
     Layernorms and biases will have no weight decay but the rest will.
     """
     modules = listify_model(model)
-    weight_decay_params = {'params': []}
-    no_weight_decay_params = {'params': [], 'weight_decay': 0.0}
+    weight_decay_params = {'params': [], 'is_expert': False}
+    weight_decay_expert_params = {'params': [], 'is_expert': True}
+    no_weight_decay_params = {'params': [], 'weight_decay': 0.0, 'is_expert': False}
+    # EP params have the 'allreduce' attr set.
+    is_expert = lambda param: not getattr(param, 'allreduce', True)
+    # Do the actual param classification
     for module in modules:
-        for module_ in module.modules():
-            if isinstance(module_, (FusedLayerNorm, FastLayerNorm, MixedFusedRMSNorm)):
-                no_weight_decay_params['params'].extend(
-                    [p for p in list(module_._parameters.values()) if p is not None]
-                )
+        for name, param in module.named_parameters():
+            if param is None:
+                continue
+            if name.endswith('.bias'):
+                no_weight_decay_params['params'].extend([param])
             else:
-                weight_decay_params['params'].extend(
-                    [p for n, p in list(module_._parameters.items()) if p is not None and n != 'bias']
-                )
-                no_weight_decay_params['params'].extend(
-                    [p for n, p in list(module_._parameters.items()) if p is not None and n == 'bias']
-                )
-
-    return weight_decay_params, no_weight_decay_params
+                if is_expert(param):
+                    weight_decay_expert_params['params'].extend([param])
+                else:
+                    weight_decay_params['params'].extend([param])
+    return weight_decay_params, weight_decay_expert_params, no_weight_decay_params
 
 
 def get_all_params_for_weight_decay_optimization(
@@ -384,11 +385,15 @@ def get_all_params_for_weight_decay_optimization(
     """Use all params for weight decay."""
     modules = listify_model(model)
 
-    weight_decay_params = [
-        p for module in modules for module_ in module.modules() for p in module_._parameters.values() if p is not None
-    ]
+    weight_decay_params = {'params': [], 'is_expert': False}
+    weight_decay_expert_params = {'params': [], 'is_expert': True}
+
+    # populate with params
+    is_expert = lambda param: not getattr(param, 'allreduce', True)
+    weight_decay_params['params'] = list(filter(lambda x: not is_expert(x), model.parameters()))
+    weight_decay_expert_params['params'] = list(filter(is_expert, model.parameters()))
 
-    return ({'params': weight_decay_params},)
+    return weight_decay_params, weight_decay_expert_params
 
 
 def get_iterator_k_split(batch: List[torch.Tensor], num_microbatches: int) -> Iterator:
diff --git a/nemo/core/optim/optimizer_with_main_params.py b/nemo/core/optim/optimizer_with_main_params.py
old mode 100644
new mode 100755
index 7f8794f746df..7d47b7e895f7
--- a/nemo/core/optim/optimizer_with_main_params.py
+++ b/nemo/core/optim/optimizer_with_main_params.py
@@ -29,7 +29,11 @@
     HAVE_APEX = False
 
 try:
-    from megatron.core.parallel_state import get_data_parallel_group, get_data_parallel_world_size
+    from megatron.core.parallel_state import (
+        get_data_modulo_expert_parallel_group,
+        get_data_parallel_group,
+        get_data_parallel_world_size,
+    )
     from megatron.core.tensor_parallel import copy_tensor_model_parallel_attributes
 
     HAVE_MEGATRON_CORE = True
@@ -68,12 +72,20 @@ def _multi_tensor_copy_this_to_that(this, that, overflow_buf):
             that_.copy_(this_)
 
 
+def _get_grad_data_group(is_expert_group):
+    if is_expert_group:
+        data_group = get_data_modulo_expert_parallel_group()
+    else:
+        data_group = get_data_parallel_group(with_context_parallel=True)
+    return data_group
+
+
 class GradBucket(object):
     """
     Persistent buffer for main gradients that remains allocated between training iterations
     """
 
-    def __init__(self, numel, chunk_size_mb):
+    def __init__(self, numel, chunk_size_mb, data_group):
         if not HAVE_APEX:
             raise ImportError(
                 "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
@@ -87,6 +99,7 @@ def __init__(self, numel, chunk_size_mb):
         self.numel = numel
         self.data = torch.zeros(self.numel, dtype=torch.float, device=torch.cuda.current_device(), requires_grad=False)
 
+        self._data_group = data_group
         self.chunk_size_mb = chunk_size_mb
         if self.chunk_size_mb > 0:
             chunk_size_bytes = chunk_size_mb * 1024 * 1024
@@ -107,8 +120,8 @@ def zero(self):
 
     def allreduce_buffer(self):
         """Synchronous buffer data allreduce """
-        self.data.div_(get_data_parallel_world_size(with_context_parallel=True))
-        torch.distributed.all_reduce(self.data, group=get_data_parallel_group(with_context_parallel=True))
+        self.data.div_(get_data_parallel_world_size())
+        torch.distributed.all_reduce(self.data, group=self._data_group)
 
     def get(self, shape, start_index):
         """Return a tensor with the input `shape` as a view into the
@@ -208,7 +221,6 @@ def __init__(
         self._async_grad_allreduce = (
             async_grad_allreduce and get_data_parallel_world_size(with_context_parallel=True) > 1
         )
-        self._grad_divisor = 1 / get_data_parallel_world_size(with_context_parallel=True)
 
         if self._async_grad_allreduce:
             # use @no_sync to disable backward grad sync during gradient accumulation
@@ -232,13 +244,17 @@ def __init__(
             # get the size of buffers
             num_elements = {}
             for i, param_group in enumerate(self.optimizer.param_groups):
-                for param in param_group['params']:
-                    if param.requires_grad:
-                        num_elements[i] = num_elements.get(i, 0) + param.data.nelement()
+                num_elements[i] = sum(
+                    map(lambda x: x.data.nelement(), filter(lambda p: p.requires_grad, param_group['params']))
+                )
 
                 # Allocate gradient memory buffers for each data type
-                if any(param.requires_grad for param in param_group['params']):
-                    self._main_grad_buffers[i] = GradBucket(num_elements[i], self._grad_allreduce_chunk_size_mb)
+                if num_elements[i] > 0:
+                    self._main_grad_buffers[i] = GradBucket(
+                        num_elements[i],
+                        self._grad_allreduce_chunk_size_mb,
+                        _get_grad_data_group(param_group.get('is_expert', False)),
+                    )
 
         # Three groups of parameters:
         self.float16_groups = []  # original float16 parameters
@@ -255,6 +271,7 @@ def __init__(
             fp32_params_this_group = []
             fp32_from_float16_params_this_group = []
             # For all the parameters in this group:
+            is_expert_group = param_group.get('is_expert', False)
             for j, param in enumerate(param_group['params']):
                 main_param = None
                 if param.requires_grad:
@@ -269,6 +286,8 @@ def __init__(
                         copy_tensor_model_parallel_attributes(main_param, param)
                         if hasattr(param, 'shared'):
                             main_param.shared = param.shared
+                        if hasattr(param, 'allreduce'):
+                            main_param.allreduce = param.allreduce
 
                         # Assign the grad buffer offset to main parameters
                         if self._contiguous_grad_bucket:
@@ -305,7 +324,9 @@ def __init__(
                     param_tmp = param.expand_as(param)
                     # Get the gradient accumulator function.
                     grad_acc = param_tmp.grad_fn.next_functions[0][0]
-                    grad_acc.register_hook(self._make_param_hook(param, main_param, i, grad_chunk_info))
+                    grad_acc.register_hook(
+                        self._make_param_hook(param, main_param, i, grad_chunk_info, is_expert_group)
+                    )
                     self.grad_accs.append(grad_acc)
 
             self.float16_groups.append(float16_params_this_group)
@@ -316,7 +337,7 @@ def __init__(
         # recast preexisting per-param state tensors
         self.optimizer.load_state_dict(self.optimizer.state_dict())
 
-    def _make_param_hook(self, param, main_param, i, grad_chunk_info):
+    def _make_param_hook(self, param, main_param, i, grad_chunk_info, is_expert_group):
         """Create the grad accumulation and all-reduce hook for backprop."""
         # Hook used for back-prop.
         def param_hook(*unused):
@@ -329,41 +350,33 @@ def param_hook(*unused):
                 # Deallocate grad memory.
                 param.grad = None
 
+            def allreduce_grads(use_fused_div, tensor, data_group, grad_mult):
+                if use_fused_div:
+                    torch.distributed.all_reduce(
+                        tensor,
+                        group=data_group,
+                        async_op=True,
+                        op=torch.distributed._make_nccl_premul_sum(1 / grad_mult),
+                    )
+                else:
+                    tensor.div_(grad_mult)
+                    torch.distributed.all_reduce(
+                        tensor, group=data_group, async_op=True,
+                    )
+
             # Asynchronous gradients allreduce accross data_parallel ranks
+            grad_mult = get_data_parallel_world_size()
             if self._require_backward_grad_sync:
+                data_group = _get_grad_data_group(is_expert_group)
                 if self._grad_allreduce_chunk_size_mb > 0:
                     self._main_grad_buffers[i].update_chunk_info(grad_chunk_info)
                     while True:
                         allreduce_tensor = self._main_grad_buffers[i].get_allreduce_tensor()
                         if allreduce_tensor is None:
                             break
-                        if self._grad_div_ar_fusion:
-                            torch.distributed.all_reduce(
-                                allreduce_tensor,
-                                group=get_data_parallel_group(with_context_parallel=True),
-                                async_op=True,
-                                op=torch.distributed._make_nccl_premul_sum(self._grad_divisor),
-                            )
-                        else:
-                            allreduce_tensor.div_(get_data_parallel_world_size(with_context_parallel=True))
-                            torch.distributed.all_reduce(
-                                allreduce_tensor,
-                                group=get_data_parallel_group(with_context_parallel=True),
-                                async_op=True,
-                            )
+                        allreduce_grads(self._grad_div_ar_fusion, allreduce_tensor, data_group, grad_mult)
                 else:
-                    if self._grad_div_ar_fusion:
-                        torch.distributed.all_reduce(
-                            main_param.grad,
-                            group=get_data_parallel_group(with_context_parallel=True),
-                            async_op=True,
-                            op=torch.distributed._make_nccl_premul_sum(self._grad_divisor),
-                        )
-                    else:
-                        main_param.grad.div_(get_data_parallel_world_size(with_context_parallel=True))
-                        torch.distributed.all_reduce(
-                            main_param.grad, group=get_data_parallel_group(with_context_parallel=True), async_op=True,
-                        )
+                    allreduce_grads(self._grad_div_ar_fusion, main_param.grad, data_group, grad_mult)
 
         return param_hook
 

From 769dc581d7017ba482b40e5d9331e821c0a8cb4f Mon Sep 17 00:00:00 2001
From: Somshubra Majumdar <titu1994@gmail.com>
Date: Sat, 16 Mar 2024 16:41:17 -0700
Subject: [PATCH 043/140] Fix Transcription data classes for Python 3.11
 (#8684)

* Fix data classes for Python 3.11

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix data classes for Python 3.11

Signed-off-by: smajumdar <titu1994@gmail.com>

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 nemo/collections/asr/models/aed_multitask_models.py  | 6 ++++--
 nemo/collections/asr/models/classification_models.py | 6 +++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py
index 1ec6b7946dda..82c706a0a06a 100644
--- a/nemo/collections/asr/models/aed_multitask_models.py
+++ b/nemo/collections/asr/models/aed_multitask_models.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import os
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from math import ceil
 from typing import Any, Dict, List, Optional, Union
 
@@ -106,7 +106,9 @@ class MultiTaskTranscriptionConfig(TranscribeConfig):
     text_field: str = "answer"
     lang_field: str = "target_lang"
 
-    _internal: Optional[MultiTaskTranscriptionInternalConfig] = None
+    _internal: Optional[MultiTaskTranscriptionInternalConfig] = field(
+        default_factory=lambda: MultiTaskTranscriptionInternalConfig()
+    )
 
 
 class EncDecMultiTaskModel(ASRModel, ExportableEncDecModel, ASRBPEMixin, ASRTranscriptionMixin):
diff --git a/nemo/collections/asr/models/classification_models.py b/nemo/collections/asr/models/classification_models.py
index 142a3c0922a3..c1294de5bdc0 100644
--- a/nemo/collections/asr/models/classification_models.py
+++ b/nemo/collections/asr/models/classification_models.py
@@ -17,7 +17,7 @@
 import os
 import tempfile
 from abc import abstractmethod
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from math import ceil, floor
 from typing import Any, Dict, List, Optional, Union
 
@@ -48,7 +48,7 @@ class ClassificationInferConfig:
     batch_size: int = 4
     logprobs: bool = False
 
-    _internal: InternalTranscribeConfig = InternalTranscribeConfig()
+    _internal: InternalTranscribeConfig = field(default_factory=lambda: InternalTranscribeConfig())
 
 
 @dataclass
@@ -56,7 +56,7 @@ class RegressionInferConfig:
     batch_size: int = 4
     logprobs: bool = True
 
-    _internal: InternalTranscribeConfig = InternalTranscribeConfig()
+    _internal: InternalTranscribeConfig = field(default_factory=lambda: InternalTranscribeConfig())
 
 
 class _EncDecBaseModel(ASRModel, ExportableEncDecModel, TranscriptionMixin):

From 86e331c019e0e4e855f3d1304719dd8928668cc4 Mon Sep 17 00:00:00 2001
From: tbartley94 <90423858+tbartley94@users.noreply.github.com>
Date: Sat, 16 Mar 2024 23:28:37 -0700
Subject: [PATCH 044/140] tn language suport matrix update (#8648)

Signed-off-by: Travis Bartley <tbartley@nvidia.com>
---
 .../nlp/text_normalization/wfst/wfst_text_normalization.rst   | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst b/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst
index b664eed23a1a..1210779363d9 100644
--- a/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst
+++ b/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst
@@ -180,6 +180,10 @@ Language Support Matrix
 +------------------+----------+----------+----------+--------------------+----------------------+
 | Italian          | it       | x        |          |                    |                      |
 +------------------+----------+----------+----------+--------------------+----------------------+
+| Armenian         | hy       | x        | x        |                    |                      |
++------------------+----------+----------+----------+--------------------+----------------------+
+| Marathi          | mr       |          | x        |                    |                      |
++------------------+----------+----------+----------+--------------------+----------------------+
 
 
 See :ref:`Grammar customization <wfst_customization>` for grammar customization details.

From 6cbaf3751dea4f6039344eb6f7a62f7c90475e28 Mon Sep 17 00:00:00 2001
From: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Date: Sun, 17 Mar 2024 09:57:04 -0700
Subject: [PATCH 045/140] Various docs fixes: typos, changing urls to relative
 links (#8685)

* fix typos

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* rename Further information to NeMo ASR Documentation

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* fix malformed tables in asr lm docs

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* fix some :doc: links that weren't working

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* change doc urls in docs to relative links using :doc: or :ref:

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* change AAYN asr bib key so its not same as nlp bib

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

---------

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>
---
 docs/source/asr/api.rst                       |  2 +-
 docs/source/asr/asr_all.bib                   |  2 +-
 ...sr_language_modeling_and_customization.rst | 82 +++++++++----------
 docs/source/asr/examples/kinyarwanda_asr.rst  |  4 +-
 docs/source/asr/intro.rst                     |  8 +-
 docs/source/asr/models.rst                    |  2 +-
 docs/source/core/core.rst                     |  6 +-
 docs/source/multimodal/mllm/configs.rst       |  2 +-
 docs/source/multimodal/vlm/configs.rst        |  2 +-
 docs/source/nlp/nemo_megatron/intro.rst       |  2 +-
 ...ation_and_capitalization_lexical_audio.rst | 10 +--
 .../wfst/wfst_resources.rst                   |  2 +-
 .../wfst/wfst_text_normalization.rst          |  4 +-
 docs/source/starthere/intro.rst               |  2 +-
 docs/source/tools/nemo_forced_aligner.rst     |  2 +-
 docs/source/tools/speech_data_explorer.rst    |  2 +-
 docs/source/tts/configs.rst                   |  2 +-
 17 files changed, 68 insertions(+), 68 deletions(-)

diff --git a/docs/source/asr/api.rst b/docs/source/asr/api.rst
index a5b3369177b9..b28fe2db1d88 100644
--- a/docs/source/asr/api.rst
+++ b/docs/source/asr/api.rst
@@ -1,4 +1,4 @@
-NeMo ASR collection API
+NeMo ASR Collection API
 =======================
 
 
diff --git a/docs/source/asr/asr_all.bib b/docs/source/asr/asr_all.bib
index 11998d30cd5e..6256864152ac 100644
--- a/docs/source/asr/asr_all.bib
+++ b/docs/source/asr/asr_all.bib
@@ -1034,7 +1034,7 @@ @misc{park2022multi
     copyright = {Creative Commons Attribution 4.0 International}
 }
 
-@inproceedings{vaswani2017attention,
+@inproceedings{vaswani2017aayn,
   title={Attention is all you need},
   author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
   booktitle={Advances in Neural Information Processing Systems},
diff --git a/docs/source/asr/asr_language_modeling_and_customization.rst b/docs/source/asr/asr_language_modeling_and_customization.rst
index 013b31dd28cd..0761f60d2380 100644
--- a/docs/source/asr/asr_language_modeling_and_customization.rst
+++ b/docs/source/asr/asr_language_modeling_and_customization.rst
@@ -76,27 +76,27 @@ it is stored at the path specified by `kenlm_model_file`.
 
 The following is the list of the arguments for the training script:
 
-+------------------+----------+-------------+--------------------------------------------------------------------------------------------------------------------------------+
-| **Argument**     | **Type** | **Default** | **Description**                                                                                                                |
-+------------------+----------+-------------+--------------------------------------------------------------------------------------------------------------------------------+
-| nemo_model_file  | str      | Required    | The path to `.nemo` file of the ASR model, or name of a pretrained NeMo model to extract a tokenizer.                          |
-+------------------+----------+-------------+--------------------------------------------------------------------------------------------------------------------------------+
-| train_paths      | List[str] | Required    | List of training files or folders. Files can be a plain text file or ".json" manifest or ".json.gz".                          |
-+------------------+----------+-------------+--------------------------------------------------------------------------------------------------------------------------------+
-| kenlm_model_file | str      | Required    | The path to store the KenLM binary model file.                                                                                 |
-+------------------+----------+-------------+--------------------------------------------------------------------------------------------------------------------------------+
-| kenlm_bin_path   | str      | Required    | The path to the bin folder of KenLM. It is a folder named `bin` under where KenLM is installed.                                |
-+------------------+----------+-------------+--------------------------------------------------------------------------------------------------------------------------------+
-| ngram_length**   | int      | Required    | Specifies order of N-gram LM.                                                                                                  |
-+------------------+----------+-------------+--------------------------------------------------------------------------------------------------------------------------------+
-| ngram_prune      | List[int] | [0]        | List of thresholds to prune N-grams. Example: [0,0,1]. See Pruning section on the https://kheafield.com/code/kenlm/estimation  |
-+------------------+----------+-------------+--------------------------------------------------------------------------------------------------------------------------------+
-| cache_path       | str      | ""          | Cache path to save tokenized files.                                                                                            |
-+------------------+----------+-------------+--------------------------------------------------------------------------------------------------------------------------------+
-| preserve_arpa    | bool     | ``False``   | Whether to preserve the intermediate ARPA file after construction of the BIN file.                                             |
-+------------------+----------+-------------+--------------------------------------------------------------------------------------------------------------------------------+
-| verbose          | int      | 1           | Verbose level.                                                                                                                 |
-+------------------+----------+-------------+--------------------------------------------------------------------------------------------------------------------------------+
++------------------+-----------+-------------+--------------------------------------------------------------------------------------------------------------------------------+
+| **Argument**     | **Type**  | **Default** | **Description**                                                                                                                |
++------------------+-----------+-------------+--------------------------------------------------------------------------------------------------------------------------------+
+| nemo_model_file  | str       | Required    | The path to `.nemo` file of the ASR model, or name of a pretrained NeMo model to extract a tokenizer.                          |
++------------------+-----------+-------------+--------------------------------------------------------------------------------------------------------------------------------+
+| train_paths      | List[str] | Required    | List of training files or folders. Files can be a plain text file or ".json" manifest or ".json.gz".                           |
++------------------+-----------+-------------+--------------------------------------------------------------------------------------------------------------------------------+
+| kenlm_model_file | str       | Required    | The path to store the KenLM binary model file.                                                                                 |
++------------------+-----------+-------------+--------------------------------------------------------------------------------------------------------------------------------+
+| kenlm_bin_path   | str       | Required    | The path to the bin folder of KenLM. It is a folder named `bin` under where KenLM is installed.                                |
++------------------+-----------+-------------+--------------------------------------------------------------------------------------------------------------------------------+
+| ngram_length**   | int       | Required    | Specifies order of N-gram LM.                                                                                                  |
++------------------+-----------+-------------+--------------------------------------------------------------------------------------------------------------------------------+
+| ngram_prune      | List[int] | [0]         | List of thresholds to prune N-grams. Example: [0,0,1]. See Pruning section on the https://kheafield.com/code/kenlm/estimation  |
++------------------+-----------+-------------+--------------------------------------------------------------------------------------------------------------------------------+
+| cache_path       | str       | ``""``      | Cache path to save tokenized files.                                                                                            |
++------------------+-----------+-------------+--------------------------------------------------------------------------------------------------------------------------------+
+| preserve_arpa    | bool      | ``False``   | Whether to preserve the intermediate ARPA file after construction of the BIN file.                                             |
++------------------+-----------+-------------+--------------------------------------------------------------------------------------------------------------------------------+
+| verbose          | int       | 1           | Verbose level.                                                                                                                 |
++------------------+-----------+-------------+--------------------------------------------------------------------------------------------------------------------------------+
 
 ** Note: Recommend to use 6 as the order of the N-gram model for BPE-based models. Higher orders may need the re-compilation of KenLM to support it.
 
@@ -184,7 +184,7 @@ The following is the list of the important arguments for the evaluation script:
 +--------------------------------------+----------+------------------+-------------------------------------------------------------------------+
 | text_processing.do_lowercase         | bool     | ``False``        | Whether to make the training text all lower case.                       |
 +--------------------------------------+----------+------------------+-------------------------------------------------------------------------+
-| text_processing.punctuation_marks    | str      | ""               | String with punctuation marks to process. Example: ".\,?"               |
+| text_processing.punctuation_marks    | str      | ``""``           | String with punctuation marks to process. Example: ".\,?"               |
 +--------------------------------------+----------+------------------+-------------------------------------------------------------------------+
 | text_processing.rm_punctuation       |  bool    | ``False``        | Whether to remove punctuation marks from text.                          |
 +--------------------------------------+----------+------------------+-------------------------------------------------------------------------+
@@ -527,25 +527,25 @@ The following is the list of the arguments for the opengrm script:
 | kenlm_bin_path       | str    | Required         | The path to the bin folder of KenLM library. It is a folder named `bin` under where KenLM is installed.         |
 +----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
 | ngram_bin_path       | str    | Required         | The path to the bin folder of OpenGrm Ngram. It is a folder named `bin` under where OpenGrm Ngram is installed. |
-+----------------------+--------+------------------+-------------------------------------------------------------------------+
-| arpa_a               | str    | Required         | Path to the ARPA N-gram model file A                                    |
-+----------------------+--------+------------------+-------------------------------------------------------------------------+
-| alpha                | float  | Required         | Weight of N-gram model A                                                |
-+----------------------+--------+------------------+-------------------------------------------------------------------------+
-| arpa_b               | int    | Required         | Path to the ARPA N-gram model file B                                    |
-+----------------------+--------+------------------+-------------------------------------------------------------------------+
-| beta                 | float  | Required         | Weight of N-gram model B                                                |
-+----------------------+--------+------------------+-------------------------------------------------------------------------+
-| out_path             | str    | Required         | Path for writing temporary and resulting files.                         |
-+----------------------+--------+------------------+-------------------------------------------------------------------------+
-| test_file            | str    | None             | Path to test file to count perplexity if provided.                      |
-+----------------------+--------+------------------+-------------------------------------------------------------------------+
-| symbols              | str    | None             | Path to symbols (.syms) file. Could be calculated if it is not provided.|
-+----------------------+--------+------------------+-------------------------------------------------------------------------+
-| nemo_model_file      | str    | None             | The path to '.nemo' file of the ASR model, or name of a pretrained NeMo model.  |
-+----------------------+--------+------------------+-------------------------------------------------------------------------+
-| force                | bool   | ``False``        | Whether to recompile and rewrite all files                              |
-+----------------------+--------+------------------+-------------------------------------------------------------------------+
++----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
+| arpa_a               | str    | Required         | Path to the ARPA N-gram model file A                                                                            |
++----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
+| alpha                | float  | Required         | Weight of N-gram model A                                                                                        |
++----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
+| arpa_b               | int    | Required         | Path to the ARPA N-gram model file B                                                                            |
++----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
+| beta                 | float  | Required         | Weight of N-gram model B                                                                                        |
++----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
+| out_path             | str    | Required         | Path for writing temporary and resulting files.                                                                 |
++----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
+| test_file            | str    | None             | Path to test file to count perplexity if provided.                                                              |
++----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
+| symbols              | str    | None             | Path to symbols (.syms) file. Could be calculated if it is not provided.                                        |
++----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
+| nemo_model_file      | str    | None             | The path to '.nemo' file of the ASR model, or name of a pretrained NeMo model.                                  |
++----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
+| force                | bool   | ``False``        | Whether to recompile and rewrite all files                                                                      |
++----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
 
 
 ******************
diff --git a/docs/source/asr/examples/kinyarwanda_asr.rst b/docs/source/asr/examples/kinyarwanda_asr.rst
index f8057585b104..792d4f7aa446 100644
--- a/docs/source/asr/examples/kinyarwanda_asr.rst
+++ b/docs/source/asr/examples/kinyarwanda_asr.rst
@@ -429,7 +429,7 @@ Training from scratch and finetuning
 ASR models
 ##########
 
-Our goal was to train two ASR models with different architectures: `Conformer-CTC <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#conformer-ctc>`_ and `Conformer-Transducer <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#conformer-transducer>`_, with around 120 million parameters.
+Our goal was to train two ASR models with different architectures: :ref:`Conformer-CTC <Conformer-CTC_model>` and :ref:`Conformer-Transducer <Conformer-Transducer_model>`, with around 120 million parameters.
 The CTC model predicts output tokens for each timestep. The outputs are assumed to be independent of each other. As a result the CTC models work faster but they can produce outputs that are inconsistent with each other. CTC models are often combined with external language models in production. In contrast, the Transducer models contain the decoding part which generates the output tokens one by one and the next token prediction depends on this history. Due to autoregressive nature of decoding the inference speed is several times slower than that of CTC models, but the quality is usually better because it can incorporate language model information within the same model.
 
 Training scripts and configs
@@ -604,7 +604,7 @@ Error analysis
 
 Still, even WER of 16% is not as good as we usually get for other languages trained with NeMo toolkit, so we may want to look at the errors that the model makes to better understand what's the problem.
 
-We can use `Speech Data Explorer <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/tools/speech_data_explorer.html>`_ to analyze the errors.
+We can use :doc:`Speech Data Explorer <../../tools/speech_data_explorer>` to analyze the errors.
 
 If we run
 
diff --git a/docs/source/asr/intro.rst b/docs/source/asr/intro.rst
index d8fe1f105caf..7d1270af1267 100644
--- a/docs/source/asr/intro.rst
+++ b/docs/source/asr/intro.rst
@@ -103,7 +103,7 @@ After :ref:`training <train-ngram-lm>` an N-gram LM, you can use it for transcri
         decoding_mode=beamsearch_ngram \
         decoding_strategy="<Beam library such as beam, pyctcdecode or flashlight>"
 
-See more information about LM decoding :doc:`here <./asr_language_modeling>`.
+See more information about LM decoding :doc:`here <./asr_language_modeling_and_customization>`.
 
 Use real-time transcription
 ---------------------------
@@ -179,8 +179,8 @@ Preparing ASR datasets
 NeMo includes preprocessing scripts for several common ASR datasets. The :doc:`Datasets <./datasets>` section contains instructions on
 running those scripts. It also includes guidance for creating your own NeMo-compatible dataset, if you have your own data.
 
-Further information
--------------------
+NeMo ASR Documentation
+----------------------
 For more information, see additional sections in the ASR docs on the left-hand-side menu or in the list below:
 
 .. toctree::
@@ -188,7 +188,7 @@ For more information, see additional sections in the ASR docs on the left-hand-s
 
    models
    datasets
-   asr_language_modeling
+   asr_language_modeling_and_customization
    results
    scores
    configs
diff --git a/docs/source/asr/models.rst b/docs/source/asr/models.rst
index 4f05cec410fa..f8b4ef72196e 100644
--- a/docs/source/asr/models.rst
+++ b/docs/source/asr/models.rst
@@ -24,7 +24,7 @@ Canary-1B is the latest ASR model from NVIDIA NeMo. It sits at the top of the `H
 
 You can `download the checkpoint <https://huggingface.co/nvidia/canary-1b>`__  or try out Canary in action in this `HuggingFace Space <https://huggingface.co/spaces/nvidia/canary-1b>`__.
 
-Canary-1B is an encoder-decoder model with a :ref:`FastConformer Encoder <Fast-Conformer>` and Transformer Decoder :cite:`asr-models-vaswani2017attention`.
+Canary-1B is an encoder-decoder model with a :ref:`FastConformer Encoder <Fast-Conformer>` and Transformer Decoder :cite:`asr-models-vaswani2017aayn`.
 
 It is a multi-lingual, multi-task model, supporting automatic speech-to-text recognition (ASR) in 4 languages (English, German, French, Spanish) as well as translation between English and the 3 other supported languages.
 
diff --git a/docs/source/core/core.rst b/docs/source/core/core.rst
index 7fe4a65cc32f..6e5efa56d5f0 100644
--- a/docs/source/core/core.rst
+++ b/docs/source/core/core.rst
@@ -174,9 +174,9 @@ via PyTorch Lightning `hooks <https://pytorch-lightning.readthedocs.io/en/stable
 
 For more domain-specific information, see:
 
-- :ref:`Automatic Speech Recognition (ASR) <../asr/intro>`
-- :ref:`Natural Language Processing (NLP) <../nlp/models>`
-- :ref:`Text-to-Speech Synthesis (TTS) <../tts/intro>`
+- :doc:`Automatic Speech Recognition (ASR) <../asr/intro>`
+- :doc:`Natural Language Processing (NLP) <../nlp/models>`
+- :doc:`Text-to-Speech Synthesis (TTS) <../tts/intro>`
 
 PyTorch Lightning Trainer
 ~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/source/multimodal/mllm/configs.rst b/docs/source/multimodal/mllm/configs.rst
index d54be2e1b3b6..6e9f9b2b8d10 100644
--- a/docs/source/multimodal/mllm/configs.rst
+++ b/docs/source/multimodal/mllm/configs.rst
@@ -1,7 +1,7 @@
 Common Configuration Files
 ==========================
 
-This section provides a detailed overview of the NeMo configuration file setup specific to models within the NeMo Multimodal Language Model collection. For foundational knowledge about setting up and executing experiments common to all NeMo models, such as the Experiment Manager and PyTorch Lightning trainer parameters, refer to the :doc:`../core/core` section.
+This section provides a detailed overview of the NeMo configuration file setup specific to models within the NeMo Multimodal Language Model collection. For foundational knowledge about setting up and executing experiments common to all NeMo models, such as the Experiment Manager and PyTorch Lightning trainer parameters, refer to the :doc:`core <../../core/core>` documentation.
 
 Within the configuration files of the NeMo Multimodal Language Model, details concerning dataset(s), augmentation, optimization parameters, and model architectural specifications are central. This page explores each of these aspects.
 
diff --git a/docs/source/multimodal/vlm/configs.rst b/docs/source/multimodal/vlm/configs.rst
index 160ba05cd6d0..cc383cb64b62 100644
--- a/docs/source/multimodal/vlm/configs.rst
+++ b/docs/source/multimodal/vlm/configs.rst
@@ -1,7 +1,7 @@
 Common Configuration Files
 ==========================
 
-This section provides a detailed overview of the NeMo configuration file setup specific to models within the NeMo Multimodal Language Model collection. For foundational knowledge about setting up and executing experiments common to all NeMo models, such as the Experiment Manager and PyTorch Lightning trainer parameters, refer to the :doc:`../core/core` section.
+This section provides a detailed overview of the NeMo configuration file setup specific to models within the NeMo Multimodal Language Model collection. For foundational knowledge about setting up and executing experiments common to all NeMo models, such as the Experiment Manager and PyTorch Lightning trainer parameters, refer to the :doc:`core <../../core/core>` documentation.
 
 Within the configuration files of the NeMo Multimodal Language Model, details concerning dataset(s), augmentation, optimization parameters, and model architectural specifications are central. This page explores each of these aspects.
 
diff --git a/docs/source/nlp/nemo_megatron/intro.rst b/docs/source/nlp/nemo_megatron/intro.rst
index faf315a40c04..6ddf008214cc 100644
--- a/docs/source/nlp/nemo_megatron/intro.rst
+++ b/docs/source/nlp/nemo_megatron/intro.rst
@@ -1,7 +1,7 @@
 Large Language Models
 =====================
 
-To learn more about using NeMo to train Large Language Models at scale, please refer to the `NeMo Framework User Guide! <https://docs.nvidia.com/nemo-framework/user-guide/latest/index.html>`_.
+To learn more about using NeMo to train Large Language Models at scale, please refer to the `NeMo Framework User Guide <https://docs.nvidia.com/nemo-framework/user-guide/latest/index.html>`_.
 
 * GPT-style models (decoder only)
 * T5/BART/UL2-style models (encoder-decoder)
diff --git a/docs/source/nlp/punctuation_and_capitalization_lexical_audio.rst b/docs/source/nlp/punctuation_and_capitalization_lexical_audio.rst
index d2a46af8c117..8314676e5c4c 100644
--- a/docs/source/nlp/punctuation_and_capitalization_lexical_audio.rst
+++ b/docs/source/nlp/punctuation_and_capitalization_lexical_audio.rst
@@ -15,7 +15,7 @@ Like in these examples:
 
   Yeah, they make you work. Yeah, over there you walk a lot? or Yeah, they make you work. Yeah, over there you walk a lot.
 
-You can find more details on text only punctuation and capitalization in `Punctuation And Capitalization's page <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/punctuation_and_capitalization.html>`_. In this document, we focus on model changes needed to use acoustic features.
+You can find more details on text only punctuation and capitalization in the :doc:`Punctuation And Capitalization page <./punctuation_and_capitalization>`. In this document, we focus on model changes needed to use acoustic features.
 
 Quick Start Guide
 -----------------
@@ -35,7 +35,7 @@ Quick Start Guide
 
 Model Description
 -----------------
-In addition to `Punctuation And Capitalization model <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/punctuation_and_capitalization.html>`_ we add audio encoder (e.g. Conformer's encoder) and attention based fusion of lexical and audio features.
+In addition to :doc:`Punctuation And Capitalization model <./punctuation_and_capitalization>` we add audio encoder (e.g. Conformer's encoder) and attention based fusion of lexical and audio features.
 This model architecture is based on `Multimodal Semi-supervised Learning Framework for Punctuation Prediction in Conversational Speech <https://arxiv.org/pdf/2008.00702.pdf>`__ :cite:`nlp-punct-sunkara20_interspeech`.
 
 .. note::
@@ -50,7 +50,7 @@ This model architecture is based on `Multimodal Semi-supervised Learning Framewo
 
 Raw Data Format
 ---------------
-In addition to `Punctuation And Capitalization Raw Data Format <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/punctuation_and_capitalization.html#raw-data-format>`_ this model also requires audio data.
+In addition to :ref:`Punctuation And Capitalization Raw Data Format <raw_data_format_punct>` this model also requires audio data.
 You have to provide ``audio_train.txt`` and ``audio_dev.txt`` (and optionally ``audio_test.txt``) which contain one valid path to audio per row.
 
 Example of the ``audio_train.txt``/``audio_dev.txt`` file:
@@ -100,14 +100,14 @@ Training Punctuation and Capitalization Model
 ---------------------------------------------
 
 The audio encoder is initialized with pretrained ASR model. You can use any of ``list_available_models()`` of ``EncDecCTCModel`` or your own checkpoints, either one should be provided in ``model.audio_encoder.pretrained_model``.
-You can freeze audio encoder during training and add additional ``ConformerLayer`` on top of encoder to reduce compute with ``model.audio_encoder.freeze``. You can also add `Adapters <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/core/adapters/components.html>`_ to reduce compute with ``model.audio_encoder.adapter``. Parameters of fusion module are stored in ``model.audio_encoder.fusion``.
+You can freeze audio encoder during training and add additional ``ConformerLayer`` on top of encoder to reduce compute with ``model.audio_encoder.freeze``. You can also add :doc:`Adapters <../core/adapters/components>` to reduce compute with ``model.audio_encoder.adapter``. Parameters of fusion module are stored in ``model.audio_encoder.fusion``.
 An example of a model configuration file for training the model can be found at:
 `NeMo/examples/nlp/token_classification/conf/punctuation_capitalization_lexical_audio_config.yaml <https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/token_classification/conf/punctuation_capitalization_lexical_audio_config.yaml>`__.
 
 Configs
 ^^^^^^^^^^^^
 .. note::
-  This page contains only parameters specific to lexical and audio model. Others parameters can be found in `Punctuation And Capitalization's page <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/punctuation_and_capitalization.html>`_.
+  This page contains only parameters specific to lexical and audio model. Others parameters can be found in the :doc:`Punctuation And Capitalization page <./punctuation_and_capitalization>`.
 
 Model config
 ^^^^^^^^^^^^
diff --git a/docs/source/nlp/text_normalization/wfst/wfst_resources.rst b/docs/source/nlp/text_normalization/wfst/wfst_resources.rst
index 95e9748d7c83..fb11bf9b317e 100644
--- a/docs/source/nlp/text_normalization/wfst/wfst_resources.rst
+++ b/docs/source/nlp/text_normalization/wfst/wfst_resources.rst
@@ -10,7 +10,7 @@ Resources and Documentation
 
 - List of `TN/ITN issues <https://github.com/NVIDIA/NeMo/issues?q=is%3Aissue+label%3ATN%2FITN+>`_, use `TN/ITN` label
 - TN/ITN related `discussions <https://github.com/NVIDIA/NeMo/discussions?discussions_q=label%3ATN%2FITN>`_, use `TN/ITN` label
-- Documentation on how to generate `.far files for deployment in Riva (via Sparrowhawk) <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/text_normalization/wfst/wfst_text_processing_deployment.html>`_
+- Documentation on how to generate :doc:`.far files for deployment in Riva (via Sparrowhawk) <./wfst_text_processing_deployment>`.
 - Tutorial that provides an `Overview of NeMo-TN/ITN <https://colab.research.google.com/github/NVIDIA/NeMo-text-processing/blob/main/tutorials/Text_(Inverse)_Normalization.ipynb>`_
 - Tutorial on `how to write new grammars <https://colab.research.google.com/github/NVIDIA/NeMo-text-processing/blob/main/tutorials/WFST_Tutorial.ipynb>`_ in `Pynini <https://www.opengrm.org/twiki/bin/view/GRM/Pynini>`_
 
diff --git a/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst b/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst
index 1210779363d9..7e1a34c3864e 100644
--- a/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst
+++ b/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst
@@ -138,9 +138,9 @@ Audio-based TN
 
 Additional Arguments:
 
-* ``text`` - Input text or `JSON manifest file <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/datasets.html#preparing-custom-asr-data>`_ with multiple audio paths.
+* ``text`` - Input text or :ref:`JSON manifest file<section-with-manifest-format-explanation>` with multiple audio paths.
 * ``audio_data`` - (Optional) Input audio.
-* ``model`` - `Off-shelf NeMo CTC ASR model name <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/results.html#speech-recognition-languages>`_ or path to local NeMo model checkpoint ending on .nemo
+* ``model`` - :ref:`Off-shelf NeMo CTC ASR model name <asr-checkpoint-list-by-language>` or path to local NeMo model checkpoint ending on .nemo
 * ``n_tagged`` - number of normalization options to output.
 
 
diff --git a/docs/source/starthere/intro.rst b/docs/source/starthere/intro.rst
index 77a1ca0255a1..eaeab3c212d0 100644
--- a/docs/source/starthere/intro.rst
+++ b/docs/source/starthere/intro.rst
@@ -9,7 +9,7 @@ Introduction
 .. _dummy_header:
 
 NVIDIA NeMo Framework is an end-to-end, cloud-native framework to build, customize, and deploy generative AI models anywhere. 
-To learn more about using NeMo in generative AI workflows, please refer to the `NeMo Framework User Guide! <https://docs.nvidia.com/nemo-framework/user-guide/latest/index.html>`_
+To learn more about using NeMo in generative AI workflows, please refer to the `NeMo Framework User Guide <https://docs.nvidia.com/nemo-framework/user-guide/latest/index.html>`_.
 
 `NVIDIA NeMo Framework <https://github.com/NVIDIA/NeMo>`_ has separate collections for Large Language Models (LLMs), 
 Multimodal (MM), Computer Vision (CV), Automatic Speech Recognition (ASR), 
diff --git a/docs/source/tools/nemo_forced_aligner.rst b/docs/source/tools/nemo_forced_aligner.rst
index c977ad676181..a4ed90fa7f9f 100644
--- a/docs/source/tools/nemo_forced_aligner.rst
+++ b/docs/source/tools/nemo_forced_aligner.rst
@@ -6,7 +6,7 @@ NFA is hosted here: https://github.com/NVIDIA/NeMo/tree/main/tools/nemo_forced_a
 
 NFA is a tool for generating token-, word- and segment-level timestamps of speech in audio using NeMo's CTC-based Automatic Speech Recognition models. 
 You can provide your own reference text, or use ASR-generated transcription. 
-You can use NeMo's ASR Model checkpoints out of the box in `14+ languages <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/results.html#speech-recognition-languages>`_, or train your own model.
+You can use NeMo's ASR Model checkpoints out of the box in :ref:`14+ languages <asr-checkpoint-list-by-language>`, or train your own model.
 NFA can be used on long audio files of 1+ hours duration (subject to your hardware and the ASR model used).
 
 Demos & Tutorials
diff --git a/docs/source/tools/speech_data_explorer.rst b/docs/source/tools/speech_data_explorer.rst
index 5e7a28812b39..a57cb442f468 100644
--- a/docs/source/tools/speech_data_explorer.rst
+++ b/docs/source/tools/speech_data_explorer.rst
@@ -21,7 +21,7 @@ Speech Data Explorer (SDE) is a `Dash <https://plotly.com/dash/>`__-based web ap
 SDE Demo Instance
 -----------------
 
-To demonstrate both the `CTC-Segmentation <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/tools/ctc_segmentation.html>`_ and Speech Data Explorer tools, we re-segmenting the development set as of `the LibriSpeech corpus <http://www.danielpovey.com/files/2015_icassp_librispeech.pdf>`_.
+To demonstrate both the :doc:`CTC-Segmentation <./ctc_segmentation>` and Speech Data Explorer tools, we re-segmenting the development set as of `the LibriSpeech corpus <http://www.danielpovey.com/files/2015_icassp_librispeech.pdf>`_.
 We concatenated all audio files from the dev-clean split into a single file and set up the CTC-Segmentation tool to cut the long audio file into original utterances.
 We used the CTC-based `QuartzNet15x5Base-En ASR model <https://ngc.nvidia.com/catalog/models/nvidia:nemospeechmodels>`_.
 The segmented corpus has 3.82% WER and contains 300 out of the initial 323 minutes of audio.
diff --git a/docs/source/tts/configs.rst b/docs/source/tts/configs.rst
index 3a4b99226e2e..d720b37aa721 100644
--- a/docs/source/tts/configs.rst
+++ b/docs/source/tts/configs.rst
@@ -106,7 +106,7 @@ Text normalization (TN) converts text from written form into its verbalized form
 
 Tokenizer Configuration
 ------------------------
-Tokenization converts input text string to a list of integer tokens. It may pad leading and/or trailing whitespaces to a string. NeMo tokenizer supports grapheme-only inputs, phoneme-only inputs, or a mixer of grapheme and phoneme inputs to disambiguate pronunciations of heteronyms for English, German, and Spanish. It also utilizes a grapheme-to-phoneme (G2P) tool to transliterate out-of-vocabulary (OOV) words. Please refer to the Section :doc:`../text_processing/g2p/g2p` and `TTS tokenizer collection <https://github.com/NVIDIA/NeMo/tree/stable/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py>`_ for more details. Note that G2P integration to NeMo TTS tokenizers pipeline is upcoming soon. The following example sets up a ``EnglishPhonemesTokenizer`` with a mixer of grapheme and phoneme inputs where each word shown in the heteronym list is transliterated into graphemes or phonemes by a 50% chance.
+Tokenization converts input text string to a list of integer tokens. It may pad leading and/or trailing whitespaces to a string. NeMo tokenizer supports grapheme-only inputs, phoneme-only inputs, or a mixer of grapheme and phoneme inputs to disambiguate pronunciations of heteronyms for English, German, and Spanish. It also utilizes a grapheme-to-phoneme (G2P) tool to transliterate out-of-vocabulary (OOV) words. Please refer to the :doc:`G2P section <./g2p>` and `TTS tokenizer collection <https://github.com/NVIDIA/NeMo/tree/stable/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py>`_ for more details. Note that G2P integration to NeMo TTS tokenizers pipeline is upcoming soon. The following example sets up a ``EnglishPhonemesTokenizer`` with a mixer of grapheme and phoneme inputs where each word shown in the heteronym list is transliterated into graphemes or phonemes by a 50% chance.
 
 .. code-block:: yaml
 

From 13f0f23efebe1317b2d1c0637ccddaef03321598 Mon Sep 17 00:00:00 2001
From: Krishna Puvvada <93558329+krishnacpuvvada@users.noreply.github.com>
Date: Sun, 17 Mar 2024 15:53:37 -0700
Subject: [PATCH 046/140] injecting text_field when not present in manifest in
 AED models transcribe fn[ASR] (#8679)

* injecting text_field when not present in manifest in transcribe fn

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 nemo/collections/asr/models/aed_multitask_models.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py
index 82c706a0a06a..2472bda6cfd5 100644
--- a/nemo/collections/asr/models/aed_multitask_models.py
+++ b/nemo/collections/asr/models/aed_multitask_models.py
@@ -110,6 +110,12 @@ class MultiTaskTranscriptionConfig(TranscribeConfig):
         default_factory=lambda: MultiTaskTranscriptionInternalConfig()
     )
 
+    def __post_init__(self):
+        required_fields = ['task', 'pnc', 'source_lang', 'target_lang', 'text_field', 'lang_field']
+        for field in required_fields:
+            if not hasattr(self, field):
+                raise ValueError(f"`{field}` must be present in the transcription config: {self}")
+
 
 class EncDecMultiTaskModel(ASRModel, ExportableEncDecModel, ASRBPEMixin, ASRTranscriptionMixin):
     """Base class for AED multi-task models"""
@@ -905,7 +911,7 @@ def _may_be_make_dict_and_fix_paths(self, json_items, manifest_path, trcfg: Mult
                     'taskname': 'asr' if trcfg.task is None else trcfg.task,
                     'target_lang': 'en' if trcfg.target_lang is None else trcfg.target_lang,
                     'pnc': 'yes' if trcfg.pnc is None else 'yes' if trcfg.pnc else 'no',
-                    'answer': 'nothing',
+                    trcfg.text_field: 'nothing',
                 }
             elif isinstance(item, dict):
                 entry = item
@@ -919,6 +925,8 @@ def _may_be_make_dict_and_fix_paths(self, json_items, manifest_path, trcfg: Mult
                     entry['target_lang'] = 'en' if trcfg.target_lang is None else trcfg.target_lang
                 if 'pnc' not in entry:
                     entry['pnc'] = 'yes' if trcfg.pnc is None else 'yes' if trcfg.pnc else 'no'
+                if trcfg.text_field not in entry:
+                    entry[trcfg.text_field] = 'nothing'
             else:
                 raise ValueError(f"Expected str or dict, got {type(item)}")
             out_json_items.append(entry)

From 92d4aa0942235cc741a5ed19a1ca7cd243848319 Mon Sep 17 00:00:00 2001
From: zhehuaichen <139396994+zhehuaichen@users.noreply.github.com>
Date: Mon, 18 Mar 2024 15:39:42 -0400
Subject: [PATCH 047/140] Support dumping nbest into manifest in ASR (#8662)

* support nbest decoding

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* remove extract_nbest and just dump them if exists

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* introduce extract_nbest so as to ease the user configuration of nbest

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove breakpoints

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

* use flag to check instead of checking nbest item type

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>

---------

Signed-off-by: zhehuaichen <dian.chenzhehuai@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 examples/asr/transcribe_speech.py             | 24 +++++++++++++++++--
 .../asr/parts/utils/transcribe_utils.py       |  3 ++-
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
index e85a15be81d4..c8372c422e7b 100644
--- a/examples/asr/transcribe_speech.py
+++ b/examples/asr/transcribe_speech.py
@@ -195,6 +195,7 @@ class TranscriptionConfig:
     # Only use transcribe_partial_audio() when the audio is too long to fit in memory
     # Your manifest input should have `offset` field to use transcribe_partial_audio()
     allow_partial_transcribe: bool = False
+    extract_nbest: bool = False  # Extract n-best hypotheses from the model
 
 
 @hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig)
@@ -279,6 +280,9 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
         if isinstance(asr_model.decoding, MultiTaskDecoding):
             cfg.multitask_decoding.compute_langs = cfg.compute_langs
             cfg.multitask_decoding.preserve_alignments = cfg.preserve_alignment
+            if cfg.extract_nbest:
+                cfg.multitask_decoding.beam.return_best_hypothesis = False
+                cfg.return_hypotheses = True
             asr_model.change_decoding_strategy(cfg.multitask_decoding)
         elif cfg.decoder_type is not None:
             # TODO: Support compute_langs in CTC eventually
@@ -286,6 +290,9 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
                 raise ValueError("CTC models do not support `compute_langs` at the moment")
 
             decoding_cfg = cfg.rnnt_decoding if cfg.decoder_type == 'rnnt' else cfg.ctc_decoding
+            if cfg.extract_nbest:
+                decoding_cfg.beam.return_best_hypothesis = False
+                cfg.return_hypotheses = True
             decoding_cfg.compute_timestamps = cfg.compute_timestamps  # both ctc and rnnt support it
             if 'preserve_alignments' in decoding_cfg:
                 decoding_cfg.preserve_alignments = preserve_alignment
@@ -298,6 +305,9 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
 
         # Check if ctc or rnnt model
         elif hasattr(asr_model, 'joint'):  # RNNT model
+            if cfg.extract_nbest:
+                cfg.rnnt_decoding.beam.return_best_hypothesis = False
+                cfg.return_hypotheses = True
             cfg.rnnt_decoding.fused_batch_size = -1
             cfg.rnnt_decoding.compute_timestamps = cfg.compute_timestamps
             cfg.rnnt_decoding.compute_langs = cfg.compute_langs
@@ -309,6 +319,9 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
             if cfg.compute_langs:
                 raise ValueError("CTC models do not support `compute_langs` at the moment.")
             cfg.ctc_decoding.compute_timestamps = cfg.compute_timestamps
+            if cfg.extract_nbest:
+                cfg.ctc_decoding.beam.return_best_hypothesis = False
+                cfg.return_hypotheses = True
 
             asr_model.change_decoding_strategy(cfg.ctc_decoding)
 
@@ -318,6 +331,8 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
             isinstance(asr_model, EncDecHybridRNNTCTCModel) and cfg.decoder_type == "ctc"
         ):
             cfg.decoding = cfg.ctc_decoding
+        elif isinstance(asr_model.decoding, MultiTaskDecoding):
+            cfg.decoding = cfg.multitask_decoding
         else:
             cfg.decoding = cfg.rnnt_decoding
 
@@ -402,9 +417,14 @@ def autocast(dtype=None):
         logging.info(f"Finished transcribing {len(filepaths)} files !")
     logging.info(f"Writing transcriptions into file: {cfg.output_filename}")
 
-    # if transcriptions form a tuple of (best_hypotheses, all_hypotheses), extract just best hypothesis
+    # if transcriptions form a tuple of (best_hypotheses, all_hypotheses)
     if type(transcriptions) == tuple and len(transcriptions) == 2:
-        transcriptions = transcriptions[0]
+        if cfg.extract_nbest:
+            # extract all hypotheses if exists
+            transcriptions = transcriptions[1]
+        else:
+            # extract just best hypothesis
+            transcriptions = transcriptions[0]
 
     if cfg.return_transcriptions:
         return transcriptions
diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py
index 681fab751e5f..980500e9ef00 100644
--- a/nemo/collections/asr/parts/utils/transcribe_utils.py
+++ b/nemo/collections/asr/parts/utils/transcribe_utils.py
@@ -390,7 +390,8 @@ def write_transcription(
             if not cfg.decoding.beam.return_best_hypothesis:
                 beam = []
                 for hyp in hyps:
-                    beam.append((hyp.text, hyp.score))
+                    score = hyp.score.numpy().item() if isinstance(hyp.score, torch.Tensor) else hyp.score
+                    beam.append((hyp.text, score))
                 beams.append(beam)
     else:
         raise TypeError

From fd9329da37460cd7df167c45147dd35c0772bba0 Mon Sep 17 00:00:00 2001
From: Cathy <815244047@qq.com>
Date: Tue, 19 Mar 2024 04:13:18 +0800
Subject: [PATCH 048/140] Chatglm pr update (#8686)

* support chatglm2&3

Signed-off-by: Agoniii <815244047@qq.com>

* update chatglm converter scripts and arguments

Signed-off-by: Agoniii <815244047@qq.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Agoniii <815244047@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
---
 .../conf/megatron_chatglm_config.yaml         | 224 +++++++++++++
 .../conf/megatron_chatglm_inference.yaml      |  39 +++
 .../tokenizers/huggingface/auto_tokenizer.py  |  11 +-
 .../language_modeling/megatron_base_model.py  |   6 +
 .../common/text_generation_strategy.py        |   3 +
 .../nlp/modules/common/tokenizer_utils.py     |   2 +
 .../convert_chatglm_hf_to_nemo.py             | 303 ++++++++++++++++++
 .../convert_chatglm_nemo_to_hf.py             | 230 +++++++++++++
 8 files changed, 816 insertions(+), 2 deletions(-)
 create mode 100644 examples/nlp/language_modeling/conf/megatron_chatglm_config.yaml
 create mode 100644 examples/nlp/language_modeling/conf/megatron_chatglm_inference.yaml
 create mode 100644 scripts/nlp_language_modeling/convert_chatglm_hf_to_nemo.py
 create mode 100644 scripts/nlp_language_modeling/convert_chatglm_nemo_to_hf.py

diff --git a/examples/nlp/language_modeling/conf/megatron_chatglm_config.yaml b/examples/nlp/language_modeling/conf/megatron_chatglm_config.yaml
new file mode 100644
index 000000000000..84fbd1b801d4
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_chatglm_config.yaml
@@ -0,0 +1,224 @@
+name: megatron_chatglm2
+restore_from_path: null # used when starting from a .nemo file
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: 32
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first. 
+  max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 100
+  limit_val_batches: 50
+  limit_test_batches: 500
+  accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+  gradient_clip_val: 1.0
+  benchmark: False
+  enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: megatron_chatglm2
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+
+model:
+  mcore_gpt: True
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+  micro_batch_size: 4 # limited by GPU memory
+  global_batch_size: 8 # will use more micro batches to reach global batch size
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  virtual_pipeline_model_parallel_size: null # interleaved pipeline
+
+  # model architecture
+  encoder_seq_length: 32768
+  max_position_embeddings: ${.encoder_seq_length}
+  num_layers: 28
+  hidden_size: 4096
+  ffn_hidden_size: 13696
+  num_attention_heads: 32
+  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  use_scaled_init_method: True # use scaled residuals initialization
+  hidden_dropout: 0.0 # Dropout probability for hidden state transformer.
+  attention_dropout: 0.0 # Dropout probability for attention
+  ffn_dropout: 0.0 # Dropout probability in the feed-forward layer.
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+  normalization: 'rmsnorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
+  layernorm_epsilon: 1e-5
+  do_layer_norm_weight_decay: False # True means weight decay on all params
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  persist_layer_norm: True # Use of persistent fused layer norm kernel.
+  bias: False # Whether to use bias terms in all weight matrices.
+  qkv_bias: True # add bias for QKV linear
+  activation: 'fast-swiglu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu']
+  headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
+  transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
+  openai_gelu: False # Use OpenAI's GELU instead of the default GeLU
+  normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True.
+  position_embedding_type: 'rope' # Position embedding type. Options ['learned_absolute', 'rope']
+  rotary_percentage: 0.5 # If using position_embedding_type=rope, then the per head dim is multiplied by this. For chatglm2, it is 0.5 (https://huggingface.co/THUDM/chatglm2-6b/blob/main/modeling_chatglm.py#L754)
+  rotary_interleaved: True # chatglm2 use interleaved rotary embedding
+  apply_rope_fusion: False
+  attention_type: 'multihead' # Attention type. Options ['multihead']
+  share_embeddings_and_output_weights: False # Share embedding and output layer weights.
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  num_query_groups: 2 # Number of query groups for group query attention. If None, normal attention is used.
+  override_vocab_size: null
+
+  tokenizer:
+    library: huggingface #'sentencepiece'
+    type: THUDM/chatglm2-6b #null
+    model: null # /path/to/tokenizer.model
+    vocab_file: null
+    merge_file: null 
+    delimiter: null # only used for tabular tokenizer
+    sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
+    trust_remote_code: True
+
+  # Mixed precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  hysteresis: 2 # Gradient scale hysteresis
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+
+  # Fusion
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism..
+  gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2.
+  bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
+  bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+  get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
+
+
+  # Miscellaneous
+  seed: 1234
+  resume_from_checkpoint: null # manually set the checkpoint file to load from
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  sync_batch_comm: False # Enable stream synchronization after each p2p communication between pipeline stages
+
+  ## Activation Checkpointing
+  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  # 'full' will checkpoint the entire transformer layer.
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null
+  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+  num_micro_batches_with_partial_activation_checkpoints: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Transformer Engine
+  transformer_engine: True
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 
+  fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin 
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: most_recent # 'most_recent' or 'max'. Algorithm for computing amax from history
+  reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration
+  use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.
+
+  data:
+   # Path to data must be specified by the user.
+    # Supports List, String and Dictionary
+    # List : can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]",
+    # Or see example below: 
+    # data_prefix: 
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_00_text_document
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_01_text_document
+    # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test}
+    # Or see example below:
+    # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}"
+    # data_prefix: ???
+    index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
+    data_impl: mmap
+    splits_string: 900,50,50
+    seq_length: ${model.encoder_seq_length}
+    skip_warmup: True
+    num_workers: 2
+    dataloader_type: single # cyclic
+    reset_position_ids: False # Reset position ids after end-of-document token
+    reset_attention_mask: False # Reset attention mask after end-of-document token
+    eod_mask_loss: False # Mask loss for the end of document tokens
+    validation_drop_last: True # Set to false if the last partial validation samples is to be consumed
+    no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token
+    pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size
+    shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled
+
+  # Nsys profiling options
+  nsys_profile:
+    enabled: False
+    start_step: 10  # Global batch to start profiling
+    end_step: 10 # Global batch to end profiling
+    ranks: [0] # Global rank IDs to profile
+    gen_shape: False # Generate model and kernel details including input shapes
+  
+  optim:
+    name: fused_adam
+    lr: 2e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 50000
+      min_lr: 2e-5
diff --git a/examples/nlp/language_modeling/conf/megatron_chatglm_inference.yaml b/examples/nlp/language_modeling/conf/megatron_chatglm_inference.yaml
new file mode 100644
index 000000000000..e508b01858f5
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_chatglm_inference.yaml
@@ -0,0 +1,39 @@
+inference:
+  greedy: False # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  add_BOS: True # add the bos token at the begining of the prompt
+  tokens_to_generate: 30 # The minimum length of the sequence to be generated.
+  all_probs: False  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+  end_strings: ["</s>"]  # generation will stop when one of these tokens is generated
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: False # logger provided by exp_manager
+  precision: 32 # 16, 32, or bf16
+  use_distributed_sampler: False
+
+tensor_model_parallel_size: -1
+pipeline_model_parallel_size: -1
+pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others)
+megatron_amp_O2: False  # Enable O2-level automatic mixed precision to save memory
+gpt_model_file: null  # GPT nemo file path
+checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training
+checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading
+hparams_file: null # model configuration file, only used for PTL checkpoint loading
+prompts: # prompts for GPT inference
+  - "Q: How are you?"
+  - "Q: How big is the universe?"
+server: False  # whether launch the API server
+port: 5555 # the port number for the inference server
+web_server: False # whether launch the web inference server
+share: False  # whether create a public URL
+username: test # user name for web client
+password: test2  # password for web client
+web_port: 9889 # the port number of the web server
diff --git a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
index 9340b1f7c504..b264890ce48d 100644
--- a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
+++ b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
@@ -43,6 +43,7 @@ def __init__(
         cls_token: Optional[str] = None,
         unk_token: Optional[str] = None,
         use_fast: Optional[bool] = False,
+        trust_remote_code: Optional[bool] = False,
     ):
 
         """
@@ -65,11 +66,16 @@ def __init__(
             # this logic deals with different huggingface tokenizers having different positional args
             if vocab_file is None:
                 self.tokenizer = AUTOTOKENIZER.from_pretrained(
-                    pretrained_model_name_or_path=pretrained_model_name, use_fast=use_fast,
+                    pretrained_model_name_or_path=pretrained_model_name,
+                    use_fast=use_fast,
+                    trust_remote_code=trust_remote_code,
                 )
             elif merges_file is None:
                 self.tokenizer = AUTOTOKENIZER.from_pretrained(
-                    pretrained_model_name_or_path=pretrained_model_name, vocab_file=vocab_file, use_fast=use_fast,
+                    pretrained_model_name_or_path=pretrained_model_name,
+                    vocab_file=vocab_file,
+                    use_fast=use_fast,
+                    trust_remote_code=trust_remote_code,
                 )
             else:
                 self.tokenizer = AUTOTOKENIZER.from_pretrained(
@@ -77,6 +83,7 @@ def __init__(
                     vocab_file=vocab_file,
                     merges_file=merges_file,
                     use_fast=use_fast,
+                    trust_remote_code=trust_remote_code,
                 )
         except Exception as e:
             raise ValueError(
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 803bc671a7cf..13b72ef107c2 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -422,6 +422,7 @@ def _build_tokenizer(self):
             use_fast=self.cfg.tokenizer.get('use_fast', False),
             delimiter=self.cfg.tokenizer.get('delimiter', None),
             special_tokens=self.cfg.tokenizer.get('special_tokens', None),
+            trust_remote_code=self.cfg.tokenizer.get('trust_remote_code', False),
             legacy=legacy,
         )
 
@@ -460,6 +461,7 @@ def build_transformer_config(self) -> TransformerConfig:
         model_parallel_config = self.build_model_parallel_config()
 
         add_bias_linear = self.cfg.get('bias', True)
+        add_qkv_bias = self.cfg.get('qkv_bias', False)
 
         activation = self.cfg.get('activation', 'gelu')
         gated_linear_unit = activation.endswith('glu')
@@ -481,6 +483,8 @@ def build_transformer_config(self) -> TransformerConfig:
         attention_softmax_in_fp32 = False  # not currently used in NeMo unless apply_query_key_layer_scaling is True
         apply_query_key_layer_scaling = self.cfg.get('apply_query_key_layer_scaling', False)
 
+        rotary_interleaved = self.cfg.get('rotary_interleaved', False)
+
         fp16_enabled = self.trainer.precision in [16, '16', '16-mixed']
         if apply_query_key_layer_scaling:
             if fp16_enabled:
@@ -514,6 +518,7 @@ def build_transformer_config(self) -> TransformerConfig:
             'apply_residual_connection_post_layernorm': False,  # we don't use this in NeMo
             'layernorm_zero_centered_gamma': False,
             'add_bias_linear': add_bias_linear,
+            'add_qkv_bias': add_qkv_bias,
             'gated_linear_unit': gated_linear_unit,
             'activation_func': activation_func,
             'normalization': normalization,
@@ -528,6 +533,7 @@ def build_transformer_config(self) -> TransformerConfig:
             'recompute_num_layers': recompute_num_layers,
             'distribute_saved_activations': False,  # not currently used in NeMo
             'fp8': None,
+            'rotary_interleaved': rotary_interleaved,
             'deallocate_pipeline_outputs': True,
         }
 
diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py
index 59452ce96f99..3abfda2a5e44 100644
--- a/nemo/collections/nlp/modules/common/text_generation_strategy.py
+++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py
@@ -89,6 +89,9 @@ def tokenize_batch(self, sentences, max_len, add_BOS):
         tokenizer = self.model.tokenizer
         if add_BOS:
             context_tokens = [[tokenizer.bos_id] + tokenizer.text_to_ids(s) for s in sentences]
+        elif hasattr(tokenizer.tokenizer, "get_prefix_tokens"):
+            # chatglm: add tokenizer.gmask_id, tokenizer.sop_id
+            context_tokens = [tokenizer.tokenizer.get_prefix_tokens() + tokenizer.text_to_ids(s) for s in sentences]
         else:
             context_tokens = [tokenizer.text_to_ids(s) for s in sentences]
         context_tokens, context_lengths = pad_batch(context_tokens, tokenizer.eos_id, max_len)
diff --git a/nemo/collections/nlp/modules/common/tokenizer_utils.py b/nemo/collections/nlp/modules/common/tokenizer_utils.py
index ce4f901966cb..84df4a6965e1 100644
--- a/nemo/collections/nlp/modules/common/tokenizer_utils.py
+++ b/nemo/collections/nlp/modules/common/tokenizer_utils.py
@@ -149,6 +149,7 @@ def get_nmt_tokenizer(
     r2l: Optional[bool] = False,
     legacy: Optional[bool] = False,
     delimiter: Optional[str] = None,
+    trust_remote_code: Optional[bool] = False,
 ):
     """
     Args:
@@ -180,6 +181,7 @@ def get_nmt_tokenizer(
             merges_file=merges_file,
             **special_tokens_dict,
             use_fast=use_fast,
+            trust_remote_code=trust_remote_code,
         )
     elif library == 'sentencepiece':
         logging.info(f'Getting SentencePiece with model: {tokenizer_model}')
diff --git a/scripts/nlp_language_modeling/convert_chatglm_hf_to_nemo.py b/scripts/nlp_language_modeling/convert_chatglm_hf_to_nemo.py
new file mode 100644
index 000000000000..c3f210deefac
--- /dev/null
+++ b/scripts/nlp_language_modeling/convert_chatglm_hf_to_nemo.py
@@ -0,0 +1,303 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+Conversion script to convert Huggingface ChatGLM2/ChatGLM3 checkpoints into nemo checkpoint.
+  Example to run this conversion script:
+    python convert_chatglm_hf_to_nemo.py \
+     --input_name_or_path <path_to_hf_checkpoints_folder> \
+     --output_path <path_to_output_nemo_file>
+"""
+
+import os
+from argparse import ArgumentParser
+from collections import OrderedDict
+
+import torch
+from omegaconf import OmegaConf
+from pytorch_lightning.trainer.trainer import Trainer
+from transformers import AutoModel, AutoTokenizer
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.nlp_overrides import (
+    GradScaler,
+    MegatronHalfPrecisionPlugin,
+    NLPDDPStrategy,
+    NLPSaveRestoreConnector,
+    PipelineMixedPrecisionPlugin,
+)
+from nemo.collections.nlp.parts.utils_funcs import load_state_dict_helper, torch_dtype_from_precision
+from nemo.utils import logging
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--input_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to Huggingface ChatGLM2/ChatGLM3 checkpoints",
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=os.path.join(
+            os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_chatglm_config.yaml'
+        ),
+        required=False,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
+    )
+
+    parser.add_argument("--precision", type=str, default="16", help="Model precision")
+    args = parser.parse_args()
+    return args
+
+
+def load_config(args, chatglm_config):
+    nemo_config = OmegaConf.load(args.hparams_file).model
+    nemo_config.encoder_seq_length = chatglm_config['seq_length']
+    nemo_config.num_layers = int(chatglm_config['num_layers'])
+    nemo_config.hidden_size = chatglm_config['hidden_size']
+    nemo_config.ffn_hidden_size = chatglm_config['ffn_hidden_size']
+    nemo_config.num_attention_heads = chatglm_config['num_attention_heads']
+    nemo_config.max_position_embeddings = chatglm_config['seq_length']
+    if 'multi_query_attention' in chatglm_config:
+        if chatglm_config['multi_query_attention'] and 'multi_query_group_num' in chatglm_config:
+            nemo_config.num_query_groups = chatglm_config['multi_query_group_num']
+    nemo_config.attention_dropout = chatglm_config['attention_dropout']
+    nemo_config.hidden_dropout = chatglm_config['hidden_dropout']
+    nemo_config.layernorm_epsilon = chatglm_config['layernorm_epsilon']
+    if 'apply_residual_connection_post_layernorm' in chatglm_config:
+        if chatglm_config['apply_residual_connection_post_layernorm']:
+            nemo_config.transformer_block_type = 'post_ln'
+        else:
+            nemo_config.transformer_block_type = 'pre_ln'
+    nemo_config.use_cpu_initialization = True
+    nemo_config.activation = 'fast-swiglu'
+    nemo_config.tokenizer.model = chatglm_config['tokenizer_model']
+    # base = 128
+    # while chatglm_config['padded_vocab_size'] % base != 0:
+    #     base //= 2
+    # nemo_config.make_vocab_size_divisible_by = base
+    nemo_config.override_vocab_size = chatglm_config['padded_vocab_size']
+
+    return nemo_config
+
+
+def convert(args):
+    logging.info(f"loading checkpoint {args.input_name_or_path}")
+    model = AutoModel.from_pretrained(args.input_name_or_path, trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(args.input_name_or_path, trust_remote_code=True)
+    hf_config = vars(model.config)
+    hf_config['tokenizer_model'] = str(tokenizer.vocab_file)
+    print(f"hf_config: {hf_config}")
+    print("named parameters:")
+    for name, param in model.named_parameters():
+        print(f"hf - {name}", param.shape)
+
+    nemo_config = load_config(args, hf_config)
+
+    if args.precision in ["32", "16"]:
+        precision = int(float(args.precision))
+    elif args.precision in ["bf16", "bf16-mixed"]:
+        if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
+            precision = args.precision
+        else:
+            logging.warning("BF16 is not supported on this device. Using FP16 instead.")
+            precision = args.precision[2:]  # prune bf in string
+    else:
+        precision = args.precision
+
+    plugins = []
+    if precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
+        scaler = None
+        if precision in [16, '16', '16-mixed']:
+            scaler = GradScaler(
+                init_scale=nemo_config.get('native_amp_init_scale', 2 ** 32),
+                growth_interval=nemo_config.get('native_amp_growth_interval', 1000),
+                hysteresis=nemo_config.get('hysteresis', 2),
+            )
+            # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed
+            plugin_precision = '16-mixed'
+        else:
+            plugin_precision = 'bf16-mixed'
+
+        if nemo_config.get('megatron_amp_O2', False):
+            plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+        else:
+            plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+
+    nemo_config.precision = precision
+
+    trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision, strategy=NLPDDPStrategy())
+
+    hidden_size = hf_config["hidden_size"]
+    head_num = hf_config["num_attention_heads"]
+    head_size = hidden_size // head_num
+    num_layers = hf_config["num_layers"]
+
+    mcore_gpt = nemo_config.mcore_gpt
+
+    assert mcore_gpt == nemo_config.get(
+        'transformer_engine', False
+    ), "mcore_gpt transformer_engine must be enabled (or disabled) together."
+
+    param_to_weights = lambda param: param.float()
+
+    checkpoint = OrderedDict()
+    checkpoint['state_dict'] = OrderedDict()
+
+    embed_weight = model.state_dict()[f'transformer.embedding.word_embeddings.weight']
+    if mcore_gpt:
+        embed_weights_base_name = f'model.embedding.word_embeddings.weight'
+    else:
+        embed_weights_base_name = f'model.language_model.embedding.word_embeddings.weight'
+    checkpoint['state_dict'][embed_weights_base_name] = param_to_weights(embed_weight)
+
+    if nemo_config.num_query_groups is None or nemo_config.num_query_groups == head_num:
+        num_query_groups = head_num
+    else:
+        num_query_groups = nemo_config.num_query_groups
+        assert head_num % num_query_groups == 0, 'head_num must be divisible by num_query_groups'
+    heads_per_group = head_num // num_query_groups
+
+    if mcore_gpt:
+        assert nemo_config.activation.startswith('fast-'), 'mcore only supports fast version of gated linear unit.'
+
+    for l in range(int(num_layers)):
+        print(f"converting layer {l}")
+
+        hf_qkv_weights = model.state_dict()[f'transformer.encoder.layers.{l}.self_attention.query_key_value.weight']
+        old_tensor_shape = hf_qkv_weights.size()
+        new_q_tensor_shape = (head_num, head_size, old_tensor_shape[1])
+        new_kv_tensor_shape = (num_query_groups, head_size, old_tensor_shape[1])
+        q, k, v = hf_qkv_weights.split(
+            [head_num * head_size, num_query_groups * head_size, num_query_groups * head_size], dim=0
+        )
+        q = q.view(*new_q_tensor_shape)
+        k = k.view(*new_kv_tensor_shape)
+        v = v.view(*new_kv_tensor_shape)
+        qkv_weights = torch.empty((0, head_size, old_tensor_shape[1]))
+        for i in range(num_query_groups):
+            qkv_weights = torch.cat((qkv_weights, q[i * heads_per_group : (i + 1) * heads_per_group, :, :]))
+            qkv_weights = torch.cat((qkv_weights, k[i : i + 1, :, :]))
+            qkv_weights = torch.cat((qkv_weights, v[i : i + 1, :, :]))
+        qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+
+        hf_qkv_bias = model.state_dict()[f'transformer.encoder.layers.{l}.self_attention.query_key_value.bias']
+        new_q_tensor_shape = (head_num, head_size)
+        new_kv_tensor_shape = (num_query_groups, head_size)
+        q, k, v = hf_qkv_bias.split(
+            [head_num * head_size, num_query_groups * head_size, num_query_groups * head_size], dim=0
+        )
+        q = q.view(*new_q_tensor_shape)
+        k = k.view(*new_kv_tensor_shape)
+        v = v.view(*new_kv_tensor_shape)
+        qkv_bias = torch.empty((0, head_size))
+        for i in range(num_query_groups):
+            qkv_bias = torch.cat((qkv_bias, q[i * heads_per_group : (i + 1) * heads_per_group, :]))
+            qkv_bias = torch.cat((qkv_bias, k[i : i + 1, :]))
+            qkv_bias = torch.cat((qkv_bias, v[i : i + 1, :]))
+        qkv_bias = qkv_bias.reshape([head_size * (head_num + 2 * num_query_groups),])
+
+        if mcore_gpt:
+            qkv_weights_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.weight'
+            qkv_bias_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.bias'
+        else:
+            qkv_weights_base_name = f'model.language_model.encoder.layers.{l}.self_attention.query_key_value.weight'
+            qkv_bias_base_name = f'model.language_model.encoder.layers.{l}.self_attention.query_key_value.bias'
+        checkpoint['state_dict'][qkv_weights_base_name] = param_to_weights(qkv_weights)
+        checkpoint['state_dict'][qkv_bias_base_name] = param_to_weights(qkv_bias)
+
+        # attention dense
+        o_weight = model.state_dict()[f'transformer.encoder.layers.{l}.self_attention.dense.weight']
+        if mcore_gpt:
+            o_weight_base_name = f'model.decoder.layers.{l}.self_attention.linear_proj.weight'
+        else:
+            o_weight_base_name = f'model.language_model.encoder.layers.{l}.self_attention.dense.weight'
+        checkpoint['state_dict'][o_weight_base_name] = param_to_weights(o_weight)
+
+        # MLP
+        mlp_down_weight = model.state_dict()[f'transformer.encoder.layers.{l}.mlp.dense_h_to_4h.weight']
+        if mcore_gpt:
+            mlp_down_base_name = f'model.decoder.layers.{l}.mlp.linear_fc1.weight'
+        else:
+            mlp_down_base_name = f'model.language_model.encoder.layers.{l}.mlp.dense_h_to_4h.weight'
+        checkpoint['state_dict'][mlp_down_base_name] = param_to_weights(mlp_down_weight)
+
+        mlp_up_weight = model.state_dict()[f'transformer.encoder.layers.{l}.mlp.dense_4h_to_h.weight']
+        if mcore_gpt:
+            mlp_up_base_name = f'model.decoder.layers.{l}.mlp.linear_fc2.weight'
+        else:
+            mlp_up_base_name = f'model.language_model.encoder.layers.{l}.mlp.dense_4h_to_h.weight'
+        checkpoint['state_dict'][mlp_up_base_name] = param_to_weights(mlp_up_weight)
+
+        # LayerNorm
+        input_ln_weight = model.state_dict()[f'transformer.encoder.layers.{l}.input_layernorm.weight']
+        if mcore_gpt:
+            input_ln_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_weight'
+        else:
+            input_ln_base_name = f'model.language_model.encoder.layers.{l}.input_layernorm.weight'
+        checkpoint['state_dict'][input_ln_base_name] = param_to_weights(input_ln_weight)
+
+        post_attn_ln_weight = model.state_dict()[f'transformer.encoder.layers.{l}.post_attention_layernorm.weight']
+        if mcore_gpt:
+            post_attn_ln_base_name = f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_weight'
+        else:
+            post_attn_ln_base_name = f'model.language_model.encoder.layers.{l}.post_attention_layernorm.weight'
+        checkpoint['state_dict'][post_attn_ln_base_name] = param_to_weights(post_attn_ln_weight)
+
+        print(f"done layer {l}")
+
+    final_ln_weight = model.state_dict()[f'transformer.encoder.final_layernorm.weight']
+    if mcore_gpt:
+        final_ln_base_name = f'model.decoder.final_layernorm.weight'
+    else:
+        final_ln_base_name = f'model.language_model.encoder.final_layernorm.weight'
+    checkpoint['state_dict'][final_ln_base_name] = param_to_weights(final_ln_weight)
+
+    output_layer_weight = model.state_dict()[f'transformer.output_layer.weight']
+    if mcore_gpt:
+        output_layer_base_name = f'model.output_layer.weight'
+    else:
+        output_layer_base_name = f'model.language_model.output_layer.weight'
+    checkpoint['state_dict'][output_layer_base_name] = param_to_weights(output_layer_weight)
+
+    checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config
+
+    del model
+
+    if nemo_config.get('megatron_amp_O2', False):
+        keys = list(checkpoint['state_dict'].keys())
+        for key in keys:
+            checkpoint['state_dict'][key.replace('model.', 'model.module.', 1)] = checkpoint['state_dict'].pop(key)
+
+    model = load_state_dict_helper(MegatronGPTModel, nemo_config, trainer, checkpoint['state_dict'])
+
+    model._save_restore_connector = NLPSaveRestoreConnector()
+
+    # cast to target precision and disable cpu init
+    dtype = torch_dtype_from_precision(precision)
+    model = model.to(dtype=dtype)
+    model.cfg.use_cpu_initialization = False
+
+    model.save_to(args.output_path)
+    logging.info(f'NeMo model saved to: {args.output_path}')
+
+
+if __name__ == '__main__':
+    args = get_args()
+    convert(args)
diff --git a/scripts/nlp_language_modeling/convert_chatglm_nemo_to_hf.py b/scripts/nlp_language_modeling/convert_chatglm_nemo_to_hf.py
new file mode 100644
index 000000000000..59bc0a64bbe9
--- /dev/null
+++ b/scripts/nlp_language_modeling/convert_chatglm_nemo_to_hf.py
@@ -0,0 +1,230 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from argparse import ArgumentParser
+from collections import OrderedDict
+
+import torch
+from pytorch_lightning import Trainer
+from transformers import AutoModel
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+from nemo.utils import logging
+
+"""
+Script to convert a chatglm2/chatglm3 checkpoint in nemo (mcore path) into a HuggingFace checkpoint.
+This script can be used to 1) generate only the HF weights, or 2) generate an entire HF model folder.
+
+1) Generate only HF weights from a nemo file:
+
+    python convert_chatglm_nemo_to_hf.py \
+    --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
+    --output_path /path/to/pytorch_model.bin
+    
+2) Generate the full HF model folder
+
+    python convert_chatglm_nemo_to_hf.py \
+    --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
+    --output_path /path/to/pytorch_model.bin \
+    --hf_input_path /path/to/input_hf_folder \
+    --hf_output_path /path/to/output_hf_folder
+
+    Use the --cpu-only flag if the model cannot fit in the GPU. 
+    However this option makes the conversion script significantly slower.
+"""
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--input_name_or_path", type=str, default=None, required=True, help="Path to .nemo file",
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to HF .bin file")
+    parser.add_argument(
+        "--hf_input_path",
+        type=str,
+        default=None,
+        help="A HF model path, " "e.g. a folder containing https://huggingface.co/THUDM/chatglm3-6b/blob/main",
+    )
+    parser.add_argument(
+        "--hf_output_path",
+        type=str,
+        default=None,
+        help="Output HF model path, " "with the same format as above but user's own weights",
+    )
+    parser.add_argument(
+        "--precision",
+        type=str,
+        default=None,
+        help="Precision of output weights."
+        "Defaults to precision of the input nemo weights (model.cfg.trainer.precision)",
+    )
+    parser.add_argument(
+        "--cpu-only",
+        action="store_true",
+        help="Load model in cpu only. Useful if the model cannot fit in GPU memory, "
+        "but this option makes the conversion script significantly slower.",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> None:
+    """
+    Convert NeMo weights to HF weights
+    """
+    dummy_trainer = Trainer(devices=1, accelerator='cpu', strategy=NLPDDPStrategy())
+    model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True)
+    model_config.tensor_model_parallel_size = 1
+    model_config.pipeline_model_parallel_size = 1
+    if cpu_only:
+        map_location = torch.device('cpu')
+        model_config.use_cpu_initialization = True
+    else:
+        map_location = None
+
+    if cpu_only:
+        logging.info("******** Loading model on CPU. This will take a significant amount of time.")
+    model = MegatronGPTModel.restore_from(
+        input_nemo_file, trainer=dummy_trainer, override_config_path=model_config, map_location=map_location
+    )
+    if precision is None:
+        precision = model.cfg.precision
+    if precision in [32, "32"]:
+        dtype = torch.float32
+    elif precision in [16, "16", "16-mixed"]:
+        dtype = torch.float16
+    elif precision in ["bf16", "bf16-mixed"]:
+        dtype = torch.bfloat16
+    else:
+        logging.warning(f"Precision string {precision} is not recognized, falling back to fp32")
+        dtype = torch.float32  # fallback
+
+    param_to_weights = lambda param: param.to(dtype)
+    checkpoint = OrderedDict()
+
+    hidden_size = model.cfg.hidden_size
+    head_num = model.cfg.num_attention_heads
+    num_layers = model.cfg.num_layers
+    num_query_groups = model.cfg.get("num_query_groups", head_num)  # different num_query_groups for 70B
+
+    head_size = hidden_size // head_num
+    heads_per_group = head_num // num_query_groups  # 32 / 2 = 16
+    qkv_total_dim = head_num + 2 * num_query_groups  # 32 + 2 * 2 = 36
+
+    # Embedding
+    embed_weight = model.state_dict()[f'model.embedding.word_embeddings.weight']
+    embed_weights_base_name = f'transformer.embedding.word_embeddings.weight'
+    checkpoint[embed_weights_base_name] = param_to_weights(embed_weight)
+    for name, value in checkpoint.items():
+        print(f"hf - {name}", value.shape, value.sum())
+
+    for l in range(int(num_layers)):
+        print(f"converting layer {l}")
+
+        # qkv weights
+        qkv_weights = model.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_qkv.weight']
+        qkv_weights = qkv_weights.reshape([qkv_total_dim, head_size, hidden_size])
+
+        q_slice = torch.cat(
+            [
+                torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+                for i in range(num_query_groups)
+            ]
+        )
+        k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+        v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+        qkv_weights_base_name = f'transformer.encoder.layers.{l}.self_attention.query_key_value.weight'
+        q_weight = param_to_weights(qkv_weights[q_slice].reshape(-1, hidden_size))
+        k_weight = param_to_weights(qkv_weights[k_slice].reshape(-1, hidden_size))
+        v_weight = param_to_weights(qkv_weights[v_slice].reshape(-1, hidden_size))
+        checkpoint[qkv_weights_base_name] = torch.cat((q_weight, k_weight, v_weight), dim=0)
+
+        # qkv bias
+        qkv_bias = model.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_qkv.bias']
+        qkv_bias = qkv_bias.reshape([qkv_total_dim, head_size])
+
+        q_slice = torch.cat(
+            [
+                torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+                for i in range(num_query_groups)
+            ]
+        )
+        k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+        v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+        qkv_bias_base_name = f'transformer.encoder.layers.{l}.self_attention.query_key_value.bias'
+        q_bias = param_to_weights(qkv_bias[q_slice].reshape(-1,))
+        k_bias = param_to_weights(qkv_bias[k_slice].reshape(-1,))
+        v_bias = param_to_weights(qkv_bias[v_slice].reshape(-1,))
+        checkpoint[qkv_bias_base_name] = torch.cat((q_bias, k_bias, v_bias))
+
+        # attention dense
+        o_weight = model.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_proj.weight']
+        o_weight_base_name = f'transformer.encoder.layers.{l}.self_attention.dense.weight'
+        checkpoint[o_weight_base_name] = param_to_weights(o_weight)
+
+        # mlp
+        mlp_down_proj_weights = model.state_dict()[f'model.decoder.layers.{l}.mlp.linear_fc1.weight']
+        mlp_down_proj_base_name = f'transformer.encoder.layers.{l}.mlp.dense_h_to_4h.weight'
+        checkpoint[mlp_down_proj_base_name] = param_to_weights(mlp_down_proj_weights)
+
+        mlp_up_proj_weight = model.state_dict()[f'model.decoder.layers.{l}.mlp.linear_fc2.weight']
+        mlp_up_proj_base_name = f'transformer.encoder.layers.{l}.mlp.dense_4h_to_h.weight'
+        checkpoint[mlp_up_proj_base_name] = param_to_weights(mlp_up_proj_weight)
+
+        # layernorm
+        input_ln_weight = model.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_weight']
+        input_ln_base_name = f'transformer.encoder.layers.{l}.input_layernorm.weight'
+        checkpoint[input_ln_base_name] = param_to_weights(input_ln_weight)
+
+        post_attn_ln_weight = model.state_dict()[f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_weight']
+        post_attn_ln_base_name = f'transformer.encoder.layers.{l}.post_attention_layernorm.weight'
+        checkpoint[post_attn_ln_base_name] = param_to_weights(post_attn_ln_weight)
+
+        print(f"done layer {l}")
+
+    final_ln_weight = model.state_dict()[f'model.decoder.final_layernorm.weight']
+    final_ln_base_name = f'transformer.encoder.final_layernorm.weight'
+    checkpoint[final_ln_base_name] = param_to_weights(final_ln_weight)
+
+    output_layer_weight = model.state_dict()[f'model.output_layer.weight']
+    output_layer_base_name = f'transformer.output_layer.weight'
+    checkpoint[output_layer_base_name] = param_to_weights(output_layer_weight)
+
+    os.makedirs(os.path.dirname(output_hf_file), exist_ok=True)
+    torch.save(checkpoint, output_hf_file)
+    logging.info(f"Weights saved to {output_hf_file}")
+
+
+def replace_hf_weights(weights_file, input_hf_path, output_hf_path):
+    model = AutoModel.from_pretrained(input_hf_path, local_files_only=True)
+    nemo_exported = torch.load(weights_file)
+
+    model.load_state_dict(nemo_exported)
+    model.save_pretrained(output_hf_path)
+    logging.info(f"Full HF model saved to {output_hf_path}")
+
+
+if __name__ == '__main__':
+    args = get_args()
+    convert(args.input_name_or_path, args.output_path, precision=args.precision, cpu_only=args.cpu_only)
+    if args.hf_input_path and args.hf_output_path:
+        replace_hf_weights(args.output_path, args.hf_input_path, args.hf_output_path)
+    else:
+        logging.info("`hf_input_path` and/or `hf_output_path` not provided, not generating full HF model.")
+        logging.info(f".bin file is saved to {args.output_path}")

From e995e1bcb78901f39c9af34de711778c473dd1a9 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Mon, 18 Mar 2024 17:47:57 -0700
Subject: [PATCH 049/140] Github Actions CICD init (#8166)

* Github Actions CICD init
Signed-off-by: Pablo Garay <palenq@gmail.com>

* Add docker args/options

* comment out /dev/nvidia0

* CICD:Add cmd test step

* Azure vm runner label

* temp:comment some docker options

* temp chg

* Change: user

* update script

* update script

* update script

* update script

* update script

* update script

* update script

* update script

* update script

* update script

* update script

* update script

* update script

* update script

* update script

* update script

* update script

* Add Unit Tests CPU

* Try test jobs in parallel

* Try test jobs in parallel

* Refactor container setup

* Refactor container setup

* Refactor container setup

* Refactor container setup

* Refactor container setup

* Refactor container setup

* Refactor container setup

* Refactor container setup

* Add more tests (use TestData path)

* temp attempt

* add more tests 2

* add more tests2

* Add more tests 3

* Add more tests 3

* Add more tests 3

* Add more tests 3

* Add more tests 4

* Add more tests 5

* Add more tests 5

* Add more tests 5

* update megatron-lm

* Add more tests 6

* Add more tests 7

adjustments

* Post-workflow execution

* adjustments

* fix bug in script

* fix bug in script

* fix perm for checkout issue

* fix perm for checkout issue

* solve indentation issue for py script

* various fixes

* various fixes 1

* various fixes 2

* various fixes 3

* various fixes 3

* various fixes 3

* various fixes 4

* attempt fix

* Fix attempt for permission-related issue

* attempt: cancel on failure feature

* attempt: cancel on failure feature

* attempt use cancel action

* Add cancel action

* Add cancel all on failed test template

* attempt checkout fix

* update checkout version since had issues

* revert temp changes back

* update when CICD runs

* update mcore

* update TE

* update mcore + TE

* update apex

* update mcore
---
 .github/actions/cancel-workflow/action.yml |   25 +
 .github/workflows/cicd-main.yml            | 6057 ++++++++++++++++++++
 2 files changed, 6082 insertions(+)
 create mode 100644 .github/actions/cancel-workflow/action.yml
 create mode 100644 .github/workflows/cicd-main.yml

diff --git a/.github/actions/cancel-workflow/action.yml b/.github/actions/cancel-workflow/action.yml
new file mode 100644
index 000000000000..6c1f85aea8d6
--- /dev/null
+++ b/.github/actions/cancel-workflow/action.yml
@@ -0,0 +1,25 @@
+name: Cancel Workflow
+description: >
+  Cancels the current workflow run, i.e. all jobs. Useful if you want to cancel the rest of the workflow when one job
+  fails. Note that this will cause the workflow to appear cancelled, not failed.
+
+# Cancelling the workflow in a post-script (like this:
+# https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#runspost; can also be done with
+# this action: https://github.com/webiny/action-post-run, see Git history of this file) wouldn't help the status, it
+# would still be cancelled. It actually indeed is, but it would be nicer to set it to failed, but there seems to be no
+# way to do this.
+
+runs:
+  using: "composite"
+  steps:
+    - name: Cancel Workflow
+      # # Fork PRs won't have a token with write access to Actions, thus won't be able to cancel the workflow.
+      # if: github.event.pull_request == '' || github.event.pull_request.head.repo.fork == false
+      shell: bash
+      run: |
+        curl --verbose \
+          -X POST \
+          -H "Accept: application/vnd.github+json" \
+          -H "Authorization: Bearer ${{ github.token }}" \
+          -H "X-GitHub-Api-Version: 2022-11-28" \
+          https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/cancel
\ No newline at end of file
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
new file mode 100644
index 000000000000..f4ec9818d165
--- /dev/null
+++ b/.github/workflows/cicd-main.yml
@@ -0,0 +1,6057 @@
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "CICD NeMo"
+
+on:
+  push:
+    branches: [ "main", "pagaray/nemo_cicd" ]
+  pull_request:
+    # The branches below must be a subset of the branches above
+    branches: [ "main" ]
+
+jobs:
+  gpu-test:
+    runs-on: self-hosted-azure
+    steps:
+    - name: Run nvidia-smi test
+      run: |
+        whoami
+        nvidia-smi
+
+  checkout-repository:
+    runs-on: self-hosted-azure
+    container:
+      image: nvcr.io/nvidia/pytorch:24.01-py3
+      volumes:
+        - ${{ github.workspace }}:/workspace
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v2
+
+  cicd-test-container-setup:
+    needs: [checkout-repository]
+    runs-on: self-hosted-azure
+    # uses: actions/cache@v2
+    #container:
+#      image: nvcr.io/nvidia/pytorch:24.01-py3
+#      options: 
+#        # --user 0:128
+#        --device=/dev/nvidia0
+#        --gpus all
+#        --shm-size=8g 
+#        --env TRANSFORMERS_OFFLINE=0 
+#        --env HYDRA_FULL_ERROR=1
+    steps:
+    - name: Container setup
+      run: |
+        # Pull base PyTorch container
+        docker pull nvcr.io/nvidia/pytorch:24.01-py3
+        docker run --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume ${{ github.workspace }}:/workspace --volume /home/TestData:/home/TestData nvcr.io/nvidia/pytorch:24.01-py3 /bin/bash -c '
+            set -x
+
+            # PyTorch version
+            python -c "import torch; print(torch.__version__)"
+            python -c "import torchvision; print(torchvision.__version__)"
+
+            # Install test requirements
+            apt-get update && apt-get install -y bc && pip install -r requirements/requirements_test.txt && pip install -r requirements/requirements_lightning.txt
+
+            # Code formatting checks
+            python setup.py style
+
+            # Copyright Headers check
+            python tests/check_copyright_header.py --dir .
+
+            # NeMo Installation
+            ./reinstall.sh release
+
+            # Transformer Engine 1.2.0
+            # Transformer Engine installation
+            git clone https://github.com/NVIDIA/TransformerEngine.git && \
+                pushd TransformerEngine && \
+                git fetch origin 9b2fed514ea419141146f843ab2c84b22b86bfd7 && \
+                git checkout FETCH_HEAD && \
+                git submodule init && git submodule update && \
+                NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .  && \
+                popd
+
+            # Apex bugfix for PyTorch 23.11 container: https://github.com/NVIDIA/apex/pull/1760
+            # Apex installation
+            git clone https://github.com/NVIDIA/apex.git && \
+                pushd apex && \
+                git checkout b496d85fb88a801d8e680872a12822de310951fd && \
+                cp -R apex /usr/local/lib/python3.10/dist-packages && \
+                popd
+
+            # pip package should be working with main, if not we can update the commit here
+            # until the pip package is updated
+            # Megatron Core installation
+            git clone https://github.com/NVIDIA/Megatron-LM.git && \
+                pushd Megatron-LM && \
+                git checkout 43792028f003ed25a3ee8c5a0d4cad82317d81b5 && \
+                pip install . && \
+                popd
+
+            # Install only for test: L2: Segmentation Tool
+            pushd tools/ctc_segmentation && \
+                pip install -r requirements.txt && \
+                apt-get update && apt-get install libsox-fmt-all -y && \
+                popd
+
+            # PyTorch Lightning version
+            python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"
+
+            # PyTorch Lightning DDP Checks
+            CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py"
+
+            # Basic Import Checks
+            python -c "import nemo.collections.asr as nemo_asr"
+            python -c "import nemo.collections.nlp as nemo_nlp"
+            python -c "import nemo.collections.tts as nemo_tts"
+
+            # set permission
+            chmod 777 -R /workspace
+            '
+            ### \'\'
+
+    - name: Push container to registry for future use
+      run: |
+        # Push container
+        echo "Docker: List containers" && docker ps -a
+        DOCKER_COMMIT=$(docker ps --latest --quiet)  # latest container
+        docker commit $DOCKER_COMMIT nemo_container
+        docker tag nemo_container localhost:5000/nemo_container
+        docker push localhost:5000/nemo_container
+
+    # - name: Build and push to local registry
+    #   uses: docker/build-push-action@v5
+    #   with:
+    #       context: .
+    #       push: true
+    #       tags: localhost:5000/name/app:latest
+
+    # - name: Inspect
+    #   run: |
+    #     docker buildx imagetools inspect localhost:5000/name/app:latest 
+
+    #- name: Post-workflow execution
+    #  uses: gacts/run-and-post-run@v1
+    #  with:
+    #    post: |
+    #      chmod -R 777 .
+
+
+  L0_Unit_Tests_GPU:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v2
+    - name: "L0: Unit Tests GPU"
+      run: |
+        NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads
+    - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+      if: "failure()"
+      
+
+  L0_Unit_Tests_CPU:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v2
+    - name: "L0: Unit Tests CPU"
+      run: |
+        CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
+    - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+      if: "failure()"
+
+
+
+##     - name: L2: Multimodal Imagen Train
+
+  # L2: Community LLM Checkpoints tests
+  L2_Community_LLM_Checkpoints_tests_Llama:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            CUDA_VISIBLE_DEVICES=0 python scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py \
+            --in-file=/home/TestData/nlp/megatron_llama/llama-ci-hf \
+            --out-file=/home/TestData/nlp/megatron_llama/ci.nemo \
+            --precision=16
+            rm -f /home/TestData/nlp/megatron_llama/ci.nemo
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Community_LLM_Checkpoints_tests_StarCoder:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python scripts/nlp_language_modeling/convert_starcoder_hf_to_nemo.py \
+            --config examples/nlp/language_modeling/conf/megatron_gpt_config.yaml \
+            --input /home/TestData/nlp/megatron_gpt/starcoder-ci-hf \
+            --output /home/TestData/nlp/megatron_gpt/starcoder-ci-hf
+            rm -f /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Community_LLM_Checkpoints_tests_Falcon:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py \
+            --config examples/nlp/language_modeling/conf/megatron_falcon_config.yaml \
+            --input /home/TestData/nlp/megatron_gpt/falcon-ci-hf \
+            --output /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo
+            rm -f /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  # L2: ASR dev run
+  ASR_dev_run_Speech_to_Text:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/asr/asr_ctc/speech_to_text_ctc.py \
+            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+            trainer.devices=[0] \
+            trainer.accelerator="gpu" \
+            +trainer.fast_dev_run=True \
+            exp_manager.exp_dir=examples/asr/speech_to_text_results
+            rm -rf examples/asr/speech_to_text_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  ASR_dev_run_Speech_to_Text_WPE_-_CitriNet:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
+            --config-path="../conf/citrinet/" --config-name="config_bpe" \
+            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+            model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
+            model.tokenizer.type="wpe" \
+            trainer.devices=[1] \
+            trainer.accelerator="gpu" \
+            +trainer.fast_dev_run=True \
+            exp_manager.exp_dir=examples/asr/speech_to_text_wpe_results
+            rm -rf examples/asr/speech_to_text_wpe_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  ASR_dev_run_Speech_Pre-training_-_CitriNet:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/asr/speech_pretraining/speech_pre_training.py \
+            --config-path="../conf/ssl/citrinet/" --config-name="citrinet_ssl_ci" \
+            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+            trainer.devices=[1] \
+            trainer.accelerator="gpu" \
+            +trainer.fast_dev_run=True \
+            exp_manager.exp_dir=examples/asr/speech_pre_training_results
+            rm -rf examples/asr/speech_pre_training_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  ASR_dev_run_Speech_To_Text_Finetuning:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/asr/speech_to_text_finetune.py \
+            --config-path="conf/asr_finetune" --config-name="speech_to_text_finetune" \
+            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+            init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \
+            model.tokenizer.update_tokenizer=False \
+            trainer.devices=[1] \
+            trainer.accelerator="gpu" \
+            +trainer.fast_dev_run=True \
+            exp_manager.exp_dir=examples/asr/speech_finetuning_results
+            rm -rf examples/asr/speech_finetuning_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  ASR_dev_run_Speech_To_Text_HF_Finetuning:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/asr/speech_to_text_finetune.py \
+            --config-path="conf/asr_finetune" --config-name="speech_to_text_hf_finetune" \
+            ~model.train_ds.hf_data_cfg \
+            model.train_ds.num_workers=1 \
+            model.train_ds.batch_size=2 model.validation_ds.batch_size=2 \
+            model.train_ds.streaming=true \
+            +model.train_ds.hf_data_cfg.path="librispeech_asr" \
+            +model.train_ds.hf_data_cfg.name=null \
+            +model.train_ds.hf_data_cfg.split="test.clean" \
+            +model.train_ds.hf_data_cfg.streaming=true \
+            ~model.validation_ds.hf_data_cfg \
+            model.validation_ds.streaming=true \
+            +model.validation_ds.hf_data_cfg.path="librispeech_asr" \
+            +model.validation_ds.hf_data_cfg.name=null \
+            +model.validation_ds.hf_data_cfg.split="test.clean" \
+            +model.validation_ds.hf_data_cfg.streaming=true \
+            ~model.test_ds \
+            init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \
+            model.tokenizer.update_tokenizer=False \
+            model.optim.sched.warmup_steps=0 \
+            +model.optim.sched.max_steps=3 \
+            trainer.max_epochs=null \
+            trainer.devices=[1] \
+            trainer.accelerator="gpu" \
+            +trainer.fast_dev_run=True \
+            exp_manager.exp_dir=examples/asr/speech_finetuning_results
+            rm -rf examples/asr/speech_finetuning_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  ASR_dev_run_Speech_to_Text_WPE_-_Conformer:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
+            --config-path="../conf/conformer" --config-name="conformer_ctc_bpe" \
+            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+            model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
+            model.tokenizer.type="wpe" \
+            model.train_ds.batch_size=4 \
+            model.validation_ds.batch_size=4 \
+            trainer.devices=[1] \
+            trainer.accelerator="gpu" \
+            +trainer.fast_dev_run=True \
+            exp_manager.exp_dir=examples/asr/speech_to_text_wpe_conformer_results
+            rm -rf examples/asr/speech_to_text_wpe_conformer_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  # L2: ASR dev run - part two
+  ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
+            --config-path="../conf/squeezeformer" --config-name="squeezeformer_ctc_bpe" \
+            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+            model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
+            model.tokenizer.type="wpe" \
+            model.encoder.d_model=144 \
+            model.train_ds.batch_size=4 \
+            model.validation_ds.batch_size=4 \
+            trainer.devices=[0] \
+            trainer.accelerator="gpu" \
+            +trainer.fast_dev_run=True \
+            exp_manager.exp_dir=examples/asr/speech_to_text_wpe_squeezeformer_results
+            rm -rf examples/asr/speech_to_text_wpe_squeezeformer_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Speech_to_Text_EMA:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/asr/asr_ctc/speech_to_text_ctc.py \
+            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+            trainer.devices=2 \
+            trainer.accelerator="gpu" \
+            +trainer.fast_dev_run=True \
+            +exp_manager.ema.enable=True \
+            exp_manager.exp_dir=examples/asr/speech_to_text_results
+            rm -rf examples/asr/speech_to_text_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  # L2_Speech_to_Text_AED:
+  #   needs: [cicd-test-container-setup]
+  #   runs-on: self-hosted-azure
+  #   container:
+  #     image: localhost:5000/nemo_container
+  #     options: 
+  #       # --user 0:128
+  #       --device=/dev/nvidia0
+  #       --gpus all
+  #       --shm-size=8g 
+  #       --env TRANSFORMERS_OFFLINE=0 
+  #       --env HYDRA_FULL_ERROR=1
+  #       --volume /home/TestData:/home/TestData
+  #   steps:
+  #       - name: Checkout repository
+  #         uses: actions/checkout@v2
+  #       - run: |
+  #           python examples/asr/speech_multitask/speech_to_text_aed.py \
+  #           model.prompt_format=canary \
+  #           model.model_defaults.asr_enc_hidden=256 \
+  #           model.model_defaults.lm_dec_hidden=256 \
+  #           model.encoder.n_layers=12 \
+  #           model.transf_encoder.num_layers=0 \
+  #           model.transf_decoder.config_dict.num_layers=12 \
+  #           model.train_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_train.json \
+  #           ++model.train_ds.is_tarred=false \
+  #           model.train_ds.batch_duration=60 \
+  #           +model.train_ds.text_field="answer" \
+  #           +model.train_ds.lang_field="target_lang" \
+  #           model.validation_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \
+  #           +model.validation_ds.text_field="answer" \
+  #           +model.validation_ds.lang_field="target_lang" \
+  #           model.test_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \
+  #           +model.test_ds.text_field="answer" \
+  #           +model.test_ds.lang_field="target_lang" \
+  #           model.tokenizer.langs.spl_tokens.dir=/home/TestData/asr_tokenizers/canary/canary_spl_tokenizer_v32 \
+  #           model.tokenizer.langs.spl_tokens.type="bpe" \
+  #           model.tokenizer.langs.en.dir=/home/TestData/asr_tokenizers/canary/en/tokenizer_spe_bpe_v1024_max_4 \
+  #           model.tokenizer.langs.en.type=bpe \
+  #           ++model.tokenizer.langs.es.dir=/home/TestData/asr_tokenizers/canary/es/tokenizer_spe_bpe_v1024_max_4 \
+  #           ++model.tokenizer.langs.es.type=bpe \
+  #           trainer.devices=[0] \
+  #           trainer.accelerator="gpu" \
+  #           +trainer.use_distributed_sampler=false \
+  #           +trainer.fast_dev_run=True \
+  #           exp_manager.exp_dir=examples/asr/speech_to_text_aed_results
+  #           rm -rf examples/asr/speech_to_text_results
+
+
+  # L2: Speaker dev run
+  L2_Speaker_dev_run_Speaker_Recognition:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/speaker_tasks/recognition/speaker_reco.py \
+            model.train_ds.batch_size=10 \
+            model.validation_ds.batch_size=2 \
+            model.train_ds.manifest_filepath=/home/TestData/an4_speaker/train.json \
+            model.validation_ds.manifest_filepath=/home/TestData/an4_speaker/dev.json \
+            model.decoder.num_classes=2 \
+            trainer.max_epochs=10 \
+            trainer.devices=[1] \
+            trainer.accelerator="gpu" \
+            +trainer.fast_dev_run=True \
+            exp_manager.exp_dir=examples/speaker_tasks/recognition/speaker_recognition_results
+            rm -rf examples/speaker_tasks/recognition/speaker_recognition_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Speaker_dev_run_Speaker_Diarization:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder.py \
+            model.diarizer.speaker_embeddings.model_path=titanet_large \
+            model.train_ds.batch_size=5 \
+            model.validation_ds.batch_size=5 \
+            model.train_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \
+            model.validation_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \
+            model.train_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_train/msdd_data.50step.json \
+            model.validation_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_valid/msdd_data.50step.json \
+            trainer.devices=[1] \
+            trainer.accelerator="gpu" \
+            +trainer.fast_dev_run=True \
+            exp_manager.exp_dir=examples/speaker_tasks/diarization/speaker_diarization_results
+            rm -rf examples/speaker_tasks/diarization/speaker_diarization_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Speaker_dev_run_Speech_to_Label:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/asr/speech_classification/speech_to_label.py \
+            model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \
+            model.validation_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \
+            model.test_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \
+            trainer.devices=[1] \
+            trainer.accelerator="gpu" \
+            +trainer.fast_dev_run=True \
+            model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \
+            ~model.preprocessor.window_size \
+            ~model.preprocessor.window_stride \
+            ~model.preprocessor.window \
+            ~model.preprocessor.n_mels \
+            ~model.preprocessor.n_mfcc \
+            ~model.preprocessor.n_fft \
+            exp_manager.exp_dir=examples/asr/speech_to_label_results
+            rm -rf examples/asr/speech_to_label_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py \
+            diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
+            diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \
+            diarizer.speaker_embeddings.parameters.save_embeddings=True \
+            diarizer.speaker_embeddings.parameters.window_length_in_sec=[1.5] \
+            diarizer.speaker_embeddings.parameters.shift_length_in_sec=[0.75] \
+            diarizer.speaker_embeddings.parameters.multiscale_weights=[1.0] \
+            diarizer.asr.model_path=QuartzNet15x5Base-En \
+            diarizer.asr.parameters.asr_based_vad=True \
+            diarizer.out_dir=examples/speaker_tasks/diarization/speaker_diarization_asr_results
+            rm -rf examples/speaker_tasks/diarization/speaker_diarization_asr_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Speaker_dev_run_Clustering_Diarizer_Inference:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |  
+            python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \
+            diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
+            diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \
+            diarizer.speaker_embeddings.parameters.save_embeddings=True \
+            diarizer.speaker_embeddings.parameters.window_length_in_sec=1.5 \
+            diarizer.speaker_embeddings.parameters.shift_length_in_sec=0.75 \
+            diarizer.speaker_embeddings.parameters.multiscale_weights=null \
+            diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \
+            diarizer.out_dir=examples/speaker_tasks/diarization/clustering_diarizer_results
+            rm -rf examples/speaker_tasks/diarization/clustering_diarizer_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+
+  L2_Speaker_dev_run_Neural_Diarizer_Inference:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py \
+            diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
+            diarizer.msdd_model.model_path=/home/TestData/an4_diarizer/diar_msdd_telephonic.nemo \
+            diarizer.speaker_embeddings.parameters.save_embeddings=True \
+            diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \
+            diarizer.out_dir=examples/speaker_tasks/diarization/neural_diarizer_results
+            rm -rf examples/speaker_tasks/diarization/neural_diarizer_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python tools/speech_data_simulator/multispeaker_simulator.py \
+            --config-path=conf --config-name=data_simulator.yaml \
+            data_simulator.random_seed=42 \
+            data_simulator.manifest_filepath=/home/TestData/LibriSpeechShort/dev-clean-align-short.json \
+            data_simulator.outputs.output_dir=./test_simulator \
+            data_simulator.session_config.num_sessions=2 \
+            data_simulator.session_config.session_length=60
+            rm -rf ./test_simulator
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+
+  # L2: ASR Multi-dataloader dev run
+  L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/asr/asr_ctc/speech_to_text_ctc.py \
+            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+            model.validation_ds.manifest_filepath=[/home/TestData/an4_dataset/an4_val.json,/home/TestData/an4_dataset/an4_val.json] \
+            trainer.devices=[0] \
+            trainer.accelerator="gpu" \
+            trainer.max_epochs=1 \
+            trainer.max_steps=1 \
+            +trainer.num_sanity_val_steps=1 \
+            exp_manager.exp_dir=examples/asr/speech_to_text_results
+            rm -rf examples/asr/speech_to_text_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/asr/speech_classification/speech_to_label.py \
+            model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \
+            model.validation_ds.manifest_filepath=[/home/TestData/speech_commands/test_manifest.json,/home/TestData/speech_commands/test_manifest.json] \
+            trainer.devices=[1] \
+            trainer.accelerator="gpu" \
+            trainer.max_epochs=1 \
+            trainer.max_steps=1 \
+            +trainer.num_sanity_val_steps=1 \
+            model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \
+            ~model.preprocessor.window_size \
+            ~model.preprocessor.window_stride \
+            ~model.preprocessor.window \
+            ~model.preprocessor.n_mels \
+            ~model.preprocessor.n_mfcc \
+            ~model.preprocessor.n_fft \
+            exp_manager.exp_dir=examples/asr/speech_to_label_results
+            rm -rf examples/asr/speech_to_label_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+
+  # L2: ASR Adapters
+  L2_ASR_Adapters_Linear_Adapters:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/asr/asr_adapters/train_asr_adapter.py \
+            model.pretrained_model="stt_en_conformer_ctc_small" \
+            model.adapter.adapter_name="an4" \
+            model.adapter.linear.in_features=176 \
+            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+            trainer.max_steps=5 \
+            trainer.devices=[0] \
+            trainer.accelerator="gpu" \
+            +trainer.fast_dev_run=True \
+            exp_manager.exp_dir=examples/asr/speech_to_text_adapters_results
+            rm -rf examples/asr/speech_to_text_adapters_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_ASR_Adapters_RelPos_MHA_Adapters:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/asr/asr_adapters/train_asr_adapter.py \
+            model.pretrained_model="stt_en_conformer_ctc_small" \
+            model.adapter.adapter_name="encoder:an4" \
+            model.adapter.adapter_type="tiny_attn" \
+            model.adapter.tiny_attn.n_feat=176 \
+            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+            trainer.max_steps=5 \
+            trainer.devices=[0] \
+            trainer.accelerator="gpu" \
+            +trainer.fast_dev_run=True \
+            exp_manager.exp_dir=examples/asr/speech_to_text_adapters_mha_results
+            rm -rf examples/asr/speech_to_text_adapters_mha_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+
+  # L2: Speech Transcription
+  L2_Speech_Transcription_Speech_to_Text_Transcribe:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/asr/transcribe_speech.py \
+            pretrained_name="QuartzNet15x5Base-En" \
+            audio_dir="/home/TestData/an4_transcribe/test_subset/" \
+            output_filename="stt_test_res.json" \
+            amp=true
+            rm -rf stt_test_res.json
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  # L2: Transducer alignment
+  L2_Transducer_alignment_Running_pytest:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  # L2: Segmentation Tool
+  L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd tools/ctc_segmentation && \
+            TIME=`date +"%Y-%m-%d-%T"` && \
+            /bin/bash run_segmentation.sh \
+            --MODEL_NAME_OR_PATH="stt_en_citrinet_512_gamma_0_25" \
+            --DATA_DIR=/home/TestData/ctc_segmentation/eng \
+            --OUTPUT_DIR=/home/TestData/ctc_segmentation/eng/output${TIME} \
+            --LANGUAGE=en \
+            --USE_NEMO_NORMALIZATION="TRUE" && \
+            python /home/TestData/ctc_segmentation/verify_alignment.py \
+            -r /home/TestData/ctc_segmentation/eng/eng_valid_segments_1.7.txt \
+            -g /home/TestData/ctc_segmentation/eng/output${TIME}/verified_segments/nv_test_segments.txt && \
+            rm -rf /home/TestData/ctc_segmentation/eng/output${TIME}
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd tools/ctc_segmentation && \
+            TIME=`date +"%Y-%m-%d-%T"` && \
+            /bin/bash run_segmentation.sh \
+            --MODEL_NAME_OR_PATH=/home/TestData/ctc_segmentation/QuartzNet15x5-Ru-e512-wer14.45.nemo \
+            --DATA_DIR=/home/TestData/ctc_segmentation/ru \
+            --OUTPUT_DIR=/home/TestData/ctc_segmentation/ru/output${TIME} \
+            --LANGUAGE=ru \
+            --ADDITIONAL_SPLIT_SYMBOLS=";" && \
+            python /home/TestData/ctc_segmentation/verify_alignment.py \
+            -r /home/TestData/ctc_segmentation/ru/valid_ru_segments_1.7.txt \
+            -g /home/TestData/ctc_segmentation/ru/output${TIME}/verified_segments/ru_segments.txt && \
+            rm -rf /home/TestData/ctc_segmentation/ru/output${TIME}
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+
+  # L2: G2P Models
+  L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/tts/g2p && \
+                TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_CONFORMER=output_ctc_${TIME} && \
+                python g2p_train_and_evaluate.py \
+                    train_manifest=/home/TestData/g2p/g2p.json \
+                    validation_manifest=/home/TestData/g2p/g2p.json \
+                    model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \
+                    model.tokenizer.dir=/home/TestData/g2p/tokenizer_spe_unigram_v512 \
+                    trainer.max_epochs=1 \
+                    model.max_source_len=64 \
+                    trainer.devices=[0] \
+                    do_training=True \
+                    do_testing=True \
+                    exp_manager.exp_dir=${OUTPUT_DIR_CONFORMER} \
+                    +exp_manager.use_datetime_version=False\
+                    +exp_manager.version=test \
+                    --config-name=g2p_conformer_ctc && \
+                python g2p_inference.py \
+                    pretrained_model=${OUTPUT_DIR_CONFORMER}/G2P-Conformer-CTC/test/checkpoints/G2P-Conformer-CTC.nemo \
+                    manifest_filepath=/home/TestData/g2p/g2p.json \
+                    phoneme_field=text
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+    # TODO: pleasefixme @redoctopus
+    # - name: ByT5G2P training, evaluation and inference
+    #   run: |
+    #     cd examples/tts/g2p && \
+    #         TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_T5=output_byt5_${TIME} && \
+    #         python g2p_train_and_evaluate.py \
+    #             train_manifest=/home/TestData/g2p/g2p.json \
+    #             validation_manifest=/home/TestData/g2p/g2p.json \
+    #             model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \
+    #             trainer.max_epochs=1 \
+    #             model.max_source_len=64 \
+    #             trainer.devices=[1] \
+    #             do_training=True \
+    #             do_testing=True \
+    #             exp_manager.exp_dir=${OUTPUT_DIR_T5} \
+    #             +exp_manager.use_datetime_version=False\
+    #             +exp_manager.version=test && \
+    #         python g2p_inference.py \
+    #             pretrained_model=${OUTPUT_DIR_T5}/T5G2P/test/checkpoints/T5G2P.nemo \
+    #             manifest_filepath=/home/TestData/g2p/g2p.json \
+    #             phoneme_field=text
+    #   }
+    # }
+    # - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+    # if: "failure()"
+
+  L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/tts/g2p && \
+                TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR=output_${TIME} && \
+                python g2p_heteronym_classification_train_and_evaluate.py \
+                    train_manifest=/home/TestData/g2p/manifest.json \
+                    validation_manifest=/home/TestData/g2p/manifest.json \
+                    test_manifest=/home/TestData/g2p/manifest.json \
+                    model.wordids=/home/TestData/g2p/wordids.tsv \
+                    trainer.max_epochs=1 \
+                    model.max_seq_length=64 \
+                    do_training=True \
+                    do_testing=True \
+                    exp_manager.exp_dir=${OUTPUT_DIR} \
+                    +exp_manager.use_datetime_version=False\
+                    +exp_manager.version=test && \
+                python g2p_heteronym_classification_inference.py \
+                    manifest=/home/TestData/g2p/manifest.json \
+                    pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \
+                    output_manifest=preds.json
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+
+  # L2: Dialogue Classification
+
+  # TODO: pleasefixme
+  # L2_Dialogue_Classification_Dialogue_Intent_and_slot_classification_using_GPT:
+  #   needs: [cicd-test-container-setup]
+  #   runs-on: self-hosted-azure
+  #   container:
+  #     image: localhost:5000/nemo_container
+  #     options: 
+  #       # --user 0:128
+  #       --device=/dev/nvidia0
+  #       --gpus all
+  #       --shm-size=8g 
+  #       --env TRANSFORMERS_OFFLINE=0 
+  #       --env HYDRA_FULL_ERROR=1
+  #       --volume /home/TestData:/home/TestData
+  #   steps:
+  #       - name: Checkout repository
+  #         uses: actions/checkout@v2
+  #       - run: |
+  #           cd examples/nlp/dialogue && \
+  #           python dialogue.py \
+  #           model.dataset.data_dir=/home/TestData/nlp/sgd_small \
+  #           model.language_model.lm_checkpoint=/home/TestData/nlp/gpt2/pytorch_model.bin\
+  #           model.tokenizer.vocab_file=/home/TestData/nlp/gpt2/vocab.json\
+  #           model.dataset.dialogues_example_dir=sgd_gen_outputs \
+  #           model.dataset.task_name=debug_sample \
+  #           trainer.max_steps=1 \
+  #           trainer.max_epochs=1 \
+  #           model.train_ds.batch_size=2 \
+  #           model.validation_ds.batch_size=2 \
+  #           model.test_ds.batch_size=2 \
+  #           model.nemo_path=null \
+  #           trainer.val_check_interval=0.0 \
+  #           trainer.devices=[0] \
+  #           model.dataset.use_cache=false \
+  #           model.tokenizer.special_tokens={pad_token:"endoftext"} \
+  #           model.tokenizer.tokenizer_name=gpt2 \
+  #           model.tokenizer.vocab_file=/home/TestData/nlp/gpt2/vocab.json\
+  #           model.language_model.pretrained_model_name=/home/TestData/nlp/gpt2 \
+  #           trainer.accelerator=gpu \
+  #           exp_manager=null  && \
+  #           rm -rf sgd_gen_outputs
+
+  L2_Dialogue_Classification_Intent_and_slot_classification_using_SGDQA:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/dialogue && \
+            python dialogue.py \
+            model.dataset.data_dir=/home/TestData/nlp/sgd_small \
+            model.dataset.dialogues_example_dir=sgd_gen_bert_outputs \
+            model.dataset.task_name=debug_sample \
+            trainer.max_steps=1 \
+            trainer.max_epochs=1 \
+            model.train_ds.batch_size=2 \
+            model.validation_ds.batch_size=2 \
+            model.test_ds.batch_size=2 \
+            model.dataset.num_tasks=6 \
+            model.nemo_path=null \
+            trainer.val_check_interval=0.0 \
+            trainer.devices=[0] \
+            model.dataset.use_cache=false \
+            model.language_model.pretrained_model_name=bert-base-cased \
+            trainer.accelerator=gpu \
+            exp_manager=null  && \
+            rm -rf sgd_gen_bert_outputs
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Dialogue_Classification_Intent_and_slot_classification_using_IntentSlotClassificationModel:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/dialogue && \
+            python dialogue.py \
+            model.dataset.data_dir=/home/TestData/nlp/processed_assistant \
+            model.dataset.dialogues_example_dir=sgd_gen_bert_intent_classification_outputs \
+            model.dataset.task=assistant \
+            trainer.max_steps=1 \
+            trainer.max_epochs=1 \
+            model.train_ds.batch_size=2 \
+            model.validation_ds.batch_size=2 \
+            model.test_ds.batch_size=2 \
+            model.nemo_path=null \
+            trainer.val_check_interval=0.0 \
+            trainer.devices=[0] \
+            model.dataset.use_cache=false \
+            model.language_model.pretrained_model_name=bert-base-uncased \
+            trainer.accelerator=gpu \
+            exp_manager=null  && \
+            rm -rf sgd_gen_bert_intent_classification_outputs
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Dialogue_Classification_Intent_classification_using_ZeroShotIntentModel:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/dialogue && \
+            python dialogue.py \
+            do_training=False \
+            model.dataset.data_dir=/home/TestData/nlp/drive_thru_revised \
+            model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
+            model.dataset.dialogues_example_dir=sgd_gen_zero_shot_intent_classification_outputs \
+            model.dataset.task=zero_shot \
+            model.dataset.prompt_template="This example is" \
+            trainer.max_steps=1 \
+            trainer.max_epochs=1 \
+            model.train_ds.batch_size=2 \
+            model.validation_ds.batch_size=2 \
+            model.test_ds.batch_size=2 \
+            model.nemo_path=null \
+            trainer.val_check_interval=0.0 \
+            trainer.devices=[1] \
+            model.dataset.use_cache=false \
+            model.language_model.pretrained_model_name=bert-base-uncased \
+            trainer.accelerator=gpu \
+            exp_manager=null  && \
+            rm -rf sgd_gen_zero_shot_intent_classification_outputs
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/dialogue && \
+            python dialogue.py \
+            do_training=False \
+            model.dataset.data_dir=/home/TestData/nlp/design_dataset \
+            model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
+            model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_outputs \
+            model.dataset.task=design \
+            model.dataset.prompt_template="This example is related to" \
+            model.library=megatron \
+            trainer.max_steps=1 \
+            trainer.max_epochs=1 \
+            model.train_ds.batch_size=2 \
+            model.validation_ds.batch_size=2 \
+            model.test_ds.batch_size=2 \
+            model.nemo_path=null \
+            trainer.val_check_interval=0.0 \
+            trainer.devices=[1] \
+            model.dataset.use_cache=false \
+            model.language_model.pretrained_model_name=bert-base-uncased \
+            trainer.accelerator=gpu \
+            exp_manager=null  && \
+            rm -rf design_zero_shot_intent_classification_outputs
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel_BART_Classifier:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/dialogue && \
+            python dialogue.py \
+            do_training=False \
+            model.dataset.data_dir=/home/TestData/nlp/design_dataset \
+            model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
+            model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_bart_outputs \
+            model.dataset.task=design \
+            model.dataset.prompt_template="This example is related to" \
+            model.library=huggingface \
+            trainer.devices=[1] \
+            model.dataset.use_cache=false \
+            model.language_model.pretrained_model_name=bert-base-uncased \
+            trainer.accelerator=gpu \
+            exp_manager=null  && \
+            rm -rf design_zero_shot_intent_classification_bart_outputs
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Dialogue_Classification_Design_Intent_classification_using_DialogueNearestNeighbourModel:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/dialogue && \
+            python dialogue.py \
+            do_training=False \
+            model.dataset.data_dir=/home/TestData/nlp/design_dataset \
+            model.dataset.dialogues_example_dir=design_dialogue_nearest_neighbour_classification_outputs \
+            model.dataset.task=design \
+            model.dataset.prompt_template="" \
+            model.library=huggingface \
+            trainer.devices=[0] \
+            model.dataset.use_cache=false \
+            model.language_model.pretrained_model_name=sentence-transformers/all-MiniLM-L6-v2 \
+            trainer.accelerator=gpu \
+            exp_manager=null  && \
+            rm -rf design_dialogue_nearest_neighbour_classification_outputs
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  # L2: Dialogue Generation
+  L2_Dialogue_Generation_Dialogue_Answer_Extender_using_DialogueS2SGenerationModel:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/dialogue && \
+            python dialogue.py \
+            do_training=False \
+            model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
+            model.dataset.dialogues_example_dir=answer_extender_s2s \
+            model.dataset.task=ms_marco \
+            model.library=huggingface \
+            model.dataset.debug_mode=True \
+            trainer.max_steps=1 \
+            trainer.max_epochs=1 \
+            model.train_ds.batch_size=2 \
+            model.validation_ds.batch_size=2 \
+            model.test_ds.batch_size=2 \
+            model.nemo_path=null \
+            trainer.val_check_interval=0.0 \
+            trainer.devices=[1] \
+            model.dataset.use_cache=false \
+            model.language_model.pretrained_model_name=facebook/bart-large \
+            trainer.accelerator=gpu \
+            exp_manager=null  && \
+            rm -rf answer_extender_s2s
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Dialogue_Generation_Dialogue_SGD_Based_Answer_Extender_using_DialogueS2SGenerationModel:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/dialogue && \
+            python dialogue.py \
+            do_training=False \
+            model.dataset.data_dir=/home/TestData/nlp/sgd_small \
+            model.dataset.dialogues_example_dir=sgd_answer_extender_s2s \
+            model.dataset.task_name=debug_sample \
+            model.dataset.task=sgd_generation \
+            model.dataset.input_field=utterance+system_actions \
+            model.dataset.output_field=system_utterance \
+            model.dataset.use_cache=false \
+            model.dataset.system_utterance=next_turn \
+            model.dataset.debug_mode=True \
+            model.dataset.prompt_template=slots_values \
+            model.library=huggingface \
+            trainer.max_steps=1 \
+            trainer.max_epochs=1 \
+            model.train_ds.batch_size=2 \
+            model.validation_ds.batch_size=2 \
+            model.test_ds.batch_size=2 \
+            model.nemo_path=null \
+            trainer.val_check_interval=0.0 \
+            trainer.devices=[0] \
+            model.language_model.pretrained_model_name=facebook/bart-large \
+            trainer.accelerator=gpu \
+            exp_manager=null  && \
+            rm -rf sgd_answer_extender_s2s
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+#     - name: L2: Dialogue Generation Part 2
+#       when {
+#         anyOf {
+#           branch main
+#           changeRequest target: main
+#         }
+#       }
+#       failFast true
+#       parallel {
+#         - name: Dialogue: Answer Extender using DialogueGPTGenerationModel
+#           - run: |
+#             cd examples/nlp/dialogue && \
+#             python dialogue.py \
+#             do_training=False \
+#             model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
+#             model.dataset.dialogues_example_dir=answer_extender \
+#             model.library=huggingface \
+#             model.dataset.task=ms_marco \
+#             model.dataset.debug_mode=True \
+#             trainer.val_check_interval=0.0 \
+#             trainer.devices=[0] \
+#             model.dataset.use_cache=false \
+#             model.language_model.pretrained_model_name=gpt2 \
+#             trainer.accelerator=gpu \
+#             exp_manager=null  && \
+#             rm -rf answer_extender
+#           }
+#         }
+#       }
+#     }
+
+  # L2: COPY
+  L2_COPY_Dialogue_Answer_Extender_using_DialogueGPTGenerationModel:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/dialogue && \
+            python dialogue.py \
+            do_training=False \
+            model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
+            model.dataset.dialogues_example_dir=answer_extender \
+            model.library=huggingface \
+            model.dataset.task=ms_marco \
+            model.dataset.debug_mode=True \
+            trainer.val_check_interval=0.0 \
+            trainer.devices=[0] \
+            model.dataset.use_cache=false \
+            model.language_model.pretrained_model_name=gpt2 \
+            trainer.accelerator=gpu \
+            exp_manager=null  && \
+            rm -rf answer_extender
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  # L2: Duplex Text Normalization
+  L2_Duplex_Text_Normalization_with_Tarred_dataset:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/duplex_text_normalization && \
+            python duplex_text_normalization_train.py \
+            data.validation_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv \
+            mode=tn \
+            lang=en \
+            tagger_model.do_training=false \
+            decoder_model.transformer=t5-small \
+            data.validation_ds.batch_size=2 \
+            data.train_ds.use_cache=false \
+            data.validation_ds.use_cache=false \
+            data.test_ds.batch_size=2 \
+            data.train_ds.decoder_data_augmentation=false \
+            data.train_ds.num_workers=2 \
+            decoder_trainer.devices=[0,1] \
+            decoder_trainer.accelerator="gpu" \
+            data.train_ds.use_tarred_dataset=true \
+            +decoder_trainer.fast_dev_run=true \
+            decoder_exp_manager.create_checkpoint_callback=false \
+            data.train_ds.tar_metadata_file=/home/TestData/nlp/duplex_text_norm/tarred_small/metadata.json \
+            data.test_ds.use_cache=false \
+            data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+# Runs out of memory on the 12G TITAN V (GPU 0 on main CI)
+# TODO: add when megatron bert is supported again in NeMo
+# - name: L2: MegaBERT Token Classification
+#   when {
+#     anyOf {
+#       branch main
+#       changeRequest target: main
+#     }
+#   }
+#   failFast true
+#   - run: |
+#     cd examples/nlp/token_classification && \
+#     python token_classification_train.py \
+#     model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \
+#     model.language_model.pretrained_model_name=megatron-bert-345m-uncased \
+#     model.train_ds.batch_size=10 \
+#     model.dataset.max_seq_length=50 \
+#     model.dataset.use_cache=false \
+#     trainer.accelerator=gpu \
+#     trainer.strategy=ddp \
+#     trainer.precision=16 \
+#     trainer.devices=[1] \
+#     trainer.accelerator="gpu" \
+#     +trainer.fast_dev_run=true \
+#     exp_manager=null
+#   }
+# }
+
+  # L2: BERT Text Classification
+  L2_BERT_Text_Classification_with_BERT_Test:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/text_classification && \
+            python text_classification_with_bert.py \
+            model.dataset.num_classes=6 \
+            model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
+            model.validation_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \
+            model.language_model.pretrained_model_name=distilbert-base-uncased \
+            model.train_ds.batch_size=10 \
+            model.dataset.max_seq_length=50 \
+            model.dataset.use_cache=false \
+            trainer.devices=[0] \
+            trainer.accelerator="gpu" \
+            +trainer.fast_dev_run=true \
+            exp_manager=null
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  # L2: Parallel BERT Question-Answering SQUAD v1.1 & v2.0
+  L2_Parallel_BERT_Question-Answering_SQUAD_v1_1:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            # Cannot do fast_dev_run because squad needs whole dev dataset
+            cd examples/nlp/question_answering && \
+            python question_answering.py \
+            model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
+            model.dataset.use_cache=false \
+            model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
+            model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
+            model.train_ds.batch_size=2 \
+            model.train_ds.num_samples=2 \
+            model.validation_ds.batch_size=2 \
+            model.validation_ds.num_samples=2 \
+            model.test_ds.num_samples=2 \
+            model.test_ds.batch_size=2 \
+            trainer.max_epochs=1 \
+            trainer.max_steps=1 \
+            model.language_model.pretrained_model_name=bert-base-uncased \
+            model.dataset.version_2_with_negative=false \
+            trainer.precision=16 \
+            trainer.devices=[0] \
+            trainer.accelerator="gpu" \
+            exp_manager=null
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Parallel_BERT_Question-Answering_SQUAD_v2_0:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            # Cannot do fast_dev_run because squad needs whole dev dataset
+            cd examples/nlp/question_answering && \
+            python question_answering.py \
+            model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
+            model.dataset.use_cache=false \
+            model.train_ds.batch_size=2 \
+            model.train_ds.num_samples=2 \
+            model.validation_ds.batch_size=2 \
+            model.validation_ds.num_samples=2 \
+            trainer.max_epochs=1 \
+            trainer.max_steps=1 \
+            model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
+            model.language_model.pretrained_model_name=bert-base-uncased \
+            model.dataset.version_2_with_negative=true \
+            trainer.precision=16 \
+            trainer.devices=[1] \
+            trainer.accelerator="gpu" \
+            exp_manager=null
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  # L2: Parallel BART Question-Answering SQUAD v1.1 & v2.0
+  L2_Parallel_BART_Question-Answering_SQUAD_v1_1:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/question_answering && \
+            python question_answering.py \
+            model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
+            model.dataset.use_cache=false \
+            model.dataset.check_if_answer_in_context=false \
+            model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
+            model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
+            model.train_ds.batch_size=2 \
+            model.train_ds.num_samples=2 \
+            model.validation_ds.batch_size=2 \
+            model.validation_ds.num_samples=2 \
+            model.test_ds.num_samples=2 \
+            model.test_ds.batch_size=2 \
+            trainer.max_epochs=1 \
+            trainer.max_steps=1 \
+            model.language_model.pretrained_model_name=facebook/bart-base \
+            model.dataset.version_2_with_negative=false \
+            trainer.precision=16 \
+            trainer.devices=[0] \
+            trainer.accelerator="gpu" \
+            exp_manager=null
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Parallel_BART_Question-Answering_SQUAD_v2_0:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/question_answering && \
+            python question_answering.py \
+            model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
+            model.dataset.use_cache=false \
+            model.dataset.check_if_answer_in_context=false \
+            model.train_ds.batch_size=2 \
+            model.train_ds.num_samples=2 \
+            model.validation_ds.batch_size=2 \
+            model.validation_ds.num_samples=2 \
+            trainer.max_epochs=1 \
+            trainer.max_steps=1 \
+            model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
+            model.language_model.pretrained_model_name=facebook/bart-base \
+            model.dataset.version_2_with_negative=true \
+            trainer.precision=16 \
+            trainer.devices=[1] \
+            trainer.accelerator="gpu" \
+            exp_manager=null
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  # L2: Parallel GPT2 Question-Answering SQUAD v1.1 & v2.0
+  L2_Parallel_GPT2_Question-Answering_SQUAD_v1_1:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/question_answering && \
+            python question_answering.py \
+            model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
+            model.dataset.use_cache=false \
+            model.dataset.check_if_answer_in_context=false \
+            model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
+            model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
+            model.train_ds.batch_size=2 \
+            model.train_ds.num_samples=2 \
+            model.validation_ds.batch_size=2 \
+            model.validation_ds.num_samples=2 \
+            model.test_ds.num_samples=2 \
+            model.test_ds.batch_size=2 \
+            trainer.max_epochs=1 \
+            trainer.max_steps=1 \
+            model.language_model.pretrained_model_name=gpt2 \
+            model.dataset.version_2_with_negative=false \
+            trainer.precision=16 \
+            trainer.devices=[0] \
+            trainer.accelerator="gpu" \
+            exp_manager=null
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Parallel_GPT2_Question-Answering_SQUAD_v2_0:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/question_answering && \
+            python question_answering.py \
+            model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
+            model.dataset.use_cache=false \
+            model.dataset.check_if_answer_in_context=false \
+            model.train_ds.batch_size=2 \
+            model.train_ds.num_samples=2 \
+            model.validation_ds.batch_size=2 \
+            model.validation_ds.num_samples=2 \
+            trainer.max_epochs=1 \
+            trainer.max_steps=1 \
+            model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
+            model.language_model.pretrained_model_name=gpt2 \
+            model.dataset.version_2_with_negative=true \
+            trainer.precision=16 \
+            trainer.devices=[1] \
+            trainer.accelerator="gpu" \
+            exp_manager=null
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  # L2: Intent and Slot Classification Tasks
+  L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/intent_slot_classification && \
+            python intent_slot_classification.py \
+            model.data_dir=/home/TestData/nlp/retail \
+            model.validation_ds.prefix=dev \
+            model.test_ds.prefix=dev \
+            trainer.devices=[0] \
+            trainer.accelerator="gpu" \
+            +trainer.fast_dev_run=true \
+            exp_manager.exp_dir=checkpoints
+            rm -rf checkpoints
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/intent_slot_classification && \
+            python multi_label_intent_slot_classification.py \
+            model.data_dir=/home/TestData/nlp/new_multiatis \
+            model.validation_ds.prefix=dev \
+            model.test_ds.prefix=dev \
+            trainer.devices=[0] \
+            +trainer.fast_dev_run=true \
+            exp_manager.exp_dir=checkpoints2
+            rm -rf checkpoints2
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+    # TODO: add when megatron-bert is supported again
+    # stage('L2: Model Parallel Size 2 Megatron Text Classification') {
+    #   when {
+    #     anyOf{
+    #       branch 'main'
+    #       changeRequest target: 'main'
+    #     }
+    #   }
+    #   failFast true
+    #   steps{
+    #     cd examples/nlp/text_classification && \
+    #     python text_classification_with_bert.py \
+    #     trainer.devices=[0,1] \
+    #     trainer.accelerator="gpu" \
+    #     trainer.num_nodes=1 \
+    #     trainer.precision=16 \
+    #     trainer.gradient_clip_val=1.0 \
+    #     +trainer.fast_dev_run=true \
+    #     model.dataset.num_classes=6 \
+    #     model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
+    #     model.train_ds.batch_size=4 \
+    #     model.language_model.pretrained_model_name=megatron-bert-uncased \
+    #     model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \
+    #     model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \
+    #     model.nemo_path=null \
+    #     ~model.infer_samples \
+    #     exp_manager=null
+    #   }
+    # }
+
+    # stage('L2: Model Parallel Size 2 Megatron Autoresume') {
+    #   when {
+    #     anyOf{
+    #       branch 'main'
+    #       changeRequest target: 'main'
+    #     }
+    #   }
+    #   failFast true
+    #   steps{
+    #     cd examples/nlp/text_classification && \
+    #     python text_classification_with_bert.py \
+    #     trainer.devices=[0,1] \
+    #     trainer.accelerator="gpu" \
+    #     trainer.num_nodes=1 \
+    #     trainer.precision=16 \
+    #     trainer.gradient_clip_val=1.0 \
+    #     trainer.max_epochs=1 \
+    #     +trainer.fast_dev_run=true \
+    #     model.dataset.num_classes=6 \
+    #     model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
+    #     model.train_ds.batch_size=4 \
+    #     model.language_model.pretrained_model_name=megatron-bert-uncased \
+    #     model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \
+    #     model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \
+    #     model.nemo_path=null \
+    #     ~model.infer_samples \
+    #     +exp_manager.explicit_log_dir=/home/TestData/nlp/mp_autoresume \
+    #     +exp_manager.resume_if_exists=true
+    #   }
+    # }
+
+    # stage('L2: Model Parallel Size 2 Megatron Evaluation from .nemo') {
+    #   when {
+    #     anyOf{
+    #       branch 'main'
+    #       changeRequest target: 'main'
+    #     }
+    #   }
+    #   failFast true
+    #   steps{
+    #     cd examples/nlp/text_classification && \
+    #     python model_parallel_text_classification_evaluation.py \
+    #     trainer.devices=[0,1] \
+    #     trainer.accelerator="gpu" \
+    #     trainer.num_nodes=1 \
+    #     model.dataset.num_classes=6 \
+    #     model.test_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \
+    #     model.nemo_path=/home/TestData/nlp/mp_2_nemo/retail_text_class_350M.nemo \
+    #     exp_manager=null
+    #   }
+    # }
+
+    # stage('L2: Model Parallel Size 2 Megatron Train from .nemo') {
+    #   when {
+    #     anyOf{
+    #       branch 'main'
+    #       changeRequest target: 'main'
+    #     }
+    #   }
+    #   failFast true
+    #   steps{
+    #     cd examples/nlp/token_classification && \
+    #     python token_classification_train.py \
+    #     pretrained_model=/home/TestData/nlp/mp_2_nemo/ner_350M.nemo \
+    #     model.dataset.data_dir=/home/TestData/nlp/ner/ \
+    #     model.train_ds.batch_size=2 \
+    #     model.dataset.use_cache=false \
+    #     trainer.devices=[0,1] \
+    #     trainer.accelerator="gpu" \
+    #     +trainer.fast_dev_run=true \
+    #     model.dataset.class_balancing="weighted_loss" \
+    #     exp_manager=null
+    #   }
+    # }
+
+
+  # L2: Parallel NLP Examples 2
+  L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/token_classification && \
+            python token_classification_train.py \
+            pretrained_model=ner_en_bert \
+            model.dataset.data_dir=/home/TestData/nlp/ner/ \
+            model.train_ds.batch_size=2 \
+            model.dataset.use_cache=false \
+            trainer.devices=[0] \
+            trainer.accelerator="gpu" \
+            +trainer.fast_dev_run=true \
+            model.dataset.class_balancing="weighted_loss" \
+            exp_manager.exp_dir=null
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/token_classification && \
+            data_dir="$(mktemp -d -p "$(pwd)")" && \
+            cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
+            python punctuation_capitalization_train_evaluate.py \
+              pretrained_model=punctuation_en_bert \
+              model.train_ds.ds_item="${data_dir}" \
+              model.validation_ds.ds_item="${data_dir}" \
+              model.test_ds.ds_item="${data_dir}" \
+              +model.train_ds.use_cache=false \
+              +model.validation_ds.use_cache=false \
+              +model.test_ds.use_cache=false \
+              trainer.devices=[1] \
+              trainer.accelerator="gpu" \
+              +trainer.fast_dev_run=true \
+              exp_manager.exp_dir=null && \
+            rm -rf "${data_dir}"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/token_classification && \
+            python token_classification_train.py \
+            model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \
+            trainer.devices=[0] \
+            trainer.accelerator="gpu" \
+            +trainer.fast_dev_run=true \
+            model.dataset.use_cache=false \
+            model.language_model.pretrained_model_name="TurkuNLP/bert-base-finnish-cased-v1" \
+            exp_manager.exp_dir=null
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+        
+  L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/token_classification/token_classification_evaluate.py \
+            model.dataset.data_dir=/home/TestData/nlp/ner/ \
+            model.dataset.use_cache=false \
+            pretrained_model=/home/TestData/nlp/pretrained_models/NER_Model_with_BERT_base_uncased.nemo
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            data_dir="$(mktemp -d -p "$(pwd)")" && \
+            cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
+            python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
+              +do_training=false \
+              +do_testing=true \
+              model.test_ds.ds_item="${data_dir}" \
+              ~model.train_ds \
+              ~model.validation_ds \
+              +model.test_ds.use_cache=false \
+              pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo && \
+            rm -rf "${data_dir}"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Parallel_NLP_Examples2_Punctuation_Capitalization_2GPUs_with_DistilBERT_Finetuning_on_other_data:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/token_classification && \
+            output_dir="$(mktemp -d -p "$(pwd)")" && \
+            tmp_data_dir="$(mktemp -d -p "$(pwd)")" && \
+            cp /home/TestData/nlp/token_classification_punctuation/*.txt "${tmp_data_dir}"/ && \
+            python punctuation_capitalization_train_evaluate.py \
+              model.train_ds.use_tarred_dataset=false \
+              model.train_ds.ds_item="${tmp_data_dir}" \
+              model.validation_ds.ds_item="${tmp_data_dir}" \
+              model.test_ds.ds_item="${tmp_data_dir}" \
+              model.language_model.pretrained_model_name=distilbert-base-uncased \
+              +model.train_ds.use_cache=false \
+              +model.validation_ds.use_cache=false \
+              +model.test_ds.use_cache=false \
+              trainer.devices=[0,1] \
+              trainer.accelerator="gpu" \
+              trainer.strategy=ddp \
+              trainer.max_epochs=1 \
+              +exp_manager.explicit_log_dir="${output_dir}" \
+              +do_testing=true && \
+            tmp_data_dir_2="$(mktemp -d -p "$(pwd)")" && \
+            mv "${tmp_data_dir}"/* "${tmp_data_dir_2}" && \
+            rm -rf "${tmp_data_dir}" && \
+            python punctuation_capitalization_train_evaluate.py \
+              model.train_ds.use_tarred_dataset=false \
+              model.train_ds.ds_item="${tmp_data_dir_2}" \
+              model.validation_ds.ds_item="${tmp_data_dir_2}" \
+              model.test_ds.ds_item="${tmp_data_dir_2}" \
+              pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \
+              +model.train_ds.use_cache=false \
+              +model.validation_ds.use_cache=false \
+              +model.test_ds.use_cache=false \
+              trainer.devices=[0,1] \
+              trainer.accelerator="gpu" \
+              trainer.strategy=ddp \
+              trainer.max_epochs=1 \
+              exp_manager=null && \
+            rm -rf /workspace/NeMo/examples/nlp/token_classification/nemo_experiments \
+              "${tmp_data_dir_2}" \
+              "${output_dir}"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  # Punctuation & Capitalization tarred dataset:
+  Punctuation_Capitalization_tarred_dataset_create_and_use_tarred_dataset:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            data_dir="$(mktemp -d -p "$(pwd)")" && \
+            cp -r /home/TestData/nlp/token_classification_punctuation/*.txt \
+              /home/TestData/nlp/token_classification_punctuation/wmt_wiki_10000 \
+              "${data_dir}"/ && \
+            usual_data=${data_dir}/wmt_wiki_10000 && \
+            output_dir="$(mktemp -d -p "$(pwd)")" && \
+            tarred_data=${output_dir}/train_tarred && \
+            tokens_in_batch=2000 && \
+            max_seq_length=512 && \
+            lm_model=distilbert-base-uncased && \
+            python examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py \
+              --text ${usual_data}/input.txt \
+              --labels ${usual_data}/labels.txt \
+              --output_dir ${tarred_data} \
+              --tokens_in_batch ${tokens_in_batch} \
+              --max_seq_length 512 \
+              --lines_per_dataset_fragment 2000 \
+              --num_batches_per_tarfile 5 \
+              --tar_file_prefix punctuation_capitalization \
+              --tokenizer_name ${lm_model} \
+              --use_fast_tokenizer \
+              --pad_label O \
+              --n_jobs 3 && \
+            echo "Number of tarred files in dataset:" && \
+            ls ${tarred_data}/*.tar | wc -l && \
+            echo "Label id files in dataset:" && \
+            ls ${tarred_data}/*.csv && \
+            metadata_file=${tarred_data}/metadata.punctuation_capitalization.tokens${tokens_in_batch}.max_seq_length${max_seq_length}.${lm_model}.json && \
+            python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
+              model.validation_ds.ds_item="${data_dir}" \
+              model.test_ds.ds_item="${data_dir}" \
+              model.train_ds.ds_item=${tarred_data} \
+              model.language_model.pretrained_model_name=${lm_model} \
+              model.train_ds.use_tarred_dataset=true \
+              model.train_ds.tar_metadata_file=${metadata_file} \
+              +model.train_ds.use_cache=false \
+              +model.validation_ds.use_cache=false \
+              +model.test_ds.use_cache=false \
+              trainer.devices=[0,1] \
+              trainer.accelerator="gpu" \
+              trainer.strategy=ddp \
+              trainer.max_epochs=1 \
+              +exp_manager.explicit_log_dir=${output_dir}/output && \
+            rm -rf "${output_dir}" "${data_dir}"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  # Punctuation_Capitalization_Different_ways_of_passing_labels_to_model
+  Punctuation_Capitalization_Using_model-common_datasets_parameters-label_vocab_dir:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/token_classification && \
+            work_dir="$(mktemp -d -p "$(pwd)")" && \
+            label_vocab_dir="${work_dir}/labels" && \
+            mkdir -p ${label_vocab_dir} && \
+            data_dir="${work_dir}/data" && \
+            mkdir -p "${data_dir}" && \
+            cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \
+            output_dir="${work_dir}/output" && \
+            mkdir -p "${output_dir}" && \
+            punct_label_vocab="${label_vocab_dir}/punct_label_vocab.csv" && \
+            capit_label_vocab="${label_vocab_dir}/capit_label_vocab.csv" && \
+            printf "O\n,\n.\n?\n" > "${punct_label_vocab}" && \
+            printf "O\nU\n" > "${capit_label_vocab}" && \
+            python punctuation_capitalization_train_evaluate.py \
+              model.train_ds.use_tarred_dataset=false \
+              model.train_ds.ds_item="${data_dir}" \
+              model.validation_ds.ds_item="${data_dir}" \
+              model.test_ds.ds_item="${data_dir}" \
+              model.language_model.pretrained_model_name=distilbert-base-uncased \
+              model.common_dataset_parameters.label_vocab_dir="${label_vocab_dir}" \
+              model.class_labels.punct_labels_file="$(basename "${punct_label_vocab}")" \
+              model.class_labels.capit_labels_file="$(basename "${capit_label_vocab}")" \
+              +model.train_ds.use_cache=false \
+              +model.validation_ds.use_cache=false \
+              +model.test_ds.use_cache=false \
+              trainer.devices=[0,1] \
+              trainer.strategy=ddp \
+              trainer.max_epochs=1 \
+              +exp_manager.explicit_log_dir="${output_dir}" \
+              +do_testing=false && \
+            python punctuation_capitalization_train_evaluate.py \
+              +do_training=false \
+              +do_testing=true \
+              ~model.train_ds \
+              ~model.validation_ds \
+              model.test_ds.ds_item="${data_dir}" \
+              pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \
+              +model.train_ds.use_cache=false \
+              +model.validation_ds.use_cache=false \
+              +model.test_ds.use_cache=false \
+              trainer.devices=[0,1] \
+              trainer.strategy=ddp \
+              trainer.max_epochs=1 \
+              exp_manager=null && \
+            rm -rf "${work_dir}"
+  # TODO: pleasefixme
+  # Punctuation_Capitalization_Using_model-common_datasets_parameters-punct-capit-_label_ids:
+  #   needs: [cicd-test-container-setup]
+  #   runs-on: self-hosted-azure
+  #   container:
+  #     image: localhost:5000/nemo_container
+  #     options: 
+  #       # --user 0:128
+  #       --device=/dev/nvidia0
+  #       --gpus all
+  #       --shm-size=8g 
+  #       --env TRANSFORMERS_OFFLINE=0 
+  #       --env HYDRA_FULL_ERROR=1
+  #       --volume /home/TestData:/home/TestData
+  #   steps:
+  #       - name: Checkout repository
+  #         uses: actions/checkout@v2
+  #       - run: |
+  #           cd examples/nlp/token_classification && \
+  #           work_dir="$(mktemp -d -p "$(pwd)")" && \
+  #           output_dir="${work_dir}/output" && \
+  #           mkdir -p "${output_dir}" && \
+  #           data_dir="${work_dir}/data" && \
+  #           mkdir -p "${data_dir}" && \
+  #           cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \
+  #           conf_name=punctuation_capitalization_config_with_ids && \
+  #           cp conf/punctuation_capitalization_config.yaml "${work_dir}/${conf_name}.yaml" && \
+  #           sed -i $\'s/punct_label_ids: null/punct_label_ids: {O: 0, \\\',\\\': 1, .: 2, \\\'?\\\': 3}/\' \
+  #             "${work_dir}/${conf_name}.yaml" && \
+  #           sed -i $\'s/capit_label_ids: null/capit_label_ids: {O: 0, U: 1}/\' \
+  #             "${work_dir}/${conf_name}.yaml" && \
+  #           python punctuation_capitalization_train_evaluate.py \
+  #             --config-path "${work_dir}" \
+  #             --config-name "${conf_name}" \
+  #             model.train_ds.use_tarred_dataset=false \
+  #             model.train_ds.ds_item="${data_dir}" \
+  #             model.validation_ds.ds_item="${data_dir}" \
+  #             model.test_ds.ds_item="${data_dir}" \
+  #             model.language_model.pretrained_model_name=distilbert-base-uncased \
+  #             +model.train_ds.use_cache=false \
+  #             +model.validation_ds.use_cache=false \
+  #             +model.test_ds.use_cache=false \
+  #             trainer.devices=[0,1] \
+  #             trainer.strategy=ddp \
+  #             trainer.max_epochs=1 \
+  #             +exp_manager.explicit_log_dir="${output_dir}" \
+  #             +do_testing=false && \
+  #           python punctuation_capitalization_train_evaluate.py \
+  #             +do_training=false \
+  #             +do_testing=true \
+  #             ~model.train_ds \
+  #             ~model.validation_ds \
+  #             model.test_ds.ds_item="${data_dir}" \
+  #             pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \
+  #             +model.train_ds.use_cache=false \
+  #             +model.validation_ds.use_cache=false \
+  #             +model.test_ds.use_cache=false \
+  #             trainer.devices=[0,1] \
+  #             trainer.strategy=ddp \
+  #             trainer.max_epochs=1 \
+  #             exp_manager=null && \
+  #           rm -rf "${work_dir}"
+
+  # Punctuation & Capitalization inference      
+  Punctuation_Capitalization_inference_Restore_punctuation_and_capitalization_in_long_text:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            output_dir="$(mktemp -d -p "$(pwd)")" && \
+            python examples/nlp/token_classification/punctuate_capitalize_infer.py \
+              --input_manifest /home/TestData/nlp/token_classification_punctuation/iwslt_tst2019.manifest \
+              --output_text "${output_dir}/iwslt_inference_result.txt" \
+              --max_seq_length 92 \
+              --step 8 \
+              --margin 16 \
+              --pretrained_name punctuation_en_bert \
+              --batch_size 32 && \
+            rm -rf "${output_dir}"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+  
+  # L2: Parallel Pretraining BERT pretraining from Text/Preprocessed
+  L2_Pretraining_BERT_pretraining_from_Text:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/language_modeling && \
+              python bert_pretraining.py \
+              --config-name=bert_pretraining_from_text_config.yaml \
+              trainer.devices=[0] \
+              trainer.accelerator="gpu" \
+              trainer.precision=16 \
+              +trainer.fast_dev_run=true \
+              model.train_ds.data_file=/home/TestData/nlp/wikitext-2/train.txt  \
+              model.train_ds.batch_size=32 \
+              model.validation_ds.data_file=/home/TestData/nlp/wikitext-2/valid.txt  \
+              model.validation_ds.batch_size=32 \
+              model.language_model.config_file=/home/TestData/nlp/bert_configs/bert_3200.json \
+              model.optim.lr=0.01 \
+              model.optim.sched.warmup_ratio=0.1 \
+              model.tokenizer.tokenizer_name=sentencepiece \
+              model.tokenizer.tokenizer_model=/home/TestData/nlp/wikitext-2/tokenizer_bpe_v3193/tokenizer.model \
+              model.mask_prob=0.15 \
+              model.short_seq_prob=0.1 \
+              exp_manager.exp_dir=PretrainingBERTFromText \
+              
+            rm -f /home/TestData/nlp/wikitext-2/*.pkl
+            #rm -rf examples/nlp/language_modeling/PretrainingBERTFromText
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Pretraining_BERT_from_Preprocessed:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/language_modeling && \
+              python bert_pretraining.py \
+              --config-name=bert_pretraining_from_preprocessed_config.yaml \
+              trainer.devices=[1] \
+              trainer.accelerator="gpu" \
+              trainer.precision=16 \
+              +trainer.fast_dev_run=false \
+              +trainer.max_epochs=1 \
+              +trainer.limit_val_batches=0 \
+              +trainer.limit_train_batches=1 \
+              model.train_ds.data_file=/home/TestData/nlp/wiki_book_mini/training \
+              model.train_ds.batch_size=8 \
+              model.language_model.lm_checkpoint=/home/TestData/nlp/bert_ckpts/nemo1.0/bert_base_uncased_mlm_final_1074591_nemo1.0.pt \
+              model.language_model.config_file=/home/TestData/nlp/bert_configs/uncased_L-12_H-768_A-12.json \
+              model.optim.lr=0.875e-4 \
+              model.optim.weight_decay=0.01 \
+              model.optim.sched.warmup_ratio=0.01 \
+              exp_manager.exp_dir=PretrainingBERTFromPreprocessed \
+              exp_manager.create_checkpoint_callback=False \
+              
+            #rm -rf examples/nlp/language_modeling/PretrainingBERTFromPreprocessed
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  # L2: Entity Linking        
+  L2_Entity_Linking_Self_Alignment_Pretraining_BERT:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/entity_linking && \
+            python self_alignment_pretraining.py \
+            project_dir=. \
+            trainer.val_check_interval=3 \
+            model.raw_data=None \
+            model.train_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_train_pairs.tsv \
+            model.validation_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_validation_pairs.tsv \
+            model.train_ds.batch_size=8 \
+            model.validation_ds.batch_size=8 \
+            exp_manager.exp_dir=null
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+
+  # TODO: remove +model.optim.capturable=True when Pytorch fix: https://github.com/pytorch/pytorch/pull/81858
+  # is in the release container
+  # L2: NMT Attention is All You Need Training
+  L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/machine_translation/enc_dec_nmt.py \
+              --config-path=conf \
+              --config-name=aayn_base \
+              do_testing=false \
+              model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+              model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+              model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+              model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+              model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+              model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+              model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
+              model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
+              model.encoder.num_layers=1 \
+              model.encoder.hidden_size=64 \
+              model.encoder.inner_size=256 \
+              model.decoder.num_layers=1 \
+              model.decoder.hidden_size=64 \
+              model.decoder.inner_size=256 \
+              +model.optim.capturable=True \
+              trainer.devices=[0] \
+              trainer.accelerator="gpu" \
+              +trainer.val_check_interval=2 \
+              +trainer.limit_val_batches=1 \
+              +trainer.max_steps=2 \
+              trainer.precision=16 \
+              +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \
+              +exp_manager.create_checkpoint_callback=true
+              
+            python examples/nlp/machine_translation/enc_dec_nmt.py \
+              --config-path=conf \
+              --config-name=aayn_base \
+              do_testing=true \
+              model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+              model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+              model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+              model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+              model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+              model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+              model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
+              model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
+              model.encoder.num_layers=1 \
+              model.encoder.hidden_size=64 \
+              model.encoder.inner_size=256 \
+              model.decoder.num_layers=1 \
+              model.decoder.hidden_size=64 \
+              model.decoder.inner_size=256 \
+              +model.optim.capturable=True \
+              trainer.devices=[0] \
+              trainer.accelerator="gpu" \
+              +trainer.val_check_interval=10 \
+              +trainer.limit_val_batches=1 \
+              +trainer.limit_test_batches=1 \
+              +trainer.max_steps=10 \
+              +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \
+              +exp_manager.create_checkpoint_callback=true \
+              +exp_manager.resume_if_exists=True
+              
+            rm -rf examples/nlp/machine_translation/nmt_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+              cd examples/nlp/machine_translation && \
+              python enc_dec_nmt.py \
+              --config-path=conf \
+              --config-name=aayn_base \
+              do_testing=true \
+              model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+              model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+              model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+              model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+              model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+              model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+              model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
+              model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
+              model.encoder.pre_ln=true \
+              model.decoder.pre_ln=true \
+              trainer.devices=[1] \
+              trainer.accelerator="gpu" \
+              +trainer.fast_dev_run=true \
+              +trainer.limit_test_batches=2 \
+              exp_manager=null
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+              cd examples/nlp/machine_translation && \
+              python enc_dec_nmt.py \
+              --config-path=conf \
+              --config-name=aayn_base \
+              do_testing=true \
+              model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \
+              model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \
+              model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
+              model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
+              model.test_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
+              model.test_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
+              model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
+              model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
+              trainer.devices=[0] \
+              trainer.accelerator="gpu" \
+              +trainer.fast_dev_run=true \
+              +trainer.limit_test_batches=2 \
+              exp_manager=null
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  # L2: NMT Attention is All You Need Inference
+  L2_NMT_Attention_is_All_You_Need_Inference:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/machine_translation && \
+            python nmt_transformer_infer.py \
+            --model=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \
+            --srctext=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.test.src \
+            --tgtout=/home/TestData/nlp/nmt/toy_data/out.txt \
+            --target_lang en \
+            --source_lang de
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  # L2: NMT Attention is All You Need Finetuning
+  L2_NMT_Attention_is_All_You_Need_Finetuning:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/machine_translation && \
+            python enc_dec_nmt_finetune.py \
+            model_path=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \
+            trainer.devices=[0] \
+            ~trainer.max_epochs \
+            model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+            model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+            model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+            model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+            model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+            model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+            +trainer.val_check_interval=10 \
+            +trainer.limit_val_batches=1 \
+            +trainer.limit_test_batches=1 \
+            +trainer.max_steps=10 \
+            +exp_manager.exp_dir=examples/nlp/machine_translation/nmt_finetune \
+            +exp_manager.create_checkpoint_callback=True \
+            +exp_manager.checkpoint_callback_params.monitor=val_sacreBLEU \
+            +exp_manager.checkpoint_callback_params.mode=max \
+            +exp_manager.checkpoint_callback_params.save_best_model=true
+        
+            rm -rf examples/nlp/machine_translation/nmt_finetune
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  # L2: NMT Tarred Dataset Creation
+  L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/machine_translation && \
+            python enc_dec_nmt.py \
+            --config-path=conf \
+            --config-name=aayn_base \
+            do_training=false \
+            model.preproc_out_dir=$PWD/preproc_out_dir \
+            model.train_ds.use_tarred_dataset=true \
+            model.train_ds.n_preproc_jobs=2 \
+            model.train_ds.lines_per_dataset_fragment=500 \
+            model.train_ds.num_batches_per_tarfile=10 \
+            model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+            model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+            model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+            model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+            model.encoder_tokenizer.vocab_size=2000 \
+            model.decoder_tokenizer.vocab_size=2000 \
+            ~model.test_ds \
+            trainer.devices=[0] \
+            trainer.accelerator="gpu" \
+            +trainer.fast_dev_run=true \
+            exp_manager=null \
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            cd examples/nlp/machine_translation && \
+            python create_tarred_parallel_dataset.py \
+            --src_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+            --tgt_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+            --out_dir $PWD/out_dir \
+            --encoder_tokenizer_vocab_size=2000 \
+            --decoder_tokenizer_vocab_size=2000 \
+            --tokens_in_batch=1000 \
+            --lines_per_dataset_fragment=500 \
+            --num_batches_per_tarfile=10 \
+            --n_preproc_jobs=2 \
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Megatron_NMT_Training_TP2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/machine_translation/megatron_nmt_training.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=10 \
+            +trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=10 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \
+            model.tensor_model_parallel_size=2 \
+            model.seq_length=128 \
+            model.encoder.num_layers=4 \
+            model.encoder.hidden_size=64 \
+            model.encoder.num_attention_heads=8 \
+            model.encoder.activation='swiglu' \
+            model.encoder.masked_softmax_fusion=False \
+            model.encoder.bias_activation_fusion=False \
+            model.encoder.activations_checkpoint_method='block' \
+            model.encoder.activations_checkpoint_num_layers=1 \
+            model.decoder.num_layers=2 \
+            model.decoder.hidden_size=64 \
+            model.decoder.num_attention_heads=8 \
+            model.decoder.activation='swiglu' \
+            model.decoder.masked_softmax_fusion=False \
+            model.decoder.bias_activation_fusion=False \
+            model.decoder.activations_checkpoint_method='block' \
+            model.decoder.activations_checkpoint_num_layers=1 \
+            model.micro_batch_size=2 \
+            model.global_batch_size=4 \
+            model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+            model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+            model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+            model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+            model.train_ds.num_workers=1 \
+            model.validation_ds.num_workers=1 \
+            ~model.test_ds \
+            model.train_ds.dataset_type=text_memmap \
+            model.encoder_tokenizer.library=sentencepiece \
+            model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
+            model.decoder_tokenizer.library=sentencepiece \
+            model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model
+            # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error
+            # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run()
+            python examples/nlp/machine_translation/megatron_nmt_training.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=1 \
+            +trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=10 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \
+            model.tensor_model_parallel_size=2 \
+            model.seq_length=128 \
+            model.encoder.num_layers=4 \
+            model.encoder.hidden_size=64 \
+            model.encoder.num_attention_heads=8 \
+            model.encoder.activation='swiglu' \
+            model.encoder.masked_softmax_fusion=False \
+            model.encoder.bias_activation_fusion=False \
+            model.encoder.activations_checkpoint_method='block' \
+            model.encoder.activations_checkpoint_num_layers=1 \
+            model.decoder.num_layers=2 \
+            model.decoder.hidden_size=64 \
+            model.decoder.num_attention_heads=8 \
+            model.decoder.activation='swiglu' \
+            model.decoder.masked_softmax_fusion=False \
+            model.decoder.bias_activation_fusion=False \
+            model.decoder.activations_checkpoint_method='block' \
+            model.decoder.activations_checkpoint_num_layers=1 \
+            model.micro_batch_size=2 \
+            model.global_batch_size=4 \
+            model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+            model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+            model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+            model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+            model.train_ds.num_workers=1 \
+            model.validation_ds.num_workers=1 \
+            ~model.test_ds \
+            model.train_ds.dataset_type=text_memmap \
+            model.encoder_tokenizer.library=sentencepiece \
+            model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
+            model.decoder_tokenizer.library=sentencepiece \
+            model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model
+            rm -rf examples/nlp/machine_translation/megatron_nmt_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Megatron_BART_Perceiver_MIM_Training_TP2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/megatron_bart_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=10 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=10 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \
+            model.tensor_model_parallel_size=2 \
+            model.seq_length=128 \
+            model.encoder.num_layers=4 \
+            model.encoder.hidden_size=64 \
+            model.encoder.arch=perceiver \
+            model.encoder.num_attention_heads=8 \
+            model.encoder.activation='swiglu' \
+            model.encoder.masked_softmax_fusion=False \
+            model.encoder.bias_activation_fusion=False \
+            model.encoder.activations_checkpoint_method='block' \
+            model.encoder.activations_checkpoint_num_layers=1 \
+            model.decoder.num_layers=2 \
+            model.decoder.hidden_size=64 \
+            model.decoder.num_attention_heads=8 \
+            model.decoder.activation='swiglu' \
+            model.decoder.masked_softmax_fusion=False \
+            model.decoder.bias_activation_fusion=False \
+            model.decoder.activations_checkpoint_method='block' \
+            model.decoder.activations_checkpoint_num_layers=1 \
+            model.micro_batch_size=2 \
+            model.global_batch_size=4 \
+            model.data.data_impl=text_mmap \
+            model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
+            model.data.splits_string='"800,100,100"' \
+            model.data.whole_word_masking=False \
+            model.tokenizer.library=sentencepiece \
+            model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
+            ++model.hiddens.enc_output_name=z \
+            ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \
+            ++model.hiddens.transform.q_z_given_x.hidden_size=64 \
+            ++model.hiddens.loss.mim.cls_name=a_mim \
+            ++model.hiddens.loss.mim.loss_weight=0.5
+            # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error
+            # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run()
+            python examples/nlp/language_modeling/megatron_bart_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=1 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=10 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \
+            model.tensor_model_parallel_size=2 \
+            model.seq_length=128 \
+            model.encoder.num_layers=4 \
+            model.encoder.hidden_size=64 \
+            model.encoder.arch=perceiver \
+            model.encoder.num_attention_heads=8 \
+            model.encoder.activation='swiglu' \
+            model.encoder.masked_softmax_fusion=False \
+            model.encoder.bias_activation_fusion=False \
+            model.encoder.activations_checkpoint_method='block' \
+            model.encoder.activations_checkpoint_num_layers=1 \
+            model.decoder.num_layers=2 \
+            model.decoder.hidden_size=64 \
+            model.decoder.num_attention_heads=8 \
+            model.decoder.activation='swiglu' \
+            model.decoder.masked_softmax_fusion=False \
+            model.decoder.bias_activation_fusion=False \
+            model.decoder.activations_checkpoint_method='block' \
+            model.decoder.activations_checkpoint_num_layers=1 \
+            model.micro_batch_size=2 \
+            model.global_batch_size=4 \
+            model.data.data_impl=text_mmap \
+            model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
+            model.data.splits_string='"800,100,100"' \
+            model.data.whole_word_masking=False \
+            model.tokenizer.library=sentencepiece \
+            model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
+            ++model.hiddens.enc_output_name=z \
+            ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \
+            ++model.hiddens.transform.q_z_given_x.hidden_size=64 \
+            ++model.hiddens.loss.mim.cls_name=a_mim \
+            ++model.hiddens.loss.mim.loss_weight=0.5
+            rm -rf examples/nlp/language_modeling/megatron_mim_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+    # stage('L2: NMT Bottleneck Fallback') {
+    #   when {
+    #     anyOf {
+    #       branch 'main'
+    #       changeRequest target: 'main'
+    #     }
+    #   }
+    #   failFast true
+    #   parallel {
+    #     stage('L2: seq2seq (no bottleneck)') {
+    #         steps {
+    #           cd examples/nlp/machine_translation && \
+    #           enc_dec_nmt-bottleneck.py \
+    #           --config-path=conf \
+    #           --config-name=aayn_bottleneck \
+    #           do_testing=true \
+    #           model.model_type=nll \
+    #           model.encoder.arch=seq2seq \
+    #           model.encoder.hidden_steps=1 \
+    #           model.encoder.hidden_blocks=1 \
+    #           model.encoder.hidden_init_method=params \
+    #           model.encoder.hidden_size=64 \
+    #           model.encoder.inner_size=128 \
+    #           model.encoder.num_attention_heads=2 \
+    #           model.encoder.num_layers=2 \
+    #           model.decoder.hidden_size=64 \
+    #           model.decoder.inner_size=128 \
+    #           model.decoder.num_attention_heads=2 \
+    #           model.decoder.num_layers=2 \
+    #           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \
+    #           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \
+    #           model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
+    #           model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
+    #           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src \
+    #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref \
+    #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
+    #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
+    #           trainer.devices=[1] \
+    #           trainer.accelerator="gpu" \
+    #           +trainer.fast_dev_run=true \
+    #           +trainer.limit_test_batches=2 \
+    #           exp_manager=null \
+    #         }
+    #     }
+    #   }
+    # }
+    # stage('L2: NMT Bottleneck Architecture') {
+    #   when {
+    #     anyOf {
+    #       branch 'main'
+    #       changeRequest target: 'main'
+    #     }
+    #   }
+    #   failFast true
+    #   parallel {
+    #     stage('Bridge Encoder (identity)') {
+    #         steps {
+    #           cd examples/nlp/machine_translation && \
+    #           enc_dec_nmt-bottleneck.py \
+    #           --config-path=conf \
+    #           --config-name=aayn_bottleneck \
+    #           do_testing=true \
+    #           model.model_type=nll \
+    #           model.encoder.arch=bridge \
+    #           model.encoder.hidden_steps=1 \
+    #           model.encoder.hidden_blocks=1 \
+    #           model.encoder.hidden_init_method=identity \
+    #           model.encoder.hidden_size=64 \
+    #           model.encoder.inner_size=128 \
+    #           model.encoder.num_attention_heads=2 \
+    #           model.encoder.num_layers=2 \
+    #           model.decoder.hidden_size=64 \
+    #           model.decoder.inner_size=128 \
+    #           model.decoder.num_attention_heads=2 \
+    #           model.decoder.num_layers=2 \
+    #           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+    #           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+    #           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+    #           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+    #           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+    #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+    #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
+    #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
+    #           trainer.devices=[0] \
+    #           trainer.accelerator="gpu" \
+    #           +trainer.fast_dev_run=true \
+    #           +trainer.limit_test_batches=2 \
+    #           exp_manager=null
+    #         }
+    #     }
+    #     stage('Perceiver Encoder (params)') {
+    #         steps {
+    #           cd examples/nlp/machine_translation && \
+    #           enc_dec_nmt-bottleneck.py \
+    #           --config-path=conf \
+    #           --config-name=aayn_bottleneck \
+    #           do_testing=true \
+    #           model.model_type=nll \
+    #           model.encoder.arch=perceiver \
+    #           model.encoder.hidden_steps=1 \
+    #           model.encoder.hidden_blocks=1 \
+    #           model.encoder.hidden_init_method=params \
+    #           model.encoder.hidden_size=64 \
+    #           model.encoder.inner_size=128 \
+    #           model.encoder.num_attention_heads=2 \
+    #           model.encoder.num_layers=2 \
+    #           model.decoder.hidden_size=64 \
+    #           model.decoder.inner_size=128 \
+    #           model.decoder.num_attention_heads=2 \
+    #           model.decoder.num_layers=2 \
+    #           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+    #           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+    #           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+    #           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+    #           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+    #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+    #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
+    #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
+    #           trainer.devices=[1] \
+    #           trainer.accelerator="gpu" \
+    #           +trainer.fast_dev_run=true \
+    #           +trainer.limit_test_batches=2 \
+    #           exp_manager=null
+    #         }
+    #     }
+    #   }
+    # }
+    # stage('L2: NMT Bottleneck LVM') {
+    #   when {
+    #     anyOf {
+    #       branch 'main'
+    #       changeRequest target: 'main'
+    #     }
+    #   }
+    #   failFast true
+    #   parallel {
+    #     stage('VAE') {
+    #         steps {
+    #           cd examples/nlp/machine_translation && \
+    #           enc_dec_nmt-bottleneck.py \
+    #           --config-path=conf \
+    #           --config-name=aayn_bottleneck \
+    #           do_testing=true \
+    #           model.model_type=vae \
+    #           model.encoder.arch=perceiver \
+    #           model.encoder.hidden_steps=1 \
+    #           model.encoder.hidden_blocks=1 \
+    #           model.encoder.hidden_init_method=params \
+    #           model.encoder.hidden_size=64 \
+    #           model.encoder.inner_size=128 \
+    #           model.encoder.num_attention_heads=2 \
+    #           model.encoder.num_layers=2 \
+    #           model.decoder.hidden_size=64 \
+    #           model.decoder.inner_size=128 \
+    #           model.decoder.num_attention_heads=2 \
+    #           model.decoder.num_layers=2 \
+    #           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+    #           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+    #           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+    #           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+    #           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+    #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+    #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
+    #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
+    #           trainer.devices=[0] \
+    #           trainer.accelerator="gpu" \
+    #           +trainer.fast_dev_run=true \
+    #           +trainer.limit_test_batches=2 \
+    #           exp_manager=null
+    #         }
+    #     }
+    #     stage('MIM') {
+    #         steps {
+    #           cd examples/nlp/machine_translation && \
+    #           enc_dec_nmt-bottleneck.py \
+    #           --config-path=conf \
+    #           --config-name=aayn_bottleneck \
+    #           do_testing=true \
+    #           model.model_type=mim \
+    #           model.encoder.arch=perceiver \
+    #           model.encoder.hidden_steps=1 \
+    #           model.encoder.hidden_blocks=1 \
+    #           model.encoder.hidden_init_method=params \
+    #           model.encoder.hidden_size=64 \
+    #           model.encoder.inner_size=128 \
+    #           model.encoder.num_attention_heads=2 \
+    #           model.encoder.num_layers=2 \
+    #           model.decoder.hidden_size=64 \
+    #           model.decoder.inner_size=128 \
+    #           model.decoder.num_attention_heads=2 \
+    #           model.decoder.num_layers=2 \
+    #           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+    #           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+    #           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+    #           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+    #           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+    #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+    #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
+    #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
+    #           trainer.devices=[1] \
+    #           trainer.accelerator="gpu" \
+    #           +trainer.fast_dev_run=true \
+    #           +trainer.limit_test_batches=2 \
+    #           exp_manager=null
+    #         }
+    #     }
+    #   }
+    # }
+        
+  L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=10 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=10 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+            model.pipeline_model_parallel_size=2 \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.optim.sched.warmup_steps=2 \
+            model.optim.sched.constant_steps=2 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.data.seq_length=128 \
+            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+            model.num_layers=8 \
+            model.hidden_size=256 \
+            model.num_attention_heads=8 \
+            model.activations_checkpoint_method=block \
+            model.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+
+            python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=10 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=20 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+            exp_manager.resume_if_exists=True \
+            model.pipeline_model_parallel_size=2 \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.optim.sched.warmup_steps=2 \
+            model.optim.sched.constant_steps=2 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.data.seq_length=128 \
+            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+            model.num_layers=8 \
+            model.hidden_size=256 \
+            model.num_attention_heads=8 \
+            model.activations_checkpoint_method=block \
+            model.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+
+            rm -rf examples/nlp/language_modeling/bert_pretrain_results
+            rm -rf examples/nlp/language_modeling/bert_index_mappings
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+ 
+  L2_Megatron_Bert_Pretraining_and_Resume_Training:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=10 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=10 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+            model.tensor_model_parallel_size=2 \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.sequence_parallel=True \
+            model.optim.sched.warmup_steps=2 \
+            model.optim.sched.constant_steps=2 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.data.seq_length=128 \
+            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+            model.num_layers=8 \
+            model.hidden_size=256 \
+            model.num_attention_heads=8 \
+            model.activations_checkpoint_method=block \
+            model.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+
+            python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=10 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=20 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+            exp_manager.resume_if_exists=True \
+            model.tensor_model_parallel_size=2 \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.optim.sched.warmup_steps=2 \
+            model.optim.sched.constant_steps=2 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.data.seq_length=128 \
+            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+            model.num_layers=8 \
+            model.hidden_size=256 \
+            model.num_attention_heads=8 \
+            model.activations_checkpoint_method=block \
+            model.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+
+            rm -rf examples/nlp/language_modeling/bert_pretrain_results
+            rm -rf examples/nlp/language_modeling/bert_index_mappings
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Megatron_Core_Bert_Pretraining_and_Resume_Training:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=10 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=10 \
+            trainer.precision=32 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+            model.mcore_bert=True \
+            model.tensor_model_parallel_size=2 \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.sequence_parallel=True \
+            model.optim.sched.warmup_steps=2 \
+            model.optim.sched.constant_steps=2 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.data.seq_length=128 \
+            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+            model.num_layers=8 \
+            model.hidden_size=256 \
+            model.num_attention_heads=8 \
+            model.activations_checkpoint_method='block' \
+            model.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+
+            NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=10 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=20 \
+            trainer.precision=32 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+            exp_manager.resume_if_exists=True \
+            model.mcore_bert=True \
+            model.tensor_model_parallel_size=2 \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.optim.sched.warmup_steps=2 \
+            model.optim.sched.constant_steps=2 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.data.seq_length=128 \
+            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+            model.num_layers=8 \
+            model.hidden_size=256 \
+            model.num_attention_heads=8 \
+            model.activations_checkpoint_method='block' \
+            model.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+
+            rm -rf examples/nlp/language_modeling/bert_pretrain_results
+            rm -rf examples/nlp/language_modeling/bert_index_mappings
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Megatron_RETRO_Pretraining_and_Resume_Training:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/megatron_retro_pretraining.py \
+            trainer.devices=2 \
+            trainer.num_nodes=1 \
+            trainer.accelerator=gpu \
+            trainer.accumulate_grad_batches=1 \
+            trainer.limit_val_batches=2 \
+            exp_manager.resume_if_exists=True \
+            trainer.max_steps=10 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            trainer.val_check_interval=10 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/retro_results \
+            model.data.data_prefix= \
+            model.data.knn_index= \
+            model.data.retrieval_prefix= \
+            model.tensor_model_parallel_size=2 \
+            model.micro_batch_size=4 \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.optim.sched.warmup_steps=2 \
+            model.optim.sched.constant_steps=2 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.chunk_size=32 \
+            model.enc_num_layers=2 \
+            model.dec_num_layers=2 \
+            model.enc_cross_attention=[1] \
+            model.dec_cross_attention=[1] \
+            +model.data.mock=True
+
+            python examples/nlp/language_modeling/megatron_retro_pretraining.py \
+            trainer.devices=2 \
+            trainer.num_nodes=1 \
+            trainer.accelerator=gpu \
+            trainer.accumulate_grad_batches=1 \
+            trainer.limit_val_batches=2 \
+            exp_manager.resume_if_exists=True \
+            trainer.max_steps=20 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            trainer.val_check_interval=10 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/retro_results \
+            model.data.data_prefix= \
+            model.data.knn_index= \
+            model.data.retrieval_prefix= \
+            model.tensor_model_parallel_size=2 \
+            model.micro_batch_size=4 \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.optim.sched.warmup_steps=2 \
+            model.optim.sched.constant_steps=2 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.chunk_size=32 \
+            model.enc_num_layers=2 \
+            model.dec_num_layers=2 \
+            model.enc_cross_attention=[1] \
+            model.dec_cross_attention=[1] \
+            +model.data.mock=True
+
+            rm -rf examples/nlp/language_modeling/retro_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  # L2_Megatron_RETRO_muTransfer_Pretraining_Performance:
+  #   needs: [cicd-test-container-setup]
+  #   runs-on: self-hosted-azure
+  #   container:
+  #     image: localhost:5000/nemo_container
+  #     options: 
+  #       # --user 0:128
+  #       --device=/dev/nvidia0
+  #       --gpus all
+  #       --shm-size=8g 
+  #       --env TRANSFORMERS_OFFLINE=0 
+  #       --env HYDRA_FULL_ERROR=1
+  #       --volume /home/TestData:/home/TestData
+  #   steps:
+  #       - name: Checkout repository
+  #         uses: actions/checkout@v2
+  #       - run: |
+  #           python examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py \
+  #               trainer.devices=2 \
+  #               trainer.num_nodes=1 \
+  #               trainer.accelerator=gpu \
+  #               trainer.accumulate_grad_batches=1 \
+  #               trainer.max_steps=100 \
+  #               trainer.log_every_n_steps=1 \
+  #               trainer.precision=16 \
+  #               trainer.val_check_interval=100 \
+  #               trainer.limit_val_batches=0 \
+  #               trainer.gradient_clip_val=1.0 \
+  #               +trainer.num_sanity_val_steps=0 \
+  #               exp_manager.exp_dir=examples/nlp/language_modeling/retro_results/ \
+  #               +exp_manager.version=smalltest \
+  #               model.data.neighbors=2 \
+  #               model.megatron_amp_O2=False \
+  #               model.apply_query_key_layer_scaling=False \
+  #               model.tensor_model_parallel_size=1 \
+  #               model.optim.name=muadamw \
+  #               model.optim.weight_decay=0.1 \
+  #               model.optim.betas=[0.9,0.95] \
+  #               model.optim.lr=6e-4 \
+  #               model.optim.sched.warmup_steps=1000 \
+  #               model.optim.sched.constant_steps=0 \
+  #               model.optim.sched.min_lr=6e-5 \
+  #               model.add_position_embedding=False \
+  #               model.enc_num_layers=2 \
+  #               model.dec_num_layers=6 \
+  #               model.enc_cross_attention=[0] \
+  #               model.dec_cross_attention=[3,5] \
+  #               model.hidden_size=96 \
+  #               model.ffn_hidden_size=384 \
+  #               model.init_method_std=0.023 \
+  #               model.num_attention_heads=12 \
+  #               model.max_position_embeddings=1024 \
+  #               model.encoder_seq_length=1024 \
+  #               model.tokenizer.library=megatron \
+  #               model.tokenizer.type=GPT2BPETokenizer \
+  #               model.tokenizer.merge_file=/home/TestData/nlp/megatron_retro/gpt2-merges.txt \
+  #               model.tokenizer.vocab_file=/home/TestData/nlp/megatron_retro/gpt2-vocab.json \
+  #               model.data.data_prefix=[/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document] \
+  #               model.data.knn_index=[/home/TestData/nlp/megatron_retro/knn2_map_wiki_test.idx] \
+  #               model.data.retrieval_prefix=/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document \
+  #               model.data.index_mapping_dir=/home/TestData/nlp/megatron_retro \
+  #               model.data.num_workers=8 \
+  #               model.micro_batch_size=8 \
+  #               model.normalization=rmsnorm \
+  #               model.transformer_block_type=pre_ln \
+  #               model.bias_activation_fusion=True \
+  #               model.bias_dropout_add_fusion=False \
+  #               model.masked_softmax_fusion=True \
+  #               model.hidden_dropout=0 \
+  #               model.attention_dropout=0 \
+  #               model.fp32_residual_connection=True \
+  #               model.shape_file=/home/TestData/nlp/megatron_retro/o1_rel_shape_info_tiny.yaml
+
+  #               python -c "import pandas as pd
+  #               import pathlib
+  #               from pandas.testing import assert_frame_equal
+  #               from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
+  #               import torch
+  #               if not (torch.cuda.is_available() and 'A100' in torch.cuda.get_device_name()):
+  #                   import sys
+  #                   sys.exit(0)
+  #               event_file = list(pathlib.Path('examples/nlp/language_modeling/retro_results/megatron_retro/smalltest').glob('events.out.tfevents*'))[0]
+  #               ea = EventAccumulator(str(event_file)).Reload()
+  #               vals = []
+  #               for i in ea.Scalars('reduced_train_loss'):
+  #                   vals.append(i.value)
+  #               training_curve = pd.DataFrame({'loss': vals})
+  #               gt_curve = pd.read_csv('/home/TestData/nlp/megatron_retro/expected_learning_curve.csv')
+  #               assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"
+
+  #               rm -rf examples/nlp/language_modeling/retro_results
+  #       - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+  #         if: "failure()"
+
+  L2_BioMegatron_Bert_NER_Task:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/token_classification/token_classification_train.py \
+            exp_manager.exp_dir=examples/nlp/language_modeling/token_classification_results \
+            trainer.max_epochs=1 \
+            model.dataset.data_dir=/home/TestData/nlp/ner \
+            model.language_model.pretrained_model_name=biomegatron345m_biovocab_30k_cased \
+            model.tokenizer.tokenizer_name=null
+            rm -rf examples/nlp/language_modeling/token_classification_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=2 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=3 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+            model.tensor_model_parallel_size=2 \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.optim.sched.warmup_steps=1 \
+            model.optim.sched.constant_steps=1 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.data.seq_length=128 \
+            model.normalization=rmsnorm \
+            model.bias=False \
+            model.bias_activation_fusion=False \
+            model.bias_dropout_add_fusion=False \
+            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+            model.num_layers=8 \
+            model.hidden_size=256 \
+            model.num_attention_heads=8 \
+            model.activations_checkpoint_method=block \
+            model.activations_checkpoint_granularity=full \
+            model.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+
+            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=2 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=6 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+            exp_manager.resume_if_exists=True \
+            model.tensor_model_parallel_size=2 \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.optim.sched.warmup_steps=2 \
+            model.optim.sched.constant_steps=2 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.data.seq_length=128 \
+            model.normalization=rmsnorm \
+            model.bias=False \
+            model.bias_activation_fusion=False \
+            model.bias_dropout_add_fusion=False \
+            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+            model.num_layers=8 \
+            model.hidden_size=256 \
+            model.num_attention_heads=8 \
+            model.activations_checkpoint_method=block \
+            model.activations_checkpoint_granularity=full \
+            model.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+        
+            rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+            rm -rf examples/nlp/language_modeling/gpt_index_mappings
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+           python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+           trainer.devices=2 \
+           trainer.accelerator=gpu \
+           trainer.log_every_n_steps=1 \
+           trainer.val_check_interval=2 \
+           trainer.limit_val_batches=2 \
+           trainer.accumulate_grad_batches=1 \
+           trainer.max_steps=3 \
+           trainer.precision=16 \
+           trainer.gradient_clip_val=1.0 \
+           exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+           model.tensor_model_parallel_size=2 \
+           model.optim.name=fused_adam \
+           model.optim.lr=2e-4 \
+           model.optim.sched.warmup_steps=1 \
+           model.optim.sched.constant_steps=1 \
+           model.optim.sched.min_lr=8e-5 \
+           model.max_position_embeddings=128 \
+           model.encoder_seq_length=128 \
+           model.data.seq_length=128 \
+           model.position_embedding_type=rope \
+           model.rotary_percentage=0.5 \
+           model.normalization=rmsnorm \
+           model.bias=False \
+           model.bias_activation_fusion=False \
+           model.bias_dropout_add_fusion=False \
+           model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+           model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+           model.num_layers=8 \
+           model.hidden_size=256 \
+           model.num_attention_heads=8 \
+           model.activations_checkpoint_method=block \
+           model.activations_checkpoint_granularity=full \
+           model.activations_checkpoint_num_layers=1 \
+           model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+           model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+        
+            #  commented out to save time on github ci @adithyare
+            # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+            # trainer.devices=2 \
+            # trainer.accelerator=gpu \
+            # trainer.log_every_n_steps=1 \
+            # trainer.val_check_interval=2 \
+            # trainer.limit_val_batches=1 \
+            # trainer.accumulate_grad_batches=1 \
+            # trainer.max_steps=6 \
+            # trainer.precision=16 \
+            # trainer.gradient_clip_val=1.0 \
+            # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+            # exp_manager.resume_if_exists=True \
+            # model.tensor_model_parallel_size=2 \
+            # model.optim.name=fused_adam \
+            # model.optim.lr=2e-4 \
+            # model.optim.sched.warmup_steps=2 \
+            # model.optim.sched.constant_steps=2 \
+            # model.optim.sched.min_lr=8e-5 \
+            # model.max_position_embeddings=128 \
+            # model.encoder_seq_length=128 \
+            # model.data.seq_length=128 \
+            # model.position_embedding_type=rope \
+            # model.rotary_percentage=0.5 \
+            # model.normalization=rmsnorm \
+            # model.bias=False \
+            # model.bias_activation_fusion=False \
+            # model.bias_dropout_add_fusion=False \
+            # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+            # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+            # model.num_layers=8 \
+            # model.hidden_size=256 \
+            # model.num_attention_heads=8 \
+            # model.activations_checkpoint_method=block \
+            # model.activations_checkpoint_granularity=full \
+            # model.activations_checkpoint_num_layers=1 \
+            # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+            # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
+
+           rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+           rm -rf examples/nlp/language_modeling/gpt_index_mappings
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+    #  This test requires Ampere but some of the test GPUs are Volta
+    #  Need to add a check for compute capability before uncommenting this test
+    #  - name: L2: Megatron GPT with Rope Pretraining using Flash Attention and Resume Training TP=2
+    #    when {
+    #      anyOf {
+    #        branch main
+    #        changeRequest target: main
+    #      }
+    #    }
+    #    failFast true
+    #    - run: |
+    #      python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+    #      trainer.devices=2 \
+    #      trainer.accelerator=gpu \
+    #      trainer.log_every_n_steps=1 \
+    #      trainer.val_check_interval=2 \
+    #      trainer.limit_val_batches=2 \
+    #      trainer.accumulate_grad_batches=1 \
+    #      trainer.max_steps=3 \
+    #      trainer.precision=16 \
+    #      trainer.gradient_clip_val=1.0 \
+    #      exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+    #      model.tensor_model_parallel_size=2 \
+    #      model.optim.name=fused_adam \
+    #      model.optim.lr=2e-4 \
+    #      model.optim.sched.warmup_steps=1 \
+    #      model.optim.sched.constant_steps=1 \
+    #      model.optim.sched.min_lr=8e-5 \
+    #      model.max_position_embeddings=128 \
+    #      model.encoder_seq_length=128 \
+    #      model.data.seq_length=128 \
+    #      model.position_embedding_type=rope \
+    #      model.rotary_percentage=0.5 \
+    #      model.normalization=rmsnorm \
+    #      model.bias=False \
+    #      model.bias_activation_fusion=False \
+    #      model.bias_dropout_add_fusion=False \
+    #      model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+    #      model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+    #      model.num_layers=8 \
+    #      model.hidden_size=256 \
+    #      model.num_attention_heads=8 \
+    #      model.activations_checkpoint_method=block \
+    #      model.activations_checkpoint_granularity=full \
+    #      model.activations_checkpoint_num_layers=1 \
+    #      model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+    #      model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \
+    #      model.use_flash_attention=True "
+    #      #  commented out to save time on github ci @adithyare
+    #      # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+    #      # trainer.devices=2 \
+    #      # trainer.accelerator=gpu \
+    #      # trainer.log_every_n_steps=1 \
+    #      # trainer.val_check_interval=2 \
+    #      # trainer.limit_val_batches=1 \
+    #      # trainer.accumulate_grad_batches=1 \
+    #      # trainer.max_steps=6 \
+    #      # trainer.precision=16 \
+    #      # trainer.gradient_clip_val=1.0 \
+    #      # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+    #      # exp_manager.resume_if_exists=True \
+    #      # model.tensor_model_parallel_size=2 \
+    #      # model.optim.name=fused_adam \
+    #      # model.optim.lr=2e-4 \
+    #      # model.optim.sched.warmup_steps=2 \
+    #      # model.optim.sched.constant_steps=2 \
+    #      # model.optim.sched.min_lr=8e-5 \
+    #      # model.max_position_embeddings=128 \
+    #      # model.encoder_seq_length=128 \
+    #      # model.data.seq_length=128 \
+    #      # model.position_embedding_type=rope \
+    #      # model.rotary_percentage=0.5 \
+    #      # model.normalization=rmsnorm \
+    #      # model.bias=False \
+    #      # model.bias_activation_fusion=False \
+    #      # model.bias_dropout_add_fusion=False \
+    #      # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+    #      # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+    #      # model.num_layers=8 \
+    #      # model.hidden_size=256 \
+    #      # model.num_attention_heads=8 \
+    #      # model.activations_checkpoint_method=block \
+    #      # model.activations_checkpoint_granularity=full \
+    #      # model.activations_checkpoint_num_layers=1 \
+    #      # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+    #      # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \
+    #      # model.use_flash_attention=True"
+    #      rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
+    #      rm -rf examples/nlp/language_modeling/gpt_index_mappings"
+    #    }
+    #  }
+
+  L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=2 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=3 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+            model.tensor_model_parallel_size=2 \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.optim.sched.warmup_steps=1 \
+            model.optim.sched.constant_steps=1 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.data.seq_length=128 \
+            model.position_embedding_type=alibi \
+            model.normalization=rmsnorm \
+            model.bias=False \
+            model.bias_activation_fusion=False \
+            model.bias_dropout_add_fusion=False \
+            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+            model.num_layers=8 \
+            model.hidden_size=256 \
+            model.num_attention_heads=8 \
+            model.activations_checkpoint_method=block \
+            model.activations_checkpoint_granularity=full \
+            model.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+        
+            # not testing resume functionality to save time on ci @adithyare
+            #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+            #trainer.devices=2 \
+            #trainer.accelerator=gpu \
+            #trainer.log_every_n_steps=1 \
+            #trainer.val_check_interval=2 \
+            #trainer.limit_val_batches=1 \
+            #trainer.accumulate_grad_batches=1 \
+            #trainer.max_steps=6 \
+            #trainer.precision=16 \
+            #trainer.gradient_clip_val=1.0 \
+            #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+            #exp_manager.resume_if_exists=True \
+            #model.tensor_model_parallel_size=2 \
+            #model.optim.name=fused_adam \
+            #model.optim.lr=2e-4 \
+            #model.optim.sched.warmup_steps=2 \
+            #model.optim.sched.constant_steps=2 \
+            #model.optim.sched.min_lr=8e-5 \
+            #model.max_position_embeddings=128 \
+            #model.encoder_seq_length=128 \
+            #model.data.seq_length=128 \
+            #model.position_embedding_type=alibi \
+            #model.normalization=rmsnorm \
+            #model.bias=False \
+            #model.bias_activation_fusion=False \
+            #model.bias_dropout_add_fusion=False \
+            #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+            #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+            #model.num_layers=8 \
+            #model.hidden_size=256 \
+            #model.num_attention_heads=8 \
+            #model.activations_checkpoint_method=block \
+            #model.activations_checkpoint_granularity=full \
+            #model.activations_checkpoint_num_layers=1 \
+            #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+            #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
+        
+            rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+            rm -rf examples/nlp/language_modeling/gpt_index_mappings
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=2 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=3 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+            model.tensor_model_parallel_size=2 \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.optim.sched.warmup_steps=1 \
+            model.optim.sched.constant_steps=1 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.data.seq_length=128 \
+            model.position_embedding_type=kerple \
+            model.normalization=rmsnorm \
+            model.bias=False \
+            model.bias_activation_fusion=False \
+            model.bias_dropout_add_fusion=False \
+            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+            model.num_layers=8 \
+            model.hidden_size=256 \
+            model.num_attention_heads=8 \
+            model.activations_checkpoint_method=block \
+            model.activations_checkpoint_granularity=full \
+            model.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+            
+            # commented out to save time on github ci @adithyare
+            #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+            #trainer.devices=2 \
+            #trainer.accelerator=gpu \
+            #trainer.log_every_n_steps=1 \
+            #trainer.val_check_interval=2 \
+            #trainer.limit_val_batches=1 \
+            #trainer.accumulate_grad_batches=1 \
+            #trainer.max_steps=6 \
+            #trainer.precision=16 \
+            #trainer.gradient_clip_val=1.0 \
+            #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+            #exp_manager.resume_if_exists=True \
+            #model.tensor_model_parallel_size=2 \
+            #model.optim.name=fused_adam \
+            #model.optim.lr=2e-4 \
+            #model.optim.sched.warmup_steps=2 \
+            #model.optim.sched.constant_steps=2 \
+            #model.optim.sched.min_lr=8e-5 \
+            #model.max_position_embeddings=128 \
+            #model.encoder_seq_length=128 \
+            #model.data.seq_length=128 \
+            #model.position_embedding_type=kerple \
+            #model.normalization=rmsnorm \
+            #model.bias=False \
+            #model.bias_activation_fusion=False \
+            #model.bias_dropout_add_fusion=False \
+            #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+            #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+            #model.num_layers=8 \
+            #model.hidden_size=256 \
+            #model.num_attention_heads=8 \
+            #model.activations_checkpoint_method=block \
+            #model.activations_checkpoint_granularity=full \
+            #model.activations_checkpoint_num_layers=1 \
+            #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+            #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
+            
+            rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+            rm -rf examples/nlp/language_modeling/gpt_index_mappings
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+            trainer.devices=2 \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=2 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=3 \
+            trainer.precision=bf16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+            model.pipeline_model_parallel_size=2 \
+            model.tensor_model_parallel_size=1 \
+            model.mcore_gpt=True \
+            model.megatron_amp_O2=True \
+            model.optim.name=distributed_fused_adam \
+            model.optim.lr=2e-4 \
+            model.optim.sched.warmup_steps=1 \
+            model.optim.sched.constant_steps=1 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.activation=fast-swiglu \
+            model.bias_activation_fusion=False \
+            model.hidden_dropout=0.0 \
+            model.attention_dropout=0.0 \
+            model.transformer_block_type=normformer \
+            model.headscale=True \
+            model.data.seq_length=128 \
+            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+            model.num_layers=8 \
+            model.hidden_size=256 \
+            model.num_attention_heads=8 \
+            model.activations_checkpoint_method=block \
+            model.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+
+            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+            trainer.devices=2 \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=2 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=6 \
+            trainer.precision=bf16 \
+            trainer.gradient_clip_val=1.0 \
+            model.mcore_gpt=True \
+            model.megatron_amp_O2=True \
+            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+            exp_manager.resume_if_exists=True \
+            model.pipeline_model_parallel_size=2 \
+            model.tensor_model_parallel_size=1 \
+            model.optim.name=distributed_fused_adam \
+            model.optim.lr=2e-4 \
+            model.optim.sched.warmup_steps=2 \
+            model.optim.sched.constant_steps=2 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.activation=fast-swiglu \
+            model.bias_activation_fusion=False \
+            model.hidden_dropout=0.0 \
+            model.attention_dropout=0.0 \
+            model.transformer_block_type=normformer \
+            model.headscale=True \
+            model.data.seq_length=128 \
+            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+            model.num_layers=8 \
+            model.hidden_size=256 \
+            model.num_attention_heads=8 \
+            model.activations_checkpoint_method=block \
+            model.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+        
+            rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+            rm -rf examples/nlp/language_modeling/gpt_index_mappings
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  #@athitten Remove /home/TestData/nlp/megatron_sft/trec.jsonl for validation and test file until we have support for multiple dataloaders in lightning 2.0
+  L2_Megatron_GPT_Finetuning_PP2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
+            trainer.devices=2 \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=2 \
+            +trainer.limit_val_batches=2 \
+            trainer.max_steps=3 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
+            model.pipeline_model_parallel_size=2 \
+            model.tensor_model_parallel_size=1 \
+            model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.peft.peft_scheme=null \
+            model.data.train_ds.micro_batch_size=1 \
+            model.data.train_ds.global_batch_size=4 \
+            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
+            model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \
+            model.data.train_ds.num_workers=0 \
+            model.data.test_ds.micro_batch_size=1 \
+            model.data.test_ds.global_batch_size=1 \
+            model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+            model.data.test_ds.names=[quarel] \
+            model.data.validation_ds.micro_batch_size=1 \
+            model.data.validation_ds.global_batch_size=1 \
+            model.data.validation_ds.num_workers=0 \
+            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+            model.data.validation_ds.names=[quarel]
+
+            python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
+            trainer.devices=2 \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=1 \
+            +trainer.limit_val_batches=2 \
+            trainer.max_steps=3 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
+            model.pipeline_model_parallel_size=2 \
+            model.tensor_model_parallel_size=1 \
+            model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.peft.peft_scheme=null \
+            model.data.train_ds.micro_batch_size=1 \
+            model.data.train_ds.global_batch_size=4 \
+            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
+            model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \
+            model.data.train_ds.num_workers=0 \
+            model.data.test_ds.micro_batch_size=1 \
+            model.data.test_ds.global_batch_size=1 \
+            model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+            model.data.test_ds.names=[quarel] \
+            model.data.validation_ds.micro_batch_size=1 \
+            model.data.validation_ds.global_batch_size=1 \
+            model.data.validation_ds.num_workers=0 \
+            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+            model.data.validation_ds.names=[quarel]
+
+            rm -rf examples/nlp/language_modeling/gpt_sft_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Megatron_GPT_Finetuning_StarCoder_PP1:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/tuning/megatron_gpt_sft.py \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            trainer.precision=32 \
+            trainer.max_steps=4 \
+            trainer.val_check_interval=4 \
+            trainer.enable_checkpointing=False \
+            +trainer.limit_val_batches=2 \
+            +trainer.limit_test_batches=2 \
+            exp_manager.checkpoint_callback_params.save_best_model=False \
+            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
+            model.optim.name=distributed_fused_adam \
+            model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \
+            model.tensor_model_parallel_size=1 \
+            model.pipeline_model_parallel_size=1 \
+            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+            model.data.train_ds.num_workers=0 \
+            model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+            model.data.validation_ds.num_workers=0 \
+            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+            model.data.test_ds.num_workers=0 \
+            model.data.train_ds.concat_sampling_probabilities=[1.0]
+        
+            rm -rf examples/nlp/language_modeling/gpt_sft_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Megatron_GPT_PEFT_Lora_PP2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2
+
+            python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
+            trainer.devices=2 \
+            trainer.log_every_n_steps=1 \
+            trainer.max_epochs=9999 \
+            trainer.max_steps=3 \
+            trainer.val_check_interval=3 \
+            ++trainer.limit_val_batches=2 \
+            trainer.precision=16 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_peft_lora_results_pp2 \
+            model.pipeline_model_parallel_size=2 \
+            model.tensor_model_parallel_size=1 \
+            model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
+            model.peft.peft_scheme=lora \
+            model.answer_only_loss=True \
+            model.micro_batch_size=1 \
+            model.global_batch_size=1 \
+            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+            model.data.train_ds.concat_sampling_probabilities=[1.0] \
+            model.data.train_ds.num_workers=0 \
+            model.data.validation_ds.num_workers=0 \
+            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+            model.data.validation_ds.names=[quarel]
+
+            rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Megatron_GPT_PEFT_Lora_TP2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            rm -rf /home/TestData/nlp/lora_tuning_tp2
+
+            python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
+            trainer.devices=2 \
+            trainer.log_every_n_steps=1 \
+            trainer.max_epochs=9999 \
+            trainer.max_steps=3 \
+            trainer.val_check_interval=3 \
+            ++trainer.limit_val_batches=2 \
+            trainer.precision=16 \
+            exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \
+            model.pipeline_model_parallel_size=1 \
+            model.tensor_model_parallel_size=2 \
+            model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
+            model.peft.peft_scheme='lora' \
+            model.answer_only_loss=True \
+            model.micro_batch_size=1 \
+            model.global_batch_size=1 \
+            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+            model.data.train_ds.concat_sampling_probabilities=[1.0] \
+            model.data.train_ds.num_workers=0 \
+            model.data.validation_ds.num_workers=0 \
+            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+            model.data.validation_ds.names=[quarel]
+
+            python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
+            model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
+            model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_tp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
+            model.tensor_model_parallel_size=2 \
+            trainer.devices=2 \
+            model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
+            model.data.test_ds.names=['quarel4'] \
+            model.global_batch_size=2 \
+            model.micro_batch_size=1 \
+            model.data.test_ds.tokens_to_generate=10 \
+            model.data.test_ds.write_predictions_to_file=True \
+            model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_tp2/out' \
+            inference.greedy=True \
+            inference.repetition_penalty=1.0 \
+            inference.outfile_path='/home/TestData/nlp/lora_tuning_tp2/out.jsonl'
+
+            rm -rf /home/TestData/nlp/lora_tuning_tp2
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Megatron_GPT_Eval:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/megatron_gpt_eval.py \
+                gpt_model_file=/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo \
+                prompts=['How to fix GPU memory? A:'] \
+                tensor_model_parallel_size=1 \
+                inference.tokens_to_generate=32 \
+                trainer.precision=32
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Megatron_GPT_Eval_PP2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/megatron_gpt_eval.py \
+                gpt_model_file=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
+                server=False \
+                tensor_model_parallel_size=1 \
+                pipeline_model_parallel_size=2 \
+                trainer.devices=2 \
+                trainer.num_nodes=1 \
+                trainer.precision=32
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
+                model.restore_from_path=/home/TestData/nlp/megatron_gpt_sft/megatron_gpt_rope_sft.nemo \
+                model.peft.restore_from_path=null \
+                model.data.test_ds.file_names=[/home/TestData/nlp/megatron_gpt_sft/sample.jsonl] \
+                model.data.test_ds.names=[test] \
+                model.data.test_ds.global_batch_size=1 \
+                model.data.test_ds.micro_batch_size=1 \
+                model.data.test_ds.tokens_to_generate=30 \
+                model.data.test_ds.max_seq_length=6000 \
+                model.data.test_ds.write_predictions_to_file=True \
+                model.data.test_ds.output_file_path_prefix=examples/nlp/language_modeling/out \
+                inference.greedy=True \
+                inference.repetition_penalty=1.0 \
+                inference.outfile_path=examples/nlp/language_modeling/out.jsonl && \
+                rm -rf examples/nlp/language_modeling/out.jsonl
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+    # TODO: Add this test back. Test was failing on CI machines due to HW error
+    # - name: L2: Megatron GPT Convert from Megatron-LM checkpoing and Eval
+    #   when {
+    #     anyOf {
+    #       branch main
+    #       changeRequest target: main
+    #     }
+    #   }
+    #   failFast true
+    #   - run: |
+    #     python -m torch.distributed.launch --nproc_per_node=2 \
+    #     examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py \
+    #     --checkpoint_folder=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700 \
+    #     --checkpoint_name=model_optim_rng.pt \
+    #     --hparams_file=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700/hparams.yaml \
+    #     --nemo_file_path=examples/nlp/language_modeling/small_gpt.nemo \
+    #     --model_type=gpt \
+    #     --pipeline_model_parallel_size=1 \
+    #     --gpus_per_node=2 \
+    #     --tensor_model_parallel_size=2"
+    #     python examples/nlp/language_modeling/megatron_gpt_eval.py \
+    #     --gpt_model_file=examples/nlp/language_modeling/small_gpt.nemo \
+    #     --tokens_to_generate=32 \
+    #     --tensor_model_parallel_size=2 \
+    #     --prompt=This is a test.
+    #     rm examples/nlp/language_modeling/small_gpt.nemo
+  
+  # L2_Megatron_Change_Partitions
+  L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/megatron_change_num_partitions.py \
+                --model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
+                --target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo \
+                --tensor_model_parallel_size 2 \
+                --target_tensor_model_parallel_size 1 \
+                --pipeline_model_parallel_size 1 \
+                --target_pipeline_model_parallel_size 2
+
+             rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/megatron_change_num_partitions.py \
+                --model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
+                --target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo \
+                --tensor_model_parallel_size 2 \
+                --target_tensor_model_parallel_size 4 \
+                --pipeline_model_parallel_size 1 \
+                --target_pipeline_model_parallel_size 1
+
+            rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Megatron_T5_Pretraining_and_Resume_Training_TP2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=10 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=10 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+            model.tensor_model_parallel_size=2 \
+            model.seq_length=128 \
+            model.encoder.num_layers=4 \
+            model.encoder.hidden_size=64 \
+            model.encoder.num_attention_heads=8 \
+            model.encoder.activation=swiglu \
+            model.encoder.masked_softmax_fusion=False \
+            model.encoder.bias_activation_fusion=False \
+            model.encoder.activations_checkpoint_method=block \
+            model.encoder.activations_checkpoint_num_layers=1 \
+            model.encoder.position_embedding_type=relative \
+            model.decoder.num_layers=2 \
+            model.decoder.hidden_size=64 \
+            model.decoder.num_attention_heads=8 \
+            model.decoder.activation=fast-swiglu \
+            model.decoder.masked_softmax_fusion=False \
+            model.decoder.bias_activation_fusion=False \
+            model.decoder.activations_checkpoint_method=block \
+            model.decoder.activations_checkpoint_num_layers=1 \
+            model.encoder.transformer_block_type=pre_ln \
+            model.decoder.transformer_block_type=pre_ln \
+            model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
+            model.data.data_impl=text_mmap \
+            +model.data.data_impl_kwargs.newline_int=10 \
+            +model.data.data_impl_kwargs.header_lines=0 \
+            +model.data.data_impl_kwargs.workers=null \
+            +model.data.data_impl_kwargs.sort_dataset_paths=False \
+            model.share_token_embeddings=False \
+            model.share_decoder_tokens_head_embeddings=False
+
+            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=1 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=10 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+            exp_manager.resume_if_exists=True \
+            model.tensor_model_parallel_size=2 \
+            model.seq_length=128 \
+            model.encoder.num_layers=4 \
+            model.encoder.hidden_size=64 \
+            model.encoder.num_attention_heads=8 \
+            model.encoder.activation=swiglu \
+            model.encoder.masked_softmax_fusion=False \
+            model.encoder.bias_activation_fusion=False \
+            model.encoder.activations_checkpoint_method=block \
+            model.encoder.activations_checkpoint_num_layers=1 \
+            model.encoder.position_embedding_type=relative \
+            model.decoder.num_layers=2 \
+            model.decoder.hidden_size=64 \
+            model.decoder.num_attention_heads=8 \
+            model.decoder.activation=fast-swiglu \
+            model.decoder.masked_softmax_fusion=False \
+            model.decoder.bias_activation_fusion=False \
+            model.decoder.activations_checkpoint_method=block \
+            model.decoder.activations_checkpoint_num_layers=1 \
+            model.encoder.transformer_block_type=pre_ln \
+            model.decoder.transformer_block_type=pre_ln \
+            model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
+            model.data.data_impl=text_mmap \
+            +model.data.data_impl_kwargs.newline_int=10 \
+            +model.data.data_impl_kwargs.header_lines=0 \
+            +model.data.data_impl_kwargs.workers=null \
+            +model.data.data_impl_kwargs.sort_dataset_paths=False \
+            model.share_token_embeddings=False \
+            model.share_decoder_tokens_head_embeddings=False
+
+            rm -rf examples/nlp/language_modeling/t5_pretrain_results
+            rm -rf examples/nlp/language_modeling/t5_index_mappings
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=10 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=10 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+            model.tensor_model_parallel_size=2 \
+            model.seq_length=128 \
+            model.encoder.num_layers=4 \
+            model.encoder.hidden_size=64 \
+            model.encoder.num_attention_heads=8 \
+            model.encoder.activation=swiglu \
+            model.encoder.masked_softmax_fusion=False \
+            model.encoder.bias_activation_fusion=False \
+            model.encoder.activations_checkpoint_method=block \
+            model.encoder.activations_checkpoint_num_layers=1 \
+            model.encoder.position_embedding_type=alibi \
+            model.decoder.num_layers=2 \
+            model.decoder.hidden_size=64 \
+            model.decoder.num_attention_heads=8 \
+            model.decoder.activation=swiglu \
+            model.decoder.masked_softmax_fusion=False \
+            model.decoder.bias_activation_fusion=False \
+            model.decoder.activations_checkpoint_method=block \
+            model.decoder.activations_checkpoint_num_layers=1 \
+            model.encoder.transformer_block_type=pre_ln \
+            model.decoder.transformer_block_type=pre_ln \
+            model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
+            model.data.data_impl=text_mmap \
+            +model.data.data_impl_kwargs.newline_int=10 \
+            +model.data.data_impl_kwargs.header_lines=0 \
+            +model.data.data_impl_kwargs.workers=null \
+            +model.data.data_impl_kwargs.sort_dataset_paths=False \
+            model.share_token_embeddings=False \
+            model.share_decoder_tokens_head_embeddings=False
+
+            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=1 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=10 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+            exp_manager.resume_if_exists=True \
+            model.tensor_model_parallel_size=2 \
+            model.seq_length=128 \
+            model.encoder.num_layers=4 \
+            model.encoder.hidden_size=64 \
+            model.encoder.num_attention_heads=8 \
+            model.encoder.activation=swiglu \
+            model.encoder.masked_softmax_fusion=False \
+            model.encoder.bias_activation_fusion=False \
+            model.encoder.activations_checkpoint_method=block \
+            model.encoder.activations_checkpoint_num_layers=1 \
+            model.encoder.position_embedding_type=alibi \
+            model.decoder.num_layers=2 \
+            model.decoder.hidden_size=64 \
+            model.decoder.num_attention_heads=8 \
+            model.decoder.activation=swiglu \
+            model.decoder.masked_softmax_fusion=False \
+            model.decoder.bias_activation_fusion=False \
+            model.decoder.activations_checkpoint_method=block \
+            model.decoder.activations_checkpoint_num_layers=1 \
+            model.encoder.transformer_block_type=pre_ln \
+            model.decoder.transformer_block_type=pre_ln \
+            model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
+            model.data.data_impl=text_mmap \
+            +model.data.data_impl_kwargs.newline_int=10 \
+            +model.data.data_impl_kwargs.header_lines=0 \
+            +model.data.data_impl_kwargs.workers=null \
+            +model.data.data_impl_kwargs.sort_dataset_paths=False \
+            model.share_token_embeddings=False \
+            model.share_decoder_tokens_head_embeddings=False
+
+            rm -rf examples/nlp/language_modeling/t5_pretrain_results
+            rm -rf examples/nlp/language_modeling/t5_index_mappings
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=10 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=10 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+            model.tensor_model_parallel_size=2 \
+            model.seq_length=128 \
+            model.encoder.num_layers=4 \
+            model.encoder.hidden_size=64 \
+            model.encoder.num_attention_heads=8 \
+            model.encoder.activation=swiglu \
+            model.encoder.masked_softmax_fusion=False \
+            model.encoder.bias_activation_fusion=False \
+            model.encoder.activations_checkpoint_method=block \
+            model.encoder.activations_checkpoint_num_layers=1 \
+            model.encoder.position_embedding_type=kerple \
+            model.decoder.num_layers=2 \
+            model.decoder.hidden_size=64 \
+            model.decoder.num_attention_heads=8 \
+            model.decoder.activation=swiglu \
+            model.decoder.masked_softmax_fusion=False \
+            model.decoder.bias_activation_fusion=False \
+            model.decoder.activations_checkpoint_method=block \
+            model.decoder.activations_checkpoint_num_layers=1 \
+            model.encoder.transformer_block_type=pre_ln \
+            model.decoder.transformer_block_type=pre_ln \
+            model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
+            model.data.data_impl=text_mmap \
+            +model.data.data_impl_kwargs.newline_int=10 \
+            +model.data.data_impl_kwargs.header_lines=0 \
+            +model.data.data_impl_kwargs.workers=null \
+            +model.data.data_impl_kwargs.sort_dataset_paths=False \
+            model.share_token_embeddings=False \
+            model.share_decoder_tokens_head_embeddings=False
+
+            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=1 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=10 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+            exp_manager.resume_if_exists=True \
+            model.tensor_model_parallel_size=2 \
+            model.seq_length=128 \
+            model.encoder.num_layers=4 \
+            model.encoder.hidden_size=64 \
+            model.encoder.num_attention_heads=8 \
+            model.encoder.activation=swiglu \
+            model.encoder.masked_softmax_fusion=False \
+            model.encoder.bias_activation_fusion=False \
+            model.encoder.activations_checkpoint_method=block \
+            model.encoder.activations_checkpoint_num_layers=1 \
+            model.encoder.position_embedding_type=kerple \
+            model.decoder.num_layers=2 \
+            model.decoder.hidden_size=64 \
+            model.decoder.num_attention_heads=8 \
+            model.decoder.activation=swiglu \
+            model.decoder.masked_softmax_fusion=False \
+            model.decoder.bias_activation_fusion=False \
+            model.decoder.activations_checkpoint_method=block \
+            model.decoder.activations_checkpoint_num_layers=1 \
+            model.encoder.transformer_block_type=pre_ln \
+            model.decoder.transformer_block_type=pre_ln \
+            model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
+            model.data.data_impl=text_mmap \
+            +model.data.data_impl_kwargs.newline_int=10 \
+            +model.data.data_impl_kwargs.header_lines=0 \
+            +model.data.data_impl_kwargs.workers=null \
+            +model.data.data_impl_kwargs.sort_dataset_paths=False \
+            model.share_token_embeddings=False \
+            model.share_decoder_tokens_head_embeddings=False
+
+            rm -rf examples/nlp/language_modeling/t5_pretrain_results
+            rm -rf examples/nlp/language_modeling/t5_index_mappings
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Megatron_T5_Pretraining_and_Resume_Training_PP2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=10 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=10 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+            model.pipeline_model_parallel_size=2 \
+            model.pipeline_model_parallel_split_rank=1 \
+            model.seq_length=256 \
+            model.encoder.num_layers=4 \
+            model.decoder.num_layers=1 \
+            model.encoder.hidden_size=64 \
+            model.decoder.hidden_size=64 \
+            model.encoder.num_attention_heads=8 \
+            model.decoder.num_attention_heads=8 \
+            model.decoder.ffn_hidden_size=2048 \
+            model.encoder.activation=gelu \
+            model.encoder.activations_checkpoint_method=block \
+            model.encoder.activations_checkpoint_num_layers=1 \
+            model.encoder.transformer_block_type=pre_ln \
+            model.decoder.transformer_block_type=post_ln \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
+
+            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=1 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=10 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+            exp_manager.resume_if_exists=True \
+            model.pipeline_model_parallel_size=2 \
+            model.pipeline_model_parallel_split_rank=1 \
+            model.seq_length=256 \
+            model.encoder.num_layers=4 \
+            model.decoder.num_layers=1 \
+            model.encoder.hidden_size=64 \
+            model.decoder.hidden_size=64 \
+            model.encoder.num_attention_heads=8 \
+            model.decoder.num_attention_heads=8 \
+            model.decoder.ffn_hidden_size=2048 \
+            model.encoder.activation=gelu \
+            model.encoder.activations_checkpoint_method=block \
+            model.encoder.activations_checkpoint_num_layers=1 \
+            model.encoder.transformer_block_type=pre_ln \
+            model.decoder.transformer_block_type=post_ln \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
+
+            rm -rf examples/nlp/language_modeling/t5_pretrain_results
+            rm -rf examples/nlp/language_modeling/t5_index_mappings
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Megatron_T5_w_Mixture_of_Expert_Pretraining:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=10 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=10 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+            model.pipeline_model_parallel_split_rank=1 \
+            model.seq_length=256 \
+            model.encoder.num_layers=4 \
+            model.decoder.num_layers=1 \
+            model.encoder.num_moe_experts=4 \
+            model.decoder.num_moe_experts=4 \
+            model.encoder.moe_frequency=3 \
+            model.decoder.moe_frequency=1 \
+            model.encoder.hidden_size=64 \
+            model.decoder.hidden_size=64 \
+            model.encoder.num_attention_heads=8 \
+            model.decoder.num_attention_heads=8 \
+            model.decoder.ffn_hidden_size=2048 \
+            model.encoder.activation=gelu \
+            model.encoder.activations_checkpoint_method=block \
+            model.encoder.activations_checkpoint_num_layers=1 \
+            model.encoder.transformer_block_type=pre_ln \
+            model.decoder.transformer_block_type=post_ln \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
+
+            rm -rf examples/nlp/language_modeling/t5_pretrain_results
+            rm -rf examples/nlp/language_modeling/t5_index_mappings
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/megatron_t5_pretraining.py -cn megatron_ul2_config \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=10 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=10 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+            model.tensor_model_parallel_size=2 \
+            model.seq_length=128 \
+            model.encoder.num_layers=4 \
+            model.encoder.hidden_size=64 \
+            model.encoder.num_attention_heads=8 \
+            model.encoder.activation=swiglu \
+            model.encoder.bias_activation_fusion=False \
+            model.encoder.activations_checkpoint_method=block \
+            model.encoder.activations_checkpoint_num_layers=1 \
+            model.encoder.transformer_block_type=normformer \
+            model.encoder.headscale=True \
+            model.decoder.num_layers=4 \
+            model.decoder.hidden_size=64 \
+            model.decoder.num_attention_heads=8 \
+            model.decoder.activation=geglu \
+            model.decoder.bias_activation_fusion=False \
+            model.decoder.activations_checkpoint_method=block \
+            model.decoder.activations_checkpoint_num_layers=1 \
+            model.decoder.transformer_block_type=normformer \
+            model.decoder.headscale=False \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
+
+            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=1 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=10 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+            exp_manager.resume_if_exists=True \
+            model.tensor_model_parallel_size=2 \
+            model.seq_length=128 \
+            model.encoder.num_layers=4 \
+            model.encoder.hidden_size=64 \
+            model.encoder.num_attention_heads=8 \
+            model.encoder.activation=swiglu \
+            model.encoder.bias_activation_fusion=False \
+            model.encoder.activations_checkpoint_method=block \
+            model.encoder.activations_checkpoint_num_layers=1 \
+            model.encoder.transformer_block_type=normformer \
+            model.encoder.headscale=True \
+            model.decoder.num_layers=4 \
+            model.decoder.hidden_size=64 \
+            model.decoder.num_attention_heads=8 \
+            model.decoder.activation=geglu \
+            model.decoder.bias_activation_fusion=False \
+            model.decoder.activations_checkpoint_method=block \
+            model.decoder.activations_checkpoint_num_layers=1 \
+            model.decoder.transformer_block_type=normformer \
+            model.decoder.headscale=False \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
+        
+            rm -rf examples/nlp/language_modeling/t5_pretrain_results
+            rm -rf examples/nlp/language_modeling/t5_index_mappings
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Megatron_T5_Eval:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/megatron_t5_eval.py \
+                --model_file /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
+                --prompt 'How do I fix my GPU memory issue? I am seeing <mask> out of memory.' \
+                --tensor_model_parallel_size 1
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Megatron_BART_Pretraining_and_Resume_Training_TP2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/megatron_bart_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=2 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=3 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
+            model.tensor_model_parallel_size=2 \
+            model.seq_length=128 \
+            model.encoder.num_layers=4 \
+            model.encoder.hidden_size=64 \
+            model.encoder.num_attention_heads=8 \
+            model.encoder.activation='reglu' \
+            model.encoder.bias_activation_fusion=False \
+            model.encoder.activations_checkpoint_method='block' \
+            model.encoder.activations_checkpoint_num_layers=1 \
+            model.decoder.num_layers=4 \
+            model.decoder.hidden_size=64 \
+            model.decoder.num_attention_heads=8 \
+            model.decoder.activation='reglu' \
+            model.decoder.bias_activation_fusion=False \
+            model.decoder.activations_checkpoint_method='block' \
+            model.decoder.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'
+
+            python examples/nlp/language_modeling/megatron_bart_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=2 \
+            trainer.limit_val_batches=5 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=6 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
+            exp_manager.resume_if_exists=True \
+            model.tensor_model_parallel_size=2 \
+            model.seq_length=128 \
+            model.encoder.num_layers=4 \
+            model.encoder.hidden_size=64 \
+            model.encoder.num_attention_heads=8 \
+            model.encoder.activation='reglu' \
+            model.encoder.bias_activation_fusion=False \
+            model.encoder.activations_checkpoint_method='block' \
+            model.encoder.activations_checkpoint_num_layers=1 \
+            model.decoder.num_layers=4 \
+            model.decoder.hidden_size=64 \
+            model.decoder.num_attention_heads=8 \
+            model.decoder.activation='reglu' \
+            model.decoder.bias_activation_fusion=False \
+            model.decoder.activations_checkpoint_method='block' \
+            model.decoder.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'
+        
+            rm -rf examples/nlp/language_modeling/bart_pretrain_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_Megatron_BART_Pretraining_and_Resume_Training_PP2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/megatron_bart_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=10 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=10 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
+            model.pipeline_model_parallel_size=2 \
+            model.pipeline_model_parallel_split_rank=1 \
+            model.seq_length=256 \
+            model.encoder.num_layers=4 \
+            model.encoder.hidden_size=64 \
+            model.encoder.num_attention_heads=8 \
+            model.encoder.activation=geglu \
+            model.encoder.bias_activation_fusion=False \
+            model.encoder.activations_checkpoint_method=block \
+            model.encoder.activations_checkpoint_num_layers=1 \
+            model.decoder.num_layers=4 \
+            model.decoder.hidden_size=64 \
+            model.decoder.num_attention_heads=8 \
+            model.decoder.activation=geglu \
+            model.decoder.bias_activation_fusion=False \
+            model.decoder.activations_checkpoint_method=block \
+            model.decoder.activations_checkpoint_num_layers=1 \
+            model.data.respect_document_boundaries=False \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]
+
+            python examples/nlp/language_modeling/megatron_bart_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=1 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=10 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
+            exp_manager.resume_if_exists=True \
+            model.pipeline_model_parallel_size=2 \
+            model.pipeline_model_parallel_split_rank=1 \
+            model.seq_length=256 \
+            model.encoder.num_layers=4 \
+            model.encoder.hidden_size=64 \
+            model.encoder.num_attention_heads=8 \
+            model.encoder.activation=geglu \
+            model.encoder.bias_activation_fusion=False \
+            model.encoder.activations_checkpoint_method=block \
+            model.encoder.activations_checkpoint_num_layers=1 \
+            model.decoder.num_layers=4 \
+            model.decoder.hidden_size=64 \
+            model.decoder.num_attention_heads=8 \
+            model.decoder.activation=geglu \
+            model.decoder.bias_activation_fusion=False \
+            model.decoder.activations_checkpoint_method=block \
+            model.decoder.activations_checkpoint_num_layers=1 \
+            model.data.respect_document_boundaries=False \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]
+        
+            rm -rf examples/nlp/language_modeling/bart_pretrain_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  # L2: Megatron T5 GLUE/XNLI Finetuning 
+  # TODO(Oktai15): update it in 1.8.0 version
+  L2_Megatron_T5_GLUE_RTE:  
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \
+            trainer.devices=1 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=1 \
+            +trainer.limit_val_batches=2 \
+            +trainer.limit_test_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=2 \
+            trainer.precision=16 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/t5_glue_results \
+            model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
+            model.pipeline_model_parallel_size=1 \
+            model.pipeline_model_parallel_split_rank=0 \
+            model.data.train_ds.task_name=rte \
+            model.data.train_ds.global_batch_size=4 \
+            model.data.train_ds.micro_batch_size=2 \
+            model.data.validation_ds.global_batch_size=2 \
+            model.data.validation_ds.micro_batch_size=2 \
+            model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \
+            model.data.validation_ds.task_name=rte \
+            model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/dev_ci.tsv
+            
+            rm -rf examples/nlp/language_modeling/t5_glue_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+  
+  L2_Megatron_T5_GLUE_XNLI:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \
+            -cn megatron_t5_config_finetune_glue_xnli \
+            trainer.devices=1 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=1 \
+            +trainer.limit_val_batches=2 \
+            +trainer.limit_test_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=2 \
+            trainer.precision=16 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/t5_xnli_results \
+            model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
+            model.pipeline_model_parallel_size=1 \
+            model.pipeline_model_parallel_split_rank=0 \
+            model.data.train_ds.global_batch_size=4 \
+            model.data.train_ds.micro_batch_size=2 \
+            model.data.validation_ds.global_batch_size=2 \
+            model.data.validation_ds.micro_batch_size=2 \
+            model.data.test_ds.global_batch_size=2 \
+            model.data.test_ds.micro_batch_size=2 \
+            model.data.train_ds.task_name=rte \
+            model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \
+            model.data.validation_ds.task_name=xnli \
+            model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv \
+            model.data.test_ds.task_name=xnli \
+            model.data.test_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv
+            
+            rm -rf examples/nlp/language_modeling/t5_xnli_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+ 
+  L2_Megatron_T5_PEFT_Lora_TP2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            rm -rf /home/TestData/nlp/t5_lora_tuning_tp2
+
+            python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \
+            trainer.devices=2 \
+            trainer.log_every_n_steps=1 \
+            trainer.max_epochs=9999 \
+            trainer.max_steps=3 \
+            trainer.val_check_interval=3 \
+            ++trainer.limit_val_batches=2 \
+            trainer.precision=16 \
+            exp_manager.exp_dir=/home/TestData/nlp/t5_lora_tuning_tp2 \
+            model.pipeline_model_parallel_size=1 \
+            model.tensor_model_parallel_size=2 \
+            model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
+            model.peft.peft_scheme=lora \
+            model.answer_only_loss=True \
+            model.micro_batch_size=1 \
+            model.global_batch_size=1 \
+            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+            model.data.train_ds.concat_sampling_probabilities=[1.0] \
+            model.data.train_ds.num_workers=0 \
+            model.data.validation_ds.num_workers=0 \
+            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+            model.data.validation_ds.names=[quarel]
+
+            python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \
+            model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
+            model.peft.restore_from_path=/home/TestData/nlp/t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \
+            model.peft.restore_from_ckpt_name=null \
+            model.peft.restore_from_hparams_path=null \
+            model.tensor_model_parallel_size=2 \
+            trainer.devices=2 \
+            model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
+            model.data.test_ds.names=[quarel4] \
+            model.global_batch_size=2 \
+            model.micro_batch_size=1 \
+            model.data.test_ds.tokens_to_generate=10 \
+            model.data.test_ds.write_predictions_to_file=True \
+            model.data.test_ds.output_file_path_prefix=/home/TestData/nlp/t5_lora_tuning_tp2/out \
+            inference.greedy=True \
+            inference.repetition_penalty=1.0 \
+            inference.outfile_path=/home/TestData/nlp/t5_lora_tuning_tp2/out.jsonl
+
+            rm -rf /home/TestData/nlp/t5_lora_tuning_tp2
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  # L2: Megatron Mock Data Generation                
+  L2_Megatron_Mock_Data_Generation_MockGPTDataset:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+                trainer.max_steps=10 \
+                trainer.limit_val_batches=7 \
+                trainer.val_check_interval=10 \
+                exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+                model.data.data_impl=mock \
+                model.data.data_prefix=[]
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+ 
+  L2_Megatron_Mock_Data_Generation_MockT5Dataset:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+            trainer.max_steps=10 \
+            trainer.limit_val_batches=3 \
+            trainer.val_check_interval=10 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+            model.data.data_impl=mock \
+            model.data.data_prefix=[]
+
+            rm -rf examples/nlp/language_modeling/t5_pretrain_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  # L2: TTS Fast dev runs 1
+  L2_TTS_Fast_dev_runs_1_Tacotron_2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/tts/tacotron2.py \
+            train_dataset=/home/TestData/an4_dataset/an4_train.json \
+            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
+            trainer.devices=[0] \
+            trainer.accelerator="gpu" \
+            +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
+            trainer.strategy=auto \
+            model.decoder.decoder_rnn_dim=256 \
+            model.decoder.attention_rnn_dim=1024 \
+            model.decoder.prenet_dim=128 \
+            model.postnet.postnet_n_convolutions=3 \
+            model.train_ds.dataloader_params.batch_size=4 \
+            model.train_ds.dataloader_params.num_workers=0 \
+            model.validation_ds.dataloader_params.batch_size=4 \
+            model.validation_ds.dataloader_params.num_workers=0 \
+            ~model.text_normalizer \
+            ~model.text_normalizer_call_kwargs \
+            ~trainer.check_val_every_n_epoch
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_TTS_Fast_dev_runs_1_WaveGlow:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/tts/waveglow.py \
+            train_dataset=/home/TestData/an4_dataset/an4_train.json \
+            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
+            trainer.devices="[0]" \
+            +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
+            trainer.strategy=auto \
+            model.train_ds.dataloader_params.batch_size=4 \
+            model.train_ds.dataloader_params.num_workers=0 \
+            model.validation_ds.dataloader_params.batch_size=4 \
+            model.validation_ds.dataloader_params.num_workers=0 \
+            model.waveglow.n_flows=4 \
+            model.waveglow.n_wn_layers=2 \
+            model.waveglow.n_wn_channels=32 \
+            ~trainer.check_val_every_n_epoch
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_TTS_Fast_dev_runs_1_FastPitch:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/tts/fastpitch.py \
+            --config-name fastpitch_align_v1.05 \
+            train_dataset=/home/TestData/an4_dataset/an4_train.json \
+            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
+            sup_data_path=/home/TestData/an4_dataset/beta_priors \
+            trainer.devices="[0]" \
+            +trainer.limit_train_batches=1 \
+            +trainer.limit_val_batches=1 \
+            trainer.max_epochs=1 \
+            trainer.strategy=auto \
+            model.pitch_mean=212.35873413085938 \
+            model.pitch_std=68.52806091308594 \
+            model.train_ds.dataloader_params.batch_size=4 \
+            model.train_ds.dataloader_params.num_workers=0 \
+            model.validation_ds.dataloader_params.batch_size=4 \
+            model.validation_ds.dataloader_params.num_workers=0 \
+            model.symbols_embedding_dim=64 \
+            model.input_fft.d_inner=384 \
+            model.input_fft.n_layer=2 \
+            model.output_fft.d_inner=384 \
+            model.output_fft.n_layer=2 \
+            ~trainer.check_val_every_n_epoch \
+            ~model.text_normalizer \
+            ~model.text_normalizer_call_kwargs
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_TTS_Fast_dev_runs_1_RADTTS:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/tts/radtts.py \
+            train_dataset=/home/TestData/an4_dataset/an4_train.json \
+            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
+            sup_data_path=/home/TestData/an4_dataset/radtts_beta_priors \
+            trainer.devices="[0]" \
+            +trainer.limit_train_batches=1 \
+            +trainer.limit_val_batches=1 \
+            trainer.max_epochs=1 \
+            trainer.strategy=auto \
+            model.pitch_mean=212.35873413085938 \
+            model.pitch_std=68.52806091308594 \
+            model.train_ds.dataloader_params.batch_size=4 \
+            model.train_ds.dataloader_params.num_workers=0 \
+            model.validation_ds.dataloader_params.batch_size=4 \
+            model.validation_ds.dataloader_params.num_workers=0 \
+            export_dir=/home/TestData/radtts_test \
+            model.optim.lr=0.0001 \
+            model.modelConfig.decoder_use_partial_padding=True \
+            ~trainer.check_val_every_n_epoch \
+            ~model.text_normalizer \
+            ~model.text_normalizer_call_kwargs
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_TTS_Fast_dev_runs_1_Mixer-TTS:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/tts/mixer_tts.py \
+            train_dataset=/home/TestData/an4_dataset/an4_train.json \
+            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
+            sup_data_path=/home/TestData/an4_dataset/sup_data \
+            trainer.devices="[0]" \
+            +trainer.limit_train_batches=1 \
+            +trainer.limit_val_batches=1 \
+            trainer.max_epochs=1 \
+            trainer.strategy=auto \
+            model.pitch_mean=212.35873413085938 \
+            model.pitch_std=68.52806091308594 \
+            model.train_ds.dataloader_params.batch_size=4 \
+            model.train_ds.dataloader_params.num_workers=0 \
+            model.validation_ds.dataloader_params.batch_size=4 \
+            model.validation_ds.dataloader_params.num_workers=0 \
+            ~trainer.check_val_every_n_epoch \
+            ~model.text_normalizer \
+            ~model.text_normalizer_call_kwargs
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  L2_TTS_Fast_dev_runs_1_Hifigan:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python examples/tts/hifigan.py \
+            train_dataset=/home/TestData/an4_dataset/an4_train.json \
+            validation_datasets=/home/TestData/an4_dataset/an4_val.json \
+            trainer.devices="[0]" \
+            +trainer.limit_train_batches=1 \
+            +trainer.limit_val_batches=1 \
+            +trainer.max_epochs=1 \
+            trainer.strategy=auto \
+            model.train_ds.dataloader_params.batch_size=4 \
+            model.train_ds.dataloader_params.num_workers=0 \
+            model.validation_ds.dataloader_params.batch_size=4 \
+            model.validation_ds.dataloader_params.num_workers=0 \
+            model.generator.upsample_initial_channel=64 \
+            +model.debug=true \
+            ~trainer.check_val_every_n_epoch
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+
+  # L2: NeRF
+  # L2_NeRF_DreamFusion:
+  #   needs: [cicd-test-container-setup]
+  #   runs-on: self-hosted-azure
+  #   container:
+  #     image: localhost:5000/nemo_container
+  #     options: 
+  #       # --user 0:128
+  #       --device=/dev/nvidia0
+  #       --gpus all
+  #       --shm-size=8g 
+  #       --env TRANSFORMERS_OFFLINE=0 
+  #       --env HYDRA_FULL_ERROR=1
+  #       --volume /home/TestData:/home/TestData
+  #   steps:
+  #       - name: Checkout repository
+  #         uses: actions/checkout@v2
+  #       - run: |
+  #           python examples/multimodal/text_to_image/nerf/main.py \
+  #           trainer.num_nodes=1 \
+  #           trainer.devices="[0]" \
+  #           trainer.max_steps=1000 \
+  #           model.prompt="a DSLR photo of a delicious hamburger" \
+  #           exp_manager.exp_dir=examples/multimodal/text_to_image/nerf/dreamfusion_results
+  #
+  #           rm -rf examples/multimodal/text_to_image/nerf/dreamfusion_results
+  #       - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+  #         if: "failure()"
+
+  Speech_Checkpoints_tests:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            CUDA_VISIBLE_DEVICES=0 python examples/asr/speech_to_text_eval.py \
+                pretrained_name=QuartzNet15x5Base-En  \
+                dataset_manifest=/home/TestData/librispeech/librivox-dev-other.json \
+                batch_size=64 \
+                tolerance=0.1012
+            rm -f examples/asr/evaluation_transcripts.json
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+          if: "failure()"
+

From 1810f1f397c26a5a09961a1c00145bca5301cfd1 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Mon, 18 Mar 2024 19:24:05 -0700
Subject: [PATCH 050/140] Adjustments to run CI on main (#8699)

---
 .github/workflows/cicd-main.yml | 227 ++++++++++++++++----------------
 1 file changed, 112 insertions(+), 115 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index f4ec9818d165..8bca74dc2555 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -14,10 +14,7 @@
 name: "CICD NeMo"
 
 on:
-  push:
-    branches: [ "main", "pagaray/nemo_cicd" ]
   pull_request:
-    # The branches below must be a subset of the branches above
     branches: [ "main" ]
 
 jobs:
@@ -171,7 +168,7 @@ jobs:
     - name: "L0: Unit Tests GPU"
       run: |
         NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads
-    - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+    - uses: "./.github/actions/cancel-workflow"
       if: "failure()"
       
 
@@ -194,7 +191,7 @@ jobs:
     - name: "L0: Unit Tests CPU"
       run: |
         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-    - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+    - uses: "./.github/actions/cancel-workflow"
       if: "failure()"
 
 
@@ -224,7 +221,7 @@ jobs:
             --out-file=/home/TestData/nlp/megatron_llama/ci.nemo \
             --precision=16
             rm -f /home/TestData/nlp/megatron_llama/ci.nemo
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Community_LLM_Checkpoints_tests_StarCoder:
@@ -249,7 +246,7 @@ jobs:
             --input /home/TestData/nlp/megatron_gpt/starcoder-ci-hf \
             --output /home/TestData/nlp/megatron_gpt/starcoder-ci-hf
             rm -f /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Community_LLM_Checkpoints_tests_Falcon:
@@ -274,7 +271,7 @@ jobs:
             --input /home/TestData/nlp/megatron_gpt/falcon-ci-hf \
             --output /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo
             rm -f /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   # L2: ASR dev run
@@ -303,7 +300,7 @@ jobs:
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_to_text_results
             rm -rf examples/asr/speech_to_text_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   ASR_dev_run_Speech_to_Text_WPE_-_CitriNet:
@@ -334,7 +331,7 @@ jobs:
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_to_text_wpe_results
             rm -rf examples/asr/speech_to_text_wpe_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   ASR_dev_run_Speech_Pre-training_-_CitriNet:
@@ -363,7 +360,7 @@ jobs:
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_pre_training_results
             rm -rf examples/asr/speech_pre_training_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   ASR_dev_run_Speech_To_Text_Finetuning:
@@ -394,7 +391,7 @@ jobs:
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_finetuning_results
             rm -rf examples/asr/speech_finetuning_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   ASR_dev_run_Speech_To_Text_HF_Finetuning:
@@ -441,7 +438,7 @@ jobs:
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_finetuning_results
             rm -rf examples/asr/speech_finetuning_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   ASR_dev_run_Speech_to_Text_WPE_-_Conformer:
@@ -474,7 +471,7 @@ jobs:
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_to_text_wpe_conformer_results
             rm -rf examples/asr/speech_to_text_wpe_conformer_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   # L2: ASR dev run - part two
@@ -509,7 +506,7 @@ jobs:
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_to_text_wpe_squeezeformer_results
             rm -rf examples/asr/speech_to_text_wpe_squeezeformer_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Speech_to_Text_EMA:
@@ -538,7 +535,7 @@ jobs:
             +exp_manager.ema.enable=True \
             exp_manager.exp_dir=examples/asr/speech_to_text_results
             rm -rf examples/asr/speech_to_text_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   # L2_Speech_to_Text_AED:
@@ -620,7 +617,7 @@ jobs:
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/speaker_tasks/recognition/speaker_recognition_results
             rm -rf examples/speaker_tasks/recognition/speaker_recognition_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Speaker_dev_run_Speaker_Diarization:
@@ -653,7 +650,7 @@ jobs:
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/speaker_tasks/diarization/speaker_diarization_results
             rm -rf examples/speaker_tasks/diarization/speaker_diarization_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Speaker_dev_run_Speech_to_Label:
@@ -689,7 +686,7 @@ jobs:
             ~model.preprocessor.n_fft \
             exp_manager.exp_dir=examples/asr/speech_to_label_results
             rm -rf examples/asr/speech_to_label_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference:
@@ -720,7 +717,7 @@ jobs:
             diarizer.asr.parameters.asr_based_vad=True \
             diarizer.out_dir=examples/speaker_tasks/diarization/speaker_diarization_asr_results
             rm -rf examples/speaker_tasks/diarization/speaker_diarization_asr_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Speaker_dev_run_Clustering_Diarizer_Inference:
@@ -750,7 +747,7 @@ jobs:
             diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \
             diarizer.out_dir=examples/speaker_tasks/diarization/clustering_diarizer_results
             rm -rf examples/speaker_tasks/diarization/clustering_diarizer_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
 
@@ -778,7 +775,7 @@ jobs:
             diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \
             diarizer.out_dir=examples/speaker_tasks/diarization/neural_diarizer_results
             rm -rf examples/speaker_tasks/diarization/neural_diarizer_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation:
@@ -806,7 +803,7 @@ jobs:
             data_simulator.session_config.num_sessions=2 \
             data_simulator.session_config.session_length=60
             rm -rf ./test_simulator
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
 
@@ -838,7 +835,7 @@ jobs:
             +trainer.num_sanity_val_steps=1 \
             exp_manager.exp_dir=examples/asr/speech_to_text_results
             rm -rf examples/asr/speech_to_text_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader:
@@ -875,7 +872,7 @@ jobs:
             ~model.preprocessor.n_fft \
             exp_manager.exp_dir=examples/asr/speech_to_label_results
             rm -rf examples/asr/speech_to_label_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
 
@@ -909,7 +906,7 @@ jobs:
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_to_text_adapters_results
             rm -rf examples/asr/speech_to_text_adapters_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_ASR_Adapters_RelPos_MHA_Adapters:
@@ -942,7 +939,7 @@ jobs:
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_to_text_adapters_mha_results
             rm -rf examples/asr/speech_to_text_adapters_mha_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
 
@@ -970,7 +967,7 @@ jobs:
             output_filename="stt_test_res.json" \
             amp=true
             rm -rf stt_test_res.json
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   # L2: Transducer alignment
@@ -992,7 +989,7 @@ jobs:
           uses: actions/checkout@v2
         - run: |
             pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   # L2: Segmentation Tool
@@ -1025,7 +1022,7 @@ jobs:
             -r /home/TestData/ctc_segmentation/eng/eng_valid_segments_1.7.txt \
             -g /home/TestData/ctc_segmentation/eng/output${TIME}/verified_segments/nv_test_segments.txt && \
             rm -rf /home/TestData/ctc_segmentation/eng/output${TIME}
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3:
@@ -1057,7 +1054,7 @@ jobs:
             -r /home/TestData/ctc_segmentation/ru/valid_ru_segments_1.7.txt \
             -g /home/TestData/ctc_segmentation/ru/output${TIME}/verified_segments/ru_segments.txt && \
             rm -rf /home/TestData/ctc_segmentation/ru/output${TIME}
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
 
@@ -1099,7 +1096,7 @@ jobs:
                     pretrained_model=${OUTPUT_DIR_CONFORMER}/G2P-Conformer-CTC/test/checkpoints/G2P-Conformer-CTC.nemo \
                     manifest_filepath=/home/TestData/g2p/g2p.json \
                     phoneme_field=text
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
     # TODO: pleasefixme @redoctopus
@@ -1125,7 +1122,7 @@ jobs:
     #             phoneme_field=text
     #   }
     # }
-    # - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+    # - uses: "./.github/actions/cancel-workflow"
     # if: "failure()"
 
   L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference:
@@ -1163,7 +1160,7 @@ jobs:
                     manifest=/home/TestData/g2p/manifest.json \
                     pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \
                     output_manifest=preds.json
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
 
@@ -1247,7 +1244,7 @@ jobs:
             trainer.accelerator=gpu \
             exp_manager=null  && \
             rm -rf sgd_gen_bert_outputs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Dialogue_Classification_Intent_and_slot_classification_using_IntentSlotClassificationModel:
@@ -1285,7 +1282,7 @@ jobs:
             trainer.accelerator=gpu \
             exp_manager=null  && \
             rm -rf sgd_gen_bert_intent_classification_outputs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Dialogue_Classification_Intent_classification_using_ZeroShotIntentModel:
@@ -1326,7 +1323,7 @@ jobs:
             trainer.accelerator=gpu \
             exp_manager=null  && \
             rm -rf sgd_gen_zero_shot_intent_classification_outputs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel:
@@ -1368,7 +1365,7 @@ jobs:
             trainer.accelerator=gpu \
             exp_manager=null  && \
             rm -rf design_zero_shot_intent_classification_outputs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel_BART_Classifier:
@@ -1403,7 +1400,7 @@ jobs:
             trainer.accelerator=gpu \
             exp_manager=null  && \
             rm -rf design_zero_shot_intent_classification_bart_outputs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Dialogue_Classification_Design_Intent_classification_using_DialogueNearestNeighbourModel:
@@ -1437,7 +1434,7 @@ jobs:
             trainer.accelerator=gpu \
             exp_manager=null  && \
             rm -rf design_dialogue_nearest_neighbour_classification_outputs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   # L2: Dialogue Generation
@@ -1479,7 +1476,7 @@ jobs:
             trainer.accelerator=gpu \
             exp_manager=null  && \
             rm -rf answer_extender_s2s
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Dialogue_Generation_Dialogue_SGD_Based_Answer_Extender_using_DialogueS2SGenerationModel:
@@ -1525,7 +1522,7 @@ jobs:
             trainer.accelerator=gpu \
             exp_manager=null  && \
             rm -rf sgd_answer_extender_s2s
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
 #     - name: L2: Dialogue Generation Part 2
@@ -1592,7 +1589,7 @@ jobs:
             trainer.accelerator=gpu \
             exp_manager=null  && \
             rm -rf answer_extender
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   # L2: Duplex Text Normalization
@@ -1634,7 +1631,7 @@ jobs:
             data.train_ds.tar_metadata_file=/home/TestData/nlp/duplex_text_norm/tarred_small/metadata.json \
             data.test_ds.use_cache=false \
             data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
 # Runs out of memory on the 12G TITAN V (GPU 0 on main CI)
@@ -1696,7 +1693,7 @@ jobs:
             trainer.accelerator="gpu" \
             +trainer.fast_dev_run=true \
             exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   # L2: Parallel BERT Question-Answering SQUAD v1.1 & v2.0
@@ -1738,7 +1735,7 @@ jobs:
             trainer.devices=[0] \
             trainer.accelerator="gpu" \
             exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Parallel_BERT_Question-Answering_SQUAD_v2_0:
@@ -1776,7 +1773,7 @@ jobs:
             trainer.devices=[1] \
             trainer.accelerator="gpu" \
             exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   # L2: Parallel BART Question-Answering SQUAD v1.1 & v2.0
@@ -1818,7 +1815,7 @@ jobs:
             trainer.devices=[0] \
             trainer.accelerator="gpu" \
             exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Parallel_BART_Question-Answering_SQUAD_v2_0:
@@ -1856,7 +1853,7 @@ jobs:
             trainer.devices=[1] \
             trainer.accelerator="gpu" \
             exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   # L2: Parallel GPT2 Question-Answering SQUAD v1.1 & v2.0
@@ -1898,7 +1895,7 @@ jobs:
             trainer.devices=[0] \
             trainer.accelerator="gpu" \
             exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Parallel_GPT2_Question-Answering_SQUAD_v2_0:
@@ -1936,7 +1933,7 @@ jobs:
             trainer.devices=[1] \
             trainer.accelerator="gpu" \
             exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   # L2: Intent and Slot Classification Tasks
@@ -1967,7 +1964,7 @@ jobs:
             +trainer.fast_dev_run=true \
             exp_manager.exp_dir=checkpoints
             rm -rf checkpoints
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification:
@@ -1996,7 +1993,7 @@ jobs:
             +trainer.fast_dev_run=true \
             exp_manager.exp_dir=checkpoints2
             rm -rf checkpoints2
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
     # TODO: add when megatron-bert is supported again
@@ -2134,7 +2131,7 @@ jobs:
             +trainer.fast_dev_run=true \
             model.dataset.class_balancing="weighted_loss" \
             exp_manager.exp_dir=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test:
@@ -2170,7 +2167,7 @@ jobs:
               +trainer.fast_dev_run=true \
               exp_manager.exp_dir=null && \
             rm -rf "${data_dir}"
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1:
@@ -2199,7 +2196,7 @@ jobs:
             model.dataset.use_cache=false \
             model.language_model.pretrained_model_name="TurkuNLP/bert-base-finnish-cased-v1" \
             exp_manager.exp_dir=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
         
   L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification:
@@ -2223,7 +2220,7 @@ jobs:
             model.dataset.data_dir=/home/TestData/nlp/ner/ \
             model.dataset.use_cache=false \
             pretrained_model=/home/TestData/nlp/pretrained_models/NER_Model_with_BERT_base_uncased.nemo
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation:
@@ -2254,7 +2251,7 @@ jobs:
               +model.test_ds.use_cache=false \
               pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo && \
             rm -rf "${data_dir}"
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Parallel_NLP_Examples2_Punctuation_Capitalization_2GPUs_with_DistilBERT_Finetuning_on_other_data:
@@ -2313,7 +2310,7 @@ jobs:
             rm -rf /workspace/NeMo/examples/nlp/token_classification/nemo_experiments \
               "${tmp_data_dir_2}" \
               "${output_dir}"
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   # Punctuation & Capitalization tarred dataset:
@@ -2378,7 +2375,7 @@ jobs:
               trainer.max_epochs=1 \
               +exp_manager.explicit_log_dir=${output_dir}/output && \
             rm -rf "${output_dir}" "${data_dir}"
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   # Punctuation_Capitalization_Different_ways_of_passing_labels_to_model
@@ -2535,7 +2532,7 @@ jobs:
               --pretrained_name punctuation_en_bert \
               --batch_size 32 && \
             rm -rf "${output_dir}"
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
   
   # L2: Parallel Pretraining BERT pretraining from Text/Preprocessed
@@ -2578,7 +2575,7 @@ jobs:
               
             rm -f /home/TestData/nlp/wikitext-2/*.pkl
             #rm -rf examples/nlp/language_modeling/PretrainingBERTFromText
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Pretraining_BERT_from_Preprocessed:
@@ -2619,7 +2616,7 @@ jobs:
               exp_manager.create_checkpoint_callback=False \
               
             #rm -rf examples/nlp/language_modeling/PretrainingBERTFromPreprocessed
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   # L2: Entity Linking        
@@ -2650,7 +2647,7 @@ jobs:
             model.train_ds.batch_size=8 \
             model.validation_ds.batch_size=8 \
             exp_manager.exp_dir=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
 
@@ -2732,7 +2729,7 @@ jobs:
               +exp_manager.resume_if_exists=True
               
             rm -rf examples/nlp/machine_translation/nmt_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN:
@@ -2772,7 +2769,7 @@ jobs:
               +trainer.fast_dev_run=true \
               +trainer.limit_test_batches=2 \
               exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation:
@@ -2810,7 +2807,7 @@ jobs:
               +trainer.fast_dev_run=true \
               +trainer.limit_test_batches=2 \
               exp_manager=null
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   # L2: NMT Attention is All You Need Inference
@@ -2838,7 +2835,7 @@ jobs:
             --tgtout=/home/TestData/nlp/nmt/toy_data/out.txt \
             --target_lang en \
             --source_lang de
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   # L2: NMT Attention is All You Need Finetuning
@@ -2881,7 +2878,7 @@ jobs:
             +exp_manager.checkpoint_callback_params.save_best_model=true
         
             rm -rf examples/nlp/machine_translation/nmt_finetune
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   # L2: NMT Tarred Dataset Creation
@@ -2923,7 +2920,7 @@ jobs:
             trainer.accelerator="gpu" \
             +trainer.fast_dev_run=true \
             exp_manager=null \
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation:
@@ -2954,7 +2951,7 @@ jobs:
             --lines_per_dataset_fragment=500 \
             --num_batches_per_tarfile=10 \
             --n_preproc_jobs=2 \
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Megatron_NMT_Training_TP2:
@@ -3063,7 +3060,7 @@ jobs:
             model.decoder_tokenizer.library=sentencepiece \
             model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model
             rm -rf examples/nlp/machine_translation/megatron_nmt_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Megatron_BART_Perceiver_MIM_Training_TP2:
@@ -3172,7 +3169,7 @@ jobs:
             ++model.hiddens.loss.mim.cls_name=a_mim \
             ++model.hiddens.loss.mim.loss_weight=0.5
             rm -rf examples/nlp/language_modeling/megatron_mim_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
     # stage('L2: NMT Bottleneck Fallback') {
@@ -3462,7 +3459,7 @@ jobs:
 
             rm -rf examples/nlp/language_modeling/bert_pretrain_results
             rm -rf examples/nlp/language_modeling/bert_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
  
   L2_Megatron_Bert_Pretraining_and_Resume_Training:
@@ -3544,7 +3541,7 @@ jobs:
 
             rm -rf examples/nlp/language_modeling/bert_pretrain_results
             rm -rf examples/nlp/language_modeling/bert_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Megatron_Core_Bert_Pretraining_and_Resume_Training:
@@ -3628,7 +3625,7 @@ jobs:
 
             rm -rf examples/nlp/language_modeling/bert_pretrain_results
             rm -rf examples/nlp/language_modeling/bert_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Megatron_RETRO_Pretraining_and_Resume_Training:
@@ -3711,7 +3708,7 @@ jobs:
             +model.data.mock=True
 
             rm -rf examples/nlp/language_modeling/retro_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   # L2_Megatron_RETRO_muTransfer_Pretraining_Performance:
@@ -3805,7 +3802,7 @@ jobs:
   #               assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"
 
   #               rm -rf examples/nlp/language_modeling/retro_results
-  #       - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+  #       - uses: "./.github/actions/cancel-workflow"
   #         if: "failure()"
 
   L2_BioMegatron_Bert_NER_Task:
@@ -3832,7 +3829,7 @@ jobs:
             model.language_model.pretrained_model_name=biomegatron345m_biovocab_30k_cased \
             model.tokenizer.tokenizer_name=null
             rm -rf examples/nlp/language_modeling/token_classification_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2:
@@ -3925,7 +3922,7 @@ jobs:
         
             rm -rf examples/nlp/language_modeling/gpt_pretrain_results
             rm -rf examples/nlp/language_modeling/gpt_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2:
@@ -4023,7 +4020,7 @@ jobs:
 
            rm -rf examples/nlp/language_modeling/gpt_pretrain_results
            rm -rf examples/nlp/language_modeling/gpt_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
     #  This test requires Ampere but some of the test GPUs are Volta
@@ -4211,7 +4208,7 @@ jobs:
         
             rm -rf examples/nlp/language_modeling/gpt_pretrain_results
             rm -rf examples/nlp/language_modeling/gpt_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2:
@@ -4307,7 +4304,7 @@ jobs:
             
             rm -rf examples/nlp/language_modeling/gpt_pretrain_results
             rm -rf examples/nlp/language_modeling/gpt_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2:
@@ -4406,7 +4403,7 @@ jobs:
         
             rm -rf examples/nlp/language_modeling/gpt_pretrain_results
             rm -rf examples/nlp/language_modeling/gpt_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   #@athitten Remove /home/TestData/nlp/megatron_sft/trec.jsonl for validation and test file until we have support for multiple dataloaders in lightning 2.0
@@ -4488,7 +4485,7 @@ jobs:
             model.data.validation_ds.names=[quarel]
 
             rm -rf examples/nlp/language_modeling/gpt_sft_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Megatron_GPT_Finetuning_StarCoder_PP1:
@@ -4532,7 +4529,7 @@ jobs:
             model.data.train_ds.concat_sampling_probabilities=[1.0]
         
             rm -rf examples/nlp/language_modeling/gpt_sft_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Megatron_GPT_PEFT_Lora_PP2:
@@ -4578,7 +4575,7 @@ jobs:
             model.data.validation_ds.names=[quarel]
 
             rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Megatron_GPT_PEFT_Lora_TP2:
@@ -4640,7 +4637,7 @@ jobs:
             inference.outfile_path='/home/TestData/nlp/lora_tuning_tp2/out.jsonl'
 
             rm -rf /home/TestData/nlp/lora_tuning_tp2
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Megatron_GPT_Eval:
@@ -4666,7 +4663,7 @@ jobs:
                 tensor_model_parallel_size=1 \
                 inference.tokens_to_generate=32 \
                 trainer.precision=32
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Megatron_GPT_Eval_PP2:
@@ -4694,7 +4691,7 @@ jobs:
                 trainer.devices=2 \
                 trainer.num_nodes=1 \
                 trainer.precision=32
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len:
@@ -4729,7 +4726,7 @@ jobs:
                 inference.repetition_penalty=1.0 \
                 inference.outfile_path=examples/nlp/language_modeling/out.jsonl && \
                 rm -rf examples/nlp/language_modeling/out.jsonl
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
     # TODO: Add this test back. Test was failing on CI machines due to HW error
@@ -4786,7 +4783,7 @@ jobs:
                 --target_pipeline_model_parallel_size 2
 
              rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2:
@@ -4815,7 +4812,7 @@ jobs:
                 --target_pipeline_model_parallel_size 1
 
             rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Megatron_T5_Pretraining_and_Resume_Training_TP2:
@@ -4922,7 +4919,7 @@ jobs:
 
             rm -rf examples/nlp/language_modeling/t5_pretrain_results
             rm -rf examples/nlp/language_modeling/t5_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2:
@@ -5029,7 +5026,7 @@ jobs:
 
             rm -rf examples/nlp/language_modeling/t5_pretrain_results
             rm -rf examples/nlp/language_modeling/t5_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2:
@@ -5136,7 +5133,7 @@ jobs:
 
             rm -rf examples/nlp/language_modeling/t5_pretrain_results
             rm -rf examples/nlp/language_modeling/t5_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Megatron_T5_Pretraining_and_Resume_Training_PP2:
@@ -5217,7 +5214,7 @@ jobs:
 
             rm -rf examples/nlp/language_modeling/t5_pretrain_results
             rm -rf examples/nlp/language_modeling/t5_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Megatron_T5_w_Mixture_of_Expert_Pretraining:
@@ -5271,7 +5268,7 @@ jobs:
 
             rm -rf examples/nlp/language_modeling/t5_pretrain_results
             rm -rf examples/nlp/language_modeling/t5_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2:
@@ -5362,7 +5359,7 @@ jobs:
         
             rm -rf examples/nlp/language_modeling/t5_pretrain_results
             rm -rf examples/nlp/language_modeling/t5_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Megatron_T5_Eval:
@@ -5386,7 +5383,7 @@ jobs:
                 --model_file /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
                 --prompt 'How do I fix my GPU memory issue? I am seeing <mask> out of memory.' \
                 --tensor_model_parallel_size 1
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Megatron_BART_Pretraining_and_Resume_Training_TP2:
@@ -5466,7 +5463,7 @@ jobs:
             model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'
         
             rm -rf examples/nlp/language_modeling/bart_pretrain_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_Megatron_BART_Pretraining_and_Resume_Training_PP2:
@@ -5550,7 +5547,7 @@ jobs:
             model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]
         
             rm -rf examples/nlp/language_modeling/bart_pretrain_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   # L2: Megatron T5 GLUE/XNLI Finetuning 
@@ -5596,7 +5593,7 @@ jobs:
             model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/dev_ci.tsv
             
             rm -rf examples/nlp/language_modeling/t5_glue_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
   
   L2_Megatron_T5_GLUE_XNLI:
@@ -5645,7 +5642,7 @@ jobs:
             model.data.test_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv
             
             rm -rf examples/nlp/language_modeling/t5_xnli_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
  
   L2_Megatron_T5_PEFT_Lora_TP2:
@@ -5709,7 +5706,7 @@ jobs:
             inference.outfile_path=/home/TestData/nlp/t5_lora_tuning_tp2/out.jsonl
 
             rm -rf /home/TestData/nlp/t5_lora_tuning_tp2
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   # L2: Megatron Mock Data Generation                
@@ -5737,7 +5734,7 @@ jobs:
                 exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
                 model.data.data_impl=mock \
                 model.data.data_prefix=[]
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
  
   L2_Megatron_Mock_Data_Generation_MockT5Dataset:
@@ -5766,7 +5763,7 @@ jobs:
             model.data.data_prefix=[]
 
             rm -rf examples/nlp/language_modeling/t5_pretrain_results
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   # L2: TTS Fast dev runs 1
@@ -5805,7 +5802,7 @@ jobs:
             ~model.text_normalizer \
             ~model.text_normalizer_call_kwargs \
             ~trainer.check_val_every_n_epoch
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_TTS_Fast_dev_runs_1_WaveGlow:
@@ -5839,7 +5836,7 @@ jobs:
             model.waveglow.n_wn_layers=2 \
             model.waveglow.n_wn_channels=32 \
             ~trainer.check_val_every_n_epoch
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_TTS_Fast_dev_runs_1_FastPitch:
@@ -5883,7 +5880,7 @@ jobs:
             ~trainer.check_val_every_n_epoch \
             ~model.text_normalizer \
             ~model.text_normalizer_call_kwargs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_TTS_Fast_dev_runs_1_RADTTS:
@@ -5924,7 +5921,7 @@ jobs:
             ~trainer.check_val_every_n_epoch \
             ~model.text_normalizer \
             ~model.text_normalizer_call_kwargs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_TTS_Fast_dev_runs_1_Mixer-TTS:
@@ -5962,7 +5959,7 @@ jobs:
             ~trainer.check_val_every_n_epoch \
             ~model.text_normalizer \
             ~model.text_normalizer_call_kwargs
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   L2_TTS_Fast_dev_runs_1_Hifigan:
@@ -5997,7 +5994,7 @@ jobs:
             model.generator.upsample_initial_channel=64 \
             +model.debug=true \
             ~trainer.check_val_every_n_epoch
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
   # L2: NeRF
@@ -6026,7 +6023,7 @@ jobs:
   #           exp_manager.exp_dir=examples/multimodal/text_to_image/nerf/dreamfusion_results
   #
   #           rm -rf examples/multimodal/text_to_image/nerf/dreamfusion_results
-  #       - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+  #       - uses: "./.github/actions/cancel-workflow"
   #         if: "failure()"
 
   Speech_Checkpoints_tests:
@@ -6052,6 +6049,6 @@ jobs:
                 batch_size=64 \
                 tolerance=0.1012
             rm -f examples/asr/evaluation_transcripts.json
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@pagaray/nemo_cicd"
+        - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 

From 7332754263fd3fa47466a4def32400d54595578f Mon Sep 17 00:00:00 2001
From: Adi Renduchintala <adithyare@nvidia.com>
Date: Mon, 18 Mar 2024 22:55:13 -0700
Subject: [PATCH 051/140] minor bug fix (#8696)

Signed-off-by: arendu <adithya.r@gmail.com>
---
 .../information_retrieval/megatron_gpt_embedding_model.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
index 91fa4a6f92b5..110e59494b52 100644
--- a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
+++ b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
@@ -241,7 +241,7 @@ def inference_step_validation_call(self, batch, batch_idx, data_cfg, dataloader_
         }
         return outputs
 
-    def gather_and_maybe_write_predictions(self, output, data_cfg, mode, dataloader_idx=0):
+    def gather_and_maybe_write_predictions(self, output, data_cfg, mode, averaged_metric, dataloader_idx=0):
         if not data_cfg.get("write_embeddings_to_file", False):
             return True
         gathered_output_batches = [None for _ in range(parallel_state.get_data_parallel_world_size())]

From 17d05a01a6b64b62dbf537ed2729c431d665e3ce Mon Sep 17 00:00:00 2001
From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Date: Tue, 19 Mar 2024 14:32:06 -0700
Subject: [PATCH 052/140] Unify HF converter scripts and arguments (#8435)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Refactor conversion scripts one in all

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Move bert converter

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [TTS] Add modules for mel spectrogram codec (#8238)

* [TTS] Add modules for mel spectrogram codec

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Add mel band validation

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Add fullband mel encoder and more documentation

Signed-off-by: Ryan <rlangman@nvidia.com>

---------

Signed-off-by: Ryan <rlangman@nvidia.com>

* coldfix (#8412)

Signed-off-by: George Zelenfroynd <gzelenfroind@nvidia.com>

* Fixed errors in the CTM gen functions (#8416) (#8420)

Signed-off-by: Taejin Park <tango4j@gmail.com>
Co-authored-by: Taejin Park <tango4j@gmail.com>

* Add change_vocabulary and save_tokenizers() support to Multitask ASR models (#8357) (#8367)

* Add change_vocabulary and save_tokenizers() support


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update nemo/collections/asr/models/aed_multitask_models.py


---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>

* fix path location and branch (#8314)

* fix path location and branch (#8304)

* fix path location and branch

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* change to a floating point number

Signed-off-by: Nithin Rao Koluguri <nithinraok>

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>

* updat ebranch in tutorial

Signed-off-by: Nithin Rao Koluguri <nithinraok>

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: Nithin Rao Koluguri <nithinraok>

* Add TP comm overlap knobs to AutocastTransformerLayer (#8290)

Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>
Co-authored-by: Jaemin Choi <jaeminc@nvidia.com>

* add deallocate pipeline output optimization (#8279) (#8318)

* add deallocate pipeline output optimization


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* remove assertion (#8302) (#8321)

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>

* Keep max_seqlen and cu_seqlens_argmin for later micro-batches when PP>1 (#8334) (#8346)

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Enable megatron core loggers for GPT pretraining (#8354) (#8384)

* Logging changes tested for gpt_pretraining


* Additional args


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: ashbhandare <ash.bhandare@gmail.com>
Co-authored-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Fix dreambooth data sampler issue (#8400) (#8413)

* Turn on drop last


* Some neva fixes


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* add ensemble decoding fix (#8427) (#8433)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>

* NeVA Tutorial Notebook (#8217)

* init commit - neva tutorial

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* NeVA tutorial notebook

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* init commit - neva tutorial

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Signed-off-by: Pratyush Muthukumar <pannumuthu@gmail.com>
Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* NeVA tutorial notebook

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Signed-off-by: Pratyush Muthukumar <pannumuthu@gmail.com>
Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* requested changes

Signed-off-by: Pratyush Muthukumar <pannumuthu@gmail.com>
Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* add inference via script

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* requested changes

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* requested changes

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* add codeblocks to run torchrun in notebook

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

---------

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Signed-off-by: Pratyush Muthukumar <pannumuthu@gmail.com>
Co-authored-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* mcore customization doc minor fix (#8421) (#8437)

Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Co-authored-by: Huiying <willwin.lee@gmail.com>

* Add `loop_labels` algorithm for TDT greedy decoding (#8215)

* Add `loop_labels` algorithm for TDT greedy decoding

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Use `loop_labels` by default

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Loop labels greedy decoding v2

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add comments. Clean up

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add comments

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add comments

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add tests for batched hypotheses

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add tests for batched alignments

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add comments

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix comment

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix test

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add computer for TDT

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix TDT decoding algorithm

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Use loop frames by default for TDT

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Remove "loop frames" implementation for TDT

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Clean up

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add comments

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix confidence. Use tensor for durations.

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

---------

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add dist ckpt support for regular optimizers (#7749) (#8293)

* Add dist ckpt support for regular optimizers


* [tutorial] fixed missing RIR scripts file. (#8257)


* fix imports


* imports fix


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* ci imports fix


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert asr notebook


* revert asr notebook


---------

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Multimodal r1.23.0 bug fix  (#8315) (#8339)

* Rename quick-gelu


* ddpm config guard


* Fix ddpm edit api


* Fix insert_image_token cfg issue


* neva updates


* reformat


* Add back jenkins


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix jenkins


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bugs


* Update default neva template


---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* mcore ds fix (#8283) (#8385)

* [tutorial] fixed missing RIR scripts file. (#8257)


* add values to en tts dict (#7879)


* mcore ds fix


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update mcore


* revert asr files


* add comments


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add support for mcore mock dataset


* update mcore version


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update gpt cfg


* update mcore commit


* fix Bert unit tests


* update bert tests


* fix bert mcore test


* fix gpt jenkins tests


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update apex & TE commits


* revert apex installation


* turn off the fusion for jenkins


---------

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* MCore dataset compatibility for tokenizers (#8390) (#8397)

* Add unique_identifiers for all tokenizers and eod for SentencePieceTokenizer


* Add generalized token aliases to TokenizerSpec to conform with MegatronTokenizer's interface. Remove now-redundant individual fixes from AutoTokenizer and SentencePieceTokenizer.


---------

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Canary: inference tokenization improvements; preserving custom keys when creating tarred manifests (#8432)

* Improvements for Canary:

- carry over custom keys when creatin tarred manifests
- selectable text field in ASR eval
- get rid of prompt slicing, create proper inference prompts

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* set ensure_ascii=False in tarred conversion to avoid breaking tokenizers trained on UTF-8 encoding

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* add  sbert to IR (#8445)

* add  sbert to IR

Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>

* add doc

Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>

* fix the  auto_tokenizer property method reset bug

Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>

* addressed bot comments

Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Update readme (#8440)

* update

Signed-off-by: eharper <eharper@nvidia.com>

* udpate

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* landing pages added

* landing page added for vision

* landing pages updated

* some minor changes to the main readme

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* typo fixed

* update

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>
Co-authored-by: ntajbakhsh <ntajbakhsh@nvidia.com>

* NeMo-Mistral to HF converter bugfix. (#8353) (#8442)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: akoumpa <153118171+akoumpa@users.noreply.github.com>

* Fixing mcore bert for TP, PP and SP (#8336) (#8443)

* Fixing mcore bert for TP, PP and SP

* Fixing mcore bert for TP, PP and SP

* Fixing mcore version

* Fixing mcore version

* Update Jenkinsfile


* Update Jenkinsfile


* Update Jenkinsfile


---------

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Add LoRA support to all linear layers (#7988)

* Added LoRA support for the Dense layer of Attention

* Added LoRA MLP support to MCore and NeMo models.

* Change LoRA config default to QKV.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fixed bug with ddp training.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* MCoreMixin chages.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* using new commit of meg-LM

Signed-off-by: arendu <adithya.r@gmail.com>

* add cpu_offloading_num_layers to conversion script until bug in megatron is fixed

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix peft mixin arguments to follow mcore 0.5

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update megatron commit to fix ci error

Signed-off-by: Chen Cui <chcui@nvidia.com>

* try to fix ci

Signed-off-by: Chen Cui <chcui@nvidia.com>

* try to fix ci

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add cfg default

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>
Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>
Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Adi Renduchintala <adithyare@nvidia.com>
Co-authored-by: Jiaqi Zeng <jiaqiz@nvidia.com>
Co-authored-by: arendu <adithya.r@gmail.com>
Co-authored-by: HeyyyyyyG <49757268+HeyyyyyyG@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Add Neva Template for NV-DPO Models  (#8358)

* add/rename from nvgpt to nv_steerlm, add nv_dpo template

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* add nv_dpo conversation to accomendate empty system message

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* handle nv_dpo template text generation

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* add prompt string to nvgpt

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* bugfix for inference prompt template

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* bug fix for grabbing clean text

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* fix code format

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

---------

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>
Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* Account for mpirun use case in get_rank (#8429)

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Add settings to suppress bf16 compile errors in CI on V100 (#8481) (#8482)

* Add settings to suppress bf16 compile errors in CI on V100


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* fix canary chunk infer bug (#8449)

* fix chunk infer bug

Signed-off-by: stevehuang52 <heh@nvidia.com>

* add support for duration=None, add lhotse support for relative audio path

Signed-off-by: stevehuang52 <heh@nvidia.com>

* add tests

Signed-off-by: stevehuang52 <heh@nvidia.com>

---------

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Add Baichuan2 support (#8282)

* Add Baichuan2 support

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Reworked MegatronPretrainingRandomBatchSampler to correctly handle epochs > 1 (#7920)

* Initital commit of reworked MegatronPretrainingRandomBatchSampler

Signed-off-by: Daniel Egert <degert@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fixed small length based bug

Signed-off-by: Daniel Egert <degert@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Daniel Egert <degert@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Euynaheh <hehanyue99@outlook.com>

* Add Baichuan2 support

Signed-off-by: Euynaheh <hehanyue99@outlook.com>

* Add NeMo to HF conversion

* fix code format

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix code format

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add Baichuan jenkins test

* add_BOS bug fix

* Update Jenkinsfile

Signed-off-by: Euynaheh <93857693+Euynaheh@users.noreply.github.com>

---------

Signed-off-by: Daniel Egert <degert@nvidia.com>
Signed-off-by: Euynaheh <hehanyue99@outlook.com>
Signed-off-by: Euynaheh <93857693+Euynaheh@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: trias702 <25867060+trias702@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>

* Jiaqiz/option to disable adapters & merge all lora layers (#8029)

* Added LoRA support for the Dense layer of Attention

* Added LoRA MLP support to MCore and NeMo models.

* Change LoRA config default to QKV.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fixed bug with ddp training.

* use adapter only when it is enabled

Signed-off-by: jiaqi zeng <jiaqiz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix lora merge script (#8113)

Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Adi Renduchintala <adithyare@nvidia.com>

* add peft ckpt to nemo

Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>

* merge lora weights for all layers, mcore only

Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>

* support/fix cpu initialization

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add example usage

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix TP due to distributed checkpoint

Signed-off-by: Chen Cui <chcui@nvidia.com>

* updating the logic of merging lora weights for all layers, mcore only

Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>

* MCoreMixin chages.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* merge in fp32 then cast back

Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>

* remove ckpt to nemo

Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>

* fix import

Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>

---------

Signed-off-by: jiaqi zeng <jiaqiz@nvidia.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>
Co-authored-by: Tugrul Konuk <ertkonuk@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Adi Renduchintala <adithyare@nvidia.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>

* Update k2 version (#8478)

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add mcore full TE transformer layer spec (#8328)

* Add spec and implement autocast layer

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* remove try-catchs, these dependecies are mandatory for this file

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Check out this cool try/except clause

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove unused import

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Add import tests to Jenkinsfile

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Move import tests to Jenkins and remove code that is developed only for passing tests

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Make test robust to faulty base configs

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Use proper GPT implementation in the test

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Update nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py

Co-authored-by: Sudhakar Singh <sudhakars@nvidia.com>
Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com>

* Update nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py

Co-authored-by: Sudhakar Singh <sudhakars@nvidia.com>
Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py

Co-authored-by: Jaemin Choi <minitu77@gmail.com>
Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com>

* Update nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py

Co-authored-by: Jaemin Choi <minitu77@gmail.com>
Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com>

* Add TE knobs to the copy of AutocastTransformerLayer

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Add TE knobs to the copy of AutocastTransformerLayer

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Add dummy parameter to accomodated for the changes in mcore

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update mcore to 0.5.0 in Jenkins pipeline

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Bump mcore commit. This is commit from tot, not any release.

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Remove from the test config option that is incompatible with bias_activation_fusion

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Bump TE version in CI to 1.4

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Update test

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Change precision for the test - current runnens don't support bf16

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

---------

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>
Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Sudhakar Singh <sudhakars@nvidia.com>
Co-authored-by: Jaemin Choi <minitu77@gmail.com>

* Handle float limit_val_batches (#8426)

* Handle float limit_val_batches

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Rectify reconfiguration of float limit_val_batches

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Remove unused imports

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Scale len(val_dataloader) with float limit_val_batches

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Return len(dataloader) in microbatches

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add back resetting of num val samples

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix to ensure float limit_val_batches is multiple of num_micro_batches

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Remove forcing eval samples to 1 for float limit_val_batches

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix bug wrt 0 limiot_val_batches

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add missing mock_dataset line

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Avoid ensuring limit_val_batches is a mutliple of microbatches for 1.0

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Restore the hack forcing number of validation and test epochs to 1

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Change limit_val_batches to 1.0 for GPT pretraining test. The integer value is covered in other tests

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Jan Baczek <jbaczek@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Jan Baczek <jbaczek@nvidia.com>

* Fix tutorial links in user guide (#8497)

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Sequence Parallel for LoRA (#8369)

* support lora + sequence parallel

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add more comments

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add lora SP CI test

Signed-off-by: Chen Cui <chcui@nvidia.com>

* support lora for all linear modules as in #7988

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Call proper method to replace (#8498)

Signed-off-by: Naga Venkatesh Gavini <nagavenkat9948@gmail.com>

* Added memory logger (#8395)

* Added memory logger

Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos01.eos.clusters.nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: Selvaraj Anandaraj <selvaraja@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Canary refactor for Riva (#8363)

* initial commit of bleu score tracking

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* initial commit, refactoring aed models for riva

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Updating Canary to support torch metrics

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* style fixes

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* missed an empty batch conditional

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* Fixing dataloader issues

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* Finishing merge conflict with transcribe update

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* style fix

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* copyright header fix

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* yet another merge conflict

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* making paired data management safer

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* sentencepiece needs bigger tokenizer...

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* sentencepiece tokenizer vocab needs to be +2 from vocab for canary

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* Update canary tokenizer to be more generic, updated metrics to manage special tokens removal themselves.

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* merge conflit

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* Simplified tokenizer and corrected bug in dataloader

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* Cleaning up docstrings and fixing inference bug.

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* adding example scripts

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* cleaning up useless imports

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* adding unit tests

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* fixing unit tests

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* cfg name change

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* adding custom check to pass pytests

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* removing print script

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* catching bugs regarding tokens.

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* added docstrings and made examples scripts more generic

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* docstring deleted by accident

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* plurals in namespace

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* changing example script

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

---------

Signed-off-by: Travis Bartley <tbartley@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>

* add alpha scaling to lora (#8248)

* removed pdeprecated eft model

Signed-off-by: arendu <adithya.r@gmail.com>

* add alpha

Signed-off-by: arendu <adithya.r@gmail.com>

* default for alpha

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add alpha scaling to lora (#8483)

* coldfix (#8412)

Signed-off-by: George Zelenfroynd <gzelenfroind@nvidia.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Fixed errors in the CTM gen functions (#8416) (#8420)

Signed-off-by: Taejin Park <tango4j@gmail.com>
Co-authored-by: Taejin Park <tango4j@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Add change_vocabulary and save_tokenizers() support to Multitask ASR models (#8357) (#8367)

* Add change_vocabulary and save_tokenizers() support

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update nemo/collections/asr/models/aed_multitask_models.py

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* fix path location and branch (#8314)

* fix path location and branch (#8304)

* fix path location and branch

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* change to a floating point number

Signed-off-by: Nithin Rao Koluguri <nithinraok>

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>

* updat ebranch in tutorial

Signed-off-by: Nithin Rao Koluguri <nithinraok>

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Add TP comm overlap knobs to AutocastTransformerLayer (#8290)

Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>
Co-authored-by: Jaemin Choi <jaeminc@nvidia.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* add deallocate pipeline output optimization (#8279) (#8318)

* add deallocate pipeline output optimization

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* remove assertion (#8302) (#8321)

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Keep max_seqlen and cu_seqlens_argmin for later micro-batches when PP>1 (#8334) (#8346)

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Enable megatron core loggers for GPT pretraining (#8354) (#8384)

* Logging changes tested for gpt_pretraining

* Additional args

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: ashbhandare <ash.bhandare@gmail.com>
Co-authored-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Fix dreambooth data sampler issue (#8400) (#8413)

* Turn on drop last

* Some neva fixes

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* add ensemble decoding fix (#8427) (#8433)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* NeVA Tutorial Notebook (#8217)

* init commit - neva tutorial

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* NeVA tutorial notebook

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* init commit - neva tutorial

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Signed-off-by: Pratyush Muthukumar <pannumuthu@gmail.com>
Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* NeVA tutorial notebook

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Signed-off-by: Pratyush Muthukumar <pannumuthu@gmail.com>
Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* requested changes

Signed-off-by: Pratyush Muthukumar <pannumuthu@gmail.com>
Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* add inference via script

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* requested changes

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* requested changes

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* add codeblocks to run torchrun in notebook

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

---------

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Signed-off-by: Pratyush Muthukumar <pannumuthu@gmail.com>
Co-authored-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* mcore customization doc minor fix (#8421) (#8437)

Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Co-authored-by: Huiying <willwin.lee@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Add `loop_labels` algorithm for TDT greedy decoding (#8215)

* Add `loop_labels` algorithm for TDT greedy decoding

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Use `loop_labels` by default

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Loop labels greedy decoding v2

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add comments. Clean up

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add comments

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add comments

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add tests for batched hypotheses

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add tests for batched alignments

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add comments

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix comment

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix test

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add computer for TDT

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix TDT decoding algorithm

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Use loop frames by default for TDT

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Remove "loop frames" implementation for TDT

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Clean up

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add comments

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix confidence. Use tensor for durations.

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

---------

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Add dist ckpt support for regular optimizers (#7749) (#8293)

* Add dist ckpt support for regular optimizers

* [tutorial] fixed missing RIR scripts file. (#8257)

* fix imports

* imports fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* ci imports fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert asr notebook

* revert asr notebook

---------

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Multimodal r1.23.0 bug fix  (#8315) (#8339)

* Rename quick-gelu

* ddpm config guard

* Fix ddpm edit api

* Fix insert_image_token cfg issue

* neva updates

* reformat

* Add back jenkins

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix jenkins

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bugs

* Update default neva template

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* mcore ds fix (#8283) (#8385)

* [tutorial] fixed missing RIR scripts file. (#8257)

* add values to en tts dict (#7879)

* mcore ds fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update mcore

* revert asr files

* add comments

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add support for mcore mock dataset

* update mcore version

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update gpt cfg

* update mcore commit

* fix Bert unit tests

* update bert tests

* fix bert mcore test

* fix gpt jenkins tests

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update apex & TE commits

* revert apex installation

* turn off the fusion for jenkins

---------

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* MCore dataset compatibility for tokenizers (#8390) (#8397)

* Add unique_identifiers for all tokenizers and eod for SentencePieceTokenizer

* Add generalized token aliases to TokenizerSpec to conform with MegatronTokenizer's interface. Remove now-redundant individual fixes from AutoTokenizer and SentencePieceTokenizer.

---------

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Canary: inference tokenization improvements; preserving custom keys when creating tarred manifests (#8432)

* Improvements for Canary:

- carry over custom keys when creatin tarred manifests
- selectable text field in ASR eval
- get rid of prompt slicing, create proper inference prompts

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* set ensure_ascii=False in tarred conversion to avoid breaking tokenizers trained on UTF-8 encoding

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* add  sbert to IR (#8445)

* add  sbert to IR

Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>

* add doc

Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>

* fix the  auto_tokenizer property method reset bug

Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>

* addressed bot comments

Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Update readme (#8440)

* update

Signed-off-by: eharper <eharper@nvidia.com>

* udpate

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* landing pages added

* landing page added for vision

* landing pages updated

* some minor changes to the main readme

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* typo fixed

* update

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>
Co-authored-by: ntajbakhsh <ntajbakhsh@nvidia.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* NeMo-Mistral to HF converter bugfix. (#8353) (#8442)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: akoumpa <153118171+akoumpa@users.noreply.github.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Fixing mcore bert for TP, PP and SP (#8336) (#8443)

* Fixing mcore bert for TP, PP and SP

* Fixing mcore bert for TP, PP and SP

* Fixing mcore version

* Fixing mcore version

* Update Jenkinsfile

* Update Jenkinsfile

* Update Jenkinsfile

---------

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Add LoRA support to all linear layers (#7988)

* Added LoRA support for the Dense layer of Attention

* Added LoRA MLP support to MCore and NeMo models.

* Change LoRA config default to QKV.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fixed bug with ddp training.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* MCoreMixin chages.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* using new commit of meg-LM

Signed-off-by: arendu <adithya.r@gmail.com>

* add cpu_offloading_num_layers to conversion script until bug in megatron is fixed

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix peft mixin arguments to follow mcore 0.5

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update megatron commit to fix ci error

Signed-off-by: Chen Cui <chcui@nvidia.com>

* try to fix ci

Signed-off-by: Chen Cui <chcui@nvidia.com>

* try to fix ci

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add cfg default

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>
Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>
Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Adi Renduchintala <adithyare@nvidia.com>
Co-authored-by: Jiaqi Zeng <jiaqiz@nvidia.com>
Co-authored-by: arendu <adithya.r@gmail.com>
Co-authored-by: HeyyyyyyG <49757268+HeyyyyyyG@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Add Neva Template for NV-DPO Models  (#8358)

* add/rename from nvgpt to nv_steerlm, add nv_dpo template

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* add nv_dpo conversation to accomendate empty system message

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* handle nv_dpo template text generation

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* add prompt string to nvgpt

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* bugfix for inference prompt template

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* bug fix for grabbing clean text

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* fix code format

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

---------

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>
Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Rebase scaling alpha

Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* default for alpha

Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Rebase scaling alpha

Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

---------

Signed-off-by: George Zelenfroynd <gzelenfroind@nvidia.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
Signed-off-by: Taejin Park <tango4j@gmail.com>
Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>
Signed-off-by: Nithin Rao Koluguri <nithinraok>
Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>
Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: Sangkug Lym <slym@nvidia.com>
Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>
Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Signed-off-by: Pratyush Muthukumar <pannumuthu@gmail.com>
Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>
Signed-off-by: eharper <eharper@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>
Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>
Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: HuiyingLi <willwin.lee@gmail.com>
Co-authored-by: George <37293288+Jorjeous@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Taejin Park <tango4j@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Jaemin Choi <minitu77@gmail.com>
Co-authored-by: Jaemin Choi <jaeminc@nvidia.com>
Co-authored-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: ashbhandare <ash.bhandare@gmail.com>
Co-authored-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: Pratyush Muthukumar <30813477+PannuMuthu@users.noreply.github.com>
Co-authored-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Co-authored-by: Huiying <willwin.lee@gmail.com>
Co-authored-by: Vladimir Bataev <vbataev@nvidia.com>
Co-authored-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Co-authored-by: ntajbakhsh <ntajbakhsh@nvidia.com>
Co-authored-by: akoumpa <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: Tugrul Konuk <ertkonuk@gmail.com>
Co-authored-by: Adi Renduchintala <adithyare@nvidia.com>
Co-authored-by: Jiaqi Zeng <jiaqiz@nvidia.com>
Co-authored-by: arendu <adithya.r@gmail.com>
Co-authored-by: HeyyyyyyG <49757268+HeyyyyyyG@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: George Zelenfroynd <gzelenfroind@nvidia.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
Signed-off-by: Taejin Park <tango4j@gmail.com>
Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>
Signed-off-by: Nithin Rao Koluguri <nithinraok>
Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>
Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: Sangkug Lym <slym@nvidia.com>
Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>
Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Signed-off-by: Pratyush Muthukumar <pannumuthu@gmail.com>
Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>
Signed-off-by: eharper <eharper@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>
Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: HuiyingLi <willwin.lee@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Michal Futrega <mfutrega@nvidia.com>
Co-authored-by: George <37293288+Jorjeous@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Taejin Park <tango4j@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Jaemin Choi <minitu77@gmail.com>
Co-authored-by: Jaemin Choi <jaeminc@nvidia.com>
Co-authored-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: ashbhandare <ash.bhandare@gmail.com>
Co-authored-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: Pratyush Muthukumar <30813477+PannuMuthu@users.noreply.github.com>
Co-authored-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Co-authored-by: Huiying <willwin.lee@gmail.com>
Co-authored-by: Vladimir Bataev <vbataev@nvidia.com>
Co-authored-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Co-authored-by: ntajbakhsh <ntajbakhsh@nvidia.com>
Co-authored-by: akoumpa <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: Tugrul Konuk <ertkonuk@gmail.com>
Co-authored-by: Jiaqi Zeng <jiaqiz@nvidia.com>
Co-authored-by: HeyyyyyyG <49757268+HeyyyyyyG@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>

* Update PEFT Doc (#8501)

* update peft doc

Signed-off-by: Chen Cui <chcui@nvidia.com>

* remove old prompt learning doc and notebook

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix table

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix table

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix table

Signed-off-by: Chen Cui <chcui@nvidia.com>

* revert accidental commit

Signed-off-by: Chen Cui <chcui@nvidia.com>

* revert accidental commit

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>

* release updates (#8394)

* release updates (#8378)

* [tutorial] fixed missing RIR scripts file. (#8257)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* add values to en tts dict (#7879)

Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>

* mcore ds fix

Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update mcore

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert asr files

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add comments

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add support for mcore mock dataset

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore version

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update gpt cfg

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore commit

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix Bert unit tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update bert tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix bert mcore test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix gpt jenkins tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add support for dict data input type

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add mock ds test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add test for dict data input type

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* mcore ds fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* data input fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>

* Update megatron_gpt_model.py

Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>

---------

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Fix canary transcribe (#8500)

* fix transcribe for AED models

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix logging output length

Signed-off-by: stevehuang52 <heh@nvidia.com>

---------

Signed-off-by: stevehuang52 <heh@nvidia.com>

* MoE parameter passing (#8490)

* MoE parameter passing (#8255)

* MoE parameter passing

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Pass EP/MoE params in consumer scripts.

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* PR fixes

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Use latest commit of mcore-0.5

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* CI fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: Alexandros Koumparoulis <akoumparouli@dgx1v-loki-21.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Jiaqiz/option to disable adapters & merge all lora layers (#8029)

* Added LoRA support for the Dense layer of Attention

* Added LoRA MLP support to MCore and NeMo models.

* Change LoRA config default to QKV.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fixed bug with ddp training.

* use adapter only when it is enabled

Signed-off-by: jiaqi zeng <jiaqiz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix lora merge script (#8113)

Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Adi Renduchintala <adithyare@nvidia.com>

* add peft ckpt to nemo

Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>

* merge lora weights for all layers, mcore only

Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>

* support/fix cpu initialization

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add example usage

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix TP due to distributed checkpoint

Signed-off-by: Chen Cui <chcui@nvidia.com>

* updating the logic of merging lora weights for all layers, mcore only

Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>

* MCoreMixin chages.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* merge in fp32 then cast back

Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>

* remove ckpt to nemo

Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>

* fix import

Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>

---------

Signed-off-by: jiaqi zeng <jiaqiz@nvidia.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>
Co-authored-by: Tugrul Konuk <ertkonuk@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Adi Renduchintala <adithyare@nvidia.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Update k2 version (#8478)

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Add mcore full TE transformer layer spec (#8328)

* Add spec and implement autocast layer

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* remove try-catchs, these dependecies are mandatory for this file

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Check out this cool try/except clause

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove unused import

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Add import tests to Jenkinsfile

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Move import tests to Jenkins and remove code that is developed only for passing tests

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Make test robust to faulty base configs

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Use proper GPT implementation in the test

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Update nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py

Co-authored-by: Sudhakar Singh <sudhakars@nvidia.com>
Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com>

* Update nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py

Co-authored-by: Sudhakar Singh <sudhakars@nvidia.com>
Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py

Co-authored-by: Jaemin Choi <minitu77@gmail.com>
Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com>

* Update nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py

Co-authored-by: Jaemin Choi <minitu77@gmail.com>
Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com>

* Add TE knobs to the copy of AutocastTransformerLayer

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Add TE knobs to the copy of AutocastTransformerLayer

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Add dummy parameter to accomodated for the changes in mcore

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update mcore to 0.5.0 in Jenkins pipeline

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Bump mcore commit. This is commit from tot, not any release.

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Remove from the test config option that is incompatible with bias_activation_fusion

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Bump TE version in CI to 1.4

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Update test

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Change precision for the test - current runnens don't support bf16

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

---------

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>
Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Sudhakar Singh <sudhakars@nvidia.com>
Co-authored-by: Jaemin Choi <minitu77@gmail.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Add mcore full TE transformer layer spec (#8328)

* Add spec and implement autocast layer

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* remove try-catchs, these dependecies are mandatory for this file

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Check out this cool try/except clause

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove unused import

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Add import tests to Jenkinsfile

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Move import tests to Jenkins and remove code that is developed only for passing tests

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Make test robust to faulty base configs

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Use proper GPT implementation in the test

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Update nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py

Co-authored-by: Sudhakar Singh <sudhakars@nvidia.com>
Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com>

* Update nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py

Co-authored-by: Sudhakar Singh <sudhakars@nvidia.com>
Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py

Co-authored-by: Jaemin Choi <minitu77@gmail.com>
Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com>

* Update nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py

Co-authored-by: Jaemin Choi <minitu77@gmail.com>
Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com>

* Add TE knobs to the copy of AutocastTransformerLayer

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Add TE knobs to the copy of AutocastTransformerLayer

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Add dummy parameter to accomodated for the changes in mcore

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update mcore to 0.5.0 in Jenkins pipeline

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Bump mcore commit. This is commit from tot, not any release.

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Remove from the test config option that is incompatible with bias_activation_fusion

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Bump TE version in CI to 1.4

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Update test

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Change precision for the test - current runnens don't support bf16

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

---------

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>
Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Sudhakar Singh <sudhakars@nvidia.com>
Co-authored-by: Jaemin Choi <minitu77@gmail.com>

* Handle float limit_val_batches (#8426)

* Handle float limit_val_batches

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Rectify reconfiguration of float limit_val_batches

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Remove unused imports

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Scale len(val_dataloader) with float limit_val_batches

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Return len(dataloader) in microbatches

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add back resetting of num val samples

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix to ensure float limit_val_batches is multiple of num_micro_batches

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Remove forcing eval samples to 1 for float limit_val_batches

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix bug wrt 0 limiot_val_batches

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add missing mock_dataset line

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Avoid ensuring limit_val_batches is a mutliple of microbatches for 1.0

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Restore the hack forcing number of validation and test epochs to 1

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

* Change limit_val_batches to 1.0 for GPT pretraining test. The integer value is covered in other tests

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Jan Baczek <jbaczek@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Jan Baczek <jbaczek@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Fix tutorial links in user guide (#8497)

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Sequence Parallel for LoRA (#8369)

* support lora + sequence parallel

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add more comments

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add lora SP CI test

Signed-off-by: Chen Cui <chcui@nvidia.com>

* support lora for all linear modules as in #7988

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Call proper method to replace (#8498)

Signed-off-by: Naga Venkatesh Gavini <nagavenkat9948@gmail.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Added memory logger (#8395)

* Added memory logger

Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos01.eos.clusters.nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: Selvaraj Anandaraj <selvaraja@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Canary refactor for Riva (#8363)

* initial commit of bleu score tracking

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* initial commit, refactoring aed models for riva

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Updating Canary to support torch metrics

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* style fixes

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* missed an empty batch conditional

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* Fixing dataloader issues

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* Finishing merge conflict with transcribe update

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* style fix

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* copyright header fix

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* yet another merge conflict

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* making paired data management safer

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* sentencepiece needs bigger tokenizer...

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* sentencepiece tokenizer vocab needs to be +2 from vocab for canary

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* Update canary tokenizer to be more generic, updated metrics to manage special tokens removal themselves.

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* merge conflit

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* Simplified tokenizer and corrected bug in dataloader

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* Cleaning up docstrings and fixing inference bug.

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* adding example scripts

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* cleaning up useless imports

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* adding unit tests

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* fixing unit tests

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* cfg name change

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* adding custom check to pass pytests

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* removing print script

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* catching bugs regarding tokens.

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* added docstrings and made examples scripts more generic

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* docstring deleted by accident

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* plurals in namespace

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* changing example script

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

---------

Signed-off-by: Travis Bartley <tbartley@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* add alpha scaling to lora (#8248)

* removed pdeprecated eft model

Signed-off-by: arendu <adithya.r@gmail.com>

* add alpha

Signed-off-by: arendu <adithya.r@gmail.com>

* default for alpha

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add alpha scaling to lora (#8483)

* coldfix (#8412)

Signed-off-by: George Zelenfroynd <gzelenfroind@nvidia.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Fixed errors in the CTM gen functions (#8416) (#8420)

Signed-off-by: Taejin Park <tango4j@gmail.com>
Co-authored-by: Taejin Park <tango4j@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Add change_vocabulary and save_tokenizers() support to Multitask ASR models (#8357) (#8367)

* Add change_vocabulary and save_tokenizers() support

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update nemo/collections/asr/models/aed_multitask_models.py

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* fix path location and branch (#8314)

* fix path location and branch (#8304)

* fix path location and branch

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* change to a floating point number

Signed-off-by: Nithin Rao Koluguri <nithinraok>

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>

* updat ebranch in tutorial

Signed-off-by: Nithin Rao Koluguri <nithinraok>

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Add TP comm overlap knobs to AutocastTransformerLayer (#8290)

Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>
Co-authored-by: Jaemin Choi <jaeminc@nvidia.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* add deallocate pipeline output optimization (#8279) (#8318)

* add deallocate pipeline output optimization

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* remove assertion (#8302) (#8321)

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Keep max_seqlen and cu_seqlens_argmin for later micro-batches when PP>1 (#8334) (#8346)

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Enable megatron core loggers for GPT pretraining (#8354) (#8384)

* Logging changes tested for gpt_pretraining

* Additional args

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: ashbhandare <ash.bhandare@gmail.com>
Co-authored-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Fix dreambooth data sampler issue (#8400) (#8413)

* Turn on drop last

* Some neva fixes

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* add ensemble decoding fix (#8427) (#8433)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* NeVA Tutorial Notebook (#8217)

* init commit - neva tutorial

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* NeVA tutorial notebook

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* init commit - neva tutorial

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Signed-off-by: Pratyush Muthukumar <pannumuthu@gmail.com>
Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* NeVA tutorial notebook

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Signed-off-by: Pratyush Muthukumar <pannumuthu@gmail.com>
Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* requested changes

Signed-off-by: Pratyush Muthukumar <pannumuthu@gmail.com>
Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* add inference via script

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* requested changes

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* requested changes

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* add codeblocks to run torchrun in notebook

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

---------

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Signed-off-by: Pratyush Muthukumar <pannumuthu@gmail.com>
Co-authored-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* mcore customization doc minor fix (#8421) (#8437)

Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Co-authored-by: Huiying <willwin.lee@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Add `loop_labels` algorithm for TDT greedy decoding (#8215)

* Add `loop_labels` algorithm for TDT greedy decoding

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Use `loop_labels` by default

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Loop labels greedy decoding v2

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add comments. Clean up

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add comments

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add comments

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add tests for batched hypotheses

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add tests for batched alignments

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add comments

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix comment

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix test

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add computer for TDT

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix TDT decoding algorithm

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Use loop frames by default for TDT

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Remove "loop frames" implementation for TDT

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Clean up

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add comments

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix confidence. Use tensor for durations.

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

---------

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Add dist ckpt support for regular optimizers (#7749) (#8293)

* Add dist ckpt support for regular optimizers

* [tutorial] fixed missing RIR scripts file. (#8257)

* fix imports

* imports fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* ci imports fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert asr notebook

* revert asr notebook

---------

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Multimodal r1.23.0 bug fix  (#8315) (#8339)

* Rename quick-gelu

* ddpm config guard

* Fix ddpm edit api

* Fix insert_image_token cfg issue

* neva updates

* reformat

* Add back jenkins

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix jenkins

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bugs

* Update default neva template

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* mcore ds fix (#8283) (#8385)

* [tutorial] fixed missing RIR scripts file. (#8257)

* add values to en tts dict (#7879)

* mcore ds fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update mcore

* revert asr files

* add comments

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add support for mcore mock dataset

* update mcore version

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update gpt cfg

* update mcore commit

* fix Bert unit tests

* update bert tests

* fix bert mcore test

* fix gpt jenkins tests

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update apex & TE commits

* revert apex installation

* turn off the fusion for jenkins

---------

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* MCore dataset compatibility for tokenizers (#8390) (#8397)

* Add unique_identifiers for all tokenizers and eod for SentencePieceTokenizer

* Add generalized token aliases to TokenizerSpec to conform with MegatronTokenizer's interface. Remove now-redundant individual fixes from AutoTokenizer and SentencePieceTokenizer.

---------

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Canary: inference tokenization improvements; preserving custom keys when creating tarred manifests (#8432)

* Improvements for Canary:

- carry over custom keys when creatin tarred manifests
- selectable text field in ASR eval
- get rid of prompt slicing, create proper inference prompts

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* set ensure_ascii=False in tarred conversion to avoid breaking tokenizers trained on UTF-8 encoding

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* add  sbert to IR (#8445)

* add  sbert to IR

Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>

* add doc

Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>

* fix the  auto_tokenizer property method reset bug

Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>

* addressed bot comments

Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Update readme (#8440)

* update

Signed-off-by: eharper <eharper@nvidia.com>

* udpate

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* landing pages added

* landing page added for vision

* landing pages updated

* some minor changes to the main readme

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* typo fixed

* update

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>
Co-authored-by: ntajbakhsh <ntajbakhsh@nvidia.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* NeMo-Mistral to HF converter bugfix. (#8353) (#8442)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: akoumpa <153118171+akoumpa@users.noreply.github.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Fixing mcore bert for TP, PP and SP (#8336) (#8443)

* Fixing mcore bert for TP, PP and SP

* Fixing mcore bert for TP, PP and SP

* Fixing mcore version

* Fixing mcore version

* Update Jenkinsfile

* Update Jenkinsfile

* Update Jenkinsfile

---------

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Add LoRA support to all linear layers (#7988)

* Added LoRA support for the Dense layer of Attention

* Added LoRA MLP support to MCore and NeMo models.

* Change LoRA config default to QKV.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fixed bug with ddp training.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* MCoreMixin chages.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* using new commit of meg-LM

Signed-off-by: arendu <adithya.r@gmail.com>

* add cpu_offloading_num_layers to conversion script until bug in megatron is fixed

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix peft mixin arguments to follow mcore 0.5

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update megatron commit to fix ci error

Signed-off-by: Chen Cui <chcui@nvidia.com>

* try to fix ci

Signed-off-by: Chen Cui <chcui@nvidia.com>

* try to fix ci

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add cfg default

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>
Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>
Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Adi Renduchintala <adithyare@nvidia.com>
Co-authored-by: Jiaqi Zeng <jiaqiz@nvidia.com>
Co-authored-by: arendu <adithya.r@gmail.com>
Co-authored-by: HeyyyyyyG <49757268+HeyyyyyyG@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Add Neva Template for NV-DPO Models  (#8358)

* add/rename from nvgpt to nv_steerlm, add nv_dpo template

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* add nv_dpo conversation to accomendate empty system message

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* handle nv_dpo template text generation

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* add prompt string to nvgpt

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* bugfix for inference prompt template

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* bug fix for grabbing clean text

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* fix code format

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

---------

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>
Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Rebase scaling alpha

Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* default for alpha

Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Rebase scaling alpha

Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

---------

Signed-off-by: George Zelenfroynd <gzelenfroind@nvidia.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
Signed-off-by: Taejin Park <tango4j@gmail.com>
Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>
Signed-off-by: Nithin Rao Koluguri <nithinraok>
Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>
Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: Sangkug Lym <slym@nvidia.com>
Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>
Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Signed-off-by: Pratyush Muthukumar <pannumuthu@gmail.com>
Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>
Signed-off-by: eharper <eharper@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>
Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>
Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: HuiyingLi <willwin.lee@gmail.com>
Co-authored-by: George <37293288+Jorjeous@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Taejin Park <tango4j@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Jaemin Choi <minitu77@gmail.com>
Co-authored-by: Jaemin Choi <jaeminc@nvidia.com>
Co-authored-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: ashbhandare <ash.bhandare@gmail.com>
Co-authored-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: Pratyush Muthukumar <30813477+PannuMuthu@users.noreply.github.com>
Co-authored-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Co-authored-by: Huiying <willwin.lee@gmail.com>
Co-authored-by: Vladimir Bataev <vbataev@nvidia.com>
Co-authored-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Co-authored-by: ntajbakhsh <ntajbakhsh@nvidia.com>
Co-authored-by: akoumpa <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: Tugrul Konuk <ertkonuk@gmail.com>
Co-authored-by: Adi Renduchintala <adithyare@nvidia.com>
Co-authored-by: Jiaqi Zeng <jiaqiz@nvidia.com>
Co-authored-by: arendu <adithya.r@gmail.com>
Co-authored-by: HeyyyyyyG <49757268+HeyyyyyyG@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: George Zelenfroynd <gzelenfroind@nvidia.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
Signed-off-by: Taejin Park <tango4j@gmail.com>
Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>
Signed-off-by: Nithin Rao Koluguri <nithinraok>
Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>
Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: Sangkug Lym <slym@nvidia.com>
Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>
Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Signed-off-by: Pratyush Muthukumar <pannumuthu@gmail.com>
Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>
Signed-off-by: eharper <eharper@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>
Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: HuiyingLi <willwin.lee@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Michal Futrega <mfutrega@nvidia.com>
Co-authored-by: George <37293288+Jorjeous@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Taejin Park <tango4j@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Jaemin Choi <minitu77@gmail.com>
Co-authored-by: Jaemin Choi <jaeminc@nvidia.com>
Co-authored-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: ashbhandare <ash.bhandare@gmail.com>
Co-authored-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: Pratyush Muthukumar <30813477+PannuMuthu@users.noreply.github.com>
Co-authored-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Co-authored-by: Huiying <willwin.lee@gmail.com>
Co-authored-by: Vladimir Bataev <vbataev@nvidia.com>
Co-authored-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Co-authored-by: ntajbakhsh <ntajbakhsh@nvidia.com>
Co-authored-by: akoumpa <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: Tugrul Konuk <ertkonuk@gmail.com>
Co-authored-by: Jiaqi Zeng <jiaqiz@nvidia.com>
Co-authored-by: HeyyyyyyG <49757268+HeyyyyyyG@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Update PEFT Doc (#8501)

* update peft doc

Signed-off-by: Chen Cui <chcui@nvidia.com>

* remove old prompt learning doc and notebook

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix table

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix table

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix table

Signed-off-by: Chen Cui <chcui@nvidia.com>

* revert accidental commit

Signed-off-by: Chen Cui <chcui@nvidia.com>

* revert accidental commit

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* release updates (#8394)

* release updates (#8378)

* [tutorial] fixed missing RIR scripts file. (#8257)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* add values to en tts dict (#7879)

Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>

* mcore ds fix

Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update mcore

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert asr files

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add comments

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add support for mcore mock dataset

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore version

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update gpt cfg

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore commit

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix Bert unit tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update bert tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix bert mcore test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix gpt jenkins tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add support for dict data input type

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add mock ds test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add test for dict data input type

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* mcore ds fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* data input fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>

* Update megatron_gpt_model.py

Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>

---------

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: jiaqi zeng <jiaqiz@nvidia.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>
Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
Signed-off-by: Jan Baczek <jbaczek@nvidia.com>
Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com>
Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: Naga Venkatesh Gavini <nagavenkat9948@gmail.com>
Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: Travis Bartley <tbartley@nvidia.com>
Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: George Zelenfroynd <gzelenfroind@nvidia.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
Signed-off-by: Taejin Park <tango4j@gmail.com>
Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>
Signed-off-by: Nithin Rao Koluguri <nithinraok>
Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>
Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: Sangkug Lym <slym@nvidia.com>
Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>
Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Signed-off-by: Pratyush Muthukumar <pannumuthu@gmail.com>
Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>
Signed-off-by: eharper <eharper@nvidia.com>
Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>
Signed-off-by: HuiyingLi <willwin.lee@gmail.com>
Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: akoumpa <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: Alexandros Koumparoulis <akoumparouli@dgx1v-loki-21.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: HeyyyyyyG <49757268+HeyyyyyyG@users.noreply.github.com>
Co-authored-by: Tugrul Konuk <ertkonuk@gmail.com>
Co-authored-by: Adi Renduchintala <adithyare@nvidia.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Vladimir Bataev <vbataev@nvidia.com>
Co-authored-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: Sudhakar Singh <sudhakars@nvidia.com>
Co-authored-by: Jaemin Choi <minitu77@gmail.com>
Co-authored-by: jbaczek <45043825+jbaczek@users.noreply.github.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Jan Baczek <jbaczek@nvidia.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: Naga Venkatesh Gavini <nagavenkat9948@gmail.com>
Co-authored-by: Selvaraj Anandaraj <anandaraj@wisc.edu>
Co-authored-by: Selvaraj Anandaraj <selvaraja@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: tbartley94 <90423858+tbartley94@users.noreply.github.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>
Co-authored-by: Michal Futrega <mfutrega@nvidia.com>
Co-authored-by: George <37293288+Jorjeous@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Taejin Park <tango4j@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Jaemin Choi <jaeminc@nvidia.com>
Co-authored-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: ashbhandare <ash.bhandare@gmail.com>
Co-authored-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: Pratyush Muthukumar <30813477+PannuMuthu@users.noreply.github.com>
Co-authored-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Co-authored-by: Huiying <willwin.lee@gmail.com>
Co-authored-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Co-authored-by: ntajbakhsh <ntajbakhsh@nvidia.com>
Co-authored-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: Jiaqi Zeng <jiaqiz@nvidia.com>

* Dgalvez/cuda graphs greedy rnnt inference squash (#8191)

* Speed up RNN-T greedy decoding with cuda graphs

This uses CUDA 12.3's conditional node support.

Initialize cuda tensors lazily on first call of __call__ instead of __init__.

We don't know what device is going to be used at construction time,
and we can't rely on torch.nn.Module.to() to work here. See here:
https://github.com/NVIDIA/NeMo/issues/8436

This fixes an error "Expected all tensors to be on the same device,
but found at least two devices" that happens when you call to() on your
torch.nn.Module after constructing it.

https://github.com/NVIDIA/NeMo/pull/8191#discussion_r1491227311

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>

* Fix progress bar to update the total steps with trainer.max_steps (#8499)

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Move AST and TTS import tests to github actions from Jenkinsfile (#8509)

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Update baichuan2 and Gemma

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add Docs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Update Docs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix few things

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix typo

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix jenkins

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix few issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Temporary disable PTQ in jenkins

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Revert "Temporary disable PTQ in jenkins"

This reverts commit e959c2b1f16659393c00aa7e98d6121e406eb320.

* Add checks to whether file exist

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* no longer remove llama_ci.nemo in jenkins

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* move chatglm conversion script

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: Ryan <rlangman@nvidia.com>
Signed-off-by: George Zelenfroynd <gzelenfroind@nvidia.com>
Signed-off-by: Taejin Park <tango4j@gmail.com>
Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>
Signed-off-by: Nithin Rao Koluguri <nithinraok>
Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>
Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: Sangkug Lym <slym@nvidia.com>
Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>
Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Signed-off-by: Pratyush Muthukumar <pannumuthu@gmail.com>
Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>
Signed-off-by: eharper <eharper@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>
Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>
Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: HuiyingLi <willwin.lee@gmail.com>
Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: stevehuang52 <heh@nvidia.com>
Signed-off-by: Daniel Egert <degert@nvidia.com>
Signed-off-by: Euynaheh <hehanyue99@outlook.com>
Signed-off-by: Euynaheh <93857693+Euynaheh@users.noreply.github.com>
Signed-off-by: jiaqi zeng <jiaqiz@nvidia.com>
Signed-off-by: Jan Baczek <jbaczek@nvidia.com>
Signed-off-by: jbaczek <45043825+jbaczek@users.noreply.github.com>
Signed-off-by: Naga Venkatesh Gavini <nagavenkat9948@gmail.com>
Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: Travis Bartley <tbartley@nvidia.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>
Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>
Co-authored-by: Ryan Langman <rlangman@nvidia.com>
Co-authored-by: George <37293288+Jorjeous@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Taejin Park <tango4j@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Jaemin Choi <minitu77@gmail.com>
Co-authored-by: Jaemin Choi <jaeminc@nvidia.com>
Co-authored-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: ashbhandare <ash.bhandare@gmail.com>
Co-authored-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: Pratyush Muthukumar <30813477+PannuMuthu@users.noreply.github.com>
Co-authored-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Co-authored-by: Huiying <willwin.lee@gmail.com>
Co-authored-by: Vladimir Bataev <vbataev@nvidia.com>
Co-authored-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Co-authored-by: ntajbakhsh <ntajbakhsh@nvidia.com>
Co-authored-by: akoumpa <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: Tugrul Konuk <ertkonuk@gmail.com>
Co-authored-by: Adi Renduchintala <adithyare@nvidia.com>
Co-authored-by: Jiaqi Zeng <jiaqiz@nvidia.com>
Co-authored-by: arendu <adithya.r@gmail.com>
Co-authored-by: HeyyyyyyG <49757268+HeyyyyyyG@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Jan Lasek <janek.lasek@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>
Co-authored-by: Euynaheh <93857693+Euynaheh@users.noreply.github.com>
Co-authored-by: trias702 <25867060+trias702@users.noreply.github.com>
Co-authored-by: jbaczek <45043825+jbaczek@users.noreply.github.com>
Co-authored-by: Sudhakar Singh <sudhakars@nvidia.com>
Co-authored-by: Jan Baczek <jbaczek@nvidia.com>
Co-authored-by: Naga Venkatesh Gavini <nagavenkat9948@gmail.com>
Co-authored-by: Selvaraj Anandaraj <anandaraj@wisc.edu>
Co-authored-by: Selvaraj Anandaraj <selvaraja@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: tbartley94 <90423858+tbartley94@users.noreply.github.com>
Co-authored-by: Michal Futrega <mfutrega@nvidia.com>
Co-authored-by: Alexandros Koumparoulis <akoumparouli@dgx1v-loki-21.nvidia.com>
Co-authored-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: Daniel Galvez <galv@users.noreply.github.com>
Co-authored-by: Shriya Palsamudram <69161273+ShriyaPalsamudram@users.noreply.github.com>
---
 Jenkinsfile                                   |  26 +-
 docs/source/ckpt_converters/dev_guide.rst     | 234 ++++++++++++++++++
 docs/source/ckpt_converters/user_guide.rst    |  94 +++++++
 docs/source/index.rst                         |   7 +
 nemo/utils/model_utils.py                     |   4 +-
 .../convert_baichuan2_hf_to_nemo.py}          |  41 +--
 .../convert_baichuan2_nemo_to_hf.py}          |  34 +--
 .../convert_bert_hf_to_nemo.py                |   3 +-
 .../convert_bert_nemo_to_hf.py                |   0
 .../convert_chatglm_hf_to_nemo.py             |   0
 .../convert_chatglm_nemo_to_hf.py             |   0
 .../convert_falcon_hf_to_nemo.py}             |  36 +--
 .../convert_falcon_nemo_to_hf.py}             |  32 +--
 .../convert_gemma_hf_to_nemo.py               |   0
 .../convert_gemma_jax_to_nemo.py              |   0
 .../convert_gemma_pyt_to_nemo.py              |   0
 .../convert_gpt_nemo_to_mcore.py}             |  22 +-
 .../convert_llama_hf_to_nemo.py}              |  38 +--
 .../convert_llama_nemo_to_hf.py}              |  54 ++--
 .../convert_mistral_7b_hf_to_nemo.py}         |  27 +-
 .../convert_mistral_7b_nemo_to_hf.py}         |  22 +-
 .../convert_mixtral_hf_to_nemo.py}            |  19 +-
 .../convert_mixtral_nemo_to_hf.py}            |  23 +-
 .../convert_mpt_hf_to_nemo.py}                |  37 +--
 .../convert_starcoder_hf_to_nemo.py           |  40 +--
 25 files changed, 594 insertions(+), 199 deletions(-)
 create mode 100644 docs/source/ckpt_converters/dev_guide.rst
 create mode 100644 docs/source/ckpt_converters/user_guide.rst
 rename scripts/{nlp_language_modeling/convert_hf_baichuan2_to_nemo.py => checkpoint_converters/convert_baichuan2_hf_to_nemo.py} (91%)
 rename scripts/{nlp_language_modeling/convert_nemo_baichuan2_to_hf.py => checkpoint_converters/convert_baichuan2_nemo_to_hf.py} (88%)
 rename scripts/{nlp_language_modeling => checkpoint_converters}/convert_bert_hf_to_nemo.py (99%)
 rename scripts/{nlp_language_modeling => checkpoint_converters}/convert_bert_nemo_to_hf.py (100%)
 rename scripts/{nlp_language_modeling => checkpoint_converters}/convert_chatglm_hf_to_nemo.py (100%)
 rename scripts/{nlp_language_modeling => checkpoint_converters}/convert_chatglm_nemo_to_hf.py (100%)
 rename scripts/{nlp_language_modeling/convert_hf_falcon_to_nemo.py => checkpoint_converters/convert_falcon_hf_to_nemo.py} (90%)
 rename scripts/{nlp_language_modeling/convert_nemo_falcon_to_hf.py => checkpoint_converters/convert_falcon_nemo_to_hf.py} (86%)
 rename scripts/{nlp_language_modeling => checkpoint_converters}/convert_gemma_hf_to_nemo.py (100%)
 rename scripts/{nlp_language_modeling => checkpoint_converters}/convert_gemma_jax_to_nemo.py (100%)
 rename scripts/{nlp_language_modeling => checkpoint_converters}/convert_gemma_pyt_to_nemo.py (100%)
 rename scripts/{nlp_language_modeling/convert_nemo_gpt_to_mcore.py => checkpoint_converters/convert_gpt_nemo_to_mcore.py} (96%)
 rename scripts/{nlp_language_modeling/convert_hf_llama_to_nemo.py => checkpoint_converters/convert_llama_hf_to_nemo.py} (90%)
 rename scripts/{nlp_language_modeling/convert_nemo_llama_to_hf.py => checkpoint_converters/convert_llama_nemo_to_hf.py} (86%)
 rename scripts/{nlp_language_modeling/convert_hf_mistral_7b_to_nemo.py => checkpoint_converters/convert_mistral_7b_hf_to_nemo.py} (94%)
 rename scripts/{nlp_language_modeling/convert_nemo_mistral_7b_to_hf.py => checkpoint_converters/convert_mistral_7b_nemo_to_hf.py} (92%)
 rename scripts/{nlp_language_modeling/convert_hf_mixtral_to_nemo.py => checkpoint_converters/convert_mixtral_hf_to_nemo.py} (95%)
 rename scripts/{nlp_language_modeling/convert_nemo_mixtral_to_hf.py => checkpoint_converters/convert_mixtral_nemo_to_hf.py} (93%)
 rename scripts/{nlp_language_modeling/convert_mpt_7b_hf_to_nemo.py => checkpoint_converters/convert_mpt_hf_to_nemo.py} (83%)
 rename scripts/{nlp_language_modeling => checkpoint_converters}/convert_starcoder_hf_to_nemo.py (85%)

diff --git a/Jenkinsfile b/Jenkinsfile
index c152626e84ec..01be658beec3 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -424,35 +424,33 @@ pipeline {
       parallel {
         stage('Llama') {
           steps {
-            sh 'CUDA_VISIBLE_DEVICES=0 python scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py \
-            --in-file=/home/TestData/nlp/megatron_llama/llama-ci-hf \
-            --out-file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \
+            sh 'CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \
+            --input_name_or_path=/home/TestData/nlp/megatron_llama/llama-ci-hf \
+            --output_path=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \
             --precision=16'
           }
         }
         stage('StarCoder') {
           steps {
-            sh 'python scripts/nlp_language_modeling/convert_starcoder_hf_to_nemo.py \
-            --config examples/nlp/language_modeling/conf/megatron_gpt_config.yaml \
-            --input /home/TestData/nlp/megatron_gpt/starcoder-ci-hf \
-            --output /home/TestData/nlp/megatron_gpt/starcoder-ci-hf'
+            sh 'python scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py \
+            --input_name_or_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf \
+            --output_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf'
             sh 'rm -f /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo'
           }
         }
         stage('Falcon') {
           steps {
-            sh 'python scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py \
-            --config examples/nlp/language_modeling/conf/megatron_falcon_config.yaml \
-            --input /home/TestData/nlp/megatron_gpt/falcon-ci-hf \
-            --output /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo'
+            sh 'python scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py \
+            --input_name_or_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf \
+            --output_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo'
             sh 'rm -f /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo'
           }
         }
         stage('Baichuan2') {
           steps {
-            sh 'python scripts/nlp_language_modeling/convert_hf_baichuan2_to_nemo.py \
-            --in-file=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base \
-            --out-file=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo'
+            sh 'python scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py \
+            --input_name_or_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base \
+            --output_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo'
             sh 'rm -f /home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo'
           }
         }
diff --git a/docs/source/ckpt_converters/dev_guide.rst b/docs/source/ckpt_converters/dev_guide.rst
new file mode 100644
index 000000000000..9faa752df2e1
--- /dev/null
+++ b/docs/source/ckpt_converters/dev_guide.rst
@@ -0,0 +1,234 @@
+Community Model Converter Development Guide
+===========================================
+
+Guideline Steps for Checkpoint Conversion
+-----------------------------------------
+
+1. **Understand Both Frameworks**: Familiarize yourself with the architectures and naming conventions of both HuggingFace and NeMo models.
+
+2. **Load Community Checkpoint**: For example, use HuggingFace’s ``AutoModel`` to load the pre-trained model.
+
+3. **Inspect Model and Config**: Understand the layer names, parameter shapes, and essential configs.
+
+4. **Adjust NeMo Model Configuration**: Modify the NeMo model configuration to match the HuggingFace model’s specifications.
+
+5. **Initialize NeMo Model**: Create an instance of the corresponding NeMo model.
+
+6. **Create Key Mapping**: Define a function to map HuggingFace layer names to NeMo layer names. Adjust for any structural differences.
+
+7. **Rename and Reshape Parameters**: Implement a function to rename keys in the HuggingFace state dictionary and reshape tensors if necessary. For example, QKV weights usually need some special handling from HF to NeMo.
+
+8. **Load Converted Weights into NeMo Model**: Apply the transformed state dictionary to the NeMo model.
+
+9. **Save NeMo Checkpoint**: Save the updated NeMo model as a new checkpoint.
+
+10. **Verification**: Verify the performance of the NeMo model to ensure successful conversion.
+
+11. **Add Docstrings and Comments**: Please kindly comment the expected shapes in the parameter reshaping part.
+
+12. **Add Jenkins Tests**: Please use `Llama Huggingface to NeMo converter test <https://github.com/NVIDIA/NeMo/blob/main/Jenkinsfile#L418>`_  as an example for development.
+
+Script Placement and Naming Conventions
+---------------------------------------
+
+- **Script Location**: Place scripts in the ``NeMo/scripts/checkpoint_converters`` directory.
+
+- **Script Naming**: Name your script following the format ``convert_{model}_{source}_to_{target}.py``, such as ``convert_llama_hf_to_nemo.py``.
+
+- **Unified Arguments (APIs)**: User only needs to define input and output files. Configs should be automatically updated.
+
+  - ``--input_name_or_path``: Specify the name or path of the model. Give one example default value.
+
+  - ``--output_path``: Set the path for saving the output .nemo file. This argument is required.
+
+  - ``--hparams_file``: Define the path for the configuration file needed for restoration. Set default path to an existing and working yaml file e.g. ``f"{os.path.dirname(__file__)}/../../examples/nlp/language_modeling/conf/megatron_bert_config.yaml"``. A regular user should not change it, but for advanced/internal users, this can be modified.
+
+  - ``--precision``: Choose the precision for saved checkpoint weights. Options: "bf16", "16", "32". Default: "32".
+
+Code Template
+-------------
+
+Below template tries to address the 11 steps in the guideline part. Please also use `Gemma Huggingface to NeMo converter <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py>`_  as an full example for development.
+
+.. code-block:: python
+
+    import os
+    import torch
+    from omegaconf import OmegaConf
+    from transformers import AutoTokenizer, AutoModel
+    from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
+    from nemo.utils import logging
+    from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
+
+    # Add additional imports and custom functions as required
+
+    def create_rename_keys(num_hidden_layers):
+        # Your implementation of create_rename_keys function
+        ...
+
+    def adjust_tensor_shapes(model, nemo_state_dict):
+        # Your implementation of adjust_tensor_shapes function
+        ...
+
+    def adjust_nemo_config(model_config, ref_config):
+        # Your implementation of adjust_nemo_config function
+        ...
+
+    def rename_model_keys(model_state_dict, rename_keys):
+        """
+        Rename keys in the model's state dictionary based on the provided mappings.
+
+        Parameters:
+        model_state_dict (dict): The state dictionary of the model.
+        rename_keys (list): A list of tuples with the mapping (old_key, new_key).
+
+        Returns:
+        dict: A new state dictionary with updated key names.
+        """
+
+        # Create a new state dictionary with updated key names
+        new_state_dict = {}
+
+        # Track keys from the original state dict to ensure all are processed
+        remaining_keys = set(model_state_dict.keys())
+
+        # Iterate over the rename mappings
+        for old_key, new_key in rename_keys:
+            if old_key in model_state_dict:
+                # Rename the key and remove it from the tracking set
+                new_state_dict[new_key] = model_state_dict[old_key]
+                remaining_keys.remove(old_key)
+            else:
+                print(f"Warning: Key '{old_key}' not found in the model state dictionary.")
+
+        # Check if any keys were not converted from old to new
+        for old_key in remaining_keys:
+            print(f"Warning: Key '{old_key}' was not converted.")
+
+    def get_args():
+        # Arg names subject to change, feel free to suggest.
+        parser = ArgumentParser()
+        parser.add_argument("--input_name_or_path", type=str, default="intfloat/e5-large-unsupervised")
+        parser.add_argument(
+            "--hparams_file",
+            type=str,
+            default=f"{os.path.dirname(__file__)}/../../examples/nlp/language_modeling/conf/megatron_bert_config.yaml",
+            required=False,
+            help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
+        )
+        parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+        parser.add_argument(
+            "--precision", type=str, default="32", choices=["bf16", "32"], help="Precision for checkpoint weights saved"
+        )
+
+        args = parser.parse_args()
+        return args
+
+    def convert(args):
+        logging.info(f"Loading checkpoint from HF: `{args.name_or_path}`")
+        hf_model = AutoModel.from_pretrained(args.name_or_path)
+
+        nemo_config = OmegaConf.load(args.hparams_file)
+        nemo_config.model = adjust_nemo_config(nemo_config.model, hf_model.config.to_dict())
+
+        nemo_config.trainer["precision"] = args.precision
+        trainer = MegatronTrainerBuilder(nemo_config).create_trainer()
+        model = MegatronBertModel(nemo_config.model, trainer)
+
+        old_state_dict = hf_model.state_dict()
+        rename_keys = create_rename_keys(nemo_config.model.num_layers)
+        new_state_dict = rename_model_keys(model_state_dict=old_state_dict, rename_keys=rename_keys)
+        nemo_state_dict = adjust_tensor_shapes(model, new_state_dict)
+        model.load_state_dict(nemo_state_dict, strict=True)
+
+        # Additional verification and processing steps
+        ...
+
+        model.save_to(args.save_path)
+        logging.info(f'NeMo model saved to: {args.save_path}')
+
+    if __name__ == '__main__':
+        args = get_args()
+        convert(args)
+
+
+
+*Notes:* This template abstracts some functions (create_rename_keys, adjust_tensor_shapes, adjust_nemo_config) which are crucial for the conversion process. These functions need to be adapted based on specific model architectures and requirements. Ensure that the NeMo model’s configuration is properly aligned with the HuggingFace model’s configuration. It is important to thoroughly test the converted model to validate the conversion process.
+
+
+Development Tips
+----------------
+
+A Simple Guide for Model Mapping and Conversion
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+1. **Mapping between community model and NeMo model**:
+
+   - Match the configurations between the community model and the NeMo model.
+   - Create two text files, ``state_src.txt`` and ``state_tgt.txt``, containing the state dict weights and their shapes for easier reference and debugging.
+
+   Example code to generate ``state_src.txt``:
+
+   .. code-block:: python
+
+       file_path = "state_src.txt"
+       state = model.state_dict()
+       with open(file_path, 'w') as file:
+           for k, v in state.items():
+               file.write(f"{k} {v.shape}\n")
+
+   - Utilize language models (LMs) to assist in completing the key mapping through the ``create_rename_keys`` function. Here's an example prompt for Gemma:
+
+     .. code-block:: text
+
+        Map the following key names and tensor shapes from Model A to their equivalents in Model B. Here is an example mapping: Model A's 'model.layer.weight' corresponds to Model B's 'module.block.weight'.
+        ============================================================
+        embedder.weight torch.Size([256128, 2048])
+        ...
+        ============================================================
+
+   Based on the results, update the following code accordingly:
+
+   .. code-block:: python
+
+       def create_rename_keys(num_hidden_layers):
+           rename_keys = []
+           for i in range(num_hidden_layers):
+               # encoder layers: output projection, 2 feedforward neural networks, and 2 layernorms
+               # @chatgpt to fill in layer-dependent keys above
+
+           # @chatgpt fill in non-layer-dependent keys above
+           rename_keys.extend(
+               [
+                   # ...
+               ]
+           )
+
+           return rename_keys
+
+   **Note**: Also list all the keys not included in the conversion above.
+
+2. **Common issues when converting: results not matching between Community model and NeMo model**:
+
+   a. Megatron Core uses a special QKV layout, which needs careful handling and reshaping from community models, especially when GQA or MQA is used. Refer to the `Gemma Huggingface to NeMo converter <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py#L144>`_ for guidance.
+
+   b. GLU Variants weights could also be a common source of error. In Megatron Core, the regular feedforward projection weights and gated forward weights are fused together, requiring careful attention to the order of these two. Refer to the `Gemma Huggingface to NeMo converter <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py#L135>`_ for more details.
+
+3. The ``create_hf_model`` function can be used to create a model programmatically. For reproducibility, see the example provided at `GitHub <https://github.com/NVIDIA/NeMo/blob/main/tests/setup/models/create_hf_model.py>`_. This function creates a randomly initialized HuggingFace model for testing purposes. The model can be specified by name or path for creating its config and tokenizer using HuggingFace transformers AutoConfig and AutoTokenizer functions.
+
+Example usage:
+
+.. code-block:: python
+
+    create_hf_model(
+        model_name_or_path="/home/TestData/nlp/meta-llama/Llama-2-7b-hf",
+        output_dir=os.path.join(args.save_dir, "megatron_llama/llama-ci-hf"),
+        config_updates={
+            "hidden_size": 256,
+            "num_attention_heads": 4,
+            "num_hidden_layers": 2,
+            "num_key_value_heads": 4
+        },
+        overwrite=args.overwrite,
+    )
+
diff --git a/docs/source/ckpt_converters/user_guide.rst b/docs/source/ckpt_converters/user_guide.rst
new file mode 100644
index 000000000000..6d247d0b24d5
--- /dev/null
+++ b/docs/source/ckpt_converters/user_guide.rst
@@ -0,0 +1,94 @@
+Community Model Converter User Guide
+====================================
+
+This guide provides instructions on how to use the conversion scripts to convert models between Community model and NVIDIA's NeMo format.
+
+Support Matrix
+--------------
+
++----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
+| Conversion           | From             | To              | Github Link                                                                                                        |
++======================+==================+=================+====================================================================================================================+
+| Baichuan             | Hugging Face     | NeMo            | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py>`_   |
++----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
+| Baichuan             | NeMo             | Hugging Face    | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py>`_   |
++----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
+| BERT                 | Hugging Face     | NeMo            | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py>`_        |
++----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
+| BERT                 | NeMo             | Hugging Face    | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_bert_nemo_to_hf.py>`_        |
++----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
+| Falcon               | Hugging Face     | NeMo            | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py>`_      |
++----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
+| Falcon               | NeMo             | Hugging Face    | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py>`_      |
++----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
+| Gemma                | Hugging Face     | NeMo            | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py>`_       |
++----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
+| Gemma                | JAX              | NeMo            | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py>`_      |
++----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
+| Gemma                | PyTorch          | NeMo            | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py>`_      |
++----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
+| GPT                  | NeMo             | mcore           | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py>`_      |
++----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
+| LLaMA                | Hugging Face     | NeMo            | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py>`_       |
++----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
+| LLaMA                | NeMo             | Hugging Face    | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py>`_       |
++----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
+| Mistral 7B           | Hugging Face     | NeMo            | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py>`_  |
++----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
+| Mistral 7B           | NeMo             | Hugging Face    | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py>`_  |
++----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
+| Mixtral              | Hugging Face     | NeMo            | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py>`_     |
++----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
+| Mixtral              | NeMo             | Hugging Face    | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py>`_     |
++----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
+| MPT                  | Hugging Face     | NeMo            | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mpt_hf_to_nemo.py>`_         |
++----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
+| Starcoder            | Hugging Face     | NeMo            | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py>`_   |
++----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
+
+
+Convert Hugging Face LLaMA Checkpoints to NeMo
+----------------------------------------------
+
+To convert a Hugging Face LLaMA checkpoint into a NeMo checkpoint, use the following command:
+
+.. code-block:: bash
+
+    python convert_llama_hf_to_nemo.py>`_ \
+     --input_name_or_path <path_to_hf_checkpoints_folder> \
+     --output_path <path_to_output_nemo_file>
+
+Convert NeMo Checkpoint to Hugging Face LLaMA
+---------------------------------------------
+
+To convert a NeMo checkpoint into a Hugging Face LLaMA checkpoint, you have two options:
+
+1. Generate only the Hugging Face weights:
+
+.. code-block:: bash
+
+    python convert_<model>_nemo_to_hf.py>`_ \
+    --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
+    --output_path /path/to/pytorch_model.bin
+
+2. Generate the full Hugging Face model folder:
+
+.. code-block:: bash
+
+    python convert_<model>_nemo_to_hf.py>`_ \
+    --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
+    --output_path /path/to/model_folder \
+    --hf_input_path /path/to/input_hf_folder \
+    --hf_output_path /path/to/output_hf_folder
+
+Replace `<model>` with the specific model you are converting.
+
+Use the ``--cpu-only`` flag if the model cannot fit in the GPU, such as for Llama2 70b models. Note that using this option will significantly slow down the conversion process.
+
+Command-Line Arguments
+----------------------
+
+- ``--input_name_or_path``: Path to the input .nemo file or the Hugging Face model folder.
+- ``--output_path``: Path to the output file or folder, depending on the conversion direction.
+- ``--hf_input_path``: (Optional) Path to the input Hugging Face model folder.
+- ``--hf_output_path``: (Optional) Path to the output Hugging Face model folder.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 7bf97cb779c3..5795b57682a1 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -42,6 +42,13 @@ For more information, browse the developer docs for your area of interest in the
 
    core/core_index
 
+.. toctree::
+   :maxdepth: 2
+   :caption: Community Model Converters
+   :name: CheckpointConverters
+
+   ckpt_converters/user_guide
+   ckpt_converters/dev_guide
 
 .. toctree::
    :maxdepth: 1
diff --git a/nemo/utils/model_utils.py b/nemo/utils/model_utils.py
index 8889f13d5b98..7ea4cffbb3ab 100644
--- a/nemo/utils/model_utils.py
+++ b/nemo/utils/model_utils.py
@@ -71,8 +71,10 @@ def load_config(model_file: str) -> DictConfig:
         with tempfile.TemporaryDirectory() as tmp, tarfile.open(model_file, "r:") as tar:
             tar.extract("./model_config.yaml", path=tmp)
             model_config = OmegaConf.load(os.path.join(tmp, "model_config.yaml"))
-    else:
+    elif os.path.isdir(model_file):
         model_config = OmegaConf.load(os.path.join(model_file, "model_config.yaml"))
+    else:
+        raise FileNotFoundError(model_file)
 
     return model_config
 
diff --git a/scripts/nlp_language_modeling/convert_hf_baichuan2_to_nemo.py b/scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py
similarity index 91%
rename from scripts/nlp_language_modeling/convert_hf_baichuan2_to_nemo.py
rename to scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py
index 217c24fd85c1..585741de9b9a 100644
--- a/scripts/nlp_language_modeling/convert_hf_baichuan2_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py
@@ -15,9 +15,9 @@
 r"""
 Conversion script to convert Huggingface Baichuan2 checkpoints into nemo checkpoint.
   Example to run this conversion script:
-    python convert_hf_baichuan2_to_nemo.py \
-     --in-file <path_to_hf_checkpoints_folder> \
-     --out-file <path_to_output_nemo_file>
+    python convert_baichuan2_hf_to_nemo.py \
+     --input_name_or_path <path_to_hf_checkpoints_folder> \
+     --output_path <path_to_output_nemo_file>
 """
 
 import os
@@ -26,10 +26,10 @@
 
 import torch
 from omegaconf import OmegaConf
+from pytorch_lightning.core.saving import _load_state as ptl_load_state
 from pytorch_lightning.trainer.trainer import Trainer
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.parts.nlp_overrides import (
     GradScaler,
@@ -45,9 +45,22 @@
 def get_args():
     parser = ArgumentParser()
     parser.add_argument(
-        "--in-file", type=str, default=None, required=True, help="Path to Huggingface baichuan2 checkpoints",
+        "--input_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to Huggingface baichuan2 checkpoints",
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=os.path.join(
+            os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_baichuan2_config.yaml'
+        ),
+        required=False,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
     )
-    parser.add_argument("--out-file", type=str, default=None, required=True, help="Path to output .nemo file.")
     parser.add_argument("--precision", type=str, default="32", help="Model precision")
     args = parser.parse_args()
     return args
@@ -90,11 +103,7 @@ def load_model(cls, checkpoint, strict, **kwargs):
 
 
 def load_config(args, baichuan2_config):
-    nemo_config = OmegaConf.load(
-        os.path.join(
-            os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_baichuan2_config.yaml'
-        )
-    ).model
+    nemo_config = OmegaConf.load(args.hparams_file).model
     if 'max_position_embeddings' in baichuan2_config:
         nemo_config.encoder_seq_length = baichuan2_config['max_position_embeddings']
     nemo_config.num_layers = baichuan2_config['num_hidden_layers']
@@ -121,9 +130,9 @@ def load_config(args, baichuan2_config):
 
 
 def convert(args):
-    logging.info(f"loading checkpoint {args.in_file}")
-    model = AutoModelForCausalLM.from_pretrained(args.in_file, trust_remote_code=True)
-    tokenizer = AutoTokenizer.from_pretrained(args.in_file, trust_remote_code=True)
+    logging.info(f"loading checkpoint {args.input_name_or_path}")
+    model = AutoModelForCausalLM.from_pretrained(args.input_name_or_path, trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(args.input_name_or_path, trust_remote_code=True)
     hf_config = vars(model.config)
     hf_config['tokenizer_model'] = str(tokenizer.vocab_file)
     print(f"hf_config: {hf_config}")
@@ -307,8 +316,8 @@ def convert(args):
     model = model.to(dtype=dtype)
     model.cfg.use_cpu_initialization = False
 
-    model.save_to(args.out_file)
-    logging.info(f'NeMo model saved to: {args.out_file}')
+    model.save_to(args.output_path)
+    logging.info(f'NeMo model saved to: {args.output_path}')
 
 
 if __name__ == '__main__':
diff --git a/scripts/nlp_language_modeling/convert_nemo_baichuan2_to_hf.py b/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py
similarity index 88%
rename from scripts/nlp_language_modeling/convert_nemo_baichuan2_to_hf.py
rename to scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py
index 41379574882e..18ddb8935942 100644
--- a/scripts/nlp_language_modeling/convert_nemo_baichuan2_to_hf.py
+++ b/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py
@@ -30,17 +30,17 @@
 
 1) Generate only HF weights from a nemo file:
 
-    python convert_nemo_baichuan2_to_hf.py \
-    --in-file /path/to/file.nemo or /path/to/extracted_folder \
-    --out-file /path/to/pytorch_model.bin
+    python convert_baichuan2_nemo_to_hf.py \
+    --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
+    --output_path /path/to/pytorch_model.bin
     
 2) Generate the full HF model folder
 
-    python convert_nemo_baichuan2_to_hf.py \
-    --in-file /path/to/file.nemo or /path/to/extracted_folder \
-    --out-file /path/to/pytorch_model.bin \
-    --hf-in-path /path/to/input_hf_folder \
-    --hf-out-path /path/to/output_hf_folder
+    python convert_baichuan2_nemo_to_hf.py \
+    --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
+    --output_path /path/to/pytorch_model.bin \
+    --hf_input_path /path/to/input_hf_folder \
+    --hf_output_path /path/to/output_hf_folder
 
     Use the --cpu-only flag if the model cannot fit in the GPU. 
     However this option makes the conversion script significantly slower.
@@ -50,18 +50,18 @@
 def get_args():
     parser = ArgumentParser()
     parser.add_argument(
-        "--in-file", type=str, default=None, required=True, help="Path to .nemo file",
+        "--input_name_or_path", type=str, default=None, required=True, help="Path to .nemo file",
     )
-    parser.add_argument("--out-file", type=str, default=None, required=True, help="Path to HF .bin file")
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to HF .bin file")
     parser.add_argument(
-        "--hf-in-path",
+        "--hf_input_path",
         type=str,
         default=None,
         help="A HF model path, "
         "e.g. a folder containing https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/tree/main",
     )
     parser.add_argument(
-        "--hf-out-path",
+        "--hf_output_path",
         type=str,
         default=None,
         help="Output HF model path, " "with the same format as above but user's own weights",
@@ -213,9 +213,9 @@ def replace_hf_weights(weights_file, input_hf_path, output_hf_path):
 
 if __name__ == '__main__':
     args = get_args()
-    convert(args.in_file, args.out_file, precision=args.precision, cpu_only=args.cpu_only)
-    if args.hf_in_path and args.hf_out_path:
-        replace_hf_weights(args.out_file, args.hf_in_path, args.hf_out_path)
+    convert(args.input_name_or_path, args.output_path, precision=args.precision, cpu_only=args.cpu_only)
+    if args.hf_input_path and args.hf_output_path:
+        replace_hf_weights(args.output_path, args.hf_input_path, args.hf_output_path)
     else:
-        logging.info("`hf-in-path` and/or `hf-out-path` not provided, not generating full HF model.")
-        logging.info(f".bin file is saved to {args.out_file}")
+        logging.info("`hf_input_path` and/or `hf_output_path` not provided, not generating full HF model.")
+        logging.info(f".bin file is saved to {args.output_path}")
diff --git a/scripts/nlp_language_modeling/convert_bert_hf_to_nemo.py b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
similarity index 99%
rename from scripts/nlp_language_modeling/convert_bert_hf_to_nemo.py
rename to scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
index cc9483b68c8a..fdd318195890 100644
--- a/scripts/nlp_language_modeling/convert_bert_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
@@ -203,7 +203,6 @@ def adjust_tensor_shapes(model, nemo_state_dict):
 
 
 def adjust_nemo_config(model_config, ref_config):
-    model_config.tokenizer["type"] = "intfloat/e5-large-unsupervised"  # ref_config["_input_name_or_path"]
     model_config["num_layers"] = ref_config["num_hidden_layers"]
     model_config["hidden_size"] = ref_config["hidden_size"]
     model_config["ffn_hidden_size"] = ref_config["intermediate_size"]
@@ -220,6 +219,7 @@ def adjust_nemo_config(model_config, ref_config):
 def get_args():
     parser = ArgumentParser()
     parser.add_argument("--input_name_or_path", type=str, default="thenlper/gte-large")
+    parser.add_argument("--vocab_file", type=str, default=None)
     parser.add_argument(
         "--hparams_file",
         type=str,
@@ -242,6 +242,7 @@ def convert(args):
     hf_model = AutoModel.from_pretrained(args.input_name_or_path)
 
     nemo_config = OmegaConf.load(args.hparams_file)
+    nemo_config.model.tokenizer["vocab_file"] = args.vocab_file
     nemo_config.model = adjust_nemo_config(nemo_config.model, hf_model.config.to_dict())
 
     nemo_config.trainer["precision"] = args.precision
diff --git a/scripts/nlp_language_modeling/convert_bert_nemo_to_hf.py b/scripts/checkpoint_converters/convert_bert_nemo_to_hf.py
similarity index 100%
rename from scripts/nlp_language_modeling/convert_bert_nemo_to_hf.py
rename to scripts/checkpoint_converters/convert_bert_nemo_to_hf.py
diff --git a/scripts/nlp_language_modeling/convert_chatglm_hf_to_nemo.py b/scripts/checkpoint_converters/convert_chatglm_hf_to_nemo.py
similarity index 100%
rename from scripts/nlp_language_modeling/convert_chatglm_hf_to_nemo.py
rename to scripts/checkpoint_converters/convert_chatglm_hf_to_nemo.py
diff --git a/scripts/nlp_language_modeling/convert_chatglm_nemo_to_hf.py b/scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py
similarity index 100%
rename from scripts/nlp_language_modeling/convert_chatglm_nemo_to_hf.py
rename to scripts/checkpoint_converters/convert_chatglm_nemo_to_hf.py
diff --git a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py b/scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py
similarity index 90%
rename from scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
rename to scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py
index ef9410b1b929..ae8885f4de93 100644
--- a/scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py
@@ -20,15 +20,15 @@
     
 Example to run this conversion script:
 ```
-    python convert_hf_falcon_to_nemo.py \
-     --config /path/to/megatron_falcon_config.yaml \
-     --input /path/to/hf/checkpoints/folder \
-     --output /path/to/output/nemo/file \
+    python convert_falcon_hf_to_nemo.py \
+     --input_name_or_path /path/to/hf/checkpoints/folder \
+     --output_path /path/to/output/nemo/file \
      --precision <precision of converted nemo model>
 ```
 """
 
 import argparse
+import os
 import time
 from typing import Dict
 
@@ -89,7 +89,7 @@ def load_falcon_config(args) -> FalconConfig:
     `transformers.FalconModel`. need to manually set the config values
     and force to `falcon` model type. 
     """
-    config = FalconConfig.from_pretrained(args.input)
+    config = FalconConfig.from_pretrained(args.input_name_or_path)
     if config.model_type == 'RefinedWeb':
         mappings = {
             "num_hidden_layers": config.n_layer,
@@ -116,14 +116,22 @@ def load_falcon_config(args) -> FalconConfig:
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--config", type=str, required=True, help="Path to the megatron_gpt_config.yaml file")
     parser.add_argument(
-        "--input",
+        "--input_name_or_path",
         type=str,
         required=True,
         help="Path to Falcon variants checkpoint from HuggingFace hub or local dir",
     )
-    parser.add_argument("--output", type=str, required=True, help="Path to dir where to store output .nemo file")
+    parser.add_argument("--output_path", type=str, required=True, help="Path to dir where to store output .nemo file")
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=os.path.join(
+            os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_falcon_config.yaml'
+        ),
+        required=False,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
+    )
     parser.add_argument(
         "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weights saved"
     )
@@ -132,7 +140,7 @@ def load_falcon_config(args) -> FalconConfig:
     args = parser.parse_args()
 
     falcon_config = load_falcon_config(args)
-    with open(args.config, "r", encoding="utf_8") as f:
+    with open(args.hparams_file, "r", encoding="utf_8") as f:
         orig_cfg = yaml.safe_load(f)
 
     model_dict = orig_cfg["model"]
@@ -184,7 +192,7 @@ def load_falcon_config(args) -> FalconConfig:
 
     tokenizer_dict = {
         "library": "huggingface",
-        "type": args.input,
+        "type": args.input_name_or_path,
         "use_fast": True,
     }
     trainer_dict = {
@@ -250,7 +258,7 @@ def load_falcon_config(args) -> FalconConfig:
     model = MegatronGPTModel(omega_cfg, trainer)
 
     logging.info("Loading HuggingFace model...")
-    model_hf = AutoModelForCausalLM.from_pretrained(args.input)
+    model_hf = AutoModelForCausalLM.from_pretrained(args.input_name_or_path)
 
     state_dict_hf = model_hf.state_dict()
     convert_dict = convert_state_dict(state_dict_hf, amp=omega_cfg.megatron_amp_O2)
@@ -271,7 +279,7 @@ def load_falcon_config(args) -> FalconConfig:
 
     logging.info("Saving model...")
 
-    # We make sure that the tokenizer can be instantiated later regardless of args.input
+    # We make sure that the tokenizer can be instantiated later regardless of args.input_name_or_path
     if falcon_config.new_decoder_architecture:
         model.cfg.tokenizer.update(type="tiiuae/falcon-40b")
     elif falcon_config.multi_query:
@@ -284,8 +292,8 @@ def load_falcon_config(args) -> FalconConfig:
     dtype = torch.bfloat16 if args.precision == "bf16" else torch.float32
     model = model.to(dtype=dtype)
     model.cfg.update(use_cpu_initialization=False)
-    model.save_to(args.output)
-    logging.info(f'Done. NeMo model saved to: {args.output}')
+    model.save_to(args.output_path)
+    logging.info(f'Done. NeMo model saved to: {args.output_path}')
     tok = time.time()
     t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
     logging.info(f'nemo model created and saved. Total time: {t}')
diff --git a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py b/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py
similarity index 86%
rename from scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py
rename to scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py
index 66f6399855a3..997f0ac23835 100644
--- a/scripts/nlp_language_modeling/convert_nemo_falcon_to_hf.py
+++ b/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py
@@ -31,17 +31,17 @@
 
 1) Generate only HF weights from a nemo file:
 
-    python convert_nemo_falcon_to_hf.py \
-    --in-file /path/to/file.nemo or /path/to/extracted_folder \
-    --out-file /path/to/pytorch_model.bin
+    python convert_falcon_nemo_to_hf.py \
+    --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
+    --output_path /path/to/pytorch_model.bin
     
 2) Generate the full HF model folder
 
-    python convert_nemo_falcon_to_hf.py \
-    --in-file /path/to/file.nemo or /path/to/extracted_folder \
-    --out-file /path/to/pytorch_model.bin \
-    --hf-in-file /path/to/input_hf_folder \
-    --hf-out-file /path/to/output_hf_folder
+    python convert_falcon_nemo_to_hf.py \
+    --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
+    --output_path /path/to/pytorch_model.bin \
+    --hf_input_path /path/to/input_hf_folder \
+    --hf_output_path /path/to/output_hf_folder
 
     Use the --cpu-only flag if the model cannot fit in the GPU (e.g. falcon 180b). 
     However this option makes the conversion script significantly slower.
@@ -51,18 +51,18 @@
 def get_args():
     parser = ArgumentParser()
     parser.add_argument(
-        "--in-file", type=str, required=True, help="Path to .nemo file",
+        "--input_name_or_path", type=str, required=True, help="Path to .nemo file",
     )
-    parser.add_argument("--out-file", type=str, required=True, help="Path to HF .bin file")
+    parser.add_argument("--output_path", type=str, required=True, help="Path to HF .bin file")
     parser.add_argument(
-        "--hf-in-path",
+        "--hf_input_path",
         type=str,
         default=None,
         help="A HF model path, "
         "e.g. a folder containing https://huggingface.co/meta-falcon/falcon-2-7b-hf/tree/main",
     )
     parser.add_argument(
-        "--hf-out-path",
+        "--hf_output_path",
         type=str,
         default=None,
         help="Output HF model path, " "with the same format as above but user's own weights",
@@ -163,9 +163,9 @@ def replace_hf_weights(weights_file, input_hf_path, output_hf_path):
 
 if __name__ == '__main__':
     args = get_args()
-    convert(args.in_file, args.out_file, precision=args.precision, cpu_only=args.cpu_only)
-    if args.hf_in_path and args.hf_out_path:
-        replace_hf_weights(args.out_file, args.hf_in_path, args.hf_out_path)
+    convert(args.input_name_or_path, args.output_path, precision=args.precision, cpu_only=args.cpu_only)
+    if args.hf_input_path and args.hf_output_path:
+        replace_hf_weights(args.output_path, args.hf_input_path, args.hf_output_path)
     else:
         logging.info("`hf-in-path` and/or `hf-out-path` not provided, not generating full HF model.")
-        logging.info(f".bin file is saved to {args.out_file}")
+        logging.info(f".bin file is saved to {args.output_path}")
diff --git a/scripts/nlp_language_modeling/convert_gemma_hf_to_nemo.py b/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py
similarity index 100%
rename from scripts/nlp_language_modeling/convert_gemma_hf_to_nemo.py
rename to scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py
diff --git a/scripts/nlp_language_modeling/convert_gemma_jax_to_nemo.py b/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py
similarity index 100%
rename from scripts/nlp_language_modeling/convert_gemma_jax_to_nemo.py
rename to scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py
diff --git a/scripts/nlp_language_modeling/convert_gemma_pyt_to_nemo.py b/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py
similarity index 100%
rename from scripts/nlp_language_modeling/convert_gemma_pyt_to_nemo.py
rename to scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py
diff --git a/scripts/nlp_language_modeling/convert_nemo_gpt_to_mcore.py b/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py
similarity index 96%
rename from scripts/nlp_language_modeling/convert_nemo_gpt_to_mcore.py
rename to scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py
index 8e2c2d350855..4e79aa317fe1 100644
--- a/scripts/nlp_language_modeling/convert_nemo_gpt_to_mcore.py
+++ b/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py
@@ -39,19 +39,27 @@
     tar -xvf filename.nemo
 
 Then, run this conversion script:
-python convert_nemo_gpt_to_mcore.py \
- --in-folder <path to extracted, TP1 PP1 legacy checkpoint folder> \
- --out-file <path to output nemo file>
+python convert_gpt_nemo_to_mcore.py \
+ --input_name_or_path <path to extracted, TP1 PP1 legacy checkpoint folder> \
+ --output_path <path to output nemo file>
 """
 
 
 def get_args():
     parser = ArgumentParser()
     parser.add_argument(
-        "--in-folder", type=str, default=None, required=True, help="Path to extracted, TP1 PP1 NeMo GPT checkpoint.",
+        "--input_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to extracted, TP1 PP1 NeMo GPT checkpoint.",
     )
     parser.add_argument(
-        "--out-file", type=str, default=None, required=True, help="Path to output mcore weights file (ends in .nemo)."
+        "--output_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to output mcore weights file (ends in .nemo).",
     )
     parser.add_argument(
         "--cpu-only",
@@ -304,8 +312,8 @@ def run_sanity_checks(nemo_file, mcore_file, cpu_only=False, ignore_if_missing=t
 if __name__ == '__main__':
     args = get_args()
 
-    input_nemo_file = args.in_folder
-    output_nemo_file = args.out_file
+    input_nemo_file = args.input_name_or_path
+    output_nemo_file = args.output_path
     cpu_only = args.cpu_only
     overwrite = args.overwrite
     ignore_if_missing = {key.strip() for key in args.ignore_if_missing.split(",")}
diff --git a/scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py b/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py
similarity index 90%
rename from scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py
rename to scripts/checkpoint_converters/convert_llama_hf_to_nemo.py
index e50f7fa71f2d..c8ccf50aa05f 100644
--- a/scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py
@@ -15,9 +15,9 @@
 r"""
 Conversion script to convert Huggingface LLaMA checkpoints into nemo checkpoint.
   Example to run this conversion script:
-    python convert_hf_llama_to_nemo.py \
-     --in-file <path_to_hf_checkpoints_folder> \
-     --out-file <path_to_output_nemo_file>
+    python convert_llama_hf_to_nemo.py \
+     --input_name_or_path <path_to_hf_checkpoints_folder> \
+     --output_path <path_to_output_nemo_file>
 """
 
 import os
@@ -44,18 +44,26 @@
 def get_args():
     parser = ArgumentParser()
     parser.add_argument(
-        "--in-file", type=str, default=None, required=True, help="Path to Huggingface LLaMA checkpoints",
+        "--input_name_or_path", type=str, default=None, required=True, help="Path to Huggingface LLaMA checkpoints",
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=os.path.join(
+            os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_llama_config.yaml'
+        ),
+        required=False,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
     )
-    parser.add_argument("--out-file", type=str, default=None, required=True, help="Path to output .nemo file.")
     parser.add_argument("--precision", type=str, default="16", help="Model precision")
     args = parser.parse_args()
     return args
 
 
-def load_config(llama_config):
-    nemo_config = OmegaConf.load(
-        os.path.join(os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_llama_config.yaml')
-    ).model
+def load_config(args, llama_config):
+    nemo_config = OmegaConf.load(args.hparams_file).model
+
     if llama_config.get('rope_theta', None):
         nemo_config['rotary_base'] = llama_config['rope_theta']
     nemo_config.encoder_seq_length = llama_config['max_position_embeddings']
@@ -88,9 +96,9 @@ def load_config(llama_config):
 
 
 def convert(args):
-    logging.info(f"loading checkpoint {args.in_file}")
-    model = LlamaForCausalLM.from_pretrained(args.in_file)
-    tokenizer = LlamaTokenizer.from_pretrained(args.in_file)
+    logging.info(f"loading checkpoint {args.input_name_or_path}")
+    model = LlamaForCausalLM.from_pretrained(args.input_name_or_path)
+    tokenizer = LlamaTokenizer.from_pretrained(args.input_name_or_path)
     hf_config = vars(model.config)
     hf_config['tokenizer_model'] = str(tokenizer.vocab_file)
     print(f"hf_config: {hf_config}")
@@ -98,7 +106,7 @@ def convert(args):
     for name, param in model.named_parameters():
         print(f"- {name}")
 
-    nemo_config = load_config(hf_config)
+    nemo_config = load_config(args, hf_config)
 
     if args.precision in ["32", "16"]:
         precision = int(float(args.precision))
@@ -271,8 +279,8 @@ def convert(args):
     model = model.to(dtype=dtype)
     model.cfg.use_cpu_initialization = False
 
-    model.save_to(args.out_file)
-    logging.info(f'NeMo model saved to: {args.out_file}')
+    model.save_to(args.output_path)
+    logging.info(f'NeMo model saved to: {args.output_path}')
 
 
 if __name__ == '__main__':
diff --git a/scripts/nlp_language_modeling/convert_nemo_llama_to_hf.py b/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py
similarity index 86%
rename from scripts/nlp_language_modeling/convert_nemo_llama_to_hf.py
rename to scripts/checkpoint_converters/convert_llama_nemo_to_hf.py
index 98414dae8651..159676f8b58e 100644
--- a/scripts/nlp_language_modeling/convert_nemo_llama_to_hf.py
+++ b/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py
@@ -31,19 +31,19 @@
 
 1) Generate only HF weights from a nemo file:
 
-    python convert_nemo_llama_to_hf.py \
-    --in-file /path/to/file.nemo or /path/to/extracted_folder \
-    --out-file /path/to/pytorch_model.bin
+    python convert_llama_nemo_to_hf.py \
+    --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
+    --output_path /path/to/pytorch_model.bin
     
 2) Generate the full HF model folder
 
-    python convert_nemo_llama_to_hf.py \
-    --in-file /path/to/file.nemo or /path/to/extracted_folder \
-    --out-file /path/to/pytorch_model.bin \
-    --hf-in-path /path/to/input_hf_folder \
-    --hf-out-path /path/to/output_hf_folder \
-    --in-tokenizer /path/to/tokenizer \
-    --hf-out-tokenizer /path/to/output_tokenizer \
+    python convert_llama_nemo_to_hf.py \
+    --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
+    --output_path /path/to/pytorch_model.bin \
+    --hf_input_path /path/to/input_hf_folder \
+    --hf_output_path /path/to/output_hf_folder \
+    --input_tokenizer /path/to/tokenizer \
+    --hf_output_tokenizer /path/to/output_tokenizer \
 
     Use the --cpu-only flag if the model cannot fit in the GPU (e.g. Llama2 70b). 
     However this option makes the conversion script significantly slower.
@@ -53,29 +53,32 @@
 def get_args():
     parser = ArgumentParser()
     parser.add_argument(
-        "--in-file", type=str, default=None, required=True, help="Path to .nemo file or extracted folder",
+        "--input_name_or_path", type=str, default=None, required=True, help="Path to .nemo file or extracted folder",
     )
-    parser.add_argument("--out-file", type=str, default=None, required=True, help="Path to HF .bin file")
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to HF .bin file")
     parser.add_argument(
-        "--hf-in-path",
+        "--hf_input_path",
         type=str,
         default=None,
         help="A HF model path, " "e.g. a folder containing https://huggingface.co/meta-llama/Llama-2-7b-hf/tree/main",
     )
     parser.add_argument(
-        "--hf-out-path",
+        "--hf_output_path",
         type=str,
         default=None,
         help="Output HF model path, " "with the same format as above but user's own weights",
     )
     parser.add_argument(
-        "--in-tokenizer",
+        "--input_tokenizer",
         type=str,
         default=None,
         help="Path to tokenizer used for the input nemo model. (need to extract the .nemo file first)",
     )
     parser.add_argument(
-        "--hf-out-tokenizer", type=str, default=None, help="Path to save the tokenizer used for the output HF model.",
+        "--hf_output_tokenizer",
+        type=str,
+        default=None,
+        help="Path to save the tokenizer used for the output HF model.",
     )
     parser.add_argument(
         "--precision",
@@ -247,15 +250,18 @@ def replace_hf_weights_and_tokenizer(
 
 if __name__ == '__main__':
     args = get_args()
-    if not args.hf_out_tokenizer and args.hf_out_path:
-        args.hf_out_tokenizer = args.hf_out_path
-
-    dtype = convert(args.in_file, args.out_file, precision=args.precision, cpu_only=args.cpu_only)
-
-    if args.hf_in_path and args.hf_out_path:
+    if not args.hf_output_tokenizer and args.hf_output_path:
+        args.hf_output_tokenizer = args.hf_output_path
+    dtype = convert(args.input_name_or_path, args.output_path, precision=args.precision, cpu_only=args.cpu_only)
+    if args.hf_input_path and args.hf_output_path:
         replace_hf_weights_and_tokenizer(
-            args.out_file, dtype, args.hf_in_path, args.hf_out_path, args.in_tokenizer, args.hf_out_tokenizer,
+            args.output_path,
+            dtype,
+            args.hf_input_path,
+            args.hf_output_path,
+            args.input_tokenizer,
+            args.hf_output_tokenizer,
         )
     else:
         logging.info("`hf-in-path` and/or `hf-out-path` not provided, not generating full HF model.")
-        logging.info(f".bin file is saved to {args.out_file}")
+        logging.info(f".bin file is saved to {args.output_path}")
diff --git a/scripts/nlp_language_modeling/convert_hf_mistral_7b_to_nemo.py b/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py
similarity index 94%
rename from scripts/nlp_language_modeling/convert_hf_mistral_7b_to_nemo.py
rename to scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py
index b8deebbf0f3e..db0fe28cbf73 100644
--- a/scripts/nlp_language_modeling/convert_hf_mistral_7b_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py
@@ -15,10 +15,9 @@
 r"""
 Conversion script to convert HuggingFace Mistral-7B checkpoints into nemo checkpoint.
   Example to run this conversion script:
-    python convert_hf_mistral_7b_to_nemo.py \
-     --in-file <path_to_mistral_checkpoints_folder> \
-     --out-file <path_to_output_nemo_file> \
-     [--fast-swiglu\
+    python convert_mistral_7b_hf_to_nemo.py \
+     --input_name_or_path <path_to_mistral_checkpoints_folder> \
+     --output_path <path_to_output_nemo_file> 
 """
 
 
@@ -48,9 +47,13 @@
 def get_args():
     parser = ArgumentParser()
     parser.add_argument(
-        "--in-file", type=str, default=None, required=True, help="Path to Huggingface Mistral-7b checkpoints",
+        "--input_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to Huggingface Mistral-7b checkpoints",
     )
-    parser.add_argument("--out-file", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
     parser.add_argument("--precision", type=str, default="32", help="Model precision")
     args = parser.parse_args()
     return args
@@ -142,11 +145,11 @@ def load_mistral_ckpt(in_dir):
 
 
 def convert(args):
-    logging.info(f"loading checkpoint {args.in_file}")
+    logging.info(f"loading checkpoint {args.input_name_or_path}")
 
-    model_args, ckpt, tokenizer = load_mistral_ckpt(args.in_file)
-    nemo_config = load_config(model_args, os.path.join(args.in_file, 'tokenizer.model'))
-    logging.info(f"loaded checkpoint {args.in_file}")
+    model_args, ckpt, tokenizer = load_mistral_ckpt(args.input_name_or_path)
+    nemo_config = load_config(model_args, os.path.join(args.input_name_or_path, 'tokenizer.model'))
+    logging.info(f"loaded checkpoint {args.input_name_or_path}")
 
     if args.precision in ["32", "16"]:
         precision = int(float(args.precision))
@@ -326,8 +329,8 @@ def convert(args):
     model = model.to(dtype=dtype)
     model.cfg.use_cpu_initialization = False
 
-    model.save_to(args.out_file)
-    logging.info(f'NeMo model saved to: {args.out_file}')
+    model.save_to(args.output_path)
+    logging.info(f'NeMo model saved to: {args.output_path}')
 
 
 if __name__ == '__main__':
diff --git a/scripts/nlp_language_modeling/convert_nemo_mistral_7b_to_hf.py b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
similarity index 92%
rename from scripts/nlp_language_modeling/convert_nemo_mistral_7b_to_hf.py
rename to scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
index e78567d60554..07e12f36c3d7 100644
--- a/scripts/nlp_language_modeling/convert_nemo_mistral_7b_to_hf.py
+++ b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
@@ -15,9 +15,9 @@
 r"""
 Conversion script to convert NeMo Mistral-7B checkpoints into HuggingFace checkpoint.
   Example to run this conversion script:
-    python3 convert_nemo_mistral_7b_to_hf.py \
-     --in-file <path_to_nemo_checkpoints_folder> \
-     --out-file <path_to_output_hf_file>
+    python3 convert_mistral_7b_nemo_to_hf.py \
+     --input_name_or_path <path_to_nemo_checkpoints_folder> \
+     --output_path <path_to_output_hf_file>
 """
 
 from argparse import ArgumentParser
@@ -35,9 +35,11 @@
 
 def get_args():
     parser = ArgumentParser()
-    parser.add_argument("--in-file", type=str, default=None, required=True, help="Path to NeMo Mistral-7B checkpoint")
-    parser.add_argument("--out-file", type=str, default=None, required=True, help="Path to output HF checkpoint.")
-    parser.add_argument('--hf-model-name', type=str, default="mistralai/Mistral-7B-v0.1", help="Name of HF checkpoint")
+    parser.add_argument(
+        "--input_name_or_path", type=str, default=None, required=True, help="Path to NeMo Mistral-7B checkpoint"
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output HF checkpoint.")
+    parser.add_argument('--hf_model_name', type=str, default="mistralai/Mistral-7B-v0.1", help="Name of HF checkpoint")
     parser.add_argument("--precision", type=str, default="32", help="Model precision")
     args = parser.parse_args()
     return args
@@ -214,12 +216,12 @@ def convert(in_file, precision=None, cpu_only=True) -> None:
 
 if __name__ == '__main__':
     args = get_args()
-    hf_state_dict, nemo_config = convert(args.in_file, args.precision)
+    hf_state_dict, nemo_config = convert(args.input_name_or_path, args.precision)
 
     config = load_config(args.hf_model_name, nemo_config)
     model = AutoModelForCausalLM.from_config(config)
     model.load_state_dict(hf_state_dict)
-    model.save_pretrained(args.out_file)
+    model.save_pretrained(args.output_path)
     hf_tokenizer = AutoTokenizer.from_pretrained(args.hf_model_name)
-    hf_tokenizer.save_pretrained(args.out_file)
-    logging.info(f'HF checkpoint saved to: {args.out_file}')
+    hf_tokenizer.save_pretrained(args.output_path)
+    logging.info(f'HF checkpoint saved to: {args.output_path}')
diff --git a/scripts/nlp_language_modeling/convert_hf_mixtral_to_nemo.py b/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py
similarity index 95%
rename from scripts/nlp_language_modeling/convert_hf_mixtral_to_nemo.py
rename to scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py
index 2f5bc24fae06..d8ad9d5030b8 100644
--- a/scripts/nlp_language_modeling/convert_hf_mixtral_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py
@@ -15,10 +15,9 @@
 r"""
 Conversion script to convert Huggingface Mixtral checkpoints into NeMo checkpoint.
   Example to run this conversion script:
-    python3 convert_hf_mixtral_to_nemo.py \
-     --in-file <path_to_mixtral_checkpoints_folder> \
-     --out-file <path_to_output_nemo_file> \
-     [--fast-swiglu\
+    python3 convert_mixtral_hf_to_nemo.py \
+     --input_name_or_path <path_to_mixtral_checkpoints_folder> \
+     --output_path <path_to_output_nemo_file> 
 """
 
 import json
@@ -48,9 +47,9 @@
 def get_args():
     parser = ArgumentParser()
     parser.add_argument(
-        "--in-file", type=str, default=None, required=True, help="Path to Huggingface Mixtral checkpoints",
+        "--input_name_or_path", type=str, default=None, required=True, help="Path to Huggingface Mixtral checkpoints",
     )
-    parser.add_argument("--out-file", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
     parser.add_argument("--precision", type=str, default="32", help="Model precision")
     args = parser.parse_args()
     return args
@@ -148,9 +147,9 @@ def load_mixtral_ckpt(in_dir):
 
 
 def convert(args):
-    logging.info(f"loading checkpoint {args.in_file}")
+    logging.info(f"loading checkpoint {args.input_name_or_path}")
 
-    model_args, ckpt, tokenizer = load_mixtral_ckpt(args.in_file)
+    model_args, ckpt, tokenizer = load_mixtral_ckpt(args.input_name_or_path)
     nemo_config = load_config(model_args, tokenizer.vocab_file)
 
     if args.precision in ["32", "16"]:
@@ -337,8 +336,8 @@ def convert(args):
     model = model.to(dtype=dtype)
     model.cfg.use_cpu_initialization = False
 
-    model.save_to(args.out_file)
-    logging.info(f'NeMo model saved to: {args.out_file}')
+    model.save_to(args.output_path)
+    logging.info(f'NeMo model saved to: {args.output_path}')
 
 
 if __name__ == '__main__':
diff --git a/scripts/nlp_language_modeling/convert_nemo_mixtral_to_hf.py b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py
similarity index 93%
rename from scripts/nlp_language_modeling/convert_nemo_mixtral_to_hf.py
rename to scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py
index acc278457b8d..dbcbb80a7fda 100644
--- a/scripts/nlp_language_modeling/convert_nemo_mixtral_to_hf.py
+++ b/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py
@@ -15,10 +15,9 @@
 r"""
 Conversion script to convert NeMo Mixtral checkpoints into HuggingFace checkpoint.
   Example to run this conversion script:
-    python3 convert_nemo_mixtral_to_hf.py \
-     --in-file <path_to_nemo_checkpoints_folder> \
-     --out-file <path_to_output_hf_file> \
-     [--fast-swiglu\
+    python3 convert_mixtral_nemo_to_hf.py \
+     --input_name_or_path <path_to_nemo_checkpoints_folder> \
+     --output_path <path_to_output_hf_file> 
 """
 
 from argparse import ArgumentParser
@@ -37,10 +36,12 @@
 
 def get_args():
     parser = ArgumentParser()
-    parser.add_argument("--in-file", type=str, default=None, required=True, help="Path to NeMo Mixtral checkpoint")
-    parser.add_argument("--out-file", type=str, default=None, required=True, help="Path to output HF checkpoint.")
     parser.add_argument(
-        '--hf-model-name', type=str, default="mistralai/Mixtral-8x7B-v0.1", help="Name of HF checkpoint"
+        "--input_name_or_path", type=str, default=None, required=True, help="Path to NeMo Mixtral checkpoint"
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output HF checkpoint.")
+    parser.add_argument(
+        '--hf_model_name', type=str, default="mistralai/Mixtral-8x7B-v0.1", help="Name of HF checkpoint"
     )
     parser.add_argument("--precision", type=str, default="32", help="Model precision")
     args = parser.parse_args()
@@ -233,12 +234,12 @@ def convert(in_file, precision=None) -> None:
 if __name__ == '__main__':
     args = get_args()
     parallel_state.set_cpu_expert_model_parallel_world_size(1)
-    hf_state_dict, nemo_config = convert(args.in_file, args.precision)
+    hf_state_dict, nemo_config = convert(args.input_name_or_path, args.precision)
 
     config = load_config(args.hf_model_name, nemo_config)
     model = AutoModelForCausalLM.from_config(config)
     model.load_state_dict(hf_state_dict)
-    model.save_pretrained(args.out_file)
+    model.save_pretrained(args.output_path)
     hf_tokenizer = AutoTokenizer.from_pretrained(args.hf_model_name)
-    hf_tokenizer.save_pretrained(args.out_file)
-    logging.info(f'HF checkpoint saved to: {args.out_file}')
+    hf_tokenizer.save_pretrained(args.output_path)
+    logging.info(f'HF checkpoint saved to: {args.output_path}')
diff --git a/scripts/nlp_language_modeling/convert_mpt_7b_hf_to_nemo.py b/scripts/checkpoint_converters/convert_mpt_hf_to_nemo.py
similarity index 83%
rename from scripts/nlp_language_modeling/convert_mpt_7b_hf_to_nemo.py
rename to scripts/checkpoint_converters/convert_mpt_hf_to_nemo.py
index 2261f70ea928..e7d81f709092 100644
--- a/scripts/nlp_language_modeling/convert_mpt_7b_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_mpt_hf_to_nemo.py
@@ -46,7 +46,8 @@
 Here is an example usage command:
 
 ```python
-python scripts/nlp_language_modeling/convert_mpt_7b_hf_to_nemo.py -c /path/to/megatron_gpt_config.yaml -i /path/to/mpt_7b -o /path/to/save
+python convert_mpt_hf_to_nemo.py \
+ --input_name_or_path /path/to/mpt_7b --output_path /path/to/save.nemo
 ```
 
 """
@@ -67,27 +68,31 @@
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        '-i', '--input', required=True, type=str, help='path to the two MPT-7B .bin weight files from HuggingFace'
+        "--input_name_or_path", type=str, default=None, required=True, help="Path to Huggingface MPT checkpoints",
     )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
     parser.add_argument(
-        '-c', '--config', required=True, type=str, help='the path to the megatron_gpt_config.yaml file'
-    )
-    parser.add_argument(
-        '-o', '--output', required=False, default=None, type=str, help='path to dir where to store output .nemo file'
+        "--hparams_file",
+        type=str,
+        default=os.path.join(
+            os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_gpt_config.yaml'
+        ),
+        required=False,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
     )
     parser.add_argument('--cuda', action='store_true', help='put Nemo model onto GPU prior to savedown')
 
     args = parser.parse_args()
 
-    if not os.path.exists(args.input):
-        logging.critical(f'Input directory [ {args.input} ] does not exist or cannot be found. Aborting.')
+    if not os.path.exists(args.input_name_or_path):
+        logging.critical(f'Input directory [ {args.input_name_or_path} ] does not exist or cannot be found. Aborting.')
         exit(255)
 
-    if not os.path.exists(args.config):
-        logging.critical(f'Path to config file [ {args.config} ] does not exist or cannot be found. Aborting.')
+    if not os.path.exists(args.hparams_file):
+        logging.critical(f'Path to config file [ {args.hparams_file} ] does not exist or cannot be found. Aborting.')
         exit(255)
 
-    with open(args.config, 'r', encoding='utf_8') as fr:
+    with open(args.hparams_file, 'r', encoding='utf_8') as fr:
         orig_cfg = yaml.safe_load(fr)
 
     model_dict = orig_cfg['model']
@@ -178,8 +183,8 @@
     if args.cuda:
         model = model.cuda()
 
-    mpt_1 = torch.load(os.path.join(args.input, 'pytorch_model-00001-of-00002.bin'), map_location="cpu")
-    mpt_2 = torch.load(os.path.join(args.input, 'pytorch_model-00002-of-00002.bin'), map_location="cpu")
+    mpt_1 = torch.load(os.path.join(args.input_name_or_path, 'pytorch_model-00001-of-00002.bin'), map_location="cpu")
+    mpt_2 = torch.load(os.path.join(args.input_name_or_path, 'pytorch_model-00002-of-00002.bin'), map_location="cpu")
     mpt_dict = {**mpt_1, **mpt_2}
     del mpt_1, mpt_2
 
@@ -227,7 +232,7 @@ def get_new_key(old_key):
         logging.warning('Unexpected keys were detected which should not happen. Please investigate.')
         logging.warning(f'Unexpected keys: \n{unexpected_keys}')
 
-    if args.output is None:
-        args.output = os.path.dirname(os.path.abspath(__file__))
+    if args.output_path is None:
+        args.output_path = os.path.dirname(os.path.abspath(__file__))
 
-    model.save_to(os.path.join(args.output, 'megatron_mpt_7b_base_tp1_pp1.nemo'))
+    model.save_to(os.path.join(args.output_path, 'megatron_mpt_7b_base_tp1_pp1.nemo'))
diff --git a/scripts/nlp_language_modeling/convert_starcoder_hf_to_nemo.py b/scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py
similarity index 85%
rename from scripts/nlp_language_modeling/convert_starcoder_hf_to_nemo.py
rename to scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py
index f1e3d4e6ee1e..9dceba544068 100644
--- a/scripts/nlp_language_modeling/convert_starcoder_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py
@@ -42,10 +42,9 @@
 
 Here is an example usage command:
 ```python
-python scripts/nlp_language_modeling/convert_starcoder_hf_to_nemo.py \
-    --config /path/to/megatron_gpt_config.yaml \
-    --input /path/to/starcoder \
-    --output /path/to/save
+python convert_starcoder_hf_to_nemo.py \
+    --input_name_or_path /path/to/starcoder \
+    --output_path /path/to/save.nemo
 ```
 """
 
@@ -98,23 +97,34 @@ def get_new_key(old_key):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--config", type=str, required=True, help="Path to the megatron_gpt_config.yaml file")
     parser.add_argument(
-        "--input", type=str, required=True, help="StarCoder from HuggingFace hub or local dir with downloaded model"
+        "--input_name_or_path",
+        type=str,
+        required=True,
+        help="Path to Starcoder checkpoint from HuggingFace hub or local dir",
+    )
+    parser.add_argument("--output_path", type=str, required=True, help="Path to dir where to store output .nemo file")
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=os.path.join(
+            os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_gpt_config.yaml'
+        ),
+        required=False,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
     )
-    parser.add_argument("--output", type=str, default=".", help="Path to dir where to store output .nemo file")
     parser.add_argument(
         "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weights saved"
     )
     parser.add_argument("--cuda", action="store_true", help="Put Nemo model onto GPU prior to saving")
     args = parser.parse_args()
 
-    if not os.path.isdir(args.output):
-        raise FileNotFoundError(f"Output directory '{args.output}' does not exist")
+    if not os.path.isdir(args.output_path):
+        raise FileNotFoundError(f"Output directory '{args.output_path}' does not exist")
 
-    hf_config = AutoConfig.from_pretrained(args.input)
+    hf_config = AutoConfig.from_pretrained(args.input_name_or_path)
 
-    with open(args.config, "r", encoding="utf_8") as f:
+    with open(args.hparams_file, "r", encoding="utf_8") as f:
         orig_cfg = yaml.safe_load(f)
 
     model_dict = orig_cfg["model"]
@@ -157,7 +167,7 @@ def get_new_key(old_key):
     }
     tokenizer_dict = {
         "library": "huggingface",
-        "type": args.input,
+        "type": args.input_name_or_path,
         "use_fast": True,
     }
     trainer_dict = {
@@ -188,7 +198,7 @@ def get_new_key(old_key):
     trainer = pl.Trainer(**trainer_dict)
 
     logging.info("Loading HuggingFace model...")
-    model_hf = AutoModelForCausalLM.from_pretrained(args.input)
+    model_hf = AutoModelForCausalLM.from_pretrained(args.input_name_or_path)
     logging.info(f"Loaded model:\n{model_hf}")
 
     state_dict_hf = model_hf.state_dict()
@@ -200,10 +210,10 @@ def get_new_key(old_key):
     logging.info(f"Created model:\n{model}")
 
     logging.info("Saving model...")
-    # We make sure that the tokenizer can be instantiated later regardless of args.input
+    # We make sure that the tokenizer can be instantiated later regardless of args.input_name_or_path
     model.cfg.tokenizer.update(type="bigcode/starcoder")
     dtype = torch.bfloat16 if args.precision == "bf16" else torch.float32
     model = model.to(dtype=dtype)
     model.cfg.update(use_cpu_initialization=False)
-    model.save_to(os.path.join(args.output, "megatron_starcoder_tp1_pp1.nemo"))
+    model.save_to(os.path.join(args.output_path, "megatron_starcoder_tp1_pp1.nemo"))
     logging.info("Done.")

From 0d175a07f605c81afa5376603ad4f49f19eb8a9d Mon Sep 17 00:00:00 2001
From: akoumpa <153118171+akoumpa@users.noreply.github.com>
Date: Tue, 19 Mar 2024 14:41:46 -0700
Subject: [PATCH 053/140] Fix for get_all_params_for_weight_decay_optimization
 (#8703)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 nemo/collections/nlp/modules/common/megatron/utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/utils.py b/nemo/collections/nlp/modules/common/megatron/utils.py
index 35c5d36a8662..42d14592c363 100644
--- a/nemo/collections/nlp/modules/common/megatron/utils.py
+++ b/nemo/collections/nlp/modules/common/megatron/utils.py
@@ -390,8 +390,9 @@ def get_all_params_for_weight_decay_optimization(
 
     # populate with params
     is_expert = lambda param: not getattr(param, 'allreduce', True)
-    weight_decay_params['params'] = list(filter(lambda x: not is_expert(x), model.parameters()))
-    weight_decay_expert_params['params'] = list(filter(is_expert, model.parameters()))
+    for module in modules:
+        weight_decay_params['params'] += list(filter(lambda x: not is_expert(x), module.parameters()))
+        weight_decay_expert_params['params'] += list(filter(is_expert, module.parameters()))
 
     return weight_decay_params, weight_decay_expert_params
 

From c847f17dcab920773456d6e8d5ab41f9dcb253f5 Mon Sep 17 00:00:00 2001
From: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Date: Tue, 19 Mar 2024 17:03:54 -0700
Subject: [PATCH 054/140] Disable CustomProgressBar when
 trainer.enable_progress_bar is False (#8663)

* Disable CustomProgressBar when trainer.enable_progress_bar is False

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../language_modeling/megatron_bart_pretraining.py |  8 +++++---
 .../megatron_gpt_continue_training.py              |  6 +++++-
 .../nlp/language_modeling/megatron_gpt_eval.py     |  8 +++++---
 .../language_modeling/megatron_retro_cal_shape.py  |  6 +++++-
 .../language_modeling/megatron_retro_fine_tune.py  |  6 +++++-
 .../megatron_retro_mutransfer_pretrain.py          |  6 +++++-
 .../megatron_retro_pretraining.py                  |  6 +++++-
 .../megatron_t5_lm_adaptation_finetune.py          |  8 +++++---
 .../megatron_t5_seq2seq_finetune.py                |  6 +++++-
 .../language_modeling/tuning/megatron_gpt_sft.py   |  6 +++++-
 .../nlp/parts/megatron_trainer_builder.py          | 14 +++++++-------
 11 files changed, 57 insertions(+), 23 deletions(-)

diff --git a/examples/nlp/language_modeling/megatron_bart_pretraining.py b/examples/nlp/language_modeling/megatron_bart_pretraining.py
index 447c34426602..e45b5e04ca45 100644
--- a/examples/nlp/language_modeling/megatron_bart_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_bart_pretraining.py
@@ -67,9 +67,11 @@ def main(cfg) -> None:
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
 
-    trainer = Trainer(
-        plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[ModelSummary(max_depth=3), CustomProgressBar()]
-    )
+    callbacks = [ModelSummary(max_depth=3)]
+    # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
+    if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar:
+        callbacks.append(CustomProgressBar())
+    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=callbacks)
 
     exp_manager(trainer, cfg.exp_manager)
 
diff --git a/examples/nlp/language_modeling/megatron_gpt_continue_training.py b/examples/nlp/language_modeling/megatron_gpt_continue_training.py
index eb6e9fbf5573..73cbb2abcce8 100755
--- a/examples/nlp/language_modeling/megatron_gpt_continue_training.py
+++ b/examples/nlp/language_modeling/megatron_gpt_continue_training.py
@@ -160,7 +160,11 @@ def main(cfg) -> None:
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
 
-    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[CustomProgressBar()])
+    callbacks = []
+    # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
+    if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar:
+        callbacks.append(CustomProgressBar())
+    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=callbacks)
 
     exp_manager(trainer, cfg.exp_manager)
 
diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py
index c8c64ae0f957..084a4b2642a2 100644
--- a/examples/nlp/language_modeling/megatron_gpt_eval.py
+++ b/examples/nlp/language_modeling/megatron_gpt_eval.py
@@ -167,11 +167,13 @@ def remove_padded_prompts(response, nb_paddings):
 @hydra_runner(config_path="conf", config_name="megatron_gpt_inference")
 def main(cfg) -> None:
 
+    callbacks = []
+    # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
+    if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar:
+        callbacks.append(CustomProgressBar())
     # trainer required for restoring model parallel models
     trainer = Trainer(
-        strategy=NLPDDPStrategy(timeout=datetime.timedelta(seconds=18000)),
-        **cfg.trainer,
-        callbacks=[CustomProgressBar()],
+        strategy=NLPDDPStrategy(timeout=datetime.timedelta(seconds=18000)), **cfg.trainer, callbacks=callbacks,
     )
 
     if cfg.gpt_model_file is not None:
diff --git a/examples/nlp/language_modeling/megatron_retro_cal_shape.py b/examples/nlp/language_modeling/megatron_retro_cal_shape.py
index 754f06018ba5..a57a927d2a36 100644
--- a/examples/nlp/language_modeling/megatron_retro_cal_shape.py
+++ b/examples/nlp/language_modeling/megatron_retro_cal_shape.py
@@ -61,7 +61,11 @@ def main(cfg) -> None:
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
 
-    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[CustomProgressBar()])
+    callbacks = []
+    # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
+    if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar:
+        callbacks.append(CustomProgressBar())
+    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=callbacks)
 
     # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
     with open_dict(cfg):
diff --git a/examples/nlp/language_modeling/megatron_retro_fine_tune.py b/examples/nlp/language_modeling/megatron_retro_fine_tune.py
index 1577faa69a2b..3fcaec156d9c 100644
--- a/examples/nlp/language_modeling/megatron_retro_fine_tune.py
+++ b/examples/nlp/language_modeling/megatron_retro_fine_tune.py
@@ -106,7 +106,11 @@ def main(cfg) -> None:
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
 
-    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[CustomProgressBar()])
+    callbacks = []
+    # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
+    if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar:
+        callbacks.append(CustomProgressBar())
+    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=callbacks)
     exp_manager(trainer, cfg.exp_manager)
 
     logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')
diff --git a/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py b/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py
index abe7006448e2..af6e22035def 100644
--- a/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py
+++ b/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py
@@ -70,7 +70,11 @@ def main(cfg) -> None:
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
 
-    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[CustomProgressBar()])
+    callbacks = []
+    # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
+    if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar:
+        callbacks.append(CustomProgressBar())
+    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=callbacks)
 
     exp_manager(trainer, cfg.exp_manager)
 
diff --git a/examples/nlp/language_modeling/megatron_retro_pretraining.py b/examples/nlp/language_modeling/megatron_retro_pretraining.py
index 909260856eef..c84656d4b657 100644
--- a/examples/nlp/language_modeling/megatron_retro_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_retro_pretraining.py
@@ -69,7 +69,11 @@ def main(cfg) -> None:
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
 
-    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[CustomProgressBar()])
+    callbacks = []
+    # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
+    if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar:
+        callbacks.append(CustomProgressBar())
+    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=callbacks)
 
     exp_manager(trainer, cfg.exp_manager)
 
diff --git a/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py b/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py
index 0777d1f40819..9e392d913171 100644
--- a/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py
+++ b/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py
@@ -68,9 +68,11 @@ def main(cfg) -> None:
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
 
-    trainer = Trainer(
-        plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[ModelSummary(max_depth=3), CustomProgressBar()]
-    )
+    callbacks = [ModelSummary(max_depth=3)]
+    # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
+    if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar:
+        callbacks.append(CustomProgressBar())
+    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=callbacks)
     exp_manager(trainer, cfg.exp_manager)
 
     # update resume from checkpoint found by exp_manager
diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
index 13be61f5b1c5..3cd4d7485031 100644
--- a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
+++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
@@ -181,7 +181,11 @@ def main(cfg) -> None:
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
 
-    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[CustomProgressBar()])
+    callbacks = []
+    # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
+    if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar:
+        callbacks.append(CustomProgressBar())
+    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=callbacks)
 
     exp_manager(trainer, cfg.exp_manager)
 
diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py b/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py
index fbaacbb7bff4..506ddd0364eb 100644
--- a/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py
+++ b/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py
@@ -206,7 +206,11 @@ def main(cfg) -> None:
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
 
-    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[CustomProgressBar()])
+    callbacks = []
+    # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
+    if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar:
+        callbacks.append(CustomProgressBar())
+    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=callbacks)
 
     exp_manager(trainer, cfg.exp_manager)
 
diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
index 968674b0fb92..44447ca92612 100644
--- a/nemo/collections/nlp/parts/megatron_trainer_builder.py
+++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py
@@ -130,7 +130,8 @@ def _plugins(self) -> list:
     def create_trainer(self, callbacks=None) -> Trainer:
         strategy = self._training_strategy()
         plugins = self._plugins()
-        if callbacks is None:
+        # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
+        if 'enable_progress_bar' not in self.cfg.trainer or self.cfg.trainer.enable_progress_bar:
             callbacks = [CustomProgressBar()]
         return Trainer(plugins=plugins, strategy=strategy, **self.cfg.trainer, callbacks=callbacks)
 
@@ -151,12 +152,11 @@ class MegatronT5TrainerBuilder(MegatronTrainerBuilder):
     def create_trainer(self) -> Trainer:
         strategy = self._training_strategy()
         plugins = self._plugins()
-        return Trainer(
-            plugins=plugins,
-            strategy=strategy,
-            **self.cfg.trainer,
-            callbacks=[ModelSummary(max_depth=3), CustomProgressBar()]
-        )
+        callbacks = [ModelSummary(max_depth=3)]
+        # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
+        if 'enable_progress_bar' not in self.cfg.trainer or self.cfg.trainer.enable_progress_bar:
+            callbacks.append(CustomProgressBar())
+        return Trainer(plugins=plugins, strategy=strategy, **self.cfg.trainer, callbacks=callbacks)
 
 
 class MegatronLMPPTrainerBuilder(MegatronTrainerBuilder):

From 3818e424ca2958f5a0aac92781bac8ec01dabf25 Mon Sep 17 00:00:00 2001
From: Peter Pisljar <peter.pisljar@gmail.com>
Date: Wed, 20 Mar 2024 10:00:17 +0100
Subject: [PATCH 055/140] onnx export for G2P models (#8177)

* G2P transforer CTC - exportable

makes G2P ctc model exportable

Signed-off-by: Peter Pisljar <peter.pisljar@gmail.com>
Signed-off-by: ppisljar <peter.pisljar@gmail.com>

* g2p onnx export

Signed-off-by: ppisljar <peter.pisljar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: ppisljar <peter.pisljar@gmail.com>

* fixes

Signed-off-by: ppisljar <peter.pisljar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: ppisljar <peter.pisljar@gmail.com>

* fixes

Signed-off-by: ppisljar <peter.pisljar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: ppisljar <peter.pisljar@gmail.com>

* fixes

Signed-off-by: ppisljar <peter.pisljar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: ppisljar <peter.pisljar@gmail.com>

* fixes

Signed-off-by: ppisljar <peter.pisljar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: ppisljar <peter.pisljar@gmail.com>

* fixes

Signed-off-by: ppisljar <peter.pisljar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: ppisljar <peter.pisljar@gmail.com>

* Update ctc.py

Signed-off-by: Peter Pisljar <peter.pisljar@gmail.com>

* Update t5.py

Signed-off-by: Peter Pisljar <peter.pisljar@gmail.com>

---------

Signed-off-by: Peter Pisljar <peter.pisljar@gmail.com>
Signed-off-by: ppisljar <peter.pisljar@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Evelina <10428420+ekmb@users.noreply.github.com>
---
 nemo/collections/tts/g2p/models/ctc.py | 59 +++++++++++++++++++++++++-
 nemo/collections/tts/g2p/models/t5.py  | 48 ++++++++++++++++++++-
 2 files changed, 105 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/tts/g2p/models/ctc.py b/nemo/collections/tts/g2p/models/ctc.py
index 388615988c57..2e180e766211 100644
--- a/nemo/collections/tts/g2p/models/ctc.py
+++ b/nemo/collections/tts/g2p/models/ctc.py
@@ -26,6 +26,8 @@
 from nemo.collections.tts.g2p.data.ctc import CTCG2PBPEDataset
 from nemo.collections.tts.models.base import G2PModel
 from nemo.core.classes.common import PretrainedModelInfo
+from nemo.core.classes.exportable import Exportable
+from nemo.core.neural_types import LengthsType, NeuralType, TokenIndex
 from nemo.utils import logging
 
 try:
@@ -49,7 +51,7 @@ class CTCG2PConfig:
     validation_ds: Optional[Dict[Any, Any]] = None
 
 
-class CTCG2PModel(G2PModel, ASRBPEMixin):
+class CTCG2PModel(G2PModel, ASRBPEMixin, Exportable):
     """
     CTC-based grapheme-to-phoneme model.
     """
@@ -437,3 +439,58 @@ def per(self):
     @per.setter
     def per(self, per):
         self._per = per
+
+    # Methods for model exportability
+    def _prepare_for_export(self, **kwargs):
+        super()._prepare_for_export(**kwargs)
+
+        # Define input_types and output_types as required by export()
+        self._input_types = {
+            "input_ids": NeuralType(('B', 'T'), TokenIndex()),
+            "input_len": NeuralType(tuple('B'), LengthsType()),
+        }
+        self._output_types = {
+            # "preds_str": NeuralType(('B', 'T'), LabelsType()),
+            "log_probs": NeuralType(('B', 'T'), LossType()),
+            "encoded_len": NeuralType(('B', 'T'), LengthsType()),
+        }
+
+    def _export_teardown(self):
+        self._input_types = self._output_types = None
+
+    @property
+    def input_types(self):
+        return self._input_types
+
+    @property
+    def output_types(self):
+        return self._output_types
+
+    def input_example(self, max_batch=1, max_dim=44):
+        """
+        Generates input examples for tracing etc.
+        Returns:
+            A tuple of input examples.
+        """
+        # par = next(self.fastpitch.parameters())
+        sentence = "Kupil sem si bicikel in mu zamenjal stol."
+        input_ids = [self.tokenizer_grapheme.text_to_ids(sentence)]
+        input_len = [len(entry) for entry in input_ids]
+        max_len = max(input_len)
+        input_ids = [entry + [0] * (max_len - entry_len) for entry, entry_len in zip(input_ids, input_len)]
+        inputs = (torch.tensor(input_ids).to(self.device), torch.tensor(input_len).to(self.device))
+        return inputs
+
+    def forward_for_export(self, input_ids, input_len):
+        input_embedding = self.embedding(input_ids)
+        input_embedding = input_embedding.transpose(1, 2)
+        encoded_input, encoded_len = self.encoder(audio_signal=input_embedding, length=input_len)
+
+        log_probs = self.decoder(encoder_output=encoded_input)
+        return (log_probs, encoded_len)
+        # preds_str, _ = self.decoding.ctc_decoder_predictions_tensor(
+        #    log_probs, decoder_lengths=encoded_len, return_hypotheses=True
+        # )
+        # results = [h.y_sequence for h in preds_str]
+
+        # return tuple(results)
diff --git a/nemo/collections/tts/g2p/models/t5.py b/nemo/collections/tts/g2p/models/t5.py
index b41fcf1d5945..25f63d8d858a 100644
--- a/nemo/collections/tts/g2p/models/t5.py
+++ b/nemo/collections/tts/g2p/models/t5.py
@@ -25,6 +25,7 @@
 from nemo.collections.tts.g2p.data.t5 import T5G2PDataset
 from nemo.collections.tts.models.base import G2PModel
 from nemo.core.classes.common import PretrainedModelInfo, typecheck
+from nemo.core.classes.exportable import Exportable
 from nemo.core.neural_types import LabelsType, LossType, MaskType, NeuralType, TokenIndex
 from nemo.utils import logging
 
@@ -38,7 +39,7 @@ class T5G2PConfig:
     test_ds: Optional[Dict[Any, Any]] = None
 
 
-class T5G2PModel(G2PModel):
+class T5G2PModel(G2PModel, Exportable):
     """
     T5-based grapheme-to-phoneme model.
     """
@@ -271,3 +272,48 @@ def setup_multiple_test_data(self, test_data_config: Union[DictConfig, Dict] = N
     @classmethod
     def list_available_models(cls) -> 'List[PretrainedModelInfo]':
         return []
+
+    def _prepare_for_export(self, **kwargs):
+        super()._prepare_for_export(**kwargs)
+
+        tensor_shape = ('B', 'T')
+
+        # Define input_types and output_types as required by export()
+        self._input_types = {
+            "input_ids": NeuralType(tensor_shape, TokenIndex()),
+        }
+        self._output_types = {
+            "preds_str": NeuralType(tensor_shape, LabelsType()),
+        }
+
+    def _export_teardown(self):
+        self._input_types = self._output_types = None
+
+    @property
+    def input_types(self):
+        return self._input_types
+
+    @property
+    def output_types(self):
+        return self._output_types
+
+    def input_example(self, max_batch=1, max_dim=44):
+        """
+        Generates input examples for tracing etc.
+        Returns:
+            A tuple of input examples.
+        """
+        # par = next(self.fastpitch.parameters())
+        sentence = "Kupil sem si bicikel in mu zamenjal stol."
+        input_ids = [sentence]
+        input_encoding = self._tokenizer(
+            input_ids, padding='longest', max_length=self.max_source_len, truncation=True, return_tensors='pt',
+        )
+        return (input_encoding.input_ids,)
+
+    def forward_for_export(self, input_ids):
+        outputs = self.model.generate(
+            input_ids, output_scores=True, return_dict_in_generate=True, max_length=self.max_source_len
+        )
+        generated_ids, sequence_toks_scores = outputs['sequences'], outputs['scores']
+        return tuple(generated_ids)

From e38e3526c0e28f4f5af691e0adab7e472a9663ac Mon Sep 17 00:00:00 2001
From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Date: Wed, 20 Mar 2024 14:36:46 +0200
Subject: [PATCH 056/140] add fsdp fix for tp > 1 (#8689)

* add fsdp fix for tp > 1

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 nemo/collections/nlp/parts/megatron_trainer_builder.py | 7 +------
 nemo/collections/nlp/parts/nlp_overrides.py            | 5 +++--
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
index 44447ca92612..8df8998caf9d 100644
--- a/nemo/collections/nlp/parts/megatron_trainer_builder.py
+++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py
@@ -114,12 +114,7 @@ def _plugins(self) -> list:
             if megatron_amp_O2 and not with_distributed_adam:
                 plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
             else:
-                if self.cfg.model.get('fsdp', False):
-                    plugins.append(FSDPPrecision(precision=plugin_precision, scaler=scaler))
-                else:
-                    plugins.append(
-                        PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)
-                    )
+                plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
             self.cfg.trainer.precision = None
 
         if self.cfg.get('cluster_type', None) == 'BCP':
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 0b117fd8d860..f724a4ce521f 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -35,7 +35,7 @@
 from pytorch_lightning.loops.fetchers import _DataFetcher
 from pytorch_lightning.plugins import ClusterEnvironment
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
-from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
+from pytorch_lightning.plugins.precision import FSDPPrecision, MixedPrecisionPlugin
 from pytorch_lightning.strategies import DDPStrategy, FSDPStrategy
 from pytorch_lightning.trainer.states import TrainerFn
 from pytorch_lightning.trainer.trainer import Trainer
@@ -66,6 +66,7 @@
 
 try:
     from apex.transformer.pipeline_parallel.utils import get_num_microbatches
+
     from nemo.core.optim.distributed_adam import MegatronDistributedFusedAdam
 
     HAVE_APEX = True
@@ -1121,7 +1122,7 @@ def dummy():
         return instance
 
 
-class PipelineMixedPrecisionPlugin(MixedPrecisionPlugin):
+class PipelineMixedPrecisionPlugin(MixedPrecisionPlugin, FSDPPrecision):
     """ Overrides PTL autocasting to not wrap training/val/test_step.
         We do this because we have the megatron-core fwd/bwd functions in training_step.
         This means .backward is being called in training_step so we do not want the whole

From 80fd6ef29b659e9cb60206176175f0f48fc87013 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Wed, 20 Mar 2024 08:15:00 -0700
Subject: [PATCH 057/140] [Nemo CICD] Github Actions CICD init - Part 3: Unique
 container name per workflow run (#8705)

* Adjustments to run CI on main

* Unique container name per workflow run

* update tests

* remove tmp test change
---
 .github/workflows/cicd-main.yml | 280 +++++++++++++++++---------------
 1 file changed, 151 insertions(+), 129 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 8bca74dc2555..632abc707f48 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -128,8 +128,8 @@ jobs:
         echo "Docker: List containers" && docker ps -a
         DOCKER_COMMIT=$(docker ps --latest --quiet)  # latest container
         docker commit $DOCKER_COMMIT nemo_container
-        docker tag nemo_container localhost:5000/nemo_container
-        docker push localhost:5000/nemo_container
+        docker tag nemo_container localhost:5000/nemo_container_${{ github.run_id }}
+        docker push localhost:5000/nemo_container_${{ github.run_id }}
 
     # - name: Build and push to local registry
     #   uses: docker/build-push-action@v5
@@ -153,7 +153,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -176,7 +176,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -203,7 +203,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -216,11 +216,11 @@ jobs:
         - name: Checkout repository
           uses: actions/checkout@v2
         - run: |
-            CUDA_VISIBLE_DEVICES=0 python scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py \
-            --in-file=/home/TestData/nlp/megatron_llama/llama-ci-hf \
-            --out-file=/home/TestData/nlp/megatron_llama/ci.nemo \
+            CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \
+            --input_name_or_path=/home/TestData/nlp/megatron_llama/llama-ci-hf \
+            --output_path=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \
             --precision=16
-            rm -f /home/TestData/nlp/megatron_llama/ci.nemo
+            rm -f /home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo
         - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
@@ -228,7 +228,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -241,10 +241,9 @@ jobs:
         - name: Checkout repository
           uses: actions/checkout@v2
         - run: |
-            python scripts/nlp_language_modeling/convert_starcoder_hf_to_nemo.py \
-            --config examples/nlp/language_modeling/conf/megatron_gpt_config.yaml \
-            --input /home/TestData/nlp/megatron_gpt/starcoder-ci-hf \
-            --output /home/TestData/nlp/megatron_gpt/starcoder-ci-hf
+            python scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py \
+            --input_name_or_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf \
+            --output_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf
             rm -f /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo
         - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
@@ -253,7 +252,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -266,20 +265,43 @@ jobs:
         - name: Checkout repository
           uses: actions/checkout@v2
         - run: |
-            python scripts/nlp_language_modeling/convert_hf_falcon_to_nemo.py \
-            --config examples/nlp/language_modeling/conf/megatron_falcon_config.yaml \
-            --input /home/TestData/nlp/megatron_gpt/falcon-ci-hf \
-            --output /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo
+            python scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py \
+            --input_name_or_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf \
+            --output_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo
             rm -f /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo
         - uses: "./.github/actions/cancel-workflow"
           if: "failure()"
 
+  L2_Community_LLM_Checkpoints_tests_Baichuan2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: localhost:5000/nemo_container_${{ github.run_id }}
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g 
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /home/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            python scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py \
+            --input_name_or_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base \
+            --output_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo
+            rm -f /home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo
+        - uses: "./.github/actions/cancel-workflow"
+          if: "failure()"
+
   # L2: ASR dev run
   ASR_dev_run_Speech_to_Text:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -307,7 +329,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -338,7 +360,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -367,7 +389,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -398,7 +420,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -445,7 +467,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -479,7 +501,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -513,7 +535,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -542,7 +564,7 @@ jobs:
   #   needs: [cicd-test-container-setup]
   #   runs-on: self-hosted-azure
   #   container:
-  #     image: localhost:5000/nemo_container
+  #     image: localhost:5000/nemo_container_${{ github.run_id }}
   #     options: 
   #       # --user 0:128
   #       --device=/dev/nvidia0
@@ -592,7 +614,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -624,7 +646,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -657,7 +679,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -693,7 +715,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -724,7 +746,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -755,7 +777,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -782,7 +804,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -812,7 +834,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -842,7 +864,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -881,7 +903,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -913,7 +935,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -948,7 +970,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -975,7 +997,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -997,7 +1019,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -1029,7 +1051,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -1063,7 +1085,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -1129,7 +1151,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -1171,7 +1193,7 @@ jobs:
   #   needs: [cicd-test-container-setup]
   #   runs-on: self-hosted-azure
   #   container:
-  #     image: localhost:5000/nemo_container
+  #     image: localhost:5000/nemo_container_${{ github.run_id }}
   #     options: 
   #       # --user 0:128
   #       --device=/dev/nvidia0
@@ -1212,7 +1234,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -1251,7 +1273,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -1289,7 +1311,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -1330,7 +1352,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -1372,7 +1394,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -1407,7 +1429,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -1442,7 +1464,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -1483,7 +1505,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -1561,7 +1583,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -1597,7 +1619,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -1667,7 +1689,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -1701,7 +1723,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -1742,7 +1764,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -1781,7 +1803,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -1822,7 +1844,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -1861,7 +1883,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -1902,7 +1924,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -1941,7 +1963,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -1971,7 +1993,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -2107,7 +2129,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -2138,7 +2160,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -2174,7 +2196,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -2203,7 +2225,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -2227,7 +2249,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -2258,7 +2280,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -2318,7 +2340,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -2383,7 +2405,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -2446,7 +2468,7 @@ jobs:
   #   needs: [cicd-test-container-setup]
   #   runs-on: self-hosted-azure
   #   container:
-  #     image: localhost:5000/nemo_container
+  #     image: localhost:5000/nemo_container_${{ github.run_id }}
   #     options: 
   #       # --user 0:128
   #       --device=/dev/nvidia0
@@ -2509,7 +2531,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -2540,7 +2562,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -2582,7 +2604,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -2624,7 +2646,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -2658,7 +2680,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -2736,7 +2758,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -2776,7 +2798,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -2815,7 +2837,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -2843,7 +2865,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -2886,7 +2908,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -2927,7 +2949,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -2958,7 +2980,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -3067,7 +3089,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -3385,7 +3407,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -3466,7 +3488,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -3548,7 +3570,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -3632,7 +3654,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -3715,7 +3737,7 @@ jobs:
   #   needs: [cicd-test-container-setup]
   #   runs-on: self-hosted-azure
   #   container:
-  #     image: localhost:5000/nemo_container
+  #     image: localhost:5000/nemo_container_${{ github.run_id }}
   #     options: 
   #       # --user 0:128
   #       --device=/dev/nvidia0
@@ -3809,7 +3831,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -3836,7 +3858,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -3929,7 +3951,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -4119,7 +4141,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -4215,7 +4237,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -4311,7 +4333,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -4411,7 +4433,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -4492,7 +4514,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -4536,7 +4558,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -4582,7 +4604,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -4644,7 +4666,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -4670,7 +4692,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -4698,7 +4720,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -4761,7 +4783,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -4790,7 +4812,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -4819,7 +4841,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -4926,7 +4948,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -5033,7 +5055,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -5140,7 +5162,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -5221,7 +5243,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -5275,7 +5297,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -5366,7 +5388,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -5390,7 +5412,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -5470,7 +5492,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -5556,7 +5578,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -5600,7 +5622,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -5649,7 +5671,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -5714,7 +5736,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -5741,7 +5763,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -5771,7 +5793,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -5809,7 +5831,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -5843,7 +5865,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -5887,7 +5909,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -5928,7 +5950,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -5966,7 +5988,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
@@ -6002,7 +6024,7 @@ jobs:
   #   needs: [cicd-test-container-setup]
   #   runs-on: self-hosted-azure
   #   container:
-  #     image: localhost:5000/nemo_container
+  #     image: localhost:5000/nemo_container_${{ github.run_id }}
   #     options: 
   #       # --user 0:128
   #       --device=/dev/nvidia0
@@ -6030,7 +6052,7 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container
+      image: localhost:5000/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0

From 0deacce862b84e38499fdbd210cdc64221b57aec Mon Sep 17 00:00:00 2001
From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Date: Wed, 20 Mar 2024 18:41:45 +0200
Subject: [PATCH 058/140] Add torch_dist_ckpt support (#8700)

* add torch_dist_ckpt support

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add torch dist ckpt test to jenkins

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 Jenkinsfile                                   | 83 +++++++++++++++++++
 .../conf/megatron_gpt_config.yaml             |  3 +
 .../nlp/parts/megatron_trainer_builder.py     |  1 +
 nemo/collections/nlp/parts/nlp_overrides.py   | 16 +++-
 4 files changed, 101 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 01be658beec3..29d1c5374852 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -3703,6 +3703,89 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
       }
     }
+
+    stage('L2: Megatron GPT Pretraining and Resume Training TP=2 with Torch Distributed Checkpoint') {
+      when {
+        anyOf {
+          branch 'main'
+          changeRequest target: 'main'
+        }
+      }
+      failFast true
+      steps {
+        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=3 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        model.mcore_gpt=True \
+        model.torch_distributed_checkpoint=True \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=distributed_fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=1 \
+        model.optim.sched.constant_steps=1 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.normalization=rmsnorm \
+        model.bias=False \
+        model.bias_activation_fusion=False \
+        model.bias_dropout_add_fusion=False \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        model.num_layers=8 \
+        model.cpu_offloading_num_layers=7 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
+        sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.log_every_n_steps=1 \
+        trainer.val_check_interval=2 \
+        trainer.limit_val_batches=2 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.max_steps=6 \
+        trainer.precision=16 \
+        trainer.gradient_clip_val=1.0 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.mcore_gpt=True \
+        model.torch_distributed_checkpoint=True \
+        model.tensor_model_parallel_size=2 \
+        model.optim.name=distributed_fused_adam \
+        model.optim.lr=2e-4 \
+        model.optim.sched.warmup_steps=2 \
+        model.optim.sched.constant_steps=2 \
+        model.optim.sched.min_lr=8e-5 \
+        model.max_position_embeddings=128 \
+        model.encoder_seq_length=128 \
+        model.data.seq_length=128 \
+        model.normalization=rmsnorm \
+        model.bias=False \
+        model.bias_activation_fusion=False \
+        model.bias_dropout_add_fusion=False \
+        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+        model.num_layers=8 \
+        model.cpu_offloading_num_layers=7 \
+        model.hidden_size=256 \
+        model.num_attention_heads=8 \
+        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
+        sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
+        sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
+      }
+    }
 /*
     stage('L2: Megatron GPT Pretraining with EP=2') {
       when {
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index e8b0f68d3682..7be891a156c8 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -149,6 +149,9 @@ model:
   fsdp_grad_reduce_dtype: 32 # Gradient reduction data type.
   fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint.
 
+  # PyTorch distributed checkpoint
+  torch_distributed_checkpoint: False # Set to True to use PyTorch distributed checkpoint format.
+
   ## Activation Checkpointing
   # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
   # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
index 8df8998caf9d..6f77ab1f17cf 100644
--- a/nemo/collections/nlp/parts/megatron_trainer_builder.py
+++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py
@@ -80,6 +80,7 @@ def _training_strategy(self) -> Union[NLPDDPStrategy, NLPFSDPStrategy]:
             find_unused_parameters=False,
             nccl_communicator_config_path=self.cfg.model.get('nccl_communicator_config_path', None),
             sharp=self.cfg.model.get('sharp', False),
+            torch_dist_ckpt=self.cfg.model.get('torch_distributed_checkpoint', False),
         )
 
     def _grad_scaler(self) -> GradScaler:
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index f724a4ce521f..5fd15c0ed3ac 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -165,6 +165,7 @@ def __init__(
         no_ddp_communication_hook: bool = False,
         nccl_communicator_config_path: Optional[str] = None,
         sharp: bool = False,
+        torch_dist_ckpt: bool = False,
         **kwargs: Union[Any, Dict[str, Any]],
     ) -> None:
         if not HAVE_APEX:
@@ -181,6 +182,7 @@ def __init__(
         self.no_ddp_communication_hook = no_ddp_communication_hook
         self.nccl_communicator_config_path = nccl_communicator_config_path
         self.sharp = sharp
+        self.torch_dist_ckpt = torch_dist_ckpt
 
     def setup(self, trainer: "pl.Trainer") -> None:
         """
@@ -353,7 +355,10 @@ def save_checkpoint(
             # remove device state_dict
             checkpoint['state_dict'] = OrderedDict([])
 
-            dist_checkpointing.save(sharded_state_dict=checkpoint, checkpoint_dir=checkpoint_dir)
+            sharded_strategy = ('torch_dist', 1) if self.torch_dist_ckpt else ('zarr', 1)
+            dist_checkpointing.save(
+                sharded_state_dict=checkpoint, checkpoint_dir=checkpoint_dir, sharded_strategy=sharded_strategy
+            )
         else:
             # PTL override to accomodate model parallel checkpoints
             filepath = inject_model_parallel_rank(filepath)
@@ -850,7 +855,14 @@ def dummy():
                         if model.trainer.strategy.launcher is not None:
                             model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer)
                         model.trainer.strategy.setup_environment()
-                    dist_checkpointing.save(sharded_state_dict=sharded_state_dict, checkpoint_dir=dist_ckpt_dir)
+                    sharded_strategy = (
+                        ('torch_dist', 1) if model.cfg.get("torch_distributed_checkpoint", False) else ('zarr', 1)
+                    )
+                    dist_checkpointing.save(
+                        sharded_state_dict=sharded_state_dict,
+                        checkpoint_dir=dist_ckpt_dir,
+                        sharded_strategy=sharded_strategy,
+                    )
 
             else:
 

From 5b5a4445f3de30786d7c2ef4108fc89242a643f3 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Wed, 20 Mar 2024 17:52:33 +0100
Subject: [PATCH 059/140] Quantization fixes (#8701)

* Handle model w/o artifacts

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Quantization for megatron_amp_O2=True case and test for it

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Remove broken pileval dataset

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Consider model w/o tokenizer case

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Use default converter precision

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Code formatting

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Fix typo in Jenkinsfile

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Disable sequence parallel for evaluation

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 Jenkinsfile                                              | 9 ++++-----
 .../conf/megatron_llama_quantization.yaml                | 2 +-
 .../nlp/language_modeling/megatron_llama_quantization.py | 5 +----
 .../nlp/models/language_modeling/megatron_gpt_model.py   | 6 +++++-
 nemo/export/quantize/quantizer.py                        | 2 ++
 nemo/utils/model_utils.py                                | 8 +++++++-
 6 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 29d1c5374852..3a96a9b5164f 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -426,8 +426,7 @@ pipeline {
           steps {
             sh 'CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \
             --input_name_or_path=/home/TestData/nlp/megatron_llama/llama-ci-hf \
-            --output_path=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \
-            --precision=16'
+            --output_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo'
           }
         }
         stage('StarCoder') {
@@ -469,7 +468,7 @@ pipeline {
         stage('Llama2 - Export Only') {
           steps {
             sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \
-            model_file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \
+            model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
             quantization.algorithm=null \
             model_save=/home/TestData/nlp/megatron_llama/ci_baseline'
             sh 'rm -rf /home/TestData/nlp/megatron_llama/ci_baseline'
@@ -478,7 +477,7 @@ pipeline {
         stage('Llama2 - INT8 SQ') {
           steps {
             sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \
-            model_file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \
+            model_file=/home/TestData/nlp/megatron_llama/llama_ci_megatron_amp_O2_hf_tokenizer.nemo \
             quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
             quantization.algorithm=int8_sq \
             quantization.num_calib_size=8 \
@@ -490,7 +489,7 @@ pipeline {
         stage('Llama2 - FP8') {
           steps {
             sh 'python examples/nlp/language_modeling/megatron_llama_quantization.py \
-            model_file=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \
+            model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
             tensor_model_parallel_size=2 \
             trainer.devices=2 \
             quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
diff --git a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
index f3803dc4e69c..ac10f7224090 100644
--- a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
@@ -23,7 +23,7 @@ trainer:
 quantization:
   quantize_bmm1: false
   algorithm: fp8 # int8_sq, fp8, int8, int4_awq, null
-  calib_dataset: cnn_dailymail # pileval, wikitext, cnn_dailymail
+  calib_dataset: cnn_dailymail # wikitext, cnn_dailymail, or a local dataset
   num_calib_size: 512 # number of samples used for calibration
 
 export:
diff --git a/examples/nlp/language_modeling/megatron_llama_quantization.py b/examples/nlp/language_modeling/megatron_llama_quantization.py
index 16fb5ae9c13b..92ead6b4ed69 100644
--- a/examples/nlp/language_modeling/megatron_llama_quantization.py
+++ b/examples/nlp/language_modeling/megatron_llama_quantization.py
@@ -41,10 +41,7 @@
 
 
 def get_calib_dataloader(data="cnn_dailymail", batch_size=64, calib_size=512, max_sequence_length=512):
-    if data == "pileval":
-        dataset = load_dataset("json", data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", split="train")
-        text_column = "text"
-    elif data == "wikitext":
+    if data == "wikitext":
         dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")
         text_column = "text"
     elif data == "cnn_dailymail":
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index cf18d07b3060..546ea429b149 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1151,7 +1151,11 @@ def fwd_output_only_func(dataloader_iter, model):
                     extra_arg['inference_max_sequence_len'] = inference_max_sequence_len[0].item()
             # Currently for all MCore transformer layer specs causal attention mask
             # is used so we can delegate creating it to MCore/TE and pass None below
-            if isinstance(model, MCoreGPTModel):
+            if (
+                isinstance(model, MCoreGPTModel)
+                or hasattr(model, "module")
+                and isinstance(model.module, MCoreGPTModel)
+            ):
                 attention_mask = None
             output_tensor = model(tokens, position_ids, attention_mask, **extra_arg)
 
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index 1ae375e6cfe7..d24e2a80babc 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -154,10 +154,12 @@ def _load_and_modify_config(
         with open_dict(model_cfg):
             model_cfg.activations_checkpoint_method = None
             model_cfg.activations_checkpoint_granularity = None
+            model_cfg.sequence_parallel = False
             if tensor_model_parallel_size is not None:
                 model_cfg.tensor_model_parallel_size = tensor_model_parallel_size
             if pipeline_model_parallel_size is not None:
                 model_cfg.pipeline_model_parallel_size = pipeline_model_parallel_size
+            model_cfg.megatron_amp_O2 = False  # Support for `megatron_amp_O2 = true` will be enabled in AMMO > 0.7
             # Only custom AMMO spec is supported for PTQ: this custom spec is largely based on local Megatron-LM
             # layer definitions to avoid Transformer Engine implementations that are currently not supported.
             model_cfg.name = "ammo"
diff --git a/nemo/utils/model_utils.py b/nemo/utils/model_utils.py
index 7ea4cffbb3ab..c7047d4e3b52 100644
--- a/nemo/utils/model_utils.py
+++ b/nemo/utils/model_utils.py
@@ -661,6 +661,10 @@ def save_artifacts(model, output_dir: str, use_abspath: bool = False) -> None:
     app_state = AppState()
     model_file = app_state.model_restore_path
     model_cfg = copy.deepcopy(model.cfg)
+    if not hasattr(model, "artifacts"):
+        if hasattr(model_cfg, "tokenizer"):
+            OmegaConf.save(model_cfg.tokenizer, os.path.join(output_dir, "tokenizer_config.yaml"))
+        return
 
     # Setup model file handling context: directory or tarball
     if os.path.isfile(model_file):
@@ -686,4 +690,6 @@ def save_artifacts(model, output_dir: str, use_abspath: bool = False) -> None:
             # that in this case output directory should be permanent for correct artifact recovery later
             arti_path = os.path.abspath(arti_path) if use_abspath else os.path.basename(arti_path)
             OmegaConf.update(model_cfg, arti_name, arti_path)
-    OmegaConf.save(model_cfg.tokenizer, os.path.join(output_dir, "tokenizer_config.yaml"))
+
+    if hasattr(model_cfg, "tokenizer"):
+        OmegaConf.save(model_cfg.tokenizer, os.path.join(output_dir, "tokenizer_config.yaml"))

From 05d5218c65d6c9a4ead52de76c60511a5134dbc2 Mon Sep 17 00:00:00 2001
From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Date: Wed, 20 Mar 2024 12:50:12 -0700
Subject: [PATCH 060/140] Fix multimodal nightly build issues 2403 (#8702)

* fix neva and clip

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../clip/megatron_clip_pretrain.py                           | 3 +++
 .../multimodal/models/multimodal_llm/neva/neva_model.py      | 5 +----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/multimodal/vision_language_foundation/clip/megatron_clip_pretrain.py b/examples/multimodal/vision_language_foundation/clip/megatron_clip_pretrain.py
index cc2f13df8d0f..4462649a5861 100644
--- a/examples/multimodal/vision_language_foundation/clip/megatron_clip_pretrain.py
+++ b/examples/multimodal/vision_language_foundation/clip/megatron_clip_pretrain.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 
+import torch.multiprocessing as mp
 from omegaconf.omegaconf import OmegaConf
 
 from nemo.collections.multimodal.models.vision_language_foundation.clip.megatron_clip_models import MegatronCLIPModel
@@ -21,6 +22,8 @@
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
 
+mp.set_start_method("spawn", force=True)
+
 
 @hydra_runner(config_path="conf", config_name="megatron_clip_config")
 def main(cfg) -> None:
diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
index 44ab4785e8de..78a46ce3b0db 100644
--- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
+++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
@@ -939,10 +939,7 @@ def state_dict(self, destination=None, prefix='', keep_vars=False):
             keys_to_keep += llm_keys
         if not self.cfg.mm_cfg.vision_encoder.freeze:
             keys_to_keep += vision_encoder_keys
-        if self.megatron_amp_O2:
-            new_state_dict = {k: original_state_dict[k.replace("model.", "model.module.", 1)] for k in keys_to_keep}
-        else:
-            new_state_dict = {k: original_state_dict[k] for k in keys_to_keep}
+        new_state_dict = {k: original_state_dict[k] for k in keys_to_keep}
         return new_state_dict
 
     def load_state_dict(self, state_dict, strict=False):

From 21f809cb3a1404a734d21c6d49f4e77f39f26d86 Mon Sep 17 00:00:00 2001
From: gdengk <160076886+gdengk@users.noreply.github.com>
Date: Thu, 21 Mar 2024 09:32:09 -0700
Subject: [PATCH 061/140] ep rank fix (#8713)

Signed-off-by: Gao Deng <gdeng@eos0514.eos.clusters.nvidia.com>
Signed-off-by: Gao Deng <gdeng@nvidia.com>
Co-authored-by: Gao Deng <gdeng@login-eos02.eos.clusters.nvidia.com>
---
 nemo/collections/nlp/modules/common/megatron/megatron_init.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_init.py b/nemo/collections/nlp/modules/common/megatron/megatron_init.py
index 35d27f68c380..7ba2e28008ac 100644
--- a/nemo/collections/nlp/modules/common/megatron/megatron_init.py
+++ b/nemo/collections/nlp/modules/common/megatron/megatron_init.py
@@ -327,7 +327,7 @@ def fake_initialize_model_parallel(
                 end_rank = i * tensor_and_data_group_size + (j + 1) * tensor_and_expert_group_size
                 ranks = range(start_rank, end_rank)
                 if rank in ranks:
-                    expert_model_parallel_rank = list(ranks).index(rank)
+                    expert_model_parallel_rank = list(ranks).index(rank) // tensor_model_parallel_size
 
     # Build the pipeline model-parallel groups and embedding groups
     # (first and last rank in each pipeline model-parallel group).

From 3c670e251bb94f5fe08f4452f5ad970807dd3c2b Mon Sep 17 00:00:00 2001
From: Ao Tang <aot@nvidia.com>
Date: Thu, 21 Mar 2024 12:39:34 -0400
Subject: [PATCH 062/140] SC2 Conversion scripts (#8530)

* add conversion scripts

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix tp2 issue

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Rename args based on refactor PR

* rename the conversion script

* remove to checkpoint_converters

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove setencepiece

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../conf/megatron_starcoder2_config.yaml      | 222 +++++++++++
 .../convert_starcoder2_hf_to_nemo.py          | 375 ++++++++++++++++++
 .../convert_starcoder2_nemo_to_hf.py          | 272 +++++++++++++
 3 files changed, 869 insertions(+)
 create mode 100644 examples/nlp/language_modeling/conf/megatron_starcoder2_config.yaml
 create mode 100644 scripts/checkpoint_converters/convert_starcoder2_hf_to_nemo.py
 create mode 100644 scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py

diff --git a/examples/nlp/language_modeling/conf/megatron_starcoder2_config.yaml b/examples/nlp/language_modeling/conf/megatron_starcoder2_config.yaml
new file mode 100644
index 000000000000..6d47a14b7ab3
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_starcoder2_config.yaml
@@ -0,0 +1,222 @@
+name: megatron_starcoder2
+restore_from_path: null # used when starting from a .nemo file
+
+trainer:
+  num_nodes: 1
+  devices: 1
+  accelerator: gpu
+  precision: 32
+  logger: false
+  enable_checkpointing: false
+  use_distributed_sampler: false
+  max_epochs: null
+  max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 5000
+  limit_val_batches: 50
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: megatron_starcoder2
+  create_wandb_logger: true
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: true
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false
+    save_nemo_on_train_end: false
+    filename: megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}
+    model_parallel_size: 2
+
+model:
+  mcore_gpt: true
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+  micro_batch_size: 1 # limited by GPU memory
+  global_batch_size: 2 # will use more micro batches to reach global batch size
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  virtual_pipeline_model_parallel_size: null # interleaved pipeline
+
+  # model architecture
+  encoder_seq_length: 4096
+  max_position_embeddings: ${.encoder_seq_length}
+  num_layers: 40 # 3b: 30 | 7b: 32 | 15b: 40
+  hidden_size: 6144 # 3b: 3072 | 7b: 4608 | 15b: 6144
+  ffn_hidden_size: 24576 # 3b: 12288 | 7b: 18432 | 15b: 24576
+  num_attention_heads: 48 # 3b: 24 | 7b: 36 | 15b: 48
+  init_method_std: 0.01275 # 3b: 0.018042 | 7b: 0.018042 | 15b: 0.01275
+  hidden_dropout: 0 # Dropout probability for hidden state transformer.
+  attention_dropout: 0 # Dropout probability for attention
+  ffn_dropout: 0 # Dropout probability in the feed-forward layer.
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: true # scale Q * K^T by 1 / layer-number.
+  layernorm_epsilon: 1.0e-05 
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: true # Add embedding
+  post_process: true # Add pooler
+  persist_layer_norm: true # Use of persistent fused layer norm kernel
+  position_embedding_type: rope # Position embedding type. Options ['learned_absolute', 'rope']
+  rotary_base: 10000 #  3b: 1e5 | 7b: 1e5 | 15b: 1e4
+  activation: gelu # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu']
+  bias: true  # Whether to use bias terms in all weight matrices.
+  normalization: layernorm # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
+  transformer_block_type: pre_ln # Options ['pre_ln', 'post_ln', 'normformer']
+  share_embeddings_and_output_weights: false # Share embedding and output layer weights.
+  
+  ## Fusion
+  grad_div_ar_fusion: true # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism..
+  gradient_accumulation_fusion: true  # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2.
+  bias_activation_fusion: true  # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
+  bias_dropout_add_fusion: true # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+  masked_softmax_fusion: true # Use a kernel that fuses the attention softmax with it's mask.
+
+  ## Miscellaneous
+  seed: 1234
+  resume_from_checkpoint: null # manually set the checkpoint file to load from
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  sync_batch_comm: False # Enable stream synchronization after each p2p communication between pipeline stages
+
+  overlap_p2p_comm: false
+  batch_p2p_comm: true
+  num_query_groups: 4 # 3b: 2 | 7b: 4 | 15b: 4
+  tokenizer:
+    library: huggingface
+    type: bigcode/starcoder2-tokenizer
+    model: null
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: false
+  grad_allreduce_chunk_size_mb: 125
+
+  ## Activation Checkpointing
+  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  # 'full' will checkpoint the entire transformer layer.
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null
+  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+  num_micro_batches_with_partial_activation_checkpoints: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Transformer Engine
+  transformer_engine: True
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 
+  fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin 
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration
+  use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.
+
+  ## Nsys profiling options
+  nsys_profile:
+    enabled: False
+    start_step: 10  # Global batch to start profiling
+    end_step: 10 # Global batch to end profiling
+    ranks: [0] # Global rank IDs to profile
+    gen_shape: False # Generate model and kernel details including input shapes
+  
+  optim:
+    name: distributed_fused_adam
+    bucket_cap_mb: 128
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
+    lr: 0.0003
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 1000
+      constant_steps: 0
+      min_lr: 3.0e-05
+
+  data:
+    # Path to data must be specified by the user.
+    # Supports List, String and Dictionary
+    # List : can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]",
+    # Or see example below: 
+    # data_prefix: 
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_00_text_document
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_01_text_document
+    # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test}
+    # Or see example below:
+    # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}"
+    # data_prefix: ???
+    index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
+    data_impl: mmap
+    splits_string: 900,50,50
+    seq_length: ${model.encoder_seq_length}
+    skip_warmup: True
+    num_workers: 2
+    dataloader_type: single # cyclic
+    reset_position_ids: False # Reset position ids after end-of-document token
+    reset_attention_mask: False # Reset attention mask after end-of-document token
+    eod_mask_loss: False # Mask loss for the end of document tokens
+    validation_drop_last: True # Set to false if the last partial validation samples is to be consumed
+    no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token
+    pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size
+    shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled
+    add_fim: false # Enable FIM in training
+    fim:
+      rate: 0.5
+      spm_rate: 0.5
+      split_sample: <file_sep>
+      fragment_rate: 0.5
+      no_prefix: <repo_name>
+      extra_tokens:
+        prefix: <fim_prefix>
+        middle: <fim_middle>
+        suffix: <fim_suffix>
+        pad: <fim_pad>
+        eod: <|endoftext|>
+    
\ No newline at end of file
diff --git a/scripts/checkpoint_converters/convert_starcoder2_hf_to_nemo.py b/scripts/checkpoint_converters/convert_starcoder2_hf_to_nemo.py
new file mode 100644
index 000000000000..eccca3a04621
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_starcoder2_hf_to_nemo.py
@@ -0,0 +1,375 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+Conversion script to convert HuggingFace Starcoder2 checkpoints into nemo checkpoint.
+  Example to run this conversion script:
+    python convert_hf_starcoder2_to_nemo.py \
+     --input_name_or_path <path_to_sc2_checkpoints_folder> \
+     --output_path <path_to_output_nemo_file>
+"""
+
+
+import json
+import os
+from argparse import ArgumentParser
+from collections import OrderedDict
+
+import torch
+import torch.nn
+from omegaconf import OmegaConf
+from pytorch_lightning.core.saving import _load_state as ptl_load_state
+from pytorch_lightning.trainer.trainer import Trainer
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.nlp_overrides import (
+    GradScaler,
+    MegatronHalfPrecisionPlugin,
+    NLPDDPStrategy,
+    NLPSaveRestoreConnector,
+    PipelineMixedPrecisionPlugin,
+)
+from nemo.utils import logging
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--input_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to Huggingface StarCoder2 checkpoints",
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument("--precision", type=str, default="32", help="Model precision")
+    args = parser.parse_args()
+    return args
+
+
+def load_model(cls, checkpoint, strict, **kwargs):
+    try:
+        if 'cfg' in kwargs:
+            model = ptl_load_state(cls, checkpoint, strict=strict, **kwargs)
+        else:
+            model = cls(cfg=checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY], **kwargs)
+            for name, module in model.named_parameters():
+                if name in checkpoint['state_dict']:
+                    module.data = checkpoint['state_dict'][name]
+                    checkpoint['state_dict'].pop(name)
+                else:
+                    print(f"Unexpected key: {name} not in checkpoint but in model.")
+
+            for name, buffer in model.named_buffers():
+                if name in checkpoint['state_dict']:
+                    buffer.data = checkpoint['state_dict'][name]
+                    checkpoint['state_dict'].pop(name)
+                else:
+                    print(f"Unexpected key: {name} not in checkpoint but in model.")
+
+            if len(checkpoint['state_dict'].keys()) != 0:
+                raise RuntimeError(
+                    f"Additional keys: {checkpoint['state_dict'].keys()} in checkpoint but not in model."
+                )
+
+            # register the artifacts
+            cfg = checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY]
+            if cfg.tokenizer.model is not None:
+                model.register_artifact("tokenizer.tokenizer_model", cfg.tokenizer.model)
+            if cfg.tokenizer.vocab_file is not None:
+                model.register_artifact("tokenizer.vocab_file", cfg.tokenizer.vocab_file)
+            if cfg.tokenizer.merge_file is not None:
+                model.register_artifact("tokenizer.merge_file", cfg.tokenizer.merge_file)
+    finally:
+        cls._set_model_restore_state(is_being_restored=False)
+    return model
+
+
+def load_config(sc2_config, tokenizer_path):
+    nemo_config = OmegaConf.load(
+        os.path.join(
+            os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_starcoder2_config.yaml'
+        )
+    ).model
+
+    nemo_config.encoder_seq_length = (
+        sc2_config['sliding_window'] if sc2_config.get('sliding_window') else sc2_config['max_position_embeddings']
+    )
+    nemo_config.num_layers = int(sc2_config['num_hidden_layers'])
+    nemo_config.hidden_size = sc2_config['hidden_size']
+    nemo_config.ffn_hidden_size = sc2_config['intermediate_size']
+    nemo_config.num_attention_heads = sc2_config['num_attention_heads']
+    nemo_config.max_position_embeddings = sc2_config['max_position_embeddings']
+    nemo_config.window_size = [sc2_config['sliding_window'], 0]
+    nemo_config.init_method_std = sc2_config['initializer_range']
+    nemo_config.layernorm_epsilon = sc2_config['norm_epsilon']
+
+    if 'num_key_value_heads' in sc2_config:
+        nemo_config.num_query_groups = sc2_config['num_key_value_heads']
+    nemo_config.use_cpu_initialization = True
+    nemo_config.activation = 'gelu'
+    nemo_config.tokenizer.model = tokenizer_path
+    nemo_config['rotary_base'] = sc2_config['rope_theta']
+    nemo_config['apply_rope_fusion'] = False
+
+    base = 128
+    while sc2_config['vocab_size'] % base != 0:
+        base //= 2
+    nemo_config.make_vocab_size_divisible_by = base
+
+    return nemo_config
+
+
+def load_sc2_ckpt(in_dir):
+    params_file = os.path.join(in_dir, 'config.json')
+    assert os.path.exists(params_file)
+    with open(params_file, 'r') as fp:
+        model_args = json.load(fp)
+
+    model = AutoModelForCausalLM.from_pretrained(in_dir)
+    ckpt = model.state_dict()
+
+    tokenizer = AutoTokenizer.from_pretrained(in_dir)
+    return model_args, ckpt, tokenizer
+
+
+def convert(args):
+    logging.info(f"loading checkpoint {args.input_name_or_path}")
+
+    model_args, ckpt, tokenizer = load_sc2_ckpt(args.input_name_or_path)
+    nemo_config = load_config(model_args, os.path.join(args.input_name_or_path, 'tokenizer.model'))
+    logging.info(f"loaded checkpoint {args.input_name_or_path}")
+
+    if args.precision in ["32", "16"]:
+        precision = int(float(args.precision))
+    elif args.precision in ["bf16", "bf16-mixed"]:
+        if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
+            precision = args.precision
+        else:
+            logging.warning("BF16 is not supported on this device. Using FP16 instead.")
+            precision = args.precision[2:]  # prune bf in string
+    else:
+        precision = args.precision
+
+    plugins = []
+    if precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
+        scaler = None
+        if precision in [16, '16', '16-mixed']:
+            scaler = GradScaler(
+                init_scale=nemo_config.get('native_amp_init_scale', 2 ** 32),
+                growth_interval=nemo_config.get('native_amp_growth_interval', 1000),
+                hysteresis=nemo_config.get('hysteresis', 2),
+            )
+            # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed
+            plugin_precision = '16-mixed'
+        else:
+            plugin_precision = 'bf16-mixed'
+
+        if nemo_config.get('megatron_amp_O2', False):
+            plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+        else:
+            plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+
+    if precision == 32:
+        dtype = torch.float32
+    elif precision in [16, "16", "16-mixed"]:
+        dtype = torch.float16
+    elif precision in ["bf16", "bf16-mixed"]:
+        dtype = torch.bfloat16
+    else:
+        dtype = torch.float32  # fallback
+
+    nemo_config.precision = precision
+    logging.info(f"nemo_config: {nemo_config}")
+
+    trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision, strategy=NLPDDPStrategy())
+
+    hidden_size = nemo_config.hidden_size
+    head_num = nemo_config.num_attention_heads
+    head_size = hidden_size // head_num
+    num_layers = nemo_config.num_layers
+
+    mcore_gpt = nemo_config.mcore_gpt
+
+    assert mcore_gpt == nemo_config.get(
+        'transformer_engine', False
+    ), "mcore_gpt transformer_engine must be enabled (or disabled) together."
+
+    param_to_weights = lambda param: param.float()
+
+    checkpoint = OrderedDict()
+    checkpoint['state_dict'] = OrderedDict()
+
+    embed_weight = ckpt[f'model.embed_tokens.weight']
+    if mcore_gpt:
+        embed_weights_base_name = f'model.embedding.word_embeddings.weight'
+    else:
+        embed_weights_base_name = f'model.language_model.embedding.word_embeddings.weight'
+    checkpoint['state_dict'][embed_weights_base_name] = param_to_weights(embed_weight)
+
+    if nemo_config.num_query_groups is None or nemo_config.num_query_groups == head_num:
+        num_query_groups = head_num
+    else:
+        num_query_groups = nemo_config.num_query_groups
+        assert head_num % num_query_groups == 0, 'head_num must be divisible by num_query_groups'
+
+    for l in range(int(num_layers)):
+        print(f"converting layer {l}")
+        old_tensor_shape = ckpt[f'model.layers.{l}.self_attn.q_proj.weight'].size()
+        new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
+        new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
+
+        new_q_bias_tensor_shape = (head_num, head_size)
+        new_kv_bias_tensor_shape = (num_query_groups, head_size)
+
+        q = ckpt[f'model.layers.{l}.self_attn.q_proj.weight'].view(*new_q_tensor_shape)
+        k = ckpt[f'model.layers.{l}.self_attn.k_proj.weight'].view(*new_kv_tensor_shape)
+        v = ckpt[f'model.layers.{l}.self_attn.v_proj.weight'].view(*new_kv_tensor_shape)
+
+        q_bias = ckpt[f'model.layers.{l}.self_attn.q_proj.bias'].view(*new_q_bias_tensor_shape)
+        k_bias = ckpt[f'model.layers.{l}.self_attn.k_proj.bias'].view(*new_kv_bias_tensor_shape)
+        v_bias = ckpt[f'model.layers.{l}.self_attn.v_proj.bias'].view(*new_kv_bias_tensor_shape)
+
+        # Note: we assume wq & wk have been appropriately transposed to work with
+        # NeMo/Megatron's rotary embedding. The reference checkpoint/implementation
+        # will not work OotB without transposing wq/wk matrices.
+        heads_per_group = head_num // num_query_groups
+        qkv_weights_l = []
+        qkv_bias_l = []
+        for i in range(num_query_groups):
+            qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :])
+            qkv_weights_l.append(k[i : i + 1, :, :])
+            qkv_weights_l.append(v[i : i + 1, :, :])
+
+            qkv_bias_l.append(q_bias[i * heads_per_group : (i + 1) * heads_per_group, :])
+            qkv_bias_l.append(k_bias[i : i + 1, :])
+            qkv_bias_l.append(v_bias[i : i + 1, :])
+
+        qkv_weights = torch.cat(qkv_weights_l)
+        qkv_bias = torch.cat(qkv_bias_l)
+        assert qkv_weights.ndim == 3, qkv_weights.shape
+        assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape
+        assert qkv_weights.shape[1] == head_size, qkv_weights.shape
+        assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape
+        qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+        qkv_bias = qkv_bias.reshape([head_size * (head_num + 2 * num_query_groups)])
+        if mcore_gpt:
+            qkv_weights_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.weight'
+            qkv_bias_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.bias'
+        else:
+            qkv_weights_base_name = f'model.language_model.encoder.layers.{l}.self_attention.query_key_value.weight'
+            qkv_bias_base_name = f'model.language_model.encoder.layers.{l}.self_attention.query_key_value.bias'
+        checkpoint['state_dict'][qkv_weights_base_name] = param_to_weights(qkv_weights)
+        checkpoint['state_dict'][qkv_bias_base_name] = param_to_weights(qkv_bias)
+
+        # attention dense
+        o_weight = ckpt[f'model.layers.{l}.self_attn.o_proj.weight']
+        o_bias = ckpt[f'model.layers.{l}.self_attn.o_proj.bias']
+        if mcore_gpt:
+            o_weight_base_name = f'model.decoder.layers.{l}.self_attention.linear_proj.weight'
+            o_bias_base_name = f'model.decoder.layers.{l}.self_attention.linear_proj.bias'
+        else:
+            o_weight_base_name = f'model.language_model.encoder.layers.{l}.self_attention.dense.weight'
+            o_bias_base_name = f'model.language_model.encoder.layers.{l}.self_attention.dense.bias'
+        checkpoint['state_dict'][o_weight_base_name] = param_to_weights(o_weight)
+        checkpoint['state_dict'][o_bias_base_name] = param_to_weights(o_bias)
+
+        # MLP
+        mlp_cfc_weight = ckpt[f'model.layers.{l}.mlp.c_fc.weight']
+        mlp_cfc_bias = ckpt[f'model.layers.{l}.mlp.c_fc.bias']
+        if mcore_gpt:
+            mlp_down_base_name_weight = f'model.decoder.layers.{l}.mlp.linear_fc1.weight'
+            mlp_down_base_name_bias = f'model.decoder.layers.{l}.mlp.linear_fc1.bias'
+        else:
+            raise Exception("not implemented")
+        checkpoint['state_dict'][mlp_down_base_name_weight] = param_to_weights(mlp_cfc_weight)
+        checkpoint['state_dict'][mlp_down_base_name_bias] = param_to_weights(mlp_cfc_bias)
+
+        mlp_up_weight = ckpt[f'model.layers.{l}.mlp.c_proj.weight']
+        mlp_up_bias = ckpt[f'model.layers.{l}.mlp.c_proj.bias']
+        if mcore_gpt:
+            mlp_up_base_name_weight = f'model.decoder.layers.{l}.mlp.linear_fc2.weight'
+            mlp_up_base_name_bias = f'model.decoder.layers.{l}.mlp.linear_fc2.bias'
+        else:
+            raise Exception("not implemented")
+        checkpoint['state_dict'][mlp_up_base_name_weight] = param_to_weights(mlp_up_weight)
+        checkpoint['state_dict'][mlp_up_base_name_bias] = param_to_weights(mlp_up_bias)
+
+        # LayerNorm
+        input_ln_weight = ckpt[f'model.layers.{l}.input_layernorm.weight']
+        input_ln_bias = ckpt[f'model.layers.{l}.input_layernorm.bias']
+        if mcore_gpt:
+            input_ln_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_weight'
+            input_ln_bias_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_bias'
+        else:
+            input_ln_base_name = f'model.language_model.encoder.layers.{l}.input_layernorm.weight'
+            input_ln_bias_name = f'model.language_model.encoder.layers.{l}.input_layernorm.bias'
+        checkpoint['state_dict'][input_ln_base_name] = param_to_weights(input_ln_weight)
+        checkpoint['state_dict'][input_ln_bias_name] = param_to_weights(input_ln_bias)
+
+        post_attn_ln_weight = ckpt[f'model.layers.{l}.post_attention_layernorm.weight']
+        post_attn_ln_bias = ckpt[f'model.layers.{l}.post_attention_layernorm.bias']
+        if mcore_gpt:
+            post_attn_ln_base_name = f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_weight'
+            post_attn_ln_bias_name = f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_bias'
+        else:
+            post_attn_ln_base_name = f'model.language_model.encoder.layers.{l}.post_attention_layernorm.weight'
+            post_attn_ln_bias_name = f'model.language_model.encoder.layers.{l}.post_attention_layernorm.weight'
+        checkpoint['state_dict'][post_attn_ln_base_name] = param_to_weights(post_attn_ln_weight)
+        checkpoint['state_dict'][post_attn_ln_bias_name] = param_to_weights(post_attn_ln_bias)
+
+        print(f"done layer {l}")
+
+    final_ln_weight = ckpt[f'model.norm.weight']
+    final_ln_bias = ckpt[f'model.norm.bias']
+    if mcore_gpt:
+        final_ln_base_name = f'model.decoder.final_layernorm.weight'
+        final_ln_bias_name = f'model.decoder.final_layernorm.bias'
+    else:
+        final_ln_base_name = f'model.language_model.encoder.final_layernorm.weight'
+    checkpoint['state_dict'][final_ln_base_name] = param_to_weights(final_ln_weight)
+    checkpoint['state_dict'][final_ln_bias_name] = param_to_weights(final_ln_bias)
+
+    output_layer_weight = ckpt[f'lm_head.weight']
+    if mcore_gpt:
+        output_layer_base_name = f'model.output_layer.weight'
+    else:
+        output_layer_base_name = f'model.language_model.output_layer.weight'
+    checkpoint['state_dict'][output_layer_base_name] = param_to_weights(output_layer_weight)
+
+    checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config
+
+    del ckpt
+
+    if nemo_config.get('megatron_amp_O2', False):
+        keys = list(checkpoint['state_dict'].keys())
+        for key in keys:
+            checkpoint['state_dict'][key.replace('model.', 'model.module.', 1)] = checkpoint['state_dict'].pop(key)
+
+    model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer)
+
+    model._save_restore_connector = NLPSaveRestoreConnector()
+
+    # cast to target precision and disable cpu init
+    model = model.to(dtype=dtype)
+    model.cfg.use_cpu_initialization = False
+
+    model.save_to(args.output_path)
+    logging.info(f'NeMo model saved to: {args.output_path}')
+
+
+if __name__ == '__main__':
+    args = get_args()
+    convert(args)
diff --git a/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py b/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py
new file mode 100644
index 000000000000..b7b85ee826a8
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_starcoder2_nemo_to_hf.py
@@ -0,0 +1,272 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+Conversion script to convert NeMo starcoder2 checkpoints into HuggingFace checkpoint.
+  Example to run this conversion script:
+    python3 convert_nemo_starcoder2to_hf.py \
+     --input_name_or_path <path_to_nemo_checkpoints_folder> \
+     --output_path <path_to_output_hf_file>
+"""
+
+from argparse import ArgumentParser
+from collections import OrderedDict
+
+import torch
+import torch.nn
+from pytorch_lightning.trainer.trainer import Trainer
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+from nemo.utils import logging
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--input_name_or_path", type=str, default=None, required=True, help="Path to NeMo Starcoder2 checkpoint"
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output HF checkpoint.")
+    parser.add_argument(
+        '--hf-model-name',
+        type=str,
+        default=None,
+        required=True,
+        help="Name of HF checkpoint. e.g. a folder containing https://huggingface.co/bigcode/starcoder2-15b/tree/main",
+    )
+    parser.add_argument("--precision", type=str, default="32", help="Model precision")
+    parser.add_argument(
+        "--cpu-only",
+        action="store_true",
+        help="Load model in cpu only. Useful if the model cannot fit in GPU memory, but this option makes the conversion script significantly slower.",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def load_config(hf_model_name, nemo_config):
+    hf_config = AutoConfig.from_pretrained(hf_model_name)
+    # SWA; nemo_config.window_size is list [left-bound, right-bound]
+    # SC2 is pretrained with no SWA, long-context tuning model is using SWA
+    hf_config.sliding_window = nemo_config.window_size[0] if 'window_size' in nemo_config else None
+    hf_config.max_position_embeddings = nemo_config.encoder_seq_length
+    hf_config.num_hidden_layers = nemo_config.num_layers
+    hf_config.hidden_size = nemo_config.hidden_size
+    hf_config.intermediate_size = nemo_config.ffn_hidden_size
+    hf_config.num_attention_heads = nemo_config.num_attention_heads
+    hf_config.initializer_range = nemo_config.init_method_std
+    hf_config.rms_norm_eps = nemo_config.layernorm_epsilon
+    hf_config.num_key_value_heads = nemo_config.num_query_groups
+    if nemo_config.activation == 'gelu':
+        hf_config.hidden_act = 'gelu_pytorch_tanh'
+    else:
+        logging.warning(f"Got unknown activation function {nemo_config.activation}")
+
+    hf_config.rope_theta = nemo_config['rotary_base'] if 'rotary_base' in nemo_config else 10000
+    return hf_config
+
+
+def convert(in_file, precision=None, cpu_only=True) -> None:
+    """
+    Convert NeMo checkpoint to HF checkpoint
+    """
+
+    logging.info(f'Loading NeMo checkpoint from: {in_file}')
+
+    dummy_trainer = Trainer(devices=1, accelerator='cpu', strategy=NLPDDPStrategy())
+    model_config = MegatronGPTModel.restore_from(in_file, trainer=dummy_trainer, return_config=True)
+    model_config.tensor_model_parallel_size = 1
+    model_config.pipeline_model_parallel_size = 1
+    if cpu_only:
+        map_location = torch.device('cpu')
+        model_config.use_cpu_initialization = True
+    else:
+        map_location = None
+
+    if cpu_only:
+        logging.info("******** Loading model on CPU. This will take a significant amount of time.")
+
+    model = MegatronGPTModel.restore_from(
+        in_file, trainer=dummy_trainer, override_config_path=model_config, map_location=map_location
+    )
+    ckpt = model.state_dict()
+    nemo_config = model.cfg
+
+    mcore_gpt = nemo_config.mcore_gpt
+
+    if precision is None:
+        precision = model.cfg.precision
+    if precision in [32, "32"]:
+        dtype = torch.float32
+    elif precision in [16, "16", "16-mixed"]:
+        dtype = torch.float16
+    elif precision in ["bf16", "bf16-mixed"]:
+        dtype = torch.bfloat16
+    else:
+        logging.warning(f"Precision string {precision} is not recognized, falling back to fp32")
+        dtype = torch.float32  # fallback
+    param_to_weights = lambda param: param.to(dtype)
+
+    state_dict = OrderedDict()
+
+    hf_embed_weight_name = f'model.embed_tokens.weight'
+    if mcore_gpt:
+        embed_weights_base_name = f'model.embedding.word_embeddings.weight'
+    else:
+        embed_weights_base_name = f'model.language_model.embedding.word_embeddings.weight'
+    state_dict[hf_embed_weight_name] = param_to_weights(ckpt[embed_weights_base_name])
+
+    head_num = nemo_config.num_attention_heads
+    if nemo_config.num_query_groups is None or nemo_config.num_query_groups == head_num:
+        num_query_groups = head_num
+    else:
+        num_query_groups = nemo_config.num_query_groups
+        assert head_num % num_query_groups == 0, 'head_num must be divisible by num_query_groups'
+
+    hidden_size = model.cfg.hidden_size
+    head_num = model.cfg.num_attention_heads
+    num_layers = model.cfg.num_layers
+    num_query_groups = model.cfg.get("num_query_groups", head_num)  # different num_query_groups for 70B
+
+    head_size = hidden_size // head_num
+    heads_per_group = head_num // num_query_groups
+    qkv_total_dim = head_num + 2 * num_query_groups
+
+    # Embedding
+    embed_weight = model.state_dict()[f'model.embedding.word_embeddings.weight']
+    embed_weights_base_name = f'model.embed_tokens.weight'
+    state_dict[embed_weights_base_name] = param_to_weights(embed_weight)
+
+    has_bias = nemo_config.bias
+
+    for l in range(int(num_layers)):
+        print(f"converting layer {l}")
+
+        qkv_weights = model.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_qkv.weight']
+        qkv_weights = qkv_weights.reshape([qkv_total_dim, head_size, hidden_size])
+        if has_bias:
+            qkv_bias = model.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_qkv.bias']
+            qkv_bias = qkv_bias.reshape([qkv_total_dim, head_size])
+
+        q_slice = torch.cat(
+            [
+                torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+                for i in range(num_query_groups)
+            ]
+        )
+        k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+        v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+        for name, slice in [('q_proj', q_slice), ('k_proj', k_slice), ('v_proj', v_slice)]:
+            weight_name = f'model.layers.{l}.self_attn.{name}.weight'
+            state_dict[weight_name] = param_to_weights(qkv_weights[slice].reshape(-1, hidden_size))
+            if has_bias:
+                bias_name = f'model.layers.{l}.self_attn.{name}.bias'
+                state_dict[bias_name] = param_to_weights(qkv_bias[slice].reshape(-1))
+
+        # attention dense
+        hf_o_weight_name = f'model.layers.{l}.self_attn.o_proj.weight'
+        hf_o_bias_name = f'model.layers.{l}.self_attn.o_proj.bias'
+        if mcore_gpt:
+            o_weight_base_name = f'model.decoder.layers.{l}.self_attention.linear_proj.weight'
+            o_bias_base_name = f'model.decoder.layers.{l}.self_attention.linear_proj.bias'
+        else:
+            o_weight_base_name = f'model.language_model.encoder.layers.{l}.self_attention.dense.weight'
+            o_bias_base_name = f'model.language_model.encoder.layers.{l}.self_attention.dense.bias'
+        state_dict[hf_o_weight_name] = param_to_weights(ckpt[o_weight_base_name])
+        if has_bias:
+            state_dict[hf_o_bias_name] = param_to_weights(ckpt[o_bias_base_name])
+
+        # # MLP
+        if mcore_gpt:
+            mlp_down_base_name_weight = f'model.decoder.layers.{l}.mlp.linear_fc1.weight'
+            mlp_down_base_name_bias = f'model.decoder.layers.{l}.mlp.linear_fc1.bias'
+        else:
+            raise Exception("not implemented")
+        hf_mlp_cfc_weight_name = f'model.layers.{l}.mlp.c_fc.weight'
+        hf_mlp_cfc_bias_name = f'model.layers.{l}.mlp.c_fc.bias'
+        state_dict[hf_mlp_cfc_weight_name] = param_to_weights(ckpt[mlp_down_base_name_weight])
+        if has_bias:
+            state_dict[hf_mlp_cfc_bias_name] = param_to_weights(ckpt[mlp_down_base_name_bias])
+
+        hf_mlp_up_base_name_weight = f'model.layers.{l}.mlp.c_proj.weight'
+        hf_mlp_up_base_name_bias = f'model.layers.{l}.mlp.c_proj.bias'
+        if mcore_gpt:
+            mlp_up_base_name_weight = f'model.decoder.layers.{l}.mlp.linear_fc2.weight'
+            mlp_up_base_name_bias = f'model.decoder.layers.{l}.mlp.linear_fc2.bias'
+        else:
+            raise Exception("not implemented")
+        state_dict[hf_mlp_up_base_name_weight] = param_to_weights(ckpt[mlp_up_base_name_weight])
+        if has_bias:
+            state_dict[hf_mlp_up_base_name_bias] = param_to_weights(ckpt[mlp_up_base_name_bias])
+
+        # LayerNorm
+        hf_input_ln_weight_name = f'model.layers.{l}.input_layernorm.weight'
+        hf_input_ln_bias_name = f'model.layers.{l}.input_layernorm.bias'
+        if mcore_gpt:
+            input_ln_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_weight'
+            input_ln_base_name_bias = f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_bias'
+        else:
+            input_ln_base_name = f'model.language_model.encoder.layers.{l}.input_layernorm.weight'
+            input_ln_base_name_bias = f'model.language_model.encoder.layers.{l}.input_layernorm.bias'
+        state_dict[hf_input_ln_weight_name] = param_to_weights(ckpt[input_ln_base_name])
+        if has_bias:
+            state_dict[hf_input_ln_bias_name] = param_to_weights(ckpt[input_ln_base_name_bias])
+
+        hf_post_attn_ln_weight_name = f'model.layers.{l}.post_attention_layernorm.weight'
+        hf_post_attn_ln_bias_name = f'model.layers.{l}.post_attention_layernorm.bias'
+        if mcore_gpt:
+            post_attn_ln_base_name = f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_weight'
+            post_attn_ln_base_name_bias = f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_bias'
+        else:
+            post_attn_ln_base_name = f'model.language_model.encoder.layers.{l}.mlp.linear_fc1.layer_norm.weight'
+            post_attn_ln_base_name_bias = f'model.language_model.encoder.layers.{l}.mlp.linear_fc1.layer_norm.bias'
+
+        state_dict[hf_post_attn_ln_weight_name] = param_to_weights(ckpt[post_attn_ln_base_name])
+        if has_bias:
+            state_dict[hf_post_attn_ln_bias_name] = param_to_weights(ckpt[post_attn_ln_base_name_bias])
+
+    hf_final_ln_weight_name = 'model.norm.weight'
+    hf_final_ln_bias_name = 'model.norm.bias'
+    if mcore_gpt:
+        final_ln_base_name = 'model.decoder.final_layernorm.weight'
+        final_ln_base_name_bias = 'model.decoder.final_layernorm.bias'
+    else:
+        final_ln_base_name = 'model.language_model.encoder.final_layernorm.weight'
+        final_ln_base_name_bias = 'model.language_model.encoder.final_layernorm.bias'
+    state_dict[hf_final_ln_weight_name] = param_to_weights(ckpt[final_ln_base_name])
+    if has_bias:
+        state_dict[hf_final_ln_bias_name] = param_to_weights(ckpt[final_ln_base_name_bias])
+
+    hf_output_layer_weight_name = 'lm_head.weight'
+    if mcore_gpt:
+        output_layer_base_name = 'model.output_layer.weight'
+    else:
+        output_layer_base_name = 'model.language_model.output_layer.weight'
+    state_dict[hf_output_layer_weight_name] = param_to_weights(ckpt[output_layer_base_name])
+    return state_dict, nemo_config
+
+
+if __name__ == '__main__':
+    args = get_args()
+    hf_state_dict, nemo_config = convert(args.input_name_or_path, args.precision, args.cpu_only)
+
+    config = load_config(args.hf_model_name, nemo_config)
+    model = AutoModelForCausalLM.from_config(config)
+    model.load_state_dict(hf_state_dict, strict=True)
+    model.save_pretrained(args.out_file)
+    hf_tokenizer = AutoTokenizer.from_pretrained('bigcode/starcoder2-tokenizer')
+    hf_tokenizer.save_pretrained(args.output_path)
+    logging.info(f'HF checkpoint saved to: {args.output_path}')

From b7b9c016932196b11aee5323fad49189b82aba66 Mon Sep 17 00:00:00 2001
From: Eric Harper <complex451@gmail.com>
Date: Thu, 21 Mar 2024 11:29:11 -0600
Subject: [PATCH 063/140] remove py 3.9 (#8719)

Signed-off-by: eharper <eharper@nvidia.com>
---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index 1195bf0cbb0d..9be06e74fea4 100644
--- a/setup.py
+++ b/setup.py
@@ -219,7 +219,6 @@ def finalize_options(self):
         'License :: OSI Approved :: Apache Software License',
         # Supported python versions
         'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.9',
         'Programming Language :: Python :: 3.10',
         # Additional Setting
         'Environment :: Console',

From 50241ba11ea730da15c93028b731779ac092cfe5 Mon Sep 17 00:00:00 2001
From: Adi Renduchintala <adithya.r@gmail.com>
Date: Thu, 21 Mar 2024 11:50:52 -0700
Subject: [PATCH 064/140] llm embeddings with position encoding models (#8715)

Signed-off-by: Adi Renduchintala <adithya.r@gmail.com>
---
 .../nlp/data/information_retrieval/gpt_embedding_dataset.py   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/data/information_retrieval/gpt_embedding_dataset.py b/nemo/collections/nlp/data/information_retrieval/gpt_embedding_dataset.py
index 352aff87217b..e697d5ec3bf6 100644
--- a/nemo/collections/nlp/data/information_retrieval/gpt_embedding_dataset.py
+++ b/nemo/collections/nlp/data/information_retrieval/gpt_embedding_dataset.py
@@ -261,9 +261,9 @@ def collate_fn(self, batch):
         max_length = min(self.max_seq_length, self._ceil_to_nearest(max_length, 16))
         assert max_length <= self.max_seq_length
 
-        attention_mask = [self._create_attention_mask(max_length) for _ in batch]
+        attention_mask = [self._create_attention_mask(max_length) for _ in input_ids]
         attention_mask = torch.stack(attention_mask)
-        position_ids = [list(range(max_length)) for _ in batch]
+        position_ids = [list(range(max_length)) for _ in input_ids]
         position_ids = torch.LongTensor(position_ids)
         input_ids = torch.LongTensor(
             self._collate_item(input_ids, max_length=max_length, pad_id=self.tokenizer.eos_id)

From 9b24ed72722198034b3f5c6eda9394e90e16f06d Mon Sep 17 00:00:00 2001
From: Adi Renduchintala <adithya.r@gmail.com>
Date: Thu, 21 Mar 2024 11:51:23 -0700
Subject: [PATCH 065/140] Update convert_gpt_nemo_to_mcore.py (#8712)

Signed-off-by: Adi Renduchintala <adithya.r@gmail.com>
---
 scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py b/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py
index 4e79aa317fe1..70c323553eb7 100644
--- a/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py
+++ b/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py
@@ -205,7 +205,7 @@ def restore_model(nemo_file, cpu_only=False):
             "This is a known issue. For now, please modify the config yaml file to use `MegatronGPTModel`."
         )
 
-    if model_config.precision in ['bf16', 'bf16-mixed']:
+    if model_config.get("precision", None) in ['bf16', 'bf16-mixed']:
         model_config.megatron_amp_O2 = True
 
     model = MegatronGPTModel.restore_from(

From dbc8a6ee490355bfa0cb1e10b8d199dcc47482e0 Mon Sep 17 00:00:00 2001
From: Somshubra Majumdar <titu1994@gmail.com>
Date: Thu, 21 Mar 2024 12:53:50 -0700
Subject: [PATCH 066/140] Fix transcribe for hybrid ctc rnnt model (#8721)

Signed-off-by: smajumdar <titu1994@gmail.com>
---
 nemo/collections/asr/models/hybrid_rnnt_ctc_models.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py
index 6aaa9a27f8c5..3eaab9961ef8 100644
--- a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py
+++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py
@@ -28,6 +28,7 @@
 from nemo.collections.asr.metrics.wer import WER
 from nemo.collections.asr.models.rnnt_models import EncDecRNNTModel
 from nemo.collections.asr.parts.mixins import ASRBPEMixin, InterCTCMixin, TranscribeConfig
+from nemo.collections.asr.parts.mixins.transcription import TranscriptionReturnType
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecoding, CTCDecodingConfig
 from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType
 from nemo.core.classes.common import PretrainedModelInfo
@@ -103,8 +104,8 @@ def transcribe(
         channel_selector: Optional[ChannelSelectorType] = None,
         augmentor: DictConfig = None,
         verbose: bool = True,
-        # logprobs: bool = False, DEPRECATED?
-    ) -> (List[str], Optional[List['Hypothesis']]):
+        override_config: Optional[TranscribeConfig] = None,
+    ) -> TranscriptionReturnType:
         """
         Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping.
 
@@ -142,13 +143,9 @@ def transcribe(
             channel_selector=channel_selector,
             augmentor=augmentor,
             verbose=verbose,
+            override_config=override_config,
         )
 
-        # if logprobs:
-        #     return logits_list
-        # else:
-        #     return hypotheses, all_hypotheses
-
     def _transcribe_on_begin(self, audio, trcfg: TranscribeConfig):
         super()._transcribe_on_begin(audio, trcfg)
 

From 1ab3951451c5870aaa813b1657cedf028629d7cc Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Thu, 21 Mar 2024 16:53:08 -0700
Subject: [PATCH 067/140] mcore install update to fix bug (#8724)

---
 .github/workflows/cicd-main.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 632abc707f48..c329fd2f68a8 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -98,6 +98,9 @@ jobs:
                 pushd Megatron-LM && \
                 git checkout 43792028f003ed25a3ee8c5a0d4cad82317d81b5 && \
                 pip install . && \
+                  pushd megatron/core/datasets && \
+                  make && \
+                  popd && \
                 popd
 
             # Install only for test: L2: Segmentation Tool

From d2ce082c180a569d01d8586a4525495ce61f3c96 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Fri, 22 Mar 2024 10:26:23 -0700
Subject: [PATCH 068/140] Pagaray/nemo cicd part5 (#8729)

* mcore install update to fix bug

* add clean step

* revert:temp commit for test

* Revert "revert:temp commit for test"

This reverts commit 25f5f8454a3da08135b94a7402be172f2b162967.
---
 .github/workflows/cicd-main.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index c329fd2f68a8..595c8c302ccf 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -26,6 +26,14 @@ jobs:
         whoami
         nvidia-smi
 
+  cicd-cluster-clean:
+    runs-on: self-hosted-azure
+    steps:
+    - name: Clean server from old files
+      run: |
+        docker container prune --filter "until=24h" --force
+        docker image prune -a --filter "until=24h" --force
+
   checkout-repository:
     runs-on: self-hosted-azure
     container:

From 1f1bb356ab5e8c297dc2cb72658a542bf7911c57 Mon Sep 17 00:00:00 2001
From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Date: Fri, 22 Mar 2024 19:59:13 +0200
Subject: [PATCH 069/140] Add guards for mcore imports [GPT FIM] (#8720)

* add mcore import guards

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add mcore import guards

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add guard for imports

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix code style

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert config

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../megatron/gpt_fim_dataset.py               | 25 +++++++++--
 .../gpt_full_te_layer_autocast_spec.py        | 45 +++++++++++++++----
 .../language_modeling/megatron_base_model.py  |  2 +-
 nemo/collections/nlp/parts/nlp_overrides.py   |  2 +-
 4 files changed, 61 insertions(+), 13 deletions(-)

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py
index 20ebf555f0b5..8862b52ee84b 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py
@@ -15,9 +15,22 @@
 from typing import Tuple
 
 import numpy as np
-from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig
-from megatron.core.datasets.indexed_dataset import IndexedDataset
-from megatron.core.datasets.utils import Split
+
+from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
+
+try:
+    from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig
+    from megatron.core.datasets.indexed_dataset import IndexedDataset
+    from megatron.core.datasets.utils import Split
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError) as e:
+
+    GPTDataset = GPTDatasetConfig = IndexedDataset = Split = ApexGuardDefaults
+
+    HAVE_MEGATRON_CORE = False
+    IMPORT_ERROR = e
 
 
 # is_dataset_built_on_rank function is needed for mcore GPTDatasetConfig
@@ -33,6 +46,9 @@ class GPTFIMDatasetConfig(GPTDatasetConfig):
     """
 
     def __init__(self, fim, **kwargs):
+        if not HAVE_MEGATRON_CORE:
+            raise ImportError(IMPORT_ERROR)
+
         super().__init__(**kwargs)
         self.fim = fim
 
@@ -62,6 +78,9 @@ def __init__(
         index_split: Split,
         config: GPTFIMDatasetConfig,
     ) -> None:
+        if not HAVE_MEGATRON_CORE:
+            raise ImportError(IMPORT_ERROR)
+
         super().__init__(indexed_dataset, dataset_path, indexed_indices, num_samples, index_split, config)
 
         self.indexed_dataset = indexed_dataset
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
index 8e9be1120e94..df872e03c682 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
@@ -15,14 +15,37 @@
 from typing import Any, Callable, Optional
 
 import torch
-from megatron.core import parallel_state, tensor_parallel
-from megatron.core.transformer.spec_utils import ModuleSpec
-from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
-
-from transformer_engine.pytorch import TransformerLayer
 
+from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
 from nemo.collections.nlp.parts import utils_funcs
 
+try:
+    from transformer_engine.pytorch import TransformerLayer
+
+    HAVE_TE = True
+
+except (ImportError, ModuleNotFoundError) as e:
+
+    TransformerLayer = ApexGuardDefaults
+
+    HAVE_TE = False
+    IMPORT_ERROR = e
+
+try:
+    from megatron.core import parallel_state, tensor_parallel
+    from megatron.core.transformer.spec_utils import ModuleSpec
+    from megatron.core.transformer.transformer_layer import BaseTransformerLayer
+    from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError) as e:
+
+    ModuleSpec = BaseTransformerLayer = ApexGuardDefaults
+
+    HAVE_MEGATRON_CORE = False
+    IMPORT_ERROR = e
+
 
 # Copied from nemo/collections/nlp/modules/common/megatron/transformer.py
 # as the source file is slated to be removed
@@ -64,6 +87,9 @@ def __init__(
         zero_centered_gamma: bool = False,
         device: str = 'cuda',
     ) -> None:
+        if not HAVE_MEGATRON_CORE or not HAVE_TE:
+            raise ImportError(IMPORT_ERROR)
+
         super().__init__(
             hidden_size=hidden_size,
             ffn_hidden_size=ffn_hidden_size,
@@ -137,11 +163,11 @@ def forward(
             )
 
 
-from megatron.core.transformer.transformer_layer import BaseTransformerLayer
-
-
 class TETransformerLayerAutocast(AutocastTransformerLayer, BaseTransformerLayer):
     def __init__(self, config, layer_number=1, hidden_dropout=None):
+        if not HAVE_MEGATRON_CORE or not HAVE_TE:
+            raise ImportError(IMPORT_ERROR)
+
         self.config = config
         self.is_first_microbatch = True
         precision = 'bf16' if config.bf16 else 16
@@ -260,4 +286,7 @@ def sharded_state_dict(self, prefix: str = '', sharded_offsets: tuple = ()):
 
 # Use this spec to use the full Transformer layer from Transformer Engine
 def get_gpt_full_te_layer_autocast_spec() -> ModuleSpec:
+    if not HAVE_MEGATRON_CORE or not HAVE_TE:
+        raise ImportError(IMPORT_ERROR)
+
     return ModuleSpec(module=TETransformerLayerAutocast)
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 13b72ef107c2..85867df672f2 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -66,7 +66,7 @@
 
 except (ImportError, ModuleNotFoundError):
 
-    ModelParallelConfig = ApexGuardDefaults
+    ModelParallelConfig = TransformerConfig = ApexGuardDefaults
 
     HAVE_MEGATRON_CORE = False
 
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 5fd15c0ed3ac..61e8ec64cae4 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -27,7 +27,6 @@
 import torch
 from lightning_fabric.utilities.cloud_io import get_filesystem
 from lightning_fabric.utilities.optimizer import _optimizer_to_device
-from megatron.core.tensor_parallel.layers import param_is_not_tensor_parallel_duplicate
 from omegaconf import OmegaConf
 from pytorch_lightning.callbacks.progress import TQDMProgressBar
 from pytorch_lightning.callbacks.progress.tqdm_progress import _update_n
@@ -93,6 +92,7 @@
         make_sharded_optimizer_tensor,
         optim_state_to_sharding_state,
     )
+    from megatron.core.tensor_parallel.layers import param_is_not_tensor_parallel_duplicate
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
     from megatron.core.transformer.transformer_layer import TransformerLayer as MCoreTransformerLayer
 

From c0abfd3573b8a4444c4858602a6a83ca09236183 Mon Sep 17 00:00:00 2001
From: Krishna Puvvada <93558329+krishnacpuvvada@users.noreply.github.com>
Date: Fri, 22 Mar 2024 11:47:08 -0700
Subject: [PATCH 070/140] AED transcribe bug fix[ASR] (#8732)

* bug fix in AED transcribe

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 nemo/collections/asr/models/aed_multitask_models.py          | 5 ++++-
 .../asr/modules/transformer/transformer_generators.py        | 5 +++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py
index 2472bda6cfd5..5cda453db45d 100644
--- a/nemo/collections/asr/models/aed_multitask_models.py
+++ b/nemo/collections/asr/models/aed_multitask_models.py
@@ -757,7 +757,10 @@ def _transcribe_input_manifest_processing(
         manifest_filepath = None
         if len(audio_files) == 1 and isinstance(audio_files[0], str):
             # Check if manifest file is provided
-            if hasattr(trcfg._internal, 'manifest_filepath'):
+            if (
+                hasattr(trcfg._internal, 'manifest_filepath')
+                and getattr(trcfg._internal, 'manifest_filepath') is not None
+            ):
                 manifest_filepath = trcfg._internal.manifest_filepath
 
             elif audio_files[0].endswith('.json') or audio_files[0].endswith('.jsonl'):
diff --git a/nemo/collections/asr/modules/transformer/transformer_generators.py b/nemo/collections/asr/modules/transformer/transformer_generators.py
index b5ee6da3e640..4061f54a907a 100644
--- a/nemo/collections/asr/modules/transformer/transformer_generators.py
+++ b/nemo/collections/asr/modules/transformer/transformer_generators.py
@@ -332,14 +332,15 @@ def _forward(
         # length penalty correction
         prefixes_len = torch.zeros_like(scores).fill_(prefixes.size(1) + 1)
 
-        for i in range(max_generation_length):
+        tgt_len = tgt.size(-1)
+        for i in range(tgt_len, max_generation_length + tgt_len):
 
             # mask all finished hypotheses to exclude them from beam
             pad_mask = pad_profile.repeat(1, self.beam_size)
 
             # generate and score candidates for prefixes continuation
             log_probs, decoder_mems_list = self._one_step_forward(
-                prefixes[:, -1:], encoder_hidden_states, encoder_input_mask, decoder_mems_list, i + 1
+                prefixes[:, -1:], encoder_hidden_states, encoder_input_mask, decoder_mems_list, i
             )
             scores_i, prefixes_i = torch.topk(log_probs[:, -1, :], self.beam_size, dim=-1)
 

From 284e0c36e3ab54b93f62d815c1156738b17a39d8 Mon Sep 17 00:00:00 2001
From: Stephane-Lpt <66870443+Stephane-Lpt@users.noreply.github.com>
Date: Fri, 22 Mar 2024 20:10:33 +0100
Subject: [PATCH 071/140] fix: start_time type of int instead of float (#8593)

Signed-off-by: Stephane Loppinet <stephaneloppinet@gmail.com>
Co-authored-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
---
 tools/nemo_forced_aligner/utils/make_ctm_files.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/nemo_forced_aligner/utils/make_ctm_files.py b/tools/nemo_forced_aligner/utils/make_ctm_files.py
index 17b734f20595..c87ac493d302 100644
--- a/tools/nemo_forced_aligner/utils/make_ctm_files.py
+++ b/tools/nemo_forced_aligner/utils/make_ctm_files.py
@@ -95,7 +95,7 @@ def make_ctm(
                     # make the predicted duration of the token/word/segment longer, growing it outwards equal
                     # amounts from the predicted center of the token/word/segment
                     token_mid_point = (start_time + end_time) / 2
-                    start_time = max(token_mid_point - ctm_file_config.minimum_timestamp_duration / 2, 0)
+                    start_time = max(token_mid_point - ctm_file_config.minimum_timestamp_duration / 2, 0.0)
                     end_time = min(
                         token_mid_point + ctm_file_config.minimum_timestamp_duration / 2, audio_file_duration
                     )

From c098dcffe1205d7f07080798c13c6c4e70233392 Mon Sep 17 00:00:00 2001
From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Date: Fri, 22 Mar 2024 12:34:03 -0700
Subject: [PATCH 072/140] Fix pickle issue for CLIP training (#8722)

* fix local pickle issue

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../multimodal/data/clip/clip_dataset.py      | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/nemo/collections/multimodal/data/clip/clip_dataset.py b/nemo/collections/multimodal/data/clip/clip_dataset.py
index 60708c813980..7e263e19dcc9 100644
--- a/nemo/collections/multimodal/data/clip/clip_dataset.py
+++ b/nemo/collections/multimodal/data/clip/clip_dataset.py
@@ -85,23 +85,25 @@ def get_preprocess_fns(model_cfg, tokenizer=None, is_train=True):
     return img_transform, text_transform
 
 
+# This function maps data that are tuples to dictionary.
+def tuple_to_dict(inp):
+    for input in inp:
+        out_dict = dict()
+        out_dict['images'] = input[0]
+        out_dict['captions'] = input[1]
+        yield out_dict
+
+
+def transform_fn(sample, img_transform, text_transform):
+    image, text = sample["jpg"], sample["txt"]
+    return img_transform(image), text_transform(text)
+
+
 def build_train_valid_datasets(
     model_cfg, consumed_samples, tokenizer=None,
 ):
     data_cfg = model_cfg.data
 
-    # This function maps data that are tuples to dictionary.
-    def tuple_to_dict(inp):
-        for input in inp:
-            out_dict = dict()
-            out_dict['images'] = input[0]
-            out_dict['captions'] = input[1]
-            yield out_dict
-
-    def transform_fn(sample, img_transform, text_transform):
-        image, text = sample["jpg"], sample["txt"]
-        return img_transform(image), text_transform(text)
-
     train_img_transform, text_transform = get_preprocess_fns(model_cfg, tokenizer, is_train=True)
     train_data = WebDatasetCommon(
         dataset_cfg=data_cfg,

From 35f9d34d8978ae7afe2adf938ed3894145331abe Mon Sep 17 00:00:00 2001
From: Rachit Garg <rachitgarg91@gmail.com>
Date: Fri, 22 Mar 2024 15:51:06 -0700
Subject: [PATCH 073/140] ifix for CP (#8725)

Signed-off-by: Rachit Garg <rachitg@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: Rachit Garg <rachitg@login-eos02.eos.clusters.nvidia.com>
---
 .../nlp/models/language_modeling/megatron_gpt_sft_model.py       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index e6d2eb8b6329..b6aeeb29ed3f 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -195,6 +195,7 @@ def setup(self, stage=None):
 
         if self.cfg.get('transformer_engine', False) or self.cfg.get('mcore_gpt', False):
             self.setup_transformer_engine_tp_groups()
+            self.setup_transformer_engine_cp_groups()
         self.setup_complete = True
 
     def _build_dataset(self, data_cfg, is_train=True):

From 11b7a733cbd4b8311eacba581323f88c7cd4bac4 Mon Sep 17 00:00:00 2001
From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com>
Date: Fri, 22 Mar 2024 23:14:15 -0400
Subject: [PATCH 074/140] Fix PTL2.2 saving multiple `*-last.ckpt` checkpoints
 in resumed training (#8480)

* Fix PTL2.2 saving multiple `*-last.ckpt` checkpoints when resuming from previous run

Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* Fix missing import

Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* fix broken test

Signed-off-by: stevehuang52 <heh@nvidia.com>

---------

Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>
Signed-off-by: stevehuang52 <heh@nvidia.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
---
 nemo/utils/callbacks/nemo_model_checkpoint.py | 28 ++++++++++++++++++-
 tests/core/test_exp_manager.py                |  5 ++--
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/nemo/utils/callbacks/nemo_model_checkpoint.py b/nemo/utils/callbacks/nemo_model_checkpoint.py
index 2c3325c56f82..059ce4455977 100644
--- a/nemo/utils/callbacks/nemo_model_checkpoint.py
+++ b/nemo/utils/callbacks/nemo_model_checkpoint.py
@@ -21,7 +21,7 @@
 
 import pytorch_lightning
 import torch
-from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint, _is_local_file_protocol
 from pytorch_lightning.utilities import rank_zero_info
 
 from nemo.collections.common.callbacks import EMA
@@ -454,3 +454,29 @@ def _remove_unfinished_checkpoints(checkpoint_dir: Union[Path, str]) -> None:
         # delete markers
         for marker_path in existing_marker_filepaths:
             os.remove(marker_path)
+
+    def _should_remove_checkpoint(self, trainer: "pl.Trainer", previous: str, current: str) -> bool:
+        """Checks if the previous checkpoint should be deleted.
+        A checkpoint won't be deleted if any of the cases apply:
+        - The previous checkpoint is the same as the current checkpoint (means the old was already overwritten by new)
+        - The previous checkpoint is not in the current checkpoint directory and the filesystem is local
+        - The previous checkpoint is the checkpoint the Trainer resumed from and the filesystem is local 
+            and the resumed from checkpoint is not the last checkpoint
+        """
+        if previous == current:
+            return False
+        if not _is_local_file_protocol(previous):
+            return True
+        previous = Path(previous).absolute()
+        resume_path = Path(trainer.ckpt_path).absolute() if trainer.ckpt_path is not None else None
+
+        if resume_path is not None and previous == resume_path:
+            if str(current).endswith("-last.ckpt") and resume_path.name.endswith("-last.ckpt"):
+                # delete the previous `-last.ckpt` checkpoint when current saved checkpoint is also `-last.ckpt`, if they're in the same directory
+                pass
+            else:
+                return False
+        if self.dirpath is None:
+            raise ValueError(f"{self.__class__}.dirpath is None.")
+        dirpath = Path(self.dirpath).absolute()
+        return dirpath in previous.parents
diff --git a/tests/core/test_exp_manager.py b/tests/core/test_exp_manager.py
index 8883d6514119..40fd545ab52d 100644
--- a/tests/core/test_exp_manager.py
+++ b/tests/core/test_exp_manager.py
@@ -946,9 +946,8 @@ def test_invalid_checkpoints_removed_from_topk(self, tmp_path):
         test_trainer2.fit(model)
 
         ckpt_filenames = {f.name for f in checkpoints_dir.rglob("*.ckpt") if f.is_file()}
-        # 3 top + 1 last + 1 resume ckpt since PTL >= 2.1 ensures to never delete the resume ckpt
-        # (https://github.com/Lightning-AI/pytorch-lightning/pull/18750)
-        assert len(ckpt_filenames) == 5
+        # 3 top + 1 last
+        assert len(ckpt_filenames) == 4
         assert 'epoch=9-last.ckpt' in ckpt_filenames
         assert 'epoch=8.ckpt' in ckpt_filenames
         assert 'epoch=7.ckpt' in ckpt_filenames

From eb7941dc5f907d9ed95be0664876b0e82b9ce824 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Tue, 26 Mar 2024 10:12:14 -0400
Subject: [PATCH 075/140] Packed sequence data shuffling & without `thd`
 attention (#8693)

* packed seq without thd attention for mlperf

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* enable shuffling for packed dataset

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* set seed

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../megatron/gpt_sft_dataset.py               | 99 +++++++++++++------
 .../megatron_gpt_sft_model.py                 |  5 +-
 2 files changed, 74 insertions(+), 30 deletions(-)

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
index d8314990b5cd..bb7bf07e4ad1 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
@@ -104,6 +104,8 @@ def __init__(
         self.prompt_template = prompt_template
         self.virtual_tokens = virtual_tokens
         self.tokens_to_generate = tokens_to_generate
+        self.memmap_workers = memmap_workers
+        self.hf_dataset = hf_dataset
         self.truncation_method = truncation_method
         self.is_test = is_test
         self.output_original_text = output_original_text
@@ -118,25 +120,32 @@ def __init__(
         else:
             self.special_tokens = special_tokens
 
-        if hf_dataset:
+        self._load_dataset()
+
+        # Validate prompt template
+        self._maybe_validate_prompt_template()
+
+        # Will be None after this call if `max_num_samples` is None
+        self._build_samples_mapping()
+
+    def _load_dataset(self):
+        if self.hf_dataset:
             self.indexed_dataset = load_dataset(
-                'json', data_files=file_path, cache_dir=index_mapping_dir, num_proc=memmap_workers, split='train'
+                'json',
+                data_files=self.file_path,
+                cache_dir=self.index_mapping_dir,
+                num_proc=self.memmap_workers,
+                split='train',
             )
         else:
             self.indexed_dataset = JSONLMemMapDataset(
-                dataset_paths=[file_path],
+                dataset_paths=[self.file_path],
                 tokenizer=None,
                 header_lines=0,
-                index_mapping_dir=index_mapping_dir,
-                workers=memmap_workers,
+                index_mapping_dir=self.index_mapping_dir,
+                workers=self.memmap_workers,
             )
 
-        # Validate prompt template
-        self._maybe_validate_prompt_template()
-
-        # Will be None after this call if `max_num_samples` is None
-        self._build_samples_mapping()
-
     def _maybe_validate_prompt_template(self):
         assert (
             self.prompt_template is not None
@@ -476,23 +485,28 @@ def collate_fn(self, batch):
 
 
 class GPTSFTPackedDataset(GPTSFTDataset):
-    def __init__(self, file_path: str, tokenizer: TokenizerSpec, **kwargs):
+    def __init__(self, file_path: str, tokenizer: TokenizerSpec, return_cu_seqlen: bool = True, **kwargs):
         super().__init__(file_path, tokenizer, **kwargs)
         assert self.virtual_tokens == 0, "P-Tuning with packed sequence is not supported."
-        self._load_packed_dataset(file_path)
+
+        # Whether to return `cu_seqlen` to pass to model. This should be true for almost all use cases.
+        self.return_cu_seqlen = return_cu_seqlen
+
+        np.random.seed(self.seed)
 
     def __getitem__(self, idx):
+        if self.samples_mapping is not None:
+            # assert idx < len(self.samples_mapping)
+            idx = self.samples_mapping[idx]
+
         input_ids = self.indexed_dataset[idx]['input_ids']
         seq_boundaries = self.indexed_dataset[idx]['seq_start_id'] + [len(input_ids)]
         loss_mask = self.indexed_dataset[idx]['loss_mask']
         return {'input_ids': input_ids, 'seq_boundaries': seq_boundaries, 'loss_mask': loss_mask}
 
-    def __len__(self):
-        return len(self.indexed_dataset)
-
-    def _load_packed_dataset(self, file_path):
+    def _load_dataset(self):
         try:
-            self.indexed_dataset = np.load(file_path, allow_pickle=True)
+            self.indexed_dataset = np.load(self.file_path, allow_pickle=True)
         except Exception as e:
             logging.error(
                 f"Failed to load packed dataset. The dataset should be a `.npy` file. "
@@ -500,6 +514,19 @@ def _load_packed_dataset(self, file_path):
             )
             exit(1)
 
+    def _build_samples_mapping(self):
+        if self.max_num_samples is not None:
+            # custom samples mapping logic, following the format for unpacked sft dataset
+            # Note: this is epoch-level shuffling, i.e. sampling without replacement until end of epoch, then repeat.
+            # Unpacked dataset shuffles by sampling with replacement indefinitely.
+            dataset_len = len(self.indexed_dataset)
+            max_num_epochs = np.ceil(self.max_num_samples / dataset_len)
+            indices = np.arange(dataset_len)[None, :].repeat(max_num_epochs, axis=0)
+            [np.random.shuffle(x) for x in indices]
+            self.samples_mapping = indices.reshape(1, -1).squeeze()[: self.max_num_samples]
+        else:
+            self.samples_mapping = None
+
     def _build_loss_mask(self, processed_example):
         if self.answer_only_loss:
             seq_boundaries = processed_example['seq_boundaries']
@@ -565,28 +592,42 @@ def collate_fn(self, batch):
             position_ids[0]
         ), "Dataset problem: input_ids and position_ids lengths don't match"
 
-        cu_seqlens = self._collate_item(cu_seqlens, max_length=max(len(l) for l in cu_seqlens) + 1, pad_id=-1)
         input_ids = self._collate_item(input_ids, max_length=max_length, pad_id=self.tokenizer.eos_id)
         labels = self._collate_item(labels, max_length=max_length, pad_id=self.tokenizer.eos_id)
         loss_mask = self._collate_item(loss_mask, max_length=max_length, pad_id=0)
         position_ids = self._collate_item(position_ids, max_length=max_length, pad_id=0)
 
-        # Pre-generate `cu_seqlens_argmin` and `max_seqlen` as CPU tensor to avoid device-to-host copies.
-        cu_seqlens = torch.IntTensor(cu_seqlens)
-        cu_seqlens_argmin = torch.argmin(cu_seqlens, dim=1, keepdim=True)
-        seqlens = cu_seqlens[:, 1:] - cu_seqlens[:, :-1]
-        max_seqlen, _ = seqlens.max(dim=1, keepdim=True)
-
         processed_batch = {
             'tokens': torch.LongTensor(input_ids),
             'labels': torch.LongTensor(labels),
-            'attention_mask': torch.LongTensor([1] * len(input_ids)),  # no attention mask is needed for packed seq
             'loss_mask': torch.LongTensor(loss_mask),
             'position_ids': torch.LongTensor(position_ids),
-            'cu_seqlens': torch.IntTensor(cu_seqlens),  # cu_seqlens_q must be in dtype torch.int32
             'token_count': token_count,
-            'cu_seqlens_argmin': cu_seqlens_argmin,
-            'max_seqlen': max_seqlen,
         }
 
+        if self.return_cu_seqlen:
+            cu_seqlens = self._collate_item(cu_seqlens, max_length=max(len(l) for l in cu_seqlens) + 1, pad_id=-1)
+
+            # Pre-generate `cu_seqlens_argmin` and `max_seqlen` as CPU tensor to avoid device-to-host copies.
+            cu_seqlens = torch.IntTensor(cu_seqlens)
+            cu_seqlens_argmin = torch.argmin(cu_seqlens, dim=1, keepdim=True)
+            seqlens = cu_seqlens[:, 1:] - cu_seqlens[:, :-1]
+            max_seqlen, _ = seqlens.max(dim=1, keepdim=True)
+
+            processed_batch.update(
+                {
+                    'attention_mask': torch.LongTensor(
+                        [1] * len(input_ids)
+                    ),  # no attention mask is needed for packed seq
+                    'cu_seqlens': torch.IntTensor(cu_seqlens),  # cu_seqlens_q must be in dtype torch.int32
+                    'cu_seqlens_argmin': cu_seqlens_argmin,  # only required for perf
+                    'max_seqlen': max_seqlen,  # only required for perf
+                }
+            )
+        else:
+            attention_mask = [self._create_attention_mask(max_length) for _ in batch]
+            processed_batch.update(
+                {'attention_mask': torch.stack(attention_mask),}
+            )
+
         return processed_batch
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index b6aeeb29ed3f..b2879a9171a7 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -258,11 +258,13 @@ def _build_dataset(self, data_cfg, is_train=True):
             8 * self.cfg.get('tensor_model_parallel_size', 1) if self.cfg.get('sequence_parallel', False) else 16
         )
 
+        dataset_kwargs = {}
         for file_path, num_samples in zip(data_cfg.file_names, num_train_samples_per_dataset):
             if self.cfg.data.get("chat", False):
                 dataset_cls = GPTSFTChatDataset
             elif packed_sequence:
                 dataset_cls = GPTSFTPackedDataset
+                dataset_kwargs = {'return_cu_seqlen': data_cfg.get("packed_sequence_return_cu_seqlen", True)}
                 assert data_cfg.micro_batch_size == 1, "Micro batch size must be 1 if using packed sequence"
             else:
                 dataset_cls = GPTSFTDataset
@@ -281,7 +283,7 @@ def _build_dataset(self, data_cfg, is_train=True):
                 add_eos=data_cfg.get('add_eos', True),
                 add_sep=data_cfg.get('add_sep', False),
                 sep_id=self.sep_id,
-                max_num_samples=num_samples[0] if not packed_sequence else None,
+                max_num_samples=num_samples[0],
                 seed=data_cfg.get('seed', 1234),
                 label_key=data_cfg.get('label_key', 'answer'),
                 answer_only_loss=self.cfg.get('answer_only_loss', True),
@@ -306,6 +308,7 @@ def _build_dataset(self, data_cfg, is_train=True):
                     'chat_prompt_tokens', None
                 ),  # special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '<extra_id_0>', 'turn_start': '<extra_id_1>', 'label_start': '<extra_id_2>', 'end_of_turn': '\n', "end_of_name": "\n"}
                 is_test=not is_train,
+                **dataset_kwargs,
             )
             datasets.append(dataset)
         if is_train:

From 439c14b0e75ca01c767d8ff65470ac5fd3f6338d Mon Sep 17 00:00:00 2001
From: Sangkug Lym <slym@nvidia.com>
Date: Tue, 26 Mar 2024 16:10:15 -0700
Subject: [PATCH 076/140] Fix the nsys profiling step condition (#8717)

Signed-off-by: Sangkug Lym <slym@nvidia.com>
---
 nemo/core/classes/modelPT.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py
index 177c6ade097c..d5cd18179e8b 100644
--- a/nemo/core/classes/modelPT.py
+++ b/nemo/core/classes/modelPT.py
@@ -1732,7 +1732,7 @@ def on_train_batch_start(self, batch: Any, batch_idx: int, unused: int = 0) -> O
         if self.device.type == 'cuda':
             if hasattr(self, '_nsys_profile_enabled'):
                 if self._nsys_profile_enabled and not self._profile_complete:
-                    if batch_idx == self._nsys_profile_start_step and get_rank() in self._nsys_profile_ranks:
+                    if batch_idx >= self._nsys_profile_start_step and get_rank() in self._nsys_profile_ranks:
                         logging.info("====== Start nsys profiling ======")
                         torch.cuda.cudart().cudaProfilerStart()
                         if self._nsys_profile_gen_shape:
@@ -1769,7 +1769,7 @@ def on_train_batch_end(self, outputs, batch: Any, batch_idx: int, unused: int =
         if self.device.type == 'cuda':
             if hasattr(self, '_nsys_profile_enabled'):
                 if self._nsys_profile_enabled and not self._profile_complete:
-                    if batch_idx == self._nsys_profile_end_step and get_rank() in self._nsys_profile_ranks:
+                    if batch_idx >= self._nsys_profile_end_step and get_rank() in self._nsys_profile_ranks:
                         logging.info("====== End nsys profiling ======")
                         torch.cuda.cudart().cudaProfilerStop()
                         self._profile_complete = True

From c5ec450a1095fa580ec23b766fdec92168650411 Mon Sep 17 00:00:00 2001
From: Nithin Rao <nithinrao.koluguri@gmail.com>
Date: Tue, 26 Mar 2024 21:46:30 -0400
Subject: [PATCH 077/140] Update webdataset (#8560)

* initial asr data fix

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* populate changes to other collections

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* update func call

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* update label map function

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* remove f argument in map

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* change to pipeline

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* change to pipeline

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* add tarfiles to samples

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* add tarfiles to samples

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* add tarfiles to samples

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* patch prev workers code

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* update webdataset by manual worker computation

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* add seed everything for ASR reproducibility

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* cpu case

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* removing global seed

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* pin version

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* remove manual webdataset version install in jenkins file

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* min webdataset 0.2.86 version

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* remove unused import

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* remove prints

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* update jenkins pnc position

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* remove print statements

Signed-off-by: Nithin Rao Koluguri <nithinraok>

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
---
 Jenkinsfile                                   | 18 ++-----
 nemo/collections/asr/data/audio_to_label.py   | 47 +++++++++----------
 nemo/collections/asr/data/audio_to_text.py    | 32 +++++++------
 nemo/collections/asr/models/rnnt_models.py    |  1 +
 .../asr/parts/preprocessing/perturb.py        | 20 ++++----
 .../speech_cv/data/video_to_text.py           | 28 +++++------
 .../data/language_modeling/l2r_lm_dataset.py  | 20 ++++----
 .../language_modeling/sentence_dataset.py     | 20 ++++----
 .../machine_translation_dataset.py            | 20 ++++----
 .../dataset.py                                | 20 ++++----
 .../text_normalization/decoder_dataset.py     | 19 ++++----
 ...nctuation_capitalization_tarred_dataset.py | 34 ++++++++------
 nemo/collections/tts/data/vocoder_dataset.py  | 26 +++++-----
 nemo/utils/distributed.py                     | 20 ++++++++
 nemo/utils/exp_manager.py                     |  2 +-
 requirements/requirements_lightning.txt       |  2 +-
 tests/collections/asr/test_asr_datasets.py    | 23 +++++++++
 17 files changed, 195 insertions(+), 157 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 3a96a9b5164f..545a4aa7c13d 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -162,7 +162,6 @@ pipeline {
       failFast true
       steps {
         sh "rm -rf /home/TestData/multimodal/imagen_train"
-        sh "pip install webdataset==0.2.48"
         sh "python examples/multimodal/text_to_image/imagen/imagen_training.py \
         trainer.precision=16 \
         trainer.num_nodes=1 \
@@ -177,7 +176,6 @@ pipeline {
         model.inductor=False \
         model.unet.flash_attention=False \
         "
-        sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
         sh "rm -rf /home/TestData/multimodal/imagen_train"
       }
     }
@@ -192,7 +190,6 @@ pipeline {
       failFast true
       steps {
         sh "rm -rf /home/TestData/multimodal/stable_diffusion_train"
-        sh "pip install webdataset==0.2.48"
         sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \
             trainer.precision=16 \
             trainer.num_nodes=1 \
@@ -216,7 +213,6 @@ pipeline {
             model.unet_config.attention_resolutions=[1] \
             model.unet_config.channel_mult=[1] \
             "
-        sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
         sh "rm -rf /home/TestData/multimodal/stable_diffusion_train"
       }
     }
@@ -230,7 +226,6 @@ pipeline {
 //       failFast true
 //       steps {
 //         sh "rm -rf /home/TestData/multimodal/controlnet_train"
-//         sh "pip install webdataset==0.2.48"
 //         sh "python examples/multimodal/text_to_image/controlnet/controlnet_train.py \
 //             trainer.precision=16 \
 //             trainer.num_nodes=1 \
@@ -248,7 +243,6 @@ pipeline {
 //             model.first_stage_config.from_pretrained=null \
 //             model.unet_config.use_flash_attention=False \
 //             "
-//         sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
 //         sh "rm -rf /home/TestData/multimodal/controlnet_train"
 //       }
 //     }
@@ -262,7 +256,6 @@ pipeline {
 //       failFast true
 //       steps {
 //         sh "rm -rf /home/TestData/multimodal/dreambooth_train"
-//         sh "pip install webdataset==0.2.48"
 //         sh "python examples/multimodal/text_to_image/dreambooth/dreambooth.py \
 //             trainer.precision=16 \
 //             trainer.num_nodes=1 \
@@ -284,7 +277,6 @@ pipeline {
 //             model.data.instance_dir=/home/TestData/multimodal/tiny-dreambooth \
 //             model.unet_config.use_flash_attention=False \
 //             "
-//         sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
 //         sh "rm -rf /home/TestData/multimodal/dreambooth_train"
 //       }
 //     }
@@ -298,7 +290,6 @@ pipeline {
       failFast true
       steps {
         sh "rm -rf /home/TestData/vision/vit_pretrain_tp1"
-        sh "pip install webdataset==0.2.48"
         sh "python examples/vision/vision_transformer/megatron_vit_classification_pretrain.py \
             trainer.precision=16 \
             model.megatron_amp_O2=False \
@@ -315,7 +306,6 @@ pipeline {
             exp_manager.create_checkpoint_callback=False \
             model.data.data_path=[/home/TestData/multimodal/tiny-imagenet/train,/home/TestData/multimodal/tiny-imagenet/val] \
             exp_manager.exp_dir=/home/TestData/vision/vit_pretrain_tp1 "
-        sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
         sh "rm -rf /home/TestData/vision/vit_pretrain_tp1"
       }
     }
@@ -330,7 +320,6 @@ pipeline {
       failFast true
       steps {
         sh "rm -rf /home/TestData/multimodal/clip_pretrain_tp1"
-        sh "pip install webdataset==0.2.48"
         sh "python examples/multimodal/vision_language_foundation/clip/megatron_clip_pretrain.py  \
             trainer.precision=16 \
             model.megatron_amp_O2=False \
@@ -354,7 +343,6 @@ pipeline {
             model.data.validation.dataset_path=[/home/TestData/multimodal/tiny-clip/00000.tar] \
             model.data.webdataset.local_root_path=/ \
             exp_manager.exp_dir=/home/TestData/multimodal/clip_pretrain_tp1 "
-        sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
         sh "rm -rf /home/TestData/multimodal/clip_pretrain_tp1"
       }
     }
@@ -369,7 +357,6 @@ pipeline {
       failFast true
       steps {
         sh "rm -rf /home/TestData/multimodal/neva_pretrain_tp1"
-        sh "pip install webdataset==0.2.48"
         sh "python examples/multimodal/multimodal_llm/neva/neva_pretrain.py \
             trainer.precision=16 \
             model.megatron_amp_O2=False \
@@ -400,7 +387,6 @@ pipeline {
             model.mm_cfg.llm.from_pretrained=null \
             model.use_flash_attention=false \
             exp_manager.exp_dir=/home/TestData/multimodal/neva_pretrain_tp1 "
-        sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
         sh "rm -rf /home/TestData/multimodal/neva_pretrain_tp1"
       }
     }
@@ -413,6 +399,8 @@ pipeline {
 
     // TODO: this requires TE >= v0.11 which is not available in 23.06.
     //        please uncomment this test once mcore CI is ready.
+    
+
     stage('L2: Community LLM Checkpoints tests') {
       when {
         anyOf {
@@ -2213,6 +2201,7 @@ pipeline {
         }
       }
     }
+    
     stage('Punctuation & Capitalization tarred dataset') {
       when {
         anyOf {
@@ -2272,6 +2261,7 @@ pipeline {
         }
       }
     }
+    
     stage('Punctuation & Capitalization, Different ways of passing labels to model') {
       when {
         anyOf {
diff --git a/nemo/collections/asr/data/audio_to_label.py b/nemo/collections/asr/data/audio_to_label.py
index f00f961b4c81..4ff27f91ed0f 100644
--- a/nemo/collections/asr/data/audio_to_label.py
+++ b/nemo/collections/asr/data/audio_to_label.py
@@ -16,7 +16,7 @@
 from typing import Dict, List, Optional, Union
 
 import torch
-import webdataset as wd
+import webdataset as wds
 
 from nemo.collections.asr.data.audio_to_text import cache_datastore_manifests, expand_sharded_filepaths
 from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer
@@ -25,9 +25,10 @@
 from nemo.core.classes import Dataset, IterableDataset
 from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, NeuralType, RegressionValuesType
 from nemo.utils import logging
+from nemo.utils.distributed import webdataset_split_by_workers
 
 # List of valid file formats (prioritized by order of importance)
-VALID_FILE_FORMATS = ';'.join(['wav', 'mp3', 'flac'] + [fmt.lower() for fmt in valid_sf_formats.keys()])
+VALID_FILE_FORMATS = ';'.join(['wav', 'mp3', 'flac', 'opus'] + [fmt.lower() for fmt in valid_sf_formats.keys()])
 
 
 def repeat_signal(signal: torch.Tensor, sig_len: int, required_length: int) -> torch.Tensor:
@@ -567,18 +568,15 @@ def __init__(
             global_rank=global_rank,
         )
         # Put together WebDataset
-        self._dataset = wd.WebDataset(urls=audio_tar_filepaths, nodesplitter=None)
-
-        if shuffle_n > 0:
-            self._dataset = self._dataset.shuffle(shuffle_n)
-        else:
-            logging.info("WebDataset will not shuffle files within the tar files.")
-
-        self._dataset = (
-            self._dataset.rename(audio=VALID_FILE_FORMATS, key='__key__')
-            .to_tuple('audio', 'key')
-            .pipe(self._filter)
-            .map(f=self._build_sample)
+        self._dataset = wds.DataPipeline(
+            wds.SimpleShardList(urls=audio_tar_filepaths),
+            webdataset_split_by_workers,
+            wds.shuffle(shuffle_n),
+            wds.tarfile_to_samples(),
+            wds.rename(audio=VALID_FILE_FORMATS, key='__key__'),
+            wds.to_tuple('audio', 'key'),
+            self._filter,
+            wds.map(self._build_sample),
         )
 
     def _filter(self, iterator):
@@ -1164,18 +1162,15 @@ def __init__(
             global_rank=global_rank,
         )
         # Put together WebDataset
-        self._dataset = wd.WebDataset(urls=audio_tar_filepaths, nodesplitter=None)
-
-        if shuffle_n > 0:
-            self._dataset = self._dataset.shuffle(shuffle_n)
-        else:
-            logging.info("WebDataset will not shuffle files within the tar files.")
-
-        self._dataset = (
-            self._dataset.rename(audio=VALID_FILE_FORMATS, key='__key__')
-            .to_tuple('audio', 'key')
-            .pipe(self._filter)
-            .map(f=self._build_sample)
+        self._dataset = wds.DataPipeline(
+            wds.SimpleShardList(urls=audio_tar_filepaths),
+            webdataset_split_by_workers,
+            wds.shuffle(shuffle_n),
+            wds.tarfile_to_samples(),
+            wds.rename(audio=VALID_FILE_FORMATS, key='__key__'),
+            wds.to_tuple('audio', 'key'),
+            self._filter,
+            wds.map(self._build_sample),
         )
 
     def _get_label_set(self):
diff --git a/nemo/collections/asr/data/audio_to_text.py b/nemo/collections/asr/data/audio_to_text.py
index 58cd3630e322..a689450c94ba 100644
--- a/nemo/collections/asr/data/audio_to_text.py
+++ b/nemo/collections/asr/data/audio_to_text.py
@@ -21,11 +21,12 @@
 import braceexpand
 import numpy as np
 import torch
-import webdataset as wd
+import webdataset as wds
 from torch.utils.data import ChainDataset
 from tqdm import tqdm
 
 from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer
+from nemo.collections.asr.parts.preprocessing.segment import available_formats as valid_sf_formats
 from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType
 from nemo.collections.common import tokenizers
 from nemo.collections.common.parts.preprocessing import collections, parsers
@@ -40,6 +41,7 @@
     is_datastore_path,
     is_tarred_path,
 )
+from nemo.utils.distributed import webdataset_split_by_workers
 from nemo.utils.get_rank import is_global_rank_zero
 
 __all__ = [
@@ -49,6 +51,8 @@
     'TarredAudioToBPEDataset',
 ]
 
+VALID_FILE_FORMATS = ';'.join(['wav', 'mp3', 'flac', 'opus'] + [fmt.lower() for fmt in valid_sf_formats.keys()])
+
 
 def _speech_collate_fn(batch, pad_id):
     """collate batch of audio sig, audio len, tokens, tokens len
@@ -860,20 +864,17 @@ def __init__(
             global_rank=global_rank,
         )
 
-        # Put together WebDataset
-        self._dataset = wd.WebDataset(urls=audio_tar_filepaths, nodesplitter=None)
-
-        if shuffle_n > 0:
-            self._dataset = self._dataset.shuffle(shuffle_n)
-        else:
-            logging.info("WebDataset will not shuffle files within the tar files.")
-
-        self._dataset = (
-            self._dataset.rename(audio='wav;ogg;flac', key='__key__')
-            .to_tuple('audio', 'key')
-            .pipe(self._filter)
-            .pipe(self._loop_offsets)
-            .map(f=self._build_sample)
+        # Put together WebDataset pipeline
+        self._dataset = wds.DataPipeline(
+            wds.SimpleShardList(urls=audio_tar_filepaths),
+            webdataset_split_by_workers,
+            wds.shuffle(shuffle_n),
+            wds.tarfile_to_samples(),
+            wds.rename(audio=VALID_FILE_FORMATS, key='__key__'),
+            wds.to_tuple('audio', 'key'),
+            self._filter,
+            self._loop_offsets,
+            wds.map(self._build_sample),
         )
 
     def _filter(self, iterator):
@@ -942,6 +943,7 @@ def _build_sample(self, tup):
 
         # Grab manifest entry from self.manifest_preprocessor.collection
         file_id, _ = os.path.splitext(os.path.basename(audio_filename))
+
         manifest_idx = self.manifest_processor.collection.mapping[file_id][offset_id]
         manifest_entry = self.manifest_processor.collection[manifest_idx]
 
diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py
index 4bdfcfbc965b..047e25b8dd5d 100644
--- a/nemo/collections/asr/models/rnnt_models.py
+++ b/nemo/collections/asr/models/rnnt_models.py
@@ -503,6 +503,7 @@ def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict
         # Need to set this because if using an IterableDataset, the length of the dataloader is the total number
         # of samples rather than the number of batches, and this messes up the tqdm progress bar.
         # So we set the number of steps manually (to the correct number) to fix this.
+
         if (
             self._train_dl is not None
             and hasattr(self._train_dl, 'dataset')
diff --git a/nemo/collections/asr/parts/preprocessing/perturb.py b/nemo/collections/asr/parts/preprocessing/perturb.py
index 62f1b151abe9..2108da010c52 100644
--- a/nemo/collections/asr/parts/preprocessing/perturb.py
+++ b/nemo/collections/asr/parts/preprocessing/perturb.py
@@ -54,7 +54,7 @@
 # TODO @blisc: Perhaps refactor instead of import guarding
 HAVE_OMEGACONG_WEBDATASET = True
 try:
-    import webdataset as wd
+    import webdataset as wds
     from omegaconf import DictConfig, OmegaConf
 except ModuleNotFoundError:
     from nemo.utils.exceptions import LightningNotInstalledException
@@ -1274,17 +1274,13 @@ def __init__(
 
         if not HAVE_OMEGACONG_WEBDATASET:
             raise LightningNotInstalledException(self)
-        self.audio_dataset = wd.WebDataset(urls=tar_filepaths, nodesplitter=None)
-
-        if shuffle_n > 0:
-            self.audio_dataset = self.audio_dataset.shuffle(shuffle_n)
-        else:
-            logging.info("WebDataset will not shuffle files within the tar files.")
-
-        self.audio_dataset = (
-            self.audio_dataset.rename(audio='wav;ogg;flac', key='__key__')
-            .to_tuple('audio', 'key')
-            .pipe(self._loop_offsets)
+        self.audio_dataset = wds.DataPipeline(
+            wds.SimpleShardList(urls=tar_filepaths),
+            wds.shuffle(shuffle_n),
+            wds.tarfile_to_samples(),
+            wds.rename(audio='wav;ogg;flac', key='__key__'),
+            wds.to_tuple('audio', 'key'),
+            self._loop_offsets,
         )
 
     def __len__(self):
diff --git a/nemo/collections/multimodal/speech_cv/data/video_to_text.py b/nemo/collections/multimodal/speech_cv/data/video_to_text.py
index b2474910c5f3..a20d6e5bb9a8 100644
--- a/nemo/collections/multimodal/speech_cv/data/video_to_text.py
+++ b/nemo/collections/multimodal/speech_cv/data/video_to_text.py
@@ -16,7 +16,7 @@
 from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import torch
-import webdataset as wd
+import webdataset as wds
 
 from nemo.collections.asr.data.audio_to_text import cache_datastore_manifests, expand_sharded_filepaths
 from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType
@@ -26,6 +26,7 @@
 from nemo.core.classes import Dataset, IterableDataset
 from nemo.core.neural_types import *
 from nemo.utils import logging
+from nemo.utils.distributed import webdataset_split_by_workers
 
 
 def _video_speech_collate_fn(batch, pad_id):
@@ -597,20 +598,17 @@ def __init__(
         )
 
         # Put together WebDataset
-        self._dataset = wd.WebDataset(urls=audio_tar_filepaths, nodesplitter=None)
-
-        if shuffle_n > 0:
-            self._dataset = self._dataset.shuffle(shuffle_n)
-        else:
-            logging.info("WebDataset will not shuffle files within the tar files.")
-
-        self._dataset = (
-            self._dataset.map(wd.autodecode.Decoder([wd.torch_video]))
-            .rename(video="mp4", key='__key__')
-            .to_tuple('video', 'key')
-            .pipe(self._filter)
-            .pipe(self._loop_offsets)
-            .map(f=self._build_sample)
+        self._dataset = wds.DataPipeline(
+            wds.SimpleShardList(urls=audio_tar_filepaths),
+            webdataset_split_by_workers,
+            wds.shuffle(shuffle_n),
+            wds.tarfile_to_samples(),
+            wds.map(wds.autodecode.Decoder([wds.torch_video])),
+            wds.rename(video="mp4", key='__key__'),
+            wds.to_tuple('video', 'key'),
+            self._filter,
+            self._loop_offsets,
+            wds.map(self._build_sample),
         )
 
     def _filter(self, iterator):
diff --git a/nemo/collections/nlp/data/language_modeling/l2r_lm_dataset.py b/nemo/collections/nlp/data/language_modeling/l2r_lm_dataset.py
index adb6f126cd78..e0bdda5f026b 100644
--- a/nemo/collections/nlp/data/language_modeling/l2r_lm_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/l2r_lm_dataset.py
@@ -18,12 +18,13 @@
 
 import braceexpand
 import numpy as np
-import webdataset as wd
+import webdataset as wds
 from torch.utils.data import Dataset, IterableDataset
 
 from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 from nemo.collections.nlp.data.data_utils import dataset_to_ids
 from nemo.utils import logging
+from nemo.utils.distributed import webdataset_split_by_workers
 
 __all__ = ['L2RLanguageModelingDataset', 'TarredL2RLanguageModelingDataset']
 
@@ -200,14 +201,15 @@ def __init__(
         self.tarpath = text_tar_filepaths
 
         # Put together WebDataset
-        self._dataset = wd.WebDataset(urls=text_tar_filepaths, nodesplitter=None)
-
-        if shuffle_n > 0:
-            self._dataset = self._dataset.shuffle(shuffle_n)
-        else:
-            logging.info("WebDataset will not shuffle files within the tar files.")
-
-        self._dataset = self._dataset.rename(npy='npy', key='__key__').to_tuple('npy', 'key').map(f=self._build_sample)
+        self._dataset = wds.DataPipeline(
+            wds.SimpleShardList(text_tar_filepaths),
+            webdataset_split_by_workers,
+            wds.shuffle(shuffle_n),
+            wds.tarfile_to_samples(),
+            wds.rename(npy='npy', key='__key__'),
+            wds.to_tuple('npy', 'key'),
+            wds.map(self._build_sample),
+        )
 
     def _build_sample(self, tup):
         # Load file
diff --git a/nemo/collections/nlp/data/language_modeling/sentence_dataset.py b/nemo/collections/nlp/data/language_modeling/sentence_dataset.py
index 23c24f395408..42d882bcfe60 100644
--- a/nemo/collections/nlp/data/language_modeling/sentence_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/sentence_dataset.py
@@ -22,11 +22,12 @@
 
 import braceexpand
 import numpy as np
-import webdataset as wd
+import webdataset as wds
 from torch.utils.data import IterableDataset
 
 from nemo.collections.nlp.data.data_utils.data_preprocessing import dataset_to_ids
 from nemo.core import Dataset
+from nemo.utils.distributed import webdataset_split_by_workers
 
 __all__ = ['SentenceDataset', 'TarredSentenceDataset']
 
@@ -260,14 +261,15 @@ def __init__(
         self.tarpath = text_tar_filepaths
 
         # Put together WebDataset
-        self._dataset = wd.WebDataset(urls=text_tar_filepaths, nodesplitter=None)
-
-        if shuffle_n > 0:
-            self._dataset = self._dataset.shuffle(shuffle_n)
-        else:
-            logging.info("WebDataset will not shuffle files within the tar files.")
-
-        self._dataset = self._dataset.rename(pkl='pkl', key='__key__').to_tuple('pkl', 'key').map(f=self._build_sample)
+        self._dataset = wds.DataPipeline(
+            wds.SimpleShardList(text_tar_filepaths),
+            webdataset_split_by_workers,
+            wds.shuffle(shuffle_n),
+            wds.tarfile_to_samples(),
+            wds.rename(pkl='pkl', key='__key__'),
+            wds.to_tuple('pkl', 'key'),
+            wds.map(self._build_sample),
+        )
 
     def _build_sample(self, fname):
         # Load file
diff --git a/nemo/collections/nlp/data/machine_translation/machine_translation_dataset.py b/nemo/collections/nlp/data/machine_translation/machine_translation_dataset.py
index 7715196706a3..ead10cf38ef5 100644
--- a/nemo/collections/nlp/data/machine_translation/machine_translation_dataset.py
+++ b/nemo/collections/nlp/data/machine_translation/machine_translation_dataset.py
@@ -23,12 +23,13 @@
 
 import braceexpand
 import numpy as np
-import webdataset as wd
+import webdataset as wds
 from torch.utils.data import IterableDataset
 
 from nemo.collections.nlp.data.data_utils.data_preprocessing import dataset_to_ids
 from nemo.core import Dataset
 from nemo.utils import logging
+from nemo.utils.distributed import webdataset_split_by_workers
 
 __all__ = ['TranslationDataset', 'TarredTranslationDataset']
 
@@ -427,14 +428,15 @@ def __init__(
         self.tarpath = text_tar_filepaths
 
         # Put together WebDataset
-        self._dataset = wd.WebDataset(urls=text_tar_filepaths, nodesplitter=None)
-
-        if shuffle_n > 0:
-            self._dataset = self._dataset.shuffle(shuffle_n, initial=shuffle_n)
-        else:
-            logging.info("WebDataset will not shuffle files within the tar files.")
-
-        self._dataset = self._dataset.rename(pkl='pkl', key='__key__').to_tuple('pkl', 'key').map(f=self._build_sample)
+        self._dataset = wds.DataPipeline(
+            wds.SimpleShardList(urls=text_tar_filepaths),
+            webdataset_split_by_workers,
+            wds.shuffle(shuffle_n),
+            wds.tarfile_to_samples(),
+            wds.rename(pkl='pkl', key='__key__'),
+            wds.to_tuple('pkl', 'key'),
+            wds.map(self._build_sample),
+        )
 
     def _build_sample(self, fname):
         # Load file
diff --git a/nemo/collections/nlp/data/spellchecking_asr_customization/dataset.py b/nemo/collections/nlp/data/spellchecking_asr_customization/dataset.py
index 69705ec21b9d..5898e6e83bdd 100644
--- a/nemo/collections/nlp/data/spellchecking_asr_customization/dataset.py
+++ b/nemo/collections/nlp/data/spellchecking_asr_customization/dataset.py
@@ -20,12 +20,13 @@
 import braceexpand
 import numpy as np
 import torch
-import webdataset as wd
+import webdataset as wds
 
 from nemo.collections.nlp.data.spellchecking_asr_customization.bert_example import BertExampleBuilder
 from nemo.core.classes.dataset import Dataset, IterableDataset
 from nemo.core.neural_types import ChannelType, IntType, LabelsType, MaskType, NeuralType
 from nemo.utils import logging
+from nemo.utils.distributed import webdataset_split_by_workers
 
 __all__ = [
     "SpellcheckingAsrCustomizationDataset",
@@ -410,14 +411,15 @@ def __init__(
         self.tarpath = text_tar_filepaths
 
         # Put together WebDataset
-        self._dataset = wd.WebDataset(urls=text_tar_filepaths, nodesplitter=None)
-
-        if shuffle_n > 0:
-            self._dataset = self._dataset.shuffle(shuffle_n, initial=shuffle_n)
-        else:
-            logging.info("WebDataset will not shuffle files within the tar files.")
-
-        self._dataset = self._dataset.rename(pkl='pkl', key='__key__').to_tuple('pkl', 'key').map(f=self._build_sample)
+        self._dataset = wds.DataPipeline(
+            wds.SimpleShardList(urls=text_tar_filepaths),
+            webdataset_split_by_workers,
+            wds.shuffle(shuffle_n),
+            wds.tarfile_to_samples(),
+            wds.rename(pkl='pkl', key='__key__'),
+            wds.to_tuple('pkl', 'key'),
+            wds.map(self._build_sample),
+        )
 
     def _build_sample(self, fname):
         # Load file
diff --git a/nemo/collections/nlp/data/text_normalization/decoder_dataset.py b/nemo/collections/nlp/data/text_normalization/decoder_dataset.py
index 33f693892528..d579a3862117 100644
--- a/nemo/collections/nlp/data/text_normalization/decoder_dataset.py
+++ b/nemo/collections/nlp/data/text_normalization/decoder_dataset.py
@@ -22,7 +22,7 @@
 import braceexpand
 import numpy as np
 import torch
-import webdataset as wd
+import webdataset as wds
 from torch.utils.data import IterableDataset
 from tqdm import tqdm
 from transformers import PreTrainedTokenizerBase
@@ -32,6 +32,7 @@
 from nemo.collections.nlp.data.text_normalization.utils import read_data_file
 from nemo.core.classes import Dataset
 from nemo.utils import logging
+from nemo.utils.distributed import webdataset_split_by_workers
 
 __all__ = ['TextNormalizationDecoderDataset', 'TarredTextNormalizationDecoderDataset']
 
@@ -528,13 +529,15 @@ def __init__(
             raise ValueError(f"Invalid shard strategy! Allowed values are: {valid_shard_strategies}")
 
         # Put together WebDataset
-        self._dataset = wd.WebDataset(urls=text_tar_filepaths, nodesplitter=None)
-        if shuffle_n > 0:
-            self._dataset = self._dataset.shuffle(shuffle_n)
-        else:
-            logging.info("WebDataset will not shuffle files within the tar files.")
-
-        self._dataset = self._dataset.rename(pkl='pkl', key='__key__').to_tuple('pkl', 'key').map(f=self._build_sample)
+        self._dataset = wds.DataPipeline(
+            wds.SimpleShardList(urls=text_tar_filepaths),
+            webdataset_split_by_workers,
+            wds.shuffle(shuffle_n),
+            wds.tarfile_to_samples(),
+            wds.rename(pkl='pkl', key='__key__'),
+            wds.to_tuple('pkl', 'key'),
+            wds.map(self._build_sample),
+        )
 
     def _build_sample(self, fname):
         # Load file
diff --git a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_tarred_dataset.py b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_tarred_dataset.py
index bc3ab46fad53..e88d87ba7c45 100644
--- a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_tarred_dataset.py
+++ b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_tarred_dataset.py
@@ -44,6 +44,7 @@
 from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer
 from nemo.core.neural_types import AudioSignal, ChannelType, LabelsType, LengthsType, MaskType, NeuralType
 from nemo.utils import logging
+from nemo.utils.distributed import webdataset_split_by_workers
 
 NUMBER_RE = "(0|[1-9][0-9]*)"
 TAR_FRAGMENT_TMPL_IN_PROGRESS = "fragment{fragment_idx}.{file_idx}.tar"
@@ -608,10 +609,11 @@ def repack_tar_files_with_not_enough_batches(output_dir: Path, num_batches_per_t
                 fragment_idx=match.group(1), num_batches=num_batches_per_tarfile, file_idx=match.group(3)
             )
             new_file_sink = wds.TarWriter(str(new_file))
-            append_ds_to_rewrite = (
-                wds.WebDataset(urls=[str(append_file)], nodesplitter=None)
-                .decode(wds.handle_extension('.pyd', decode_pyd))
-                .to_tuple('__key__', 'batch.pyd')
+            append_ds_to_rewrite = wds.DataPipeline(
+                wds.SimpleShardList(urls=[str(append_file)]),
+                wds.tarfile_to_samples(),
+                wds.decode(wds.handle_extension('.pyd', decode_pyd)),
+                wds.to_tuple('__key__', 'batch.pyd'),
             )
             for key, batch in iter(append_ds_to_rewrite):
                 new_file_sink.write({"__key__": key, "batch.pyd": batch})
@@ -625,10 +627,11 @@ def repack_tar_files_with_not_enough_batches(output_dir: Path, num_batches_per_t
             append_file.unlink()
         if files_to_repack_with_matches and pop_file_ds is None:
             pop_file, _ = files_to_repack_with_matches.pop()
-            pop_file_ds = (
-                wds.WebDataset(urls=[str(pop_file)], nodesplitter=None)
-                .decode(wds.handle_extension('.pyd', decode_pyd))
-                .to_tuple('__key__', 'batch.pyd')
+            pop_file_ds = wds.DataPipeline(
+                wds.SimpleShardList([str(pop_file)]),
+                wds.tarfile_to_samples(),
+                wds.decode(wds.handle_extension('.pyd', decode_pyd)),
+                wds.to_tuple('__key__', 'batch.pyd'),
             )
             pop_file_ds = iter(pop_file_ds)
         if pop_file_ds is not None and new_file_sink is not None:
@@ -1050,14 +1053,15 @@ def __init__(
         else:
             raise ValueError(f"Invalid shard strategy! Allowed values are: {valid_shard_strategies}")
 
-        self._dataset = wds.WebDataset(urls=self.tar_files, nodesplitter=None).decode(
-            wds.handle_extension('.pyd', decode_pyd)
+        self._dataset = wds.DataPipeline(
+            wds.SimpleShardList(self.tar_files),
+            webdataset_split_by_workers,
+            wds.tarfile_to_samples(),
+            wds.decode(wds.handle_extension('.pyd', decode_pyd)),
+            wds.shuffle(shuffle_n),
+            wds.to_tuple('__key__', 'batch.pyd'),
+            wds.map(self._build_sample),
         )
-        if shuffle_n > 0:
-            self._dataset.shuffle(shuffle_n)
-        else:
-            logging.info("WebDataset will not shuffle files within the tar files.")
-        self._dataset = self._dataset.to_tuple('__key__', 'batch.pyd').map(f=self._build_sample)
 
         self.use_audio = use_audio
 
diff --git a/nemo/collections/tts/data/vocoder_dataset.py b/nemo/collections/tts/data/vocoder_dataset.py
index 76dfe1154ae9..178fefcbeb7a 100644
--- a/nemo/collections/tts/data/vocoder_dataset.py
+++ b/nemo/collections/tts/data/vocoder_dataset.py
@@ -21,7 +21,7 @@
 import librosa
 import soundfile as sf
 import torch.utils.data
-import webdataset as wd
+import webdataset as wds
 
 from nemo.collections.asr.data.audio_to_text import expand_sharded_filepaths
 from nemo.collections.asr.parts.preprocessing.segment import available_formats as valid_sf_formats
@@ -37,8 +37,9 @@
 from nemo.core.classes import Dataset, IterableDataset
 from nemo.utils import logging
 from nemo.utils.decorators import experimental
+from nemo.utils.distributed import webdataset_split_by_workers
 
-VALID_FILE_FORMATS = ';'.join(['wav', 'mp3', 'flac'] + [fmt.lower() for fmt in valid_sf_formats.keys()])
+VALID_FILE_FORMATS = ';'.join(['wav', 'mp3', 'flac', 'opus'] + [fmt.lower() for fmt in valid_sf_formats.keys()])
 
 
 @dataclass
@@ -343,18 +344,15 @@ def __init__(
             shard_strategy=shard_strategy,
         )
 
-        self._dataset = wd.WebDataset(audio_tar_filepaths, nodesplitter=None)
-
-        if shuffle_n > 0:
-            self._dataset = self._dataset.shuffle(shuffle_n, initial=shuffle_n)
-        else:
-            logging.info("WebDataset will not shuffle data. Consider setting shuffle_n > 0.")
-
-        self._dataset = (
-            self._dataset.rename(audio=VALID_FILE_FORMATS, key='__key__')
-            .to_tuple('audio', 'key')
-            .pipe(self._filter)
-            .map(f=self._build_sample)
+        self._dataset = wds.DataPipeline(
+            wds.SimpleShardList(urls=audio_tar_filepaths),
+            webdataset_split_by_workers,
+            wds.shuffle(shuffle_n),
+            wds.tarfile_to_samples(),
+            wds.rename(audio=VALID_FILE_FORMATS, key='__key__'),
+            wds.to_tuple('audio', 'key'),
+            self._filter,
+            wds.map(self._build_sample),
         )
 
     def _filter(self, iterator):
diff --git a/nemo/utils/distributed.py b/nemo/utils/distributed.py
index ee6c107b1d85..9649089e40af 100644
--- a/nemo/utils/distributed.py
+++ b/nemo/utils/distributed.py
@@ -123,3 +123,23 @@ def temporary_directory():
     # We use barrier below to make sure that rank zero won't exit
     # and delete tmp_dir while other ranks may still use it
     dist.barrier()
+
+
+def webdataset_split_by_workers(src):
+    """
+    This is for latest webdataset>=0.2.6 
+    This function will make sure that each worker gets a different subset of the dataset.
+    """
+    # group = torch.distributed.group.WORLD
+    # rank = torch.distributed.get_rank(group=group)
+    # world_size = torch.distributed.get_world_size(group=group)
+    worker_info = torch.utils.data.get_worker_info()
+    num_workers = 1
+    if worker_info is not None:
+        worker = worker_info.id
+        num_workers = worker_info.num_workers
+
+    if num_workers > 1:
+        yield from list(src)[worker::num_workers]
+    else:
+        yield from src
diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index 7f915b82c820..be9a6e8cfbb3 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -341,7 +341,7 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
     elif not isinstance(cfg, DictConfig):
         raise ValueError(f"cfg was type: {type(cfg)}. Expected either a dict or a DictConfig")
     cfg = OmegaConf.create(OmegaConf.to_container(cfg, resolve=True))
-    cfg = OmegaConf.merge(schema, cfg)
+    cfg = OmegaConf.merge(schema, cfg)  # type: ExpManagerConfig
 
     error_checks(trainer, cfg)  # Ensures that trainer options are compliant with NeMo and exp_manager arguments
 
diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt
index ee9423b9115c..6bd43cdfc9c7 100644
--- a/requirements/requirements_lightning.txt
+++ b/requirements/requirements_lightning.txt
@@ -4,4 +4,4 @@ pytorch-lightning>=2.2.1
 torchmetrics>=0.11.0
 transformers>=4.36.0
 wandb
-webdataset>=0.1.48,<=0.1.62
+webdataset>=0.2.86
diff --git a/tests/collections/asr/test_asr_datasets.py b/tests/collections/asr/test_asr_datasets.py
index 73b0be22cea0..4040227c5e3e 100644
--- a/tests/collections/asr/test_asr_datasets.py
+++ b/tests/collections/asr/test_asr_datasets.py
@@ -136,6 +136,29 @@ def test_tarred_dataset(self, test_data_dir):
             count += 1
         assert count == 32
 
+    @pytest.mark.unit
+    def test_tarred_dataset_filter(self, test_data_dir):
+        """
+        Checks for 
+            1. file count when manifest len is less than tarred dataset
+            2. Ignoring files in manifest that are not in tarred balls
+
+        """
+        manifest_path = os.path.abspath(
+            os.path.join(test_data_dir, 'asr/tarred_an4/tarred_duplicate_audio_manifest.json')
+        )
+
+        # Test braceexpand loading
+        tarpath = os.path.abspath(os.path.join(test_data_dir, 'asr/tarred_an4/audio_{0..1}.tar'))
+        ds_braceexpand = TarredAudioToCharDataset(
+            audio_tar_filepaths=tarpath, manifest_filepath=manifest_path, labels=self.labels, sample_rate=16000
+        )
+        assert len(ds_braceexpand) == 6
+        count = 0
+        for _ in ds_braceexpand:
+            count += 1
+        assert count == 5  # file ending with sub is not part of tar ball
+
     @pytest.mark.unit
     def test_mismatch_in_model_dataloader_config(self, caplog):
         logging._logger.propagate = True

From fc74c44f58326073fde2aa6026422aee81a87bca Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Date: Wed, 27 Mar 2024 15:40:15 -0500
Subject: [PATCH 078/140] add mcore support for bert embedding models (#8667)

* add mcore support for bert embedding models

* minor temp fix

* minor dataset fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add TE for post ln spec

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* adding Jenkins CI

* addressing comments

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* address comments and update Jenkins

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove unused import

* fix typo in the doc

* set add_pooler=True

* add license header

* fix bert forward pass

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix mcore bert wrapper fwd

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix min_lr max_lr jenkins

* add import guards

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 Jenkinsfile                                   | 124 +++++
 docs/source/nlp/information_retrieval.rst     |  52 +-
 ...ml => megatron_bert_embedding_config.yaml} |  19 +-
 ... => megatron_bert_embedding_finetuning.py} |   9 +-
 .../conf/megatron_bert_config.yaml            |   5 +-
 .../bert_embedding_dataset.py                 | 320 +++++++++--
 .../bert_embedding_model.py                   | 143 +++++
 .../megatron_bert_embedding_model.py          | 502 ++++++++++++++++++
 .../megatron/bert/__init__.py                 |  13 +
 .../megatron/{ => bert}/bert_model.py         | 309 ++++++++++-
 .../megatron/bert/bert_spec.py                |  85 +++
 .../language_modeling/megatron_bert_model.py  |  51 +-
 .../convert_bert_hf_to_nemo.py                | 369 ++++++-------
 13 files changed, 1664 insertions(+), 337 deletions(-)
 rename examples/nlp/information_retrieval/conf/{megatron_sbert_config.yaml => megatron_bert_embedding_config.yaml} (94%)
 rename examples/nlp/information_retrieval/{megatron_sbert_finetune.py => megatron_bert_embedding_finetuning.py} (85%)
 create mode 100644 nemo/collections/nlp/models/information_retrieval/bert_embedding_model.py
 create mode 100644 nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
 create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/bert/__init__.py
 rename nemo/collections/nlp/models/language_modeling/megatron/{ => bert}/bert_model.py (50%)
 create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/bert/bert_spec.py

diff --git a/Jenkinsfile b/Jenkinsfile
index 545a4aa7c13d..883f6e105ed0 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -3349,6 +3349,130 @@ pipeline {
         sh "rm -rf examples/nlp/language_modeling/bert_index_mappings"
       }
     }
+    stage('L2: NeMo Bert Embedding Finetuning and Resume') {
+      when {
+        anyOf {
+          branch 'main'
+          changeRequest target: 'main'
+        }
+      }
+      failFast true
+      steps {
+        sh "NVTE_FLASH_ATTN=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.max_steps=12 \
+        trainer.val_check_interval=4 \
+        trainer.max_epochs=1 \
+        +trainer.num_sanity_val_steps=0 \
+        restore_from_path=/home/TestData/nlp/bert_embedding/bert_embedding_nemo_tiny.nemo \
+        model.num_layers=2 \
+        model.hidden_size=64 \
+        model.ffn_hidden_size=256 \
+        model.num_attention_heads=2 \
+        model.megatron_legacy=False \
+        model.mcore_bert=False \
+        model.global_batch_size=2 \
+        model.micro_batch_size=1 \
+        model.optim.lr=0.0005 \
+        model.encoder_seq_length=512 \
+        model.tokenizer.library='huggingface' \
+        model.tokenizer.type='intfloat/e5-large-unsupervised' \
+        model.data.data_train=/home/TestData/nlp/bert_embedding/bert_embedding_toy_data.jsonl \
+        model.data.hard_negatives_to_train=4 \
+        exp_manager.explicit_log_dir=examples/nlp/information_retrieval/bert_embedding_results \
+        exp_manager.create_wandb_logger=False \
+        exp_manager.resume_if_exists=False"
+        sh "NVTE_FLASH_ATTN=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.max_steps=36 \
+        trainer.val_check_interval=4 \
+        trainer.max_epochs=1 \
+        +trainer.num_sanity_val_steps=0 \
+        restore_from_path=/home/TestData/nlp/bert_embedding/bert_embedding_nemo_tiny.nemo \
+        model.num_layers=2 \
+        model.hidden_size=64 \
+        model.ffn_hidden_size=256 \
+        model.num_attention_heads=2 \
+        model.megatron_legacy=False \
+        model.mcore_bert=False \
+        model.global_batch_size=2 \
+        model.micro_batch_size=1 \
+        model.optim.lr=0.0005 \
+        model.encoder_seq_length=512 \
+        model.tokenizer.library='huggingface' \
+        model.tokenizer.type='intfloat/e5-large-unsupervised' \
+        model.data.data_train=/home/TestData/nlp/bert_embedding/bert_embedding_toy_data.jsonl \
+        model.data.hard_negatives_to_train=4 \
+        exp_manager.explicit_log_dir=examples/nlp/information_retrieval/bert_embedding_results \
+        exp_manager.create_wandb_logger=False \
+        exp_manager.resume_if_exists=True"
+        sh "rm -rf examples/nlp/information_retrieval/bert_embedding_results"
+      }
+    }
+    stage('L2: Megatron Core Bert Embedding Finetuning and Resume') {
+      when {
+        anyOf {
+          branch 'main'
+          changeRequest target: 'main'
+        }
+      }
+      failFast true
+      steps {
+        sh "NVTE_FLASH_ATTN=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.max_steps=12 \
+        trainer.val_check_interval=4 \
+        trainer.max_epochs=36 \
+        +trainer.num_sanity_val_steps=0 \
+        restore_from_path=/home/TestData/nlp/bert_embedding/bert_embedding_mcore_tiny.nemo \
+        model.num_layers=2 \
+        model.hidden_size=64 \
+        model.ffn_hidden_size=256 \
+        model.num_attention_heads=2 \
+        model.megatron_legacy=False \
+        model.mcore_bert=True \
+        model.global_batch_size=2 \
+        model.micro_batch_size=1 \
+        model.optim.lr=0.0005 \
+        model.encoder_seq_length=512 \
+        model.tokenizer.library='huggingface' \
+        model.tokenizer.type='intfloat/e5-large-unsupervised' \
+        model.data.data_train=/home/TestData/nlp/bert_embedding/bert_embedding_toy_data.jsonl \
+        model.data.hard_negatives_to_train=4 \
+        exp_manager.explicit_log_dir=examples/nlp/information_retrieval/bert_embedding_results \
+        exp_manager.create_wandb_logger=False \
+        exp_manager.resume_if_exists=False"
+        sh "NVTE_FLASH_ATTN=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \
+        trainer.devices=2 \
+        trainer.accelerator=gpu \
+        trainer.max_steps=16 \
+        trainer.val_check_interval=4 \
+        trainer.max_epochs=1 \
+        +trainer.num_sanity_val_steps=0 \
+        restore_from_path=/home/TestData/nlp/bert_embedding/bert_embedding_mcore_tiny.nemo \
+        model.num_layers=2 \
+        model.hidden_size=64 \
+        model.ffn_hidden_size=256 \
+        model.num_attention_heads=2 \
+        model.megatron_legacy=False \
+        model.mcore_bert=True \
+        model.global_batch_size=2 \
+        model.micro_batch_size=1 \
+        model.optim.lr=0.0005 \
+        model.encoder_seq_length=512 \
+        model.tokenizer.library='huggingface' \
+        model.tokenizer.type='intfloat/e5-large-unsupervised' \
+        model.data.data_train=/home/TestData/nlp/bert_embedding/bert_embedding_toy_data.jsonl \
+        model.data.hard_negatives_to_train=4 \
+        exp_manager.explicit_log_dir=examples/nlp/information_retrieval/bert_embedding_results \
+        exp_manager.create_wandb_logger=False \
+        exp_manager.resume_if_exists=True"
+        sh "rm -rf examples/nlp/information_retrieval/bert_embedding_results"
+      }
+    }
     stage('L2: Megatron RETRO Pretraining and Resume Training') {
       when {
         anyOf {
diff --git a/docs/source/nlp/information_retrieval.rst b/docs/source/nlp/information_retrieval.rst
index b40caeee8a3b..a283c845b11d 100644
--- a/docs/source/nlp/information_retrieval.rst
+++ b/docs/source/nlp/information_retrieval.rst
@@ -1,15 +1,15 @@
 .. _information_retrieval:
 
-Sentence-BERT
-=============
+BERT Embedding Models
+=====================
 
 Sentence-BERT (SBERT) is a modification of the BERT model that is specifically trained to generate semantically meaningful sentence embeddings. 
 The model architecture and pre-training process are detailed in the `Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks <https://aclanthology.org/D19-1410.pdf>`__ paper. Similar to BERT, 
-Sentence-BERT utilizes a BERT-based architecture, but it is trained using a siamese and triplet network structure to derive fixed-sized sentence embeddings that capture semantic information. 
+Sentence-BERT utilizes a BERT-based architecture, but it is trained using a Siamese and triplet network structure to derive fixed-sized sentence embeddings that capture semantic information. 
 Sentence-BERT is commonly used to generate high-quality sentence embeddings for various downstream natural language processing tasks, such as semantic textual similarity, clustering, and information retrieval
 
 Data Input for the Sentence-BERT model
----------------------------------------
+--------------------------------------
 
 The fine-tuning data for the Sentence-BERT (SBERT) model should consist of data instances, 
 each comprising a query, a positive document, and a list of negative documents. Negative mining is 
@@ -20,8 +20,8 @@ The dataset should be in JSON format. For instance, the dataset should have the
 
     [
         {
-            "question": "Query",
-            "pos_doc": ["Positive"],
+            "query": "Query",
+            "pos_doc": "Positive",
             "neg_doc": ["Negative_1", "Negative_2", ..., "Negative_n"]
         },
         {
@@ -39,15 +39,17 @@ This format ensures that the fine-tuning data is appropriately structured for tr
 Fine-tuning the Sentence-BERT model
 -----------------------------------
 
-For fine-tuning Sentence-BERT model, you need to initialze the Sentence-BERT model with BERT model
+For fine-tuning Sentence-BERT model, you need to initialize the Sentence-BERT model with BERT model
 checkpoint. To do so, you should either have a ``.nemo`` checkpoint or need to convert a HuggingFace
-BERT checkpoint to NeMo using the following:
+BERT checkpoint to NeMo (mcore) using the following:
 
 .. code-block:: python
 
      python NeMo/scripts/nlp_language_modeling/convert_bert_hf_to_nemo.py \
             --input_name_or_path "intfloat/e5-large-unsupervised" \
-            --output_path /path/to/output/nemo/file.nemo 
+            --output_path /path/to/output/nemo/file.nemo \
+            --mcore True \
+            --precision 32
 
 Then you can fine-tune the sentence-BERT model using the following script:
 
@@ -60,37 +62,43 @@ Then you can fine-tune the sentence-BERT model using the following script:
     NAME= # wandb run name
     export WANDB_API_KEY= # your_wandb_key
 
-
     NUM_DEVICES=1 # number of gpus to train on
-
-
     CONFIG_PATH="/NeMo/examples/nlp/information_retrieval/conf/"
-    CONFIG_NAME="megatron_bert_config"
+    CONFIG_NAME="megatron_bert_embedding_config"
     PATH_TO_NEMO_MODEL= # Path to conveted nemo model from hf
-    DATASET_PATH= # Path to json dataset 
+    TRAIN_DATASET_PATH= # Path to json dataset 
+    VALIDATION_DATASET_PATH= # Path to validation dataset 
     SAVE_DIR= # where the checkpoint and logs are saved
     mkdir -p $SAVE_DIR
-
-
-    python /NeMo/examples/nlp/language_modeling/megatron_sbert_pretraining.py \
+    export NVTE_FLASH_ATTN=0
+    export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0
+    export NVTE_FUSED_ATTN=0
+    
+    python NeMo/examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \
     --config-path=${CONFIG_PATH} \
     --config-name=${CONFIG_NAME} \
     restore_from_path=${PATH_TO_NEMO_MODEL} \
     trainer.devices=${NUM_DEVICES} \
+    trainer.max_steps=10000 \
     trainer.val_check_interval=100 \
     trainer.max_epochs=1 \
     +trainer.num_sanity_val_steps=0 \
+    model.mcore_bert=True \
+    model.post_process=False \
     model.global_batch_size=8 \ # should be NUM_DEVICES * model.micro_batch_size
     model.micro_batch_size=8 \
+    model.optim.lr=0.000005 \
+    model.optim.sched.min_lr=0.00000001 \
+    model.optim.sched.warmup_steps=100 \
+    model.encoder_seq_length=512 \
     model.tokenizer.library="huggingface" \
     model.tokenizer.type="intfloat/e5-large-unsupervised" \
-    ++model.data.data_prefix=${DATASET_PATH} \
-    ++model.tokenizer.do_lower_case=False \
-    ++model.data.evaluation_sample_size=100 \
-    ++model.data.hard_negatives_to_train=4 \
-    ++model.data.evaluation_steps=100 \
+    model.data.data_train=${TRAIN_DATASET_PATH} \
+    model.data.data_validation=${VALIDATION_DATASET_PATH} \
+    model.data.hard_negatives_to_train=4 \
     exp_manager.explicit_log_dir=${SAVE_DIR} \
     exp_manager.create_wandb_logger=True \
     exp_manager.resume_if_exists=True \
     exp_manager.wandb_logger_kwargs.name=${NAME} \
     exp_manager.wandb_logger_kwargs.project=${PROJECT}
+    
diff --git a/examples/nlp/information_retrieval/conf/megatron_sbert_config.yaml b/examples/nlp/information_retrieval/conf/megatron_bert_embedding_config.yaml
similarity index 94%
rename from examples/nlp/information_retrieval/conf/megatron_sbert_config.yaml
rename to examples/nlp/information_retrieval/conf/megatron_bert_embedding_config.yaml
index c58d120dad0c..0b57313fb0a0 100644
--- a/examples/nlp/information_retrieval/conf/megatron_sbert_config.yaml
+++ b/examples/nlp/information_retrieval/conf/megatron_bert_embedding_config.yaml
@@ -41,7 +41,7 @@ exp_manager:
 
 model:
   # model parallelism 
-  mcore_bert: False
+  mcore_bert: True
   micro_batch_size: 4
   global_batch_size: 8
   tensor_model_parallel_size: 1
@@ -56,8 +56,9 @@ model:
   hidden_size: 1024
   ffn_hidden_size: 4096 # Transformer FFN hidden size. Usually 4 * hidden_size.
   num_attention_heads: 16
-  skip_head: True
   transformer_block_type: post_ln
+  add_pooler: True
+  add_lm_head: False
   init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
   hidden_dropout: 0.1 # Dropout probability for hidden state transformer.
   kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
@@ -68,8 +69,7 @@ model:
   pre_process: True # add embedding
   post_process: True # add pooler
   bert_binary_head: True # BERT binary head
-  megatron_legacy: True
-
+  megatron_legacy: False
   tokenizer:
     library: 'huggingface'
     type: 'intfloat/e5-large-unsupervised'
@@ -125,14 +125,9 @@ model:
   
   data:
     # Path to data must be specified by the user.
-    # can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]",
-    # Or see example below: 
-    # data_prefix: 
-    #   - .5
-    #   - /raid/data/pile/my-gpt3_00_text_document
-    #   - .5
-    #   - /raid/data/pile/my-gpt3_01_text_document
-    data_prefix: [1.0, /path/to/data]
+    data_train: null
+    data_validation: null
+    hard_negatives_to_train: 4
     index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
     data_impl: mmap
     splits_string: 900,50,50
diff --git a/examples/nlp/information_retrieval/megatron_sbert_finetune.py b/examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py
similarity index 85%
rename from examples/nlp/information_retrieval/megatron_sbert_finetune.py
rename to examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py
index 050db34510e5..04d12fed9eca 100644
--- a/examples/nlp/information_retrieval/megatron_sbert_finetune.py
+++ b/examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py
@@ -15,7 +15,7 @@
 import torch.multiprocessing as mp
 from omegaconf.omegaconf import OmegaConf, open_dict
 
-from nemo.collections.nlp.models.information_retrieval.megatron_sbert_model import MegatronSBertModel
+from nemo.collections.nlp.models.information_retrieval.megatron_bert_embedding_model import MegatronBertEmbeddingModel
 from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronBertTrainerBuilder
 from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
 from nemo.core.config import hydra_runner
@@ -23,7 +23,7 @@
 from nemo.utils.exp_manager import exp_manager
 
 
-@hydra_runner(config_path="conf", config_name="megatron_bert_config")
+@hydra_runner(config_path="conf", config_name="megatron_bert_embedding_config")
 def main(cfg) -> None:
     if cfg.model.data.dataloader_type != "LDDL":
         mp.set_start_method("spawn", force=True)
@@ -34,7 +34,7 @@ def main(cfg) -> None:
     trainer = MegatronBertTrainerBuilder(cfg).create_trainer()
     exp_manager(trainer, cfg.exp_manager)
 
-    model_cfg = MegatronSBertModel.merge_cfg_with(cfg.restore_from_path, cfg)
+    model_cfg = MegatronBertEmbeddingModel.merge_cfg_with(cfg.restore_from_path, cfg)
 
     assert (
         model_cfg.micro_batch_size * cfg.trainer.devices == model_cfg.global_batch_size
@@ -44,7 +44,8 @@ def main(cfg) -> None:
     with open_dict(model_cfg):
         model_cfg.precision = trainer.precision
 
-    model = MegatronSBertModel.restore_from(
+    logging.info(f"Loading model from {cfg.restore_from_path}")
+    model = MegatronBertEmbeddingModel.restore_from(
         restore_path=cfg.restore_from_path,
         trainer=trainer,
         save_restore_connector=NLPSaveRestoreConnector(),
diff --git a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml
index b3e3912fffd4..58e874386c44 100644
--- a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml
@@ -56,8 +56,9 @@ model:
   hidden_size: 768
   ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size.
   num_attention_heads: 12
-  skip_head: False
-  transformer_block_type: post_ln
+  transformer_block_type: pre_ln
+  add_pooler: True
+  add_lm_head: True
   init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
   hidden_dropout: 0.1 # Dropout probability for hidden state transformer.
   kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
diff --git a/nemo/collections/nlp/data/information_retrieval/bert_embedding_dataset.py b/nemo/collections/nlp/data/information_retrieval/bert_embedding_dataset.py
index 038b1c47ec56..3c57b1af4cca 100644
--- a/nemo/collections/nlp/data/information_retrieval/bert_embedding_dataset.py
+++ b/nemo/collections/nlp/data/information_retrieval/bert_embedding_dataset.py
@@ -12,82 +12,286 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import random
-from typing import Dict, List
+from typing import Mapping, Optional
 
+import datasets
+import numpy as np
+import torch
 from torch.utils.data import Dataset
 
+# hack to avoid the "not enough disk space" error in some slurm cluster
+datasets.builder.has_sufficient_disk_space = lambda needed_bytes, directory='.': True
+from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import get_samples_mapping
+from nemo.collections.nlp.data.language_modeling.text_memmap_dataset import JSONLMemMapDataset
+from nemo.core.classes import Dataset
+from nemo.utils import logging
 
-class BertEmbeddingDataset(Dataset):
-    """SentenceTransformer tokenizer and MultipleNegativesRankingLoss expects
-        a single positive and a single hard-negative (optional) per example.
-        This Dataset manages the case where there is more than one positive or negative
-        available, in form of a list.
-        It uses the list of positives/negatives as a queue, where for each epoch the 
-        first positive/negative of the queue is used for training, after which the
-        item is moved to the end of the queue.
-        If num_hard_negs > 1, multiple negatives will be sampled for each example.
+__all__ = ['BertEmbeddingDataset']
 
-        Args:
-            data (List[Dict[str, str]]): A list of Dict whose 
-            keys are "question", "pos_doc", "neg_doc"
-            num_hard_negs (int): Number of hard-negatives for each query to sample
-            shuffled_negs (bool, optional): Whether the negatives per example
-            needs to be shuffled in the initialization. Defaults to False.
-    """
 
+class BertEmbeddingDataset(Dataset):
     def __init__(
         self,
-        data: List[Dict[str, str]],
-        shuffled_negs: bool = False,
-        num_hard_negs: int = 1,
-        query_prefix: str = "",
-        passage_prefix: str = "",
+        file_path: str,
+        tokenizer: TokenizerSpec,
+        max_seq_length: int = 1024,
+        min_seq_length: int = 1,
+        add_bos: bool = True,
+        add_eos: bool = True,
+        max_num_samples: int = None,
+        seed: int = 1234,
+        index_mapping_dir: str = None,
+        virtual_tokens: int = 0,
+        memmap_workers: Optional[int] = None,
+        truncation_method: str = 'right',
+        special_tokens: Optional[Mapping[str, str]] = None,  # special tokens, a dictory of {token_type: token}
+        data_type: str = 'train',  # train, query or doc
+        num_hard_negatives: int = 4,
     ):
-        self.data = data
-        self.num_hard_negs = num_hard_negs
-        self.query_prefix = query_prefix
-        self.passage_prefix = passage_prefix
+        """
+        file_path: Path to a JSONL dataset with (query,pos_doc,neg_doc) triplets in jsonl format. 
+        tokenizer: Tokenizer for the dataset. Instance of a class that inherits TokenizerSpec (ex: YTTM, SentencePiece).
+        max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated.
+        min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements.
+        add_bos (bool): Whether to add a beginning of sentence token to each data example
+        add_eos (bool): Whether to add an end of sentence token to each data example
+        seed: Random seed for data shuffling.
+        max_num_samples: Maximum number of samples to load. This can be > dataset length if you want to oversample data. If None, all samples will be loaded.
+        index_mapping_dir: Directory to save the index mapping to. If None, will write to the same folder as the dataset.
+        truncation_method: Truncation from which position. Options: ['left', 'right']
+        special_tokens: special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '<extra_id_0>', 'turn_start': '<extra_id_1>', 'label_start': '<extra_id_2>', 'end_of_turn': '\n', "end_of_name": "\n"}
+        """
+        # TODO: lot of copy-paste from GPTSFDDataset, should refactor both to use a common base class (@adithyare)
+        self.tokenizer = tokenizer
+        self.file_path = file_path
+        self.max_seq_length = max_seq_length
+        self.min_seq_length = min_seq_length
+        self.add_bos = add_bos
+        self.add_eos = add_eos
+        self.max_num_samples = max_num_samples
+        self.seed = seed
+        self.index_mapping_dir = index_mapping_dir
+        self.virtual_tokens = virtual_tokens
+        self.truncation_method = truncation_method
+        if special_tokens is None:
+            self.special_tokens = {
+                "system_turn_start": "<extra_id_0>",
+                "turn_start": "<extra_id_1>",
+                "label_start": "<extra_id_2>",
+                "end_of_turn": "\n",
+                "end_of_name": "\n",
+            }
+        else:
+            self.special_tokens = special_tokens
+        self.data_type = data_type
+        self.num_hard_negatives = num_hard_negatives
 
-        if shuffled_negs:
-            for example in self.data:
-                random.shuffle(example["neg_doc"])
+        self.indexed_dataset = JSONLMemMapDataset(
+            dataset_paths=[file_path],
+            tokenizer=None,
+            header_lines=0,
+            index_mapping_dir=index_mapping_dir,
+            workers=memmap_workers,
+        )
+        # Will be None after this call if `max_num_samples` is None
+        self.samples_mapping = None
+        self._build_samples_mapping()
+
+    def _build_samples_mapping(self):
+        if self.max_num_samples is not None:
+            self.samples_mapping = get_samples_mapping(
+                indexed_dataset=self.indexed_dataset,
+                data_prefix=self.file_path,
+                num_epochs=None,
+                max_num_samples=self.max_num_samples,
+                max_seq_length=self.max_seq_length - 2,
+                short_seq_prob=0,
+                seed=self.seed,
+                name=self.file_path.split('/')[-1],
+                binary_head=False,
+                index_mapping_dir=self.index_mapping_dir,
+            )
+        else:
+            self.samples_mapping = None
 
     def __len__(self):
-        return len(self.data)
+        if self.max_num_samples is None:
+            return len(self.indexed_dataset)
+        else:
+            assert self.samples_mapping is not None
+            return len(self.samples_mapping)
+
+    def __getitem__(self, idx):
+        if isinstance(idx, np.int64):
+            idx = idx.item()
+
+        if self.samples_mapping is not None:
+            assert idx < len(self.samples_mapping)
+            idx, _, _ = self.samples_mapping[idx]
+            if isinstance(idx, np.uint32):
+                idx = idx.item()
+
+        assert idx < len(self.indexed_dataset)
+        # idx may < 0 because we pad_samples_to_global_batch_size, e.g. id = -1
+        if idx < 0:
+            idx = len(self) + idx
+            auto_gen_idx = True
+        else:
+            auto_gen_idx = False
+        try:
+            example = self.indexed_dataset[idx]
+            if auto_gen_idx:
+                example['__AUTOGENERATED__'] = True
+        except Exception as e:
+            logging.error(f"Error while loading example {idx} from dataset {self.file_path}")
+            raise e
+        return self._process_example(example)
+
+    def _process_example(self, example):
+        """
+        Create an example by concatenating text and answer.
+        Truncation is carried out when needed, but it is performed only on the prompt side.
+        BOS, EOS, and SEP, are added if specified.
+        """
+
+        metadata = {k: v for k, v in example.items()}
+        if self.data_type == 'train':
+            q = self.tokenizer.text_to_ids("query: " + example['query'].strip())
+            d = self.tokenizer.text_to_ids("passage: " + example['pos_doc'].strip())
+            nd = [
+                self.tokenizer.text_to_ids("passage: " + example['neg_doc'][i].strip())
+                for i in range(self.num_hard_negatives)
+            ]
+
+        elif self.data_type == 'query':
+            q = self.tokenizer.text_to_ids("query: " + example['query'].strip())
+            d, nd = None, None
+            assert "query_id" in example, "query_id is required for query dataset"
+            assert "doc_id" in example, "doc_id is required for query dataset"
+        elif self.data_type == 'doc':
+            d = self.tokenizer.text_to_ids("passage: " + example['pos_doc'].strip())
+            assert "doc_id" in example, "doc_id is required for doc dataset"
+            q, nd = None, None
+        else:
+            raise ValueError(f"Invalid data type: {self.data_type}")
 
-    def __getitem__(self, item):
+        q = q if q is not None else []
+        d = d if d is not None else []
+        nd = nd if nd is not None else []
 
-        example = self.data[item]
-        question = f'{self.query_prefix} {example["question"]}'.strip()
-        texts = [question]
+        if self.virtual_tokens:
+            # (@adithyare) we are going to insert "pad/eos" tokens in the beginning of the text and context
+            # these pad/eos tokens are placeholders for virtual tokens for ptuning (if used)
+            q = [self.tokenizer.eos_id] * self.virtual_tokens + q  # type: ignore
+            d = [self.tokenizer.eos_id] * self.virtual_tokens + d  # type: ignore
+            nd = [[self.tokenizer.eos_id] * self.virtual_tokens + n for n in nd]  # type: ignore
 
-        positive = example["pos_doc"]
-        if isinstance(positive, list):
+        if self.add_bos:
+            q = [self.tokenizer.bos_id] + q  # type: ignore
+            d = [self.tokenizer.bos_id] + d  # type: ignore
+            nd = [[self.tokenizer.bos_id] + n for n in nd]  # type: ignore
 
-            positive = example["pos_doc"][0]
+        # TODO: (@adithyare) should probably add a warning before truncation
+        q = q[: self.max_seq_length - 1]
+        d = d[: self.max_seq_length - 1]
+        nd = [n[: self.max_seq_length - 1] for n in nd]
 
-        positive = f"{self.passage_prefix} {positive}".strip()
-        texts.append(positive)
+        if self.add_eos:
+            q = q + [self.tokenizer.eos_id]  # type: ignore
+            d = d + [self.tokenizer.eos_id]  # type: ignore
+            nd = [n + [self.tokenizer.eos_id] for n in nd]  # type: ignore
 
-        negative = []
-        if "neg_doc" in example:
-            negative = example["neg_doc"]
-            selected_negs = []
-            if isinstance(negative, list):
-                for counter in range(self.num_hard_negs):
-                    if len(example["neg_doc"]) > 0:
+        processed_example = {
+            'query': q,
+            'pos_doc': d,
+            'neg_doc': nd,
+            'metadata': metadata,
+        }
+        return processed_example
 
-                        negative = example["neg_doc"][counter]
-                        selected_negs.append(negative)
-                    else:
-                        # Providing empty hard-negative, for this example,
-                        # so that it matches the number of hard negatives
-                        # of the other examples
-                        selected_negs.append("")
+    def _maybe_cast_to_list(self, x):
+        if isinstance(x, np.ndarray):
+            return [item.tolist() for item in x]
+        return x
 
+    def _ceil_to_nearest(self, n, m):
+        return (n + m - 1) // m * m
+
+    def _collate_item(self, item, max_length, pad_id):
+        item = self._maybe_cast_to_list(item)
+        # max_length = max([len(x) for x in item]) if item else 0
+        # here [0] should be tokenizer.pad_id
+        item = [x + [pad_id] * (max_length - len(x)) for x in item]
+        return item
+
+    @torch.no_grad()
+    def _create_attention_mask(self, max_length):
+        """Create `attention_mask`.
+        Args:
+            input_ids: A 1D tensor that holds the indices of tokens.
+        """
+        # seq_length = len(input_ids)
+        # `attention_mask` has the shape of [1, seq_length, seq_length]
+        attention_mask = torch.tril(torch.ones((max_length, max_length))).unsqueeze(0)
+        attention_mask = attention_mask < 0.5
+        return attention_mask
+
+    @torch.no_grad()
+    def _create_attention_mask2(self, max_length, item_lengh):
+        """Create `attention_mask`.
+        Args:
+            input_ids: A 1D tensor that holds the indices of tokens.
+        """
+        # seq_length = len(input_ids)
+        # `attention_mask` has the shape of [1, seq_length, seq_length]
+        attention_mask = torch.zeros(max_length)
+        attention_mask[:item_lengh] = 1
+        return attention_mask
+
+    def collate_fn(self, batch):
+        input_ids = []
+        metadata = []
+        lengths = []
+        max_length = -1
+        for item in batch:
+            metadata.append(item['metadata'])
+            if self.data_type == 'train':
+                input_ids.append(item['query'])
+                lengths.append(len(item['query']))
+                input_ids.append(item['pos_doc'])
+                lengths.append(len(item['pos_doc']))
+                for nd in item['neg_doc']:
+                    input_ids.append(nd)
+                    lengths.append(len(nd))
+                max_length = max(
+                    max_length, len(item['query']), len(item['pos_doc']), *(len(nd) for nd in item['neg_doc'])
+                )
+            elif self.data_type == 'query':
+                input_ids.append(item['query'])
+                lengths.append(len(item['query']))
+                max_length = max(max_length, len(item['query']))
+            elif self.data_type == 'doc':
+                input_ids.append(item['pos_doc'])
+                lengths.append(len(item['pos_doc']))
+                max_length = max(max_length, len(item['pos_doc']))
             else:
-                selected_negs = [negative]
-            selected_negs = [f"{self.passage_prefix} {neg}".strip() for neg in selected_negs]
-            texts.extend(selected_negs)
-        return texts
+                raise ValueError(f"Invalid data type: {self.data_type}")
+
+        max_length = min(self.max_seq_length, self._ceil_to_nearest(max_length, 16))
+        assert max_length <= self.max_seq_length
+
+        attention_mask = [self._create_attention_mask2(max_length, len) for len in lengths]
+        attention_mask = torch.stack(attention_mask)
+        position_ids = [list(range(max_length)) for _ in batch]
+        position_ids = torch.LongTensor(position_ids)
+        input_ids = torch.LongTensor(self._collate_item(input_ids, max_length=max_length, pad_id=0))
+        lengths = torch.LongTensor(lengths) - 1  # subtract 1 to account for the eos token
+
+        processed_batch = {
+            'input_ids': input_ids,
+            'token_type_ids': torch.zeros_like(input_ids),
+            'attention_mask': attention_mask,
+        }
+
+        return processed_batch
diff --git a/nemo/collections/nlp/models/information_retrieval/bert_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/bert_embedding_model.py
new file mode 100644
index 000000000000..3d21e9876c40
--- /dev/null
+++ b/nemo/collections/nlp/models/information_retrieval/bert_embedding_model.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import warnings
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+from nemo.collections.nlp.models.language_modeling.megatron.bert.bert_model import (
+    MCoreBertModelWrapperWithPostLNSupport,
+    NeMoBertModel,
+)
+from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
+
+try:
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    ModelParallelConfig = ApexGuardDefaults
+
+    HAVE_MEGATRON_CORE = False
+
+
+class BertEmbeddingHead(nn.Module):
+    """Performs mean pooling on the token embeddings.
+    """
+
+    def __init__(
+        self, word_embedding_dimension: int, pooling_mode_mean_tokens: bool = True,
+    ):
+        super(BertEmbeddingHead, self).__init__()
+
+        self.config_keys = [
+            "word_embedding_dimension",
+            "pooling_mode_mean_tokens",
+        ]
+        self.word_embedding_dimension = word_embedding_dimension
+        self.pooling_mode_mean_tokens = pooling_mode_mean_tokens
+
+    def forward(self, token_embeddings: Tensor, attention_mask: Tensor):
+
+        token_embeddings = token_embeddings.permute(1, 0, 2)
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
+
+        sum_mask = input_mask_expanded.sum(1)
+
+        sum_mask = torch.clamp(sum_mask, min=1e-9)
+
+        output_vector = sum_embeddings / sum_mask
+
+        output_vector = F.normalize(output_vector, p=2, dim=1)
+
+        return output_vector
+
+    def __repr__(self):
+        return "Pooling({}) and Normalize".format(self.get_config_dict())
+
+    def get_config_dict(self):
+        return {key: self.__dict__[key] for key in self.config_keys}
+
+
+class MCoreBertEmbeddingModel(MCoreBertModelWrapperWithPostLNSupport):
+    def __init__(self, *args, **kwargs):
+
+        super(MCoreBertEmbeddingModel, self).__init__(*args, **kwargs)
+        # Changing the default settings of the original Bert model to make it compatible with the embedding model.
+        self.post_process = False
+        self.binary_head = None
+        self.lm_head = None
+        self.output_layer = None
+        self.encoder.final_layernorm = None
+        self.encoder.post_process = False
+        self.embedding_head = BertEmbeddingHead(
+            word_embedding_dimension=self.config.hidden_size, pooling_mode_mean_tokens=True,
+        )
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        attention_mask: Tensor,
+        tokentype_ids: Tensor = None,
+        lm_labels: Tensor = None,
+        inference_params=None,
+    ):
+        """Forward function of BERT model
+
+        Forward function of the BERT Model This function passes the input tensors
+        through the embedding layer, and then the encoder and finally into the post
+        processing layer (optional).
+
+        It either returns the Loss values if labels are given  or the final hidden units
+        """
+        hidden_states = super(MCoreBertEmbeddingModel, self).forward(
+            input_ids, attention_mask, tokentype_ids, lm_labels, inference_params
+        )
+        embeddings_out = self.embedding_head(hidden_states, attention_mask)
+        return embeddings_out
+
+
+class NeMoBertEmbeddingModel(NeMoBertModel):
+    """
+    Bert Language model.
+    Model returns [seq, batch, hidden] shape
+    """
+
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "NeMoBertModel will be deprecated mid 2024. Use MCoreBertEmbeddingModel instead.", DeprecationWarning
+        )
+        super().__init__(*args, **kwargs)
+        self.embedding_head = BertEmbeddingHead(
+            word_embedding_dimension=self.config.hidden_size, pooling_mode_mean_tokens=True,
+        )
+
+    def forward(
+        self,
+        bert_model_input,
+        attention_mask,
+        token_type_ids=None,
+        lm_labels=None,
+        checkpoint_activations_all_layers=None,
+    ):
+
+        lm_output = super(NeMoBertEmbeddingModel, self).forward(
+            bert_model_input, attention_mask, token_type_ids, lm_labels, checkpoint_activations_all_layers
+        )
+        embeddings_out = self.embedding_head(lm_output[0], attention_mask)
+        return embeddings_out
diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
new file mode 100644
index 000000000000..5d8ff1d305bd
--- /dev/null
+++ b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
@@ -0,0 +1,502 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+try:
+    from apex.transformer.pipeline_parallel.utils import get_num_microbatches
+
+    HAVE_APEX = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_APEX = False
+
+try:
+    from megatron.core import parallel_state
+    from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
+    from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
+
+    HAVE_MEGATRON_CORE = True
+except (ImportError, ModuleNotFoundError):
+    TransformerConfig = ApexGuardDefaults
+    HAVE_MEGATRON_CORE = False
+import torch
+from megatron.core.models.bert.bert_layer_specs import bert_layer_with_transformer_engine_spec
+from omegaconf import DictConfig, OmegaConf, open_dict
+from omegaconf.dictconfig import DictConfig
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.collections.nlp.data.information_retrieval.bert_embedding_dataset import BertEmbeddingDataset
+from nemo.collections.nlp.data.language_modeling.megatron.data_samplers import (
+    MegatronPretrainingRandomSampler,
+    MegatronPretrainingSampler,
+)
+from nemo.collections.nlp.models.information_retrieval.bert_embedding_model import (
+    MCoreBertEmbeddingModel,
+    NeMoBertEmbeddingModel,
+)
+from nemo.collections.nlp.models.language_modeling.megatron.bert.bert_spec import (
+    bert_layer_with_transformer_engine_spec_postln,
+)
+from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
+from nemo.collections.nlp.modules.common.megatron.utils import (
+    ApexGuardDefaults,
+    average_losses_across_data_parallel_group,
+)
+from nemo.collections.nlp.parts.utils_funcs import get_last_rank
+from nemo.utils import logging
+
+try:
+    from megatron.core import parallel_state
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    ModelParallelConfig = ApexGuardDefaults
+
+    HAVE_MEGATRON_CORE = False
+
+
+class MegatronBertEmbeddingModel(MegatronBertModel):
+    """
+    Megatron Bert pretraining.
+    Model returns [batch, seq, hidden] shape
+    """
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer):
+
+        super().__init__(cfg, trainer=trainer)
+        self.cross_entropy_loss = torch.nn.CrossEntropyLoss(label_smoothing=cfg.get('label_smoothing', 0.0))
+        softmax_temp = cfg.get('softmax_temp', 0.05)
+        self.scale = 1.0 / softmax_temp
+
+    def model_provider_func(self, pre_process, post_process):
+        cfg = self.cfg
+        num_tokentypes = 2 if cfg.bert_binary_head else 0
+        transformer_block_type = cfg.get('transformer_block_type', 'post_ln')
+        if self.mcore_bert:
+            if transformer_block_type == 'pre_ln':
+                layer_spec = bert_layer_with_transformer_engine_spec
+            else:
+                layer_spec = bert_layer_with_transformer_engine_spec_postln
+            model = MCoreBertEmbeddingModel(
+                config=self.transformer_config,
+                transformer_layer_spec=layer_spec,
+                vocab_size=self.padded_vocab_size,
+                max_sequence_length=cfg.max_position_embeddings,
+                num_tokentypes=num_tokentypes,
+                add_binary_head=cfg.bert_binary_head,
+                share_embeddings_and_output_weights=self.cfg.get('share_embeddings_and_output_weights', True),
+                parallel_output=True,
+                pre_process=pre_process,
+                post_process=post_process,
+                transformer_block_type=transformer_block_type,
+                add_pooler=self.cfg.get('add_pooler', True),
+            )
+
+        else:
+            model = NeMoBertEmbeddingModel(
+                config=self.model_parallel_config,
+                vocab_size=self.padded_vocab_size,
+                hidden_size=cfg.hidden_size,
+                max_position_embeddings=cfg.max_position_embeddings,
+                num_layers=cfg.num_layers,
+                num_attention_heads=cfg.num_attention_heads,
+                apply_query_key_layer_scaling=cfg.get('apply_query_key_layer_scaling', True),
+                kv_channels=cfg.get('kv_channels', None),
+                ffn_hidden_size=cfg.ffn_hidden_size,
+                num_tokentypes=num_tokentypes,
+                parallel_output=True,
+                pre_process=pre_process,
+                post_process=post_process,
+                init_method_std=cfg.get('init_method_std', 0.02),
+                fp16_lm_cross_entropy=cfg.get('fp16_lm_cross_entropy', False),
+                hidden_dropout=cfg.get('hidden_dropout', 0.1),
+                precision=cfg.get('precision', 16),
+                fp32_residual_connection=cfg.get('fp32_residual_connection', False),
+                activations_checkpoint_granularity=self.cfg.get('activations_checkpoint_granularity', None),
+                activations_checkpoint_method=self.cfg.get('activations_checkpoint_method', None),
+                activations_checkpoint_num_layers=self.cfg.get('activations_checkpoint_num_layers', 1),
+                activations_checkpoint_layers_per_pipeline=self.cfg.get(
+                    'activations_checkpoint_layers_per_pipeline', None
+                ),
+                layernorm_epsilon=cfg.get('layernorm_epsilon', 1e-5),
+                masked_softmax_fusion=cfg.get('masked_softmax_fusion', True),
+                normalization=cfg.get('normalization', 'layernorm'),
+                transformer_block_type=transformer_block_type,
+                bias_gelu_fusion=cfg.get('bias_gelu_fusion', True),
+                bias_dropout_add_fusion=cfg.get("bias_dropout_add_fusion", True),
+                onnx_safe=cfg.get('onnx_safe', False),
+                add_binary_head=cfg.bert_binary_head,
+                megatron_legacy=cfg.get('megatron_legacy', False),
+                position_embedding_type=self.cfg.get("position_embedding_type", "learned_absolute"),
+                add_pooler=cfg.get('add_pooler', True),
+                add_lm_head=cfg.get('add_lm_head', False),
+            )
+
+        return model
+
+    def build_train_valid_test_datasets(self):
+
+        self._train_ds = None
+        self._validation_ds = None
+        self._test_ds = None
+
+        self._train_ds = BertEmbeddingDataset(
+            self.cfg.data.data_train,
+            tokenizer=self.tokenizer,
+            add_bos=True,
+            num_hard_negatives=self.cfg.data.get("hard_negatives_to_train", 4),
+            max_seq_length=self.cfg.encoder_seq_length,
+        )
+        if self.cfg.data.data_validation:
+            self._validation_ds = BertEmbeddingDataset(
+                self.cfg.data.data_validation,
+                tokenizer=self.tokenizer,
+                add_bos=True,
+                num_hard_negatives=self.cfg.data.get("hard_negatives_to_train", 4),
+                max_seq_length=self.cfg.encoder_seq_length,
+            )
+
+        if self._train_ds is not None:
+            logging.info(f'Length of train dataset: {len(self._train_ds)}')
+        if self._validation_ds is not None:
+            logging.info(f'Length of val dataset: {len(self._validation_ds)}')
+        if self._test_ds is not None:
+            logging.info(f'Length of test dataset: {len(self._test_ds)}')
+        logging.info(f'Finished building SBert datasets.')
+
+        return self._train_ds, self._validation_ds, self._test_ds
+
+    def setup(self, stage=None):
+        """ PTL hook that is executed after DDP spawns.
+            We setup datasets here as megatron datasets require DDP to instantiate.
+            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+        Args:
+            stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
+        """
+
+        num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert(
+            self.model
+        )
+
+        logging.info(
+            f'Pipeline model parallel rank: {parallel_state.get_pipeline_model_parallel_rank()}, '
+            f'Tensor model parallel rank: {parallel_state.get_tensor_model_parallel_rank()}, '
+            f'Number of model parameters on device: {num_parameters_on_device:.2e}. '
+            f'Total number of model parameters: {total_num_parameters:.2e}.'
+        )
+
+        resume_checkpoint_path = self.trainer.ckpt_path
+        if resume_checkpoint_path:
+            init_consumed_samples = self._extract_consumed_samples_from_ckpt(resume_checkpoint_path)
+        else:
+            init_consumed_samples = 0
+        self.init_consumed_samples = init_consumed_samples
+        self.init_global_step = self.trainer.global_step
+
+        if stage == 'predict':
+            return
+        else:
+            # TODO: consider adding a ModelPT guard to check if model is being restored.
+            # allowing restored models to optionally setup datasets
+            if self.cfg.data.dataloader_type == "LDDL":
+                self.build_LDDL_data(self.cfg.data)
+                torch.distributed.barrier()
+            else:
+                self.build_train_valid_test_datasets()
+                self.setup_training_data(self.cfg.data)
+                self.setup_validation_data(self.cfg.data)
+
+        # when using pipeline model parallel the final stage need to initialize word embeddings
+        if parallel_state.get_pipeline_model_parallel_world_size() > 1:
+            if isinstance(self.model, list):
+                for i, module in enumerate(self.model):
+                    parallel_state.set_virtual_pipeline_model_parallel_rank(i)
+                    sync_embeddings = (
+                        module.initialize_last_stage_with_word_embeddings
+                        if self.mcore_bert
+                        else module.sync_initial_word_embeddings
+                    )
+                    sync_embeddings()
+                parallel_state.set_virtual_pipeline_model_parallel_rank(0)
+            else:
+                sync_embeddings = (
+                    self.model.initialize_last_stage_with_word_embeddings
+                    if self.mcore_bert
+                    else self.model.sync_initial_word_embeddings
+                )
+                sync_embeddings()
+
+        if self.cfg.get('transformer_engine', False) or self.cfg.get('mcore_bert', False):
+            self.setup_transformer_engine_tp_groups()
+
+    @classmethod
+    def merge_cfg_with(cls, path, cfg):
+        """
+        Merge a given configuration dictionary `cfg` with the configuration dictionary
+        obtained from restoring a MegatronBertModel at the specified `path`.
+
+        Args:
+            path (str): The path to the Bert model checkpoint to be restored.
+            cfg (DictConfig): The configuration dictionary to merge.
+
+        Returns:
+            DictConfig: The merged configuration dictionary.
+
+        Examples:
+            >>> path = "/path/to/model/checkpoint"
+            >>> cfg = DictConfig({"model": {"key": "value"}, "trainer": {"precision": 16}})
+            >>> merged_cfg = merge_cfg_with(path, cfg)
+
+        Notes:
+            - The function resolves variables within the `cfg` dictionary using `OmegaConf.resolve`.
+            - Keys in `cfg.model` will override the corresponding keys in the output dictionary.
+            - If "train_ds" exists in `cfg.model.data`, it updates `micro_batch_size` and `global_batch_size`.
+            - If `cfg.trainer` contains a "precision" key, it updates `output.precision`.
+
+        """
+
+        base_cfg = cls.restore_from(path, return_config=True)
+
+        OmegaConf.resolve(cfg)
+        with open_dict(base_cfg):
+            for key, val in cfg.model.items():
+                base_cfg[key] = val
+            if "train_ds" in cfg.model.data:
+                base_cfg.micro_batch_size = cfg.model.data.train_ds.micro_batch_size
+                base_cfg.global_batch_size = cfg.model.data.train_ds.global_batch_size
+            if cfg.get("trainer", None) and cfg.trainer.get("precision"):
+                base_cfg.precision = cfg.trainer.precision
+
+        return base_cfg
+
+    def build_pretraining_data_loader(self, dataset, consumed_samples):
+        """Buld dataloader given an input dataset."""
+
+        if dataset is None:
+            return None
+
+        # Megatron sampler
+        if hasattr(self.cfg.data, 'dataloader_type') and self.cfg.data.dataloader_type is not None:
+            if self.cfg.data.dataloader_type == 'single':
+                batch_sampler = MegatronPretrainingSampler(
+                    total_samples=len(dataset),
+                    consumed_samples=consumed_samples,
+                    micro_batch_size=self.cfg.micro_batch_size,
+                    global_batch_size=self.cfg.global_batch_size,
+                    data_parallel_rank=parallel_state.get_data_parallel_rank(),
+                    data_parallel_size=parallel_state.get_data_parallel_world_size(),
+                    drop_last=self.cfg.get('drop_last', True),
+                )
+            elif self.cfg.data.dataloader_type == 'cyclic':
+                batch_sampler = MegatronPretrainingRandomSampler(
+                    total_samples=len(dataset),
+                    consumed_samples=consumed_samples,
+                    micro_batch_size=self.cfg.micro_batch_size,
+                    data_parallel_rank=parallel_state.get_data_parallel_rank(),
+                    data_parallel_size=parallel_state.get_data_parallel_world_size(),
+                    drop_last=self.cfg.get('drop_last', True),
+                )
+            else:
+                raise ValueError('cfg.data.dataloader_type must be "single" or "cyclic"')
+        else:
+            raise ValueError('cfg.data.dataloader_type not found. Must be "single" or "cyclic"')
+
+        # Torch dataloader.
+
+        dataloader = torch.utils.data.DataLoader(
+            dataset,
+            shuffle=False,
+            batch_sampler=batch_sampler,
+            num_workers=self.cfg.data.num_workers,
+            pin_memory=True,
+            persistent_workers=True if self.cfg.data.num_workers > 0 else False,
+            collate_fn=dataset.collate_fn,
+        )
+        return dataloader
+
+    def setup_training_data(self, cfg):
+        if self._train_ds:
+            consumed_samples = self.compute_consumed_samples(0)
+            logging.info(
+                f'Setting up train dataloader with len(len(self._train_ds)): {len(self._train_ds)} and consumed samples: {consumed_samples}'
+            )
+            self._train_dl = self.build_pretraining_data_loader(self._train_ds, consumed_samples)
+
+    def setup_validation_data(self, cfg):
+        if self._validation_ds:
+            consumed_samples = 0
+            logging.info(
+                f'Setting up validation dataloader with len(len(self._validation_ds)): {len(self._validation_ds)} and consumed samples: {consumed_samples}'
+            )
+            self._validation_dl = self.build_pretraining_data_loader(self._validation_ds, consumed_samples)
+
+    def training_step(self, dataloader_iter):
+
+        self._optimizer.zero_grad()
+
+        if self.with_distributed_adam:
+            # hack to enable overlapping param sync and forward compute
+            # note: the distributed optimizer monkey-patches each
+            # parameter's __getattribute__ function so that it can
+            # launch parameter all-gathers the first time the
+            # parameter is accessed after the optimizer step. However,
+            # PyTorch directly passes embedding parameters into a C++,
+            # bypassing this process. A quick-and-dirty hack is to
+            # manually interact with the parameter.
+            modules = self.model if isinstance(self.model, list) else [self.model]
+            for module in modules:
+                if isinstance(module, (Float16Module, MCoreFloat16Module)):
+                    module = module.module
+                if not self.mcore_bert:
+                    module = module.language_model
+                if hasattr(module, 'embedding'):
+                    for param in module.embedding.parameters():
+                        param.data_ptr()
+
+        if self.cfg.data.dataloader_type == "LDDL":
+            # this is of type bert dataset
+            seq_length = dataloader_iter.iterator.loaders.get_seqlen()
+        else:
+            seq_length = self.cfg.encoder_seq_length
+
+        # run forward and backwards passes for an entire global batch
+        # we do this inside training_step to support pipeline parallelism
+        fwd_bwd_function = get_forward_backward_func()
+
+        losses_reduced_per_micro_batch = fwd_bwd_function(
+            forward_step_func=self.get_forward_output_and_loss_func(),
+            data_iterator=self._make_data_iterator_list(dataloader_iter),
+            model=self.model,
+            num_microbatches=get_num_microbatches(),
+            forward_only=False,
+            seq_length=seq_length,
+            micro_batch_size=self.cfg.micro_batch_size,
+        )
+
+        if losses_reduced_per_micro_batch:
+            loss_tensors_list = [loss_reduced['loss'] for loss_reduced in losses_reduced_per_micro_batch]
+            loss_tensor = torch.vstack(loss_tensors_list)
+            loss_mean = loss_tensor.mean(axis=0)
+        else:
+            if self.cfg.bert_binary_head == True:
+                loss_mean = torch.tensor([0.0, 0.0, 0.0]).cuda()
+            else:
+                loss_mean = torch.tensor([0.0, 0.0]).cuda()
+
+        # when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced
+        if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False):
+            self.allreduce_sequence_parallel_gradients()
+
+        if self.with_distributed_adam:
+            # synchronize asynchronous grad reductions
+            # note: not necessary, but reduces performance degradation
+            # from multiple simultaneous NCCL calls
+            self._optimizer._finish_bucket_grad_sync()
+        elif self.megatron_amp_O2:
+            if self.cfg.get('pipeline_model_parallel_size', 1) > 1 or self.cfg.get('sequence_parallel', False):
+                # when using pipeline parallelism grads must be all-reduced after the pipeline (not asynchronously)
+                self._optimizer.allreduce_main_grads()
+        else:
+            # async grad allreduce is not currently implemented for O1/autocasting mixed precision training
+            # so we all-reduce gradients after the pipeline
+            self.allreduce_gradients()  # @sangkug we think this is causing memory to blow up (hurts perf)
+
+        if self.cfg.get('pipeline_model_parallel_size', 1) > 1:
+            # when using pipeline parallelism the first and last stage must keep embeddings in sync
+            self.allreduce_first_last_embeddings()
+
+        torch.distributed.broadcast(loss_mean, get_last_rank())
+
+        if self.torch_dtype == torch.float16:
+            loss_scale = self.trainer.precision_plugin.scaler._scale
+            if loss_scale is not None:
+                self.log('loss_scale', loss_scale, batch_size=1)
+
+        self.log('reduced_train_loss', loss_mean[0], prog_bar=True, batch_size=1)
+        if len(loss_mean) > 2:
+            self.log('reduced_lm_train_loss', loss_mean[1], prog_bar=True, batch_size=1)
+            self.log('reduced_sop_train_loss', loss_mean[2], prog_bar=True, batch_size=1)
+        lr = self._optimizer.param_groups[0]['lr']
+        self.log('lr', lr, batch_size=1)
+        self.log('global_step', self.trainer.global_step, prog_bar=True, batch_size=1)
+        self.log(
+            'consumed_samples', self._compute_consumed_samples_after_training_step(), prog_bar=True, batch_size=1,
+        )
+        return loss_mean[0]
+
+    def get_forward_output_and_loss_func(self):
+        def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_layers=None):
+
+            batches = next(dataloader_iter)[0]
+            batches = {k: v.cuda(non_blocking=True) for k, v in batches.items()}
+
+            if self.mcore_bert:
+
+                batches["tokentype_ids"] = batches.pop("token_type_ids")
+                output_tensor = model(**batches)
+            else:
+                output_tensor = self.forward(**batches).permute(1, 0)
+
+            def loss_func(output_tensor):
+
+                loss_dict = self.loss_func(output_tensor)
+
+                if 'sop loss' in loss_dict:
+                    lm_loss = loss_dict['lm loss']
+                    sop_loss = loss_dict['sop loss']
+                    loss = lm_loss + sop_loss
+                    reduced_loss = average_losses_across_data_parallel_group([loss, lm_loss, sop_loss])
+                else:
+                    lm_loss = loss_dict['lm loss']
+                    loss = lm_loss
+                    reduced_loss = average_losses_across_data_parallel_group([loss, lm_loss])
+
+                return loss, {'loss': reduced_loss}
+
+            return output_tensor, loss_func
+
+        return fwd_output_and_loss_func
+
+    def loss_func(self, output_tensor):
+
+        chunks = output_tensor.chunk(self.cfg.micro_batch_size)
+        queries = torch.stack([item[0] for item in chunks])  # shape (bs, embedding_dim)
+        positives = torch.stack([item[1] for item in chunks])  # shape (bs, embedding_dim)
+
+        pos_inbatch_negs_scores = torch.mm(
+            queries, positives.transpose(0, 1)
+        )  # shape (bs, bs); each positive is negative for other queries.
+
+        hard_negs = [
+            torch.stack([item[i + 2] for item in chunks])
+            for i in range(self.cfg.data.get("hard_negatives_to_train", 4))
+        ]  # List of length "num_negatives", each tensor of shape (bs, embedding_dim)
+
+        hard_negs_scores = (
+            torch.multiply(queries.unsqueeze(0).repeat(len(hard_negs), 1, 1), torch.stack(hard_negs),).sum(axis=-1).T
+        )  # shape = (bs, num_negatives); Hard negatives are not shared between queries.
+
+        scores = torch.cat([pos_inbatch_negs_scores, hard_negs_scores], axis=1)
+
+        scores *= self.scale
+
+        labels = torch.tensor(
+            range(len(scores)), dtype=torch.long, device=scores.device
+        )  # Indices of the (query, positive) pairs
+
+        return {'lm loss': self.cross_entropy_loss(scores, labels)}
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert/__init__.py b/nemo/collections/nlp/models/language_modeling/megatron/bert/__init__.py
new file mode 100644
index 000000000000..4fc50543f1d2
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/bert/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py
similarity index 50%
rename from nemo/collections/nlp/models/language_modeling/megatron/bert_model.py
rename to nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py
index 7e928a4e893b..0fed19dd7718 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py
@@ -14,7 +14,11 @@
 
 """BERT model."""
 
+import warnings
+from dataclasses import dataclass
+
 import torch
+from torch import Tensor
 
 from nemo.collections.nlp.modules.common.megatron.language_model import get_language_model
 from nemo.collections.nlp.modules.common.megatron.module import MegatronModule
@@ -43,14 +47,23 @@
     AttnMaskType = ApexGuardDefaults()
 
 try:
-    from megatron.core import ModelParallelConfig, parallel_state, tensor_parallel
+    from megatron.core import InferenceParams, ModelParallelConfig, parallel_state, tensor_parallel
+    from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+    from megatron.core.models.bert.bert_lm_head import BertLMHead as MCoreBertLMHead
+    from megatron.core.models.bert.bert_model import BertModel as MCoreBert
+    from megatron.core.models.bert.pooler import Pooler
+    from megatron.core.packed_seq_params import PackedSeqParams
+    from megatron.core.transformer.spec_utils import build_module
+    from megatron.core.transformer.transformer_block import TransformerBlock
+    from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+    from megatron.core.transformer.utils import get_linear_layer as mcore_get_linear_layer
+    from megatron.core.utils import make_viewless_tensor
 
     HAVE_MEGATRON_CORE = True
 
 except (ImportError, ModuleNotFoundError):
 
     ModelParallelConfig = ApexGuardDefaults
-
     HAVE_MEGATRON_CORE = False
 
 
@@ -154,7 +167,267 @@ def post_language_model_processing(
         return lm_loss, binary_logits
 
 
-class BertModel(MegatronModule):
+@dataclass
+class TransformerLayerSubmodulesWithPostLNSupport(TransformerLayerSubmodules):
+    def __init__(self, post_att_layernorm, post_mlp_layernorm, **kwargs):
+        super(TransformerLayerSubmodulesWithPostLNSupport, self).__init__(**kwargs)
+        self.post_att_layernorm = post_att_layernorm
+        self.post_mlp_layernorm = post_mlp_layernorm
+
+
+class TransformerLayerWithPostLNSupport(TransformerLayer):
+    def __init__(self, *args, **kwargs):
+        super(TransformerLayerWithPostLNSupport, self).__init__(*args, **kwargs)
+        ## [Module add: Post attention LN]
+        self.post_att_layernorm = build_module(
+            self.submodules_config.post_att_layernorm,
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+        )
+        ## [Module add: Post MLP LN]
+        self.post_mlp_layernorm = build_module(
+            self.submodules_config.post_mlp_layernorm,
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        context=None,
+        context_mask=None,
+        rotary_pos_emb=None,
+        inference_params=None,
+        packed_seq_params=None,
+    ):
+        # hidden_states: [s, b, h]
+
+        # Residual connection.
+        residual = hidden_states
+
+        # Optional Input Layer norm
+        input_layernorm_output = self.input_layernorm(hidden_states)
+
+        # Self attention.
+        attention_output_with_bias = self.self_attention(
+            input_layernorm_output,
+            attention_mask=attention_mask,
+            inference_params=inference_params,
+            rotary_pos_emb=rotary_pos_emb,
+            packed_seq_params=packed_seq_params,
+        )
+
+        # TODO: could we move `bias_dropout_add_exec_handler` itself
+        # inside the module provided in the `bias_dropout_add_spec` module?
+        with self.bias_dropout_add_exec_handler():
+            hidden_states = self.self_attn_bda(self.training, self.config.bias_dropout_fusion)(
+                attention_output_with_bias, residual, self.hidden_dropout
+            )
+
+        # Residual connection.
+        residual = hidden_states
+
+        # Post-LN after Self Attention
+        hidden_states = self.post_att_layernorm(hidden_states)
+
+        # Optional Layer norm after self-attention
+        pre_cross_attn_layernorm_output = self.pre_cross_attn_layernorm(hidden_states)
+
+        # Cross attention.
+        attention_output_with_bias = self.cross_attention(
+            pre_cross_attn_layernorm_output,
+            attention_mask=context_mask,
+            key_value_states=context,
+            inference_params=inference_params,
+        )
+
+        if isinstance(attention_output_with_bias, dict) and "context" in attention_output_with_bias:
+            context = attention_output_with_bias["context"]
+
+        # TODO: could we move `bias_dropout_add_exec_handler` itself
+        # inside the module provided in the `bias_dropout_add_spec` module?
+        with self.bias_dropout_add_exec_handler():
+            hidden_states = self.cross_attn_bda(self.training, self.config.bias_dropout_fusion)(
+                attention_output_with_bias, residual, self.hidden_dropout
+            )
+
+        # Residual connection.
+        residual = hidden_states
+
+        # Optional Layer norm post the cross-attention.
+        pre_mlp_layernorm_output = self.pre_mlp_layernorm(hidden_states)
+
+        # MLP.
+        mlp_output_with_bias = self.mlp(pre_mlp_layernorm_output)
+
+        # TODO: could we move `bias_dropout_add_exec_handler` itself
+        # inside the module provided in the `bias_dropout_add_spec` module?
+        with self.bias_dropout_add_exec_handler():
+            hidden_states = self.mlp_bda(self.training, self.config.bias_dropout_fusion)(
+                mlp_output_with_bias, residual, self.hidden_dropout
+            )
+
+        # Post-LN after MLP
+        hidden_states = self.post_mlp_layernorm(hidden_states)
+
+        # Jit compiled function creates 'view' tensor. This tensor
+        # potentially gets saved in the MPU checkpoint function context,
+        # which rejects view tensors. While making a viewless tensor here
+        # won't result in memory savings (like the data loader, or
+        # p2p_communication), it serves to document the origin of this
+        # 'view' tensor.
+        output = make_viewless_tensor(inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True)
+
+        return output, context
+
+
+class TransformerBlockWithPostLNSupport(TransformerBlock):
+    def __init__(self, transformer_block_type='post_ln', *args, **kwargs):
+
+        super(TransformerBlockWithPostLNSupport, self).__init__(*args, **kwargs)
+        self.transformer_block_type = transformer_block_type
+        if self.transformer_block_type == 'post_ln':
+            self.initial_layernorm = FusedLayerNorm(
+                config=self.config, hidden_size=self.config.hidden_size, eps=self.config.layernorm_epsilon
+            )
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Tensor,
+        context: Tensor = None,
+        context_mask: Tensor = None,
+        rotary_pos_emb: Tensor = None,
+        inference_params: InferenceParams = None,
+        packed_seq_params: PackedSeqParams = None,
+    ):
+        # hidden_states (float): [s, b, h]
+        # attention_mask (bool): [1, 1, s, s]
+
+        if not self.pre_process:
+            # See set_input_tensor()
+            hidden_states = self.input_tensor
+        if self.transformer_block_type == 'post_ln':
+            hidden_states = self.initial_layernorm(hidden_states)
+        return super(TransformerBlockWithPostLNSupport, self).forward(
+            hidden_states, attention_mask, context, context_mask, rotary_pos_emb, inference_params, packed_seq_params
+        )
+
+
+'''
+This class is used for working with HF Bert Checkpoints. These checkpoints
+by default have post layer norm, while the vanilla mcore bert model does not support it.
+'''
+
+
+class MCoreBertModelWrapperWithPostLNSupport(MCoreBert):
+    def __init__(self, transformer_block_type='pre-ln', add_pooler=True, *args, **kwargs):
+
+        super(MCoreBertModelWrapperWithPostLNSupport, self).__init__(*args, **kwargs)
+        self.add_pooler = add_pooler
+        self.transformer_block_type = transformer_block_type
+
+        # Transformer.
+        self.encoder = TransformerBlockWithPostLNSupport(
+            config=self.config,
+            spec=self.transformer_layer_spec,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+            transformer_block_type=self.transformer_block_type,
+        )
+
+        if self.add_pooler:
+            self.pooler = Pooler(
+                self.config.hidden_size, self.config.init_method, self.config, self.config.sequence_parallel
+            )
+
+        # Output
+        if self.post_process:
+            # TODO: Make sure you are passing in the mpu_vocab_size properly
+            self.lm_head = MCoreBertLMHead(
+                self.config.hidden_size,
+                self.config,
+                self.parallel_output,
+                self.vocab_size,
+                self.pre_process,
+                self.share_embeddings_and_output_weights,
+            )
+
+            self.output_layer = self.lm_head.output_layer
+
+            self.binary_head = None
+            if self.add_binary_head:
+                # TODO: Shoudl switch this to TE ?
+                self.binary_head = mcore_get_linear_layer(
+                    self.config.hidden_size, 2, self.config.init_method, self.config.perform_initialization
+                )
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        attention_mask: Tensor,
+        tokentype_ids: Tensor = None,
+        lm_labels: Tensor = None,
+        inference_params=None,
+    ):
+        """Forward function of BERT model
+
+        Forward function of the BERT Model This function passes the input tensors
+        through the embedding layer, and then the encoder and finally into the post
+        processing layer (optional).
+
+        It either returns the Loss values if labels are given  or the final hidden units
+        """
+        original_post_process = self.post_process
+
+        # We set this to false since we just want to get the hidden states from the encoder
+        self.post_process = False
+        hidden_states = super().forward(input_ids, attention_mask, tokentype_ids, lm_labels, inference_params)
+        self.post_process = original_post_process
+
+        if not self.post_process:
+            return hidden_states
+
+        if self.add_pooler:
+            pooled_output = self.pooler(hidden_states, 0)
+
+        if self.return_embeddings:
+            embeddings = torch.transpose(hidden_states, 0, 1)
+            masks = torch.sum(attention_mask, dim=1)
+            # Collect masked embeddings.
+            output = torch.zeros(
+                size=(embeddings.shape[0], embeddings.shape[2]),
+                dtype=torch.float32,
+                device=torch.cuda.current_device(),
+            )
+            for i, (embedding, mask) in enumerate(zip(embeddings, masks)):
+                output[i, :] = torch.mean(embedding[1 : mask - 1], dim=0)
+            return output
+
+        # logits and loss
+        output_weight = None
+        if self.share_embeddings_and_output_weights:
+            output_weight = self.shared_embedding_or_output_weight()
+
+        logits = self.lm_head(hidden_states=hidden_states, word_embeddings_weight=output_weight)
+
+        binary_logits = None
+        if self.binary_head is not None and self.add_pooler:
+            binary_logits = self.binary_head(pooled_output)
+
+        if lm_labels is None:
+            # [s b h] => [b s h]
+            return logits.transpose(0, 1).contiguous(), binary_logits
+
+        loss = self.compute_language_model_loss(lm_labels, logits)
+
+        return loss, binary_logits
+
+
+class NeMoBertModel(MegatronModule):
     """
     Bert Language model.
     Model returns [seq, batch, hidden] shape
@@ -193,22 +466,30 @@ def __init__(
         openai_gelu=False,
         onnx_safe=False,
         add_binary_head=True,
-        skip_head=False,
+        add_pooler=True,
+        add_lm_head=True,
         megatron_legacy=False,
         sequence_parallel=False,
         position_embedding_type='learned_absolute',
     ):
-        super(BertModel, self).__init__(config=config)
+        warnings.warn(
+            "NeMoBertModel will be deprecated mid 2024. Use MCoreBertModelWrapperWithPostLNSupport instead.",
+            DeprecationWarning,
+        )
+        super(NeMoBertModel, self).__init__(config=config)
         self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
         self.add_binary_head = add_binary_head
         self.parallel_output = parallel_output
         self.pre_process = pre_process
         self.post_process = post_process
         self.sequence_parallel = sequence_parallel
+        self.add_lm_head = add_lm_head
+        self.add_pooler = add_pooler
 
         init_method = init_method_normal(init_method_std)
         scaled_init_method = scaled_init_method_normal(init_method_std, num_layers)
-
+        if self.add_binary_head:
+            assert self.add_pooler, "Binary head requires pooler."
         self.language_model, self._language_model_key = get_language_model(
             config=config,
             vocab_size=vocab_size,
@@ -221,7 +502,7 @@ def __init__(
             apply_query_key_layer_scaling=apply_query_key_layer_scaling,
             kv_channels=kv_channels,
             ffn_hidden_size=ffn_hidden_size,
-            add_pooler=self.add_binary_head,
+            add_pooler=self.add_pooler,
             encoder_attn_mask_type=AttnMaskType.padding,
             init_method=init_method,
             scaled_init_method=scaled_init_method,
@@ -250,9 +531,7 @@ def __init__(
             init_method=init_method_normal(init_method_std), vocab_size=vocab_size, hidden_size=hidden_size
         )
 
-        if skip_head:
-            self.post_process = False
-        if self.post_process:
+        if self.post_process and self.add_lm_head:
             self.lm_head = BertLMHead(
                 config,
                 self.word_embeddings_weight().size(0),
@@ -299,12 +578,12 @@ def forward(
             checkpoint_activations_all_layers=checkpoint_activations_all_layers,
         )
 
-        if self.post_process and self.add_binary_head:
+        if self.post_process and self.add_binary_head and self.add_lm_head:
             lm_output, pooled_output = lm_output
         else:
             pooled_output = None
 
-        if self.post_process:
+        if self.post_process and self.add_lm_head:
             return post_language_model_processing(
                 lm_output,
                 pooled_output,
@@ -325,14 +604,14 @@ def state_dict_for_save_checkpoint(self, destination=None, prefix='', keep_vars=
         state_dict_[self._language_model_key] = self.language_model.state_dict_for_save_checkpoint(
             destination, prefix, keep_vars
         )
-        if self.post_process:
+        if self.post_process and self.add_lm_head:
             state_dict_[self._lm_head_key] = self.lm_head.state_dict_for_save_checkpoint(
                 destination, prefix, keep_vars
             )
-        if self.post_process and self.add_binary_head:
+        if self.post_process and self.add_binary_head and self.add_lm_head:
             state_dict_[self._binary_head_key] = self.binary_head.state_dict(destination, prefix, keep_vars)
         # Save word_embeddings.
-        if self.post_process and not self.pre_process:
+        if self.post_process and not self.pre_process and self.add_lm_head:
             state_dict_[self._word_embeddings_for_head_key] = self.word_embeddings.state_dict(
                 destination, prefix, keep_vars
             )
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_spec.py
new file mode 100644
index 000000000000..31fd62126c15
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_spec.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+try:
+    from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+    from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+    from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+    from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+    from megatron.core.transformer.custom_layers.transformer_engine import (
+        TEColumnParallelLinear,
+        TEDotProductAttention,
+        TENorm,
+        TERowParallelLinear,
+    )
+    from megatron.core.transformer.dot_product_attention import DotProductAttention
+    from megatron.core.transformer.enums import AttnMaskType
+    from megatron.core.transformer.mlp import MLP, MLPSubmodules
+    from megatron.core.transformer.spec_utils import ModuleSpec
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+    TransformerConfig = ApexGuardDefaults
+    HAVE_MEGATRON_CORE = False
+
+from nemo.collections.nlp.models.language_modeling.megatron.bert.bert_model import (
+    TransformerLayerSubmodulesWithPostLNSupport,
+    TransformerLayerWithPostLNSupport,
+)
+
+# Use this spec to use lower level Transformer Engine modules (required for fp8 training)
+bert_layer_with_transformer_engine_spec_postln = ModuleSpec(
+    module=TransformerLayerWithPostLNSupport,
+    submodules=TransformerLayerSubmodulesWithPostLNSupport(
+        self_attention=ModuleSpec(
+            module=SelfAttention,
+            params={"attn_mask_type": AttnMaskType.padding},
+            submodules=SelfAttentionSubmodules(
+                linear_qkv=TEColumnParallelLinear,
+                core_attention=TEDotProductAttention,
+                linear_proj=TERowParallelLinear,
+            ),
+        ),
+        self_attn_bda=get_bias_dropout_add,
+        post_att_layernorm=TENorm,
+        mlp=ModuleSpec(
+            module=MLP, submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,),
+        ),
+        mlp_bda=get_bias_dropout_add,
+        post_mlp_layernorm=TENorm,
+    ),
+)
+
+# Use this spec for an implementation using only modules in megatron core
+bert_layer_local_spec_postln = ModuleSpec(
+    module=TransformerLayerWithPostLNSupport,
+    submodules=TransformerLayerSubmodulesWithPostLNSupport(
+        self_attention=ModuleSpec(
+            module=SelfAttention,
+            params={"attn_mask_type": AttnMaskType.padding},
+            submodules=SelfAttentionSubmodules(
+                linear_qkv=ColumnParallelLinear, core_attention=DotProductAttention, linear_proj=RowParallelLinear,
+            ),
+        ),
+        self_attn_bda=get_bias_dropout_add,
+        post_att_layernorm=FusedLayerNorm,
+        mlp=ModuleSpec(
+            module=MLP, submodules=MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear,),
+        ),
+        mlp_bda=get_bias_dropout_add,
+        post_mlp_layernorm=FusedLayerNorm,
+    ),
+)
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
index 29e1d2656cdf..98d0e9f12584 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
@@ -26,7 +26,13 @@
     MegatronPretrainingRandomSampler,
     MegatronPretrainingSampler,
 )
-from nemo.collections.nlp.models.language_modeling.megatron.bert_model import BertModel
+from nemo.collections.nlp.models.language_modeling.megatron.bert.bert_model import (
+    MCoreBertModelWrapperWithPostLNSupport,
+    NeMoBertModel,
+)
+from nemo.collections.nlp.models.language_modeling.megatron.bert.bert_spec import (
+    bert_layer_with_transformer_engine_spec_postln,
+)
 from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
 from nemo.collections.nlp.modules.common.megatron.build_model import build_model
 from nemo.collections.nlp.modules.common.megatron.module import Float16Module
@@ -35,11 +41,10 @@
     average_losses_across_data_parallel_group,
     get_params_for_weight_decay_optimization,
 )
-from nemo.collections.nlp.parts.nlp_overrides import GradScaler
 from nemo.collections.nlp.parts.utils_funcs import get_last_rank
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.core.neural_types import ChannelType, MaskType, NeuralType
-from nemo.utils import AppState, logging
+from nemo.utils import logging
 
 try:
     from apex.transformer.pipeline_parallel.utils import get_num_microbatches
@@ -61,6 +66,7 @@
 
 try:
     from megatron.core import parallel_state
+    from megatron.core.models.bert.bert_layer_specs import bert_layer_with_transformer_engine_spec
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
     from megatron.core.transformer.transformer_config import TransformerConfig
@@ -140,14 +146,16 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
     def model_provider_func(self, pre_process, post_process):
         cfg = self.cfg
         num_tokentypes = 2 if cfg.bert_binary_head else 0
-
+        transformer_block_type = cfg.get('transformer_block_type', 'pre_ln')
         if self.mcore_bert:
-            from megatron.core.models.bert.bert_layer_specs import bert_layer_with_transformer_engine_spec
-            from megatron.core.models.bert.bert_model import BertModel as MCoreBertModel
+            if transformer_block_type == 'pre_ln':
+                layer_spec = bert_layer_with_transformer_engine_spec
+            else:
+                layer_spec = bert_layer_with_transformer_engine_spec_postln
 
-            model = MCoreBertModel(
+            model = MCoreBertModelWrapperWithPostLNSupport(
                 config=self.transformer_config,
-                transformer_layer_spec=bert_layer_with_transformer_engine_spec,
+                transformer_layer_spec=layer_spec,
                 vocab_size=self.padded_vocab_size,
                 max_sequence_length=cfg.max_position_embeddings,
                 num_tokentypes=num_tokentypes,
@@ -156,9 +164,11 @@ def model_provider_func(self, pre_process, post_process):
                 parallel_output=True,
                 pre_process=pre_process,
                 post_process=post_process,
+                transformer_block_type=transformer_block_type,
+                add_pooler=self.cfg.get('add_pooler', False),
             )
         else:
-            model = BertModel(
+            model = NeMoBertModel(
                 config=self.model_parallel_config,
                 vocab_size=self.padded_vocab_size,
                 hidden_size=cfg.hidden_size,
@@ -186,14 +196,15 @@ def model_provider_func(self, pre_process, post_process):
                 layernorm_epsilon=cfg.get('layernorm_epsilon', 1e-5),
                 masked_softmax_fusion=cfg.get('masked_softmax_fusion', True),
                 normalization=cfg.get('normalization', 'layernorm'),
-                transformer_block_type=cfg.get('transformer_block_type', 'pre_ln'),
+                transformer_block_type=transformer_block_type,
                 bias_gelu_fusion=cfg.get('bias_gelu_fusion', True),
                 bias_dropout_add_fusion=cfg.get("bias_dropout_add_fusion", True),
                 onnx_safe=cfg.get('onnx_safe', False),
                 add_binary_head=cfg.bert_binary_head,
-                skip_head=cfg.get('skip_head', False),
                 megatron_legacy=cfg.get('megatron_legacy', False),
                 position_embedding_type=self.cfg.get("position_embedding_type", "learned_absolute"),
+                add_pooler=cfg.get('add_pooler', True),
+                add_lm_head=cfg.get('add_lm_head', True),
             )
 
         return model
@@ -291,13 +302,17 @@ def forward(
     ):
         if model is None:
             model = self.model
-        output_tensor = model(
-            input_ids,
-            attention_mask,
-            token_type_ids=token_type_ids,
-            lm_labels=lm_labels,
-            checkpoint_activations_all_layers=checkpoint_activations_all_layers,
-        )
+
+        if self.mcore_bert:
+            output_tensor = model(input_ids, attention_mask, tokentype_ids=token_type_ids,)
+        else:
+            output_tensor = model(
+                input_ids,
+                attention_mask,
+                token_type_ids=token_type_ids,
+                lm_labels=lm_labels,
+                checkpoint_activations_all_layers=checkpoint_activations_all_layers,
+            )
         if parallel_state.is_pipeline_last_stage():
             # Return the output tensor of encoder and transpose from [seq_len, batch, hidden] to [batch, seq_len, hidden]
             if torch.is_tensor(output_tensor):
diff --git a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
index fdd318195890..24294cfdfb85 100644
--- a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
@@ -18,6 +18,7 @@
     python convert_bert_hf_to_nemo.py \
      --input_name_or_path "thenlper/gte-large" \
      --output_path /path/to/output/nemo/file.nemo \
+     --mcore True \
      --precision 32
 ```
 """
@@ -26,183 +27,15 @@
 from argparse import ArgumentParser
 
 import torch
-import torch.nn.functional as F
 from omegaconf import OmegaConf
-from transformers import AutoModel, AutoTokenizer
-
+from transformers import AutoModel
 from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
 from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
 from nemo.utils import logging
 
 
-def average_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
-    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
-    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
-
-
-def create_rename_keys(num_hidden_layers):
-    rename_keys = []
-    for i in range(num_hidden_layers):
-        # encoder layers: attention mechanism, 2 feedforward neural networks, and 2 layernorms
-        rename_keys.extend(
-            [
-                (
-                    f"encoder.layer.{i}.attention.self.query.weight",
-                    f"model.language_model.encoder.layers.{i}.self_attention.query.weight",
-                ),
-                (
-                    f"encoder.layer.{i}.attention.self.query.bias",
-                    f"model.language_model.encoder.layers.{i}.self_attention.query.bias",
-                ),
-                (
-                    f"encoder.layer.{i}.attention.self.key.weight",
-                    f"model.language_model.encoder.layers.{i}.self_attention.key.weight",
-                ),
-                (
-                    f"encoder.layer.{i}.attention.self.key.bias",
-                    f"model.language_model.encoder.layers.{i}.self_attention.key.bias",
-                ),
-                (
-                    f"encoder.layer.{i}.attention.self.value.weight",
-                    f"model.language_model.encoder.layers.{i}.self_attention.value.weight",
-                ),
-                (
-                    f"encoder.layer.{i}.attention.self.value.bias",
-                    f"model.language_model.encoder.layers.{i}.self_attention.value.bias",
-                ),
-                (
-                    f"encoder.layer.{i}.attention.output.dense.weight",
-                    f"model.language_model.encoder.layers.{i}.self_attention.dense.weight",
-                ),
-                (
-                    f"encoder.layer.{i}.attention.output.dense.bias",
-                    f"model.language_model.encoder.layers.{i}.self_attention.dense.bias",
-                ),
-                (
-                    f"encoder.layer.{i}.attention.output.LayerNorm.weight",
-                    f"model.language_model.encoder.layers.{i}.input_layernorm.weight",
-                ),
-                (
-                    f"encoder.layer.{i}.attention.output.LayerNorm.bias",
-                    f"model.language_model.encoder.layers.{i}.input_layernorm.bias",
-                ),
-                (
-                    f"encoder.layer.{i}.intermediate.dense.weight",
-                    f"model.language_model.encoder.layers.{i}.mlp.dense_h_to_4h.weight",
-                ),
-                (
-                    f"encoder.layer.{i}.intermediate.dense.bias",
-                    f"model.language_model.encoder.layers.{i}.mlp.dense_h_to_4h.bias",
-                ),
-                (
-                    f"encoder.layer.{i}.output.dense.weight",
-                    f"model.language_model.encoder.layers.{i}.mlp.dense_4h_to_h.weight",
-                ),
-                (
-                    f"encoder.layer.{i}.output.dense.bias",
-                    f"model.language_model.encoder.layers.{i}.mlp.dense_4h_to_h.bias",
-                ),
-                (
-                    f"encoder.layer.{i}.output.LayerNorm.weight",
-                    f"model.language_model.encoder.layers.{i}.post_attention_layernorm.weight",
-                ),
-                (
-                    f"encoder.layer.{i}.output.LayerNorm.bias",
-                    f"model.language_model.encoder.layers.{i}.post_attention_layernorm.bias",
-                ),
-            ]
-        )
-
-    # Non-layer dependent keys
-    rename_keys.extend(
-        [
-            ("embeddings.word_embeddings.weight", "model.language_model.embedding.word_embeddings.weight"),
-            ("embeddings.position_embeddings.weight", "model.language_model.embedding.position_embeddings.weight"),
-            ("embeddings.token_type_embeddings.weight", "model.language_model.embedding.tokentype_embeddings.weight"),
-            ("embeddings.LayerNorm.weight", "model.language_model.encoder.initial_layernorm.weight"),
-            ("embeddings.LayerNorm.bias", "model.language_model.encoder.initial_layernorm.bias"),
-            ("pooler.dense.weight", "model.language_model.pooler.dense.weight"),
-            ("pooler.dense.bias", "model.language_model.pooler.dense.bias"),
-        ]
-    )
-
-    return rename_keys
-
-
-def rename_model_keys(model_state_dict, rename_keys):
-    """
-    Rename keys in the model's state dictionary based on the provided mappings.
-
-    Parameters:
-    model_state_dict (dict): The state dictionary of the model.
-    rename_keys (list): A list of tuples with the mapping (old_key, new_key).
-
-    Returns:
-    dict: A new state dictionary with updated key names.
-    """
-
-    # Create a new state dictionary with updated key names
-    new_state_dict = {}
-
-    # Track keys from the original state dict to ensure all are processed
-    remaining_keys = set(model_state_dict.keys())
-
-    # Iterate over the rename mappings
-    for old_key, new_key in rename_keys:
-        if old_key in model_state_dict:
-            # Rename the key and remove it from the tracking set
-            new_state_dict[new_key] = model_state_dict[old_key]
-            remaining_keys.remove(old_key)
-        else:
-            print(f"Warning: Key '{old_key}' not found in the model state dictionary.")
-
-    # Check if any keys were not converted from old to new
-    for old_key in remaining_keys:
-        print(f"Warning: Key '{old_key}' was not converted.")
-
-    return new_state_dict
-
-
-def adjust_tensor_shapes(model, nemo_state_dict):
-    """
-    Adapt tensor shapes in the state dictionary to ensure compatibility with a different model structure.
-
-    Parameters:
-    nemo_state_dict (dict): The state dictionary of the model.
-
-    Returns:
-    dict: The updated state dictionary with modified tensor shapes for compatibility.
-    """
-
-    # Note: For 'key' and 'value' weights and biases, NeMo uses a consolidated tensor 'query_key_value'.
-    for key_ in list(nemo_state_dict.keys()):
-        if "self_attention.query" in key_:
-            key_q = key_
-            key_k = key_.replace('self_attention.query', 'self_attention.key')
-            key_v = key_.replace('self_attention.query', 'self_attention.value')
-            key_new = key_.replace('self_attention.query', 'self_attention.query_key_value')
-            value_new = torch.concat((nemo_state_dict[key_q], nemo_state_dict[key_k], nemo_state_dict[key_v]), dim=0)
-            nemo_state_dict[key_new] = value_new
-            del nemo_state_dict[key_q], nemo_state_dict[key_k], nemo_state_dict[key_v]
-
-    # Padding to new vocab size
-    original_embedding = nemo_state_dict['model.language_model.embedding.word_embeddings.weight']
-    vocab_size = original_embedding.size(0)
-    if model.padded_vocab_size > vocab_size:
-        zeros_to_add = torch.zeros(
-            model.padded_vocab_size - vocab_size,
-            original_embedding.size(1),
-            dtype=original_embedding.dtype,
-            device=original_embedding.device,
-        )
-        # Concatenate the two tensors along rows
-        padded_embedding = torch.cat([original_embedding, zeros_to_add], dim=0)
-        nemo_state_dict['model.language_model.embedding.word_embeddings.weight'] = padded_embedding
-
-    return nemo_state_dict
-
-
-def adjust_nemo_config(model_config, ref_config):
+def adjust_nemo_config(model_config, ref_config, mcore_bert=True):
+    model_config.tokenizer["type"] = "intfloat/e5-large-unsupervised"  # ref_config["_input_name_or_path"]
     model_config["num_layers"] = ref_config["num_hidden_layers"]
     model_config["hidden_size"] = ref_config["hidden_size"]
     model_config["ffn_hidden_size"] = ref_config["intermediate_size"]
@@ -211,15 +44,15 @@ def adjust_nemo_config(model_config, ref_config):
     model_config["normalization"] = "layernorm"
     model_config["transformer_block_type"] = "post_ln"
     model_config["apply_query_key_layer_scaling"] = False
-    model_config["skip_head"] = True
-    model_config["megatron_legacy"] = True
+    model_config["megatron_legacy"] = False
+    model_config["mcore_bert"] = mcore_bert
     return model_config
 
 
 def get_args():
     parser = ArgumentParser()
     parser.add_argument("--input_name_or_path", type=str, default="thenlper/gte-large")
-    parser.add_argument("--vocab_file", type=str, default=None)
+    parser.add_argument("--mcore", type=bool, default=True)
     parser.add_argument(
         "--hparams_file",
         type=str,
@@ -238,49 +71,173 @@ def get_args():
 
 def convert(args):
     logging.info(f"Loading checkpoint from HF: `{args.input_name_or_path}`")
-    hf_tokenizer = AutoTokenizer.from_pretrained(args.input_name_or_path)
     hf_model = AutoModel.from_pretrained(args.input_name_or_path)
 
     nemo_config = OmegaConf.load(args.hparams_file)
-    nemo_config.model.tokenizer["vocab_file"] = args.vocab_file
-    nemo_config.model = adjust_nemo_config(nemo_config.model, hf_model.config.to_dict())
+    nemo_config.model = adjust_nemo_config(nemo_config.model, hf_model.config.to_dict(), mcore_bert=args.mcore)
 
     nemo_config.trainer["precision"] = args.precision
     trainer = MegatronTrainerBuilder(nemo_config).create_trainer()
     model = MegatronBertModel(nemo_config.model, trainer)
 
-    old_state_dict = hf_model.state_dict()
-    rename_keys = create_rename_keys(nemo_config.model.num_layers)
-    new_state_dict = rename_model_keys(model_state_dict=old_state_dict, rename_keys=rename_keys)
-    nemo_state_dict = adjust_tensor_shapes(model, new_state_dict)
-    model.load_state_dict(nemo_state_dict, strict=True)
-
-    logging.info(f'=' * 50)
-    # Verifications
-    input_texts = [
-        'query: how much protein should a female eat',
-        'query: summit define',
-        "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
-        "passage: Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments.",
-    ]
+    nemo_state_dict = {}
+    hf_config = hf_model.config.to_dict()
+    hidden_size = hf_config["hidden_size"]
+    head_num = hf_config["num_attention_heads"]
+    head_size = hidden_size // head_num
+    num_layers = hf_config["num_hidden_layers"]
+
+    param_to_weights = lambda param: param.float()
+    for l in range(num_layers):
+        print(f"converting layer {l}")
+        old_tensor_shape = hf_model.state_dict()[f'encoder.layer.{l}.attention.self.query.weight'].size()
+        new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
+        new_q_tensor_shape_bias = (head_num, head_size)
+
+        q = hf_model.state_dict()[f'encoder.layer.{l}.attention.self.query.weight'].view(*new_q_tensor_shape)
+        k = hf_model.state_dict()[f'encoder.layer.{l}.attention.self.key.weight'].view(*new_q_tensor_shape)
+        v = hf_model.state_dict()[f'encoder.layer.{l}.attention.self.value.weight'].view(*new_q_tensor_shape)
+        bias_q = hf_model.state_dict()[f'encoder.layer.{l}.attention.self.query.bias'].view(*new_q_tensor_shape_bias)
+        bias_k = hf_model.state_dict()[f'encoder.layer.{l}.attention.self.key.bias'].view(*new_q_tensor_shape_bias)
+        bias_v = hf_model.state_dict()[f'encoder.layer.{l}.attention.self.value.bias'].view(*new_q_tensor_shape_bias)
+
+        qkv_weights = torch.empty((0, head_size) + old_tensor_shape[1:])
+        qkv_biases = torch.empty((0, head_size))
+        for i in range(head_num):
+            qkv_weights = torch.cat((qkv_weights, q[i : i + 1, :, :]))
+            qkv_weights = torch.cat((qkv_weights, k[i : i + 1, :, :]))
+            qkv_weights = torch.cat((qkv_weights, v[i : i + 1, :, :]))
+            qkv_biases = torch.cat((qkv_biases, bias_q[i : i + 1]))
+            qkv_biases = torch.cat((qkv_biases, bias_k[i : i + 1]))
+            qkv_biases = torch.cat((qkv_biases, bias_v[i : i + 1]))
+
+        qkv_weights = qkv_weights.reshape([head_size * (3 * head_num), hidden_size])
+        qkv_biases = qkv_biases.reshape([head_size * (3 * head_num)])
+
+        if args.mcore:
+            qkv_weights_base_name = f'model.encoder.layers.{l}.self_attention.linear_qkv.weight'
+            qkv_biases_base_name = f'model.encoder.layers.{l}.self_attention.linear_qkv.bias'
+        else:
+            qkv_weights_base_name = f'model.language_model.encoder.layers.{l}.self_attention.query_key_value.weight'
+            qkv_biases_base_name = f'model.language_model.encoder.layers.{l}.self_attention.query_key_value.bias'
+        nemo_state_dict[qkv_weights_base_name] = param_to_weights(qkv_weights)
+        nemo_state_dict[qkv_biases_base_name] = param_to_weights(qkv_biases)
+
+        # attention dense
+        dense_weight = hf_model.state_dict()[f'encoder.layer.{l}.attention.output.dense.weight']
+        dense_bias = hf_model.state_dict()[f'encoder.layer.{l}.attention.output.dense.bias']
+        if args.mcore:
+            dense_weight_base_name = f'model.encoder.layers.{l}.self_attention.linear_proj.weight'
+            dense_bias_base_name = f'model.encoder.layers.{l}.self_attention.linear_proj.bias'
+        else:
+            dense_weight_base_name = f'model.language_model.encoder.layers.{l}.self_attention.dense.weight'
+            dense_bias_base_name = f'model.language_model.encoder.layers.{l}.self_attention.dense.bias'
+        nemo_state_dict[dense_weight_base_name] = param_to_weights(dense_weight)
+        nemo_state_dict[dense_bias_base_name] = param_to_weights(dense_bias)
+
+        # LayerNorm1
+        LayerNorm1_weight = hf_model.state_dict()[f'encoder.layer.{l}.attention.output.LayerNorm.weight']
+        LayerNorm1_bias = hf_model.state_dict()[f'encoder.layer.{l}.attention.output.LayerNorm.bias']
+        if args.mcore:
+            LayerNorm1_weight_base_name = f'model.encoder.layers.{l}.post_att_layernorm.weight'
+            LayerNorm1_bias_base_name = f'model.encoder.layers.{l}.post_att_layernorm.bias'
+        else:
+            LayerNorm1_weight_base_name = f'model.language_model.encoder.layers.{l}.input_layernorm.weight'
+            LayerNorm1_bias_base_name = f'model.language_model.encoder.layers.{l}.input_layernorm.bias'
+        nemo_state_dict[LayerNorm1_weight_base_name] = param_to_weights(LayerNorm1_weight)
+        nemo_state_dict[LayerNorm1_bias_base_name] = param_to_weights(LayerNorm1_bias)
+
+        # MLP 1
+        MLP1_weight = hf_model.state_dict()[f'encoder.layer.{l}.intermediate.dense.weight']
+        MLP1_bias = hf_model.state_dict()[f'encoder.layer.{l}.intermediate.dense.bias']
+        if args.mcore:
+            MLP1_weight_base_name = f'model.encoder.layers.{l}.mlp.linear_fc1.weight'
+            MLP1_bias_base_name = f'model.encoder.layers.{l}.mlp.linear_fc1.bias'
+        else:
+            MLP1_weight_base_name = f'model.language_model.encoder.layers.{l}.mlp.dense_h_to_4h.weight'
+            MLP1_bias_base_name = f'model.language_model.encoder.layers.{l}.mlp.dense_h_to_4h.bias'
+        nemo_state_dict[MLP1_weight_base_name] = param_to_weights(MLP1_weight)
+        nemo_state_dict[MLP1_bias_base_name] = param_to_weights(MLP1_bias)
+
+        # MLP 2
+        MLP2_weight = hf_model.state_dict()[f'encoder.layer.{l}.output.dense.weight']
+        MLP2_bias = hf_model.state_dict()[f'encoder.layer.{l}.output.dense.bias']
+        if args.mcore:
+            MLP2_weight_base_name = f'model.encoder.layers.{l}.mlp.linear_fc2.weight'
+            MLP2_bias_base_name = f'model.encoder.layers.{l}.mlp.linear_fc2.bias'
+        else:
+            MLP2_weight_base_name = f'model.language_model.encoder.layers.{l}.mlp.dense_4h_to_h.weight'
+            MLP2_bias_base_name = f'model.language_model.encoder.layers.{l}.mlp.dense_4h_to_h.bias'
+        nemo_state_dict[MLP2_weight_base_name] = param_to_weights(MLP2_weight)
+        nemo_state_dict[MLP2_bias_base_name] = param_to_weights(MLP2_bias)
+
+        # LayerNorm2
+        LayerNorm2_weight = hf_model.state_dict()[f'encoder.layer.{l}.output.LayerNorm.weight']
+        LayerNorm2_bias = hf_model.state_dict()[f'encoder.layer.{l}.output.LayerNorm.bias']
+        if args.mcore:
+            LayerNorm2_weight_base_name = f'model.encoder.layers.{l}.post_mlp_layernorm.weight'
+            LayerNorm2_bias_base_name = f'model.encoder.layers.{l}.post_mlp_layernorm.bias'
+        else:
+            LayerNorm2_weight_base_name = f'model.language_model.encoder.layers.{l}.post_attention_layernorm.weight'
+            LayerNorm2_bias_base_name = f'model.language_model.encoder.layers.{l}.post_attention_layernorm.bias'
+        nemo_state_dict[LayerNorm2_weight_base_name] = param_to_weights(LayerNorm2_weight)
+        nemo_state_dict[LayerNorm2_bias_base_name] = param_to_weights(LayerNorm2_bias)
 
-    # Tokenize the input texts
-    batch_dict = hf_tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
-    batch_dict_cuda = {k: v.cuda() for k, v in batch_dict.items()}
-    hf_model = hf_model.cuda().eval()
-    model = model.eval()
-    with torch.no_grad():
-        hf_outputs = hf_model(**batch_dict_cuda)
-        embeddings_hf = average_pool(hf_outputs.last_hidden_state, batch_dict_cuda['attention_mask'])
-        embeddings_hf = F.normalize(embeddings_hf, p=2, dim=1)
+    # Non-layer dependent keys
+    word_embeddings_weight = hf_model.state_dict()['embeddings.word_embeddings.weight']
+    position_embeddings_weight = hf_model.state_dict()['embeddings.position_embeddings.weight']
+    token_type_embeddings_weight = hf_model.state_dict()['embeddings.token_type_embeddings.weight']
+    LayerNorm_weight = hf_model.state_dict()['embeddings.LayerNorm.weight']
+    LayerNorm_bias = hf_model.state_dict()['embeddings.LayerNorm.bias']
+    pooler_dense = hf_model.state_dict()['pooler.dense.weight']
+    pooler_bias = hf_model.state_dict()['pooler.dense.bias']
+
+    if args.mcore:
+        word_embeddings_weight_base_name = "model.embedding.word_embeddings.weight"
+        position_embeddings_weight_base_name = "model.embedding.position_embeddings.weight"
+        token_type_embeddings_weight_base_name = "model.embedding.tokentype_embeddings.weight"
+        LayerNorm_weight_base_name = "model.encoder.initial_layernorm.weight"
+        LayerNorm_bias_base_name = "model.encoder.initial_layernorm.bias"
+        pooler_dense_base_name = "model.pooler.dense.weight"
+        pooler_bias_base_name = "model.pooler.dense.bias"
+    else:
+        word_embeddings_weight_base_name = "model.language_model.embedding.word_embeddings.weight"
+        position_embeddings_weight_base_name = "model.language_model.embedding.position_embeddings.weight"
+        token_type_embeddings_weight_base_name = "model.language_model.embedding.tokentype_embeddings.weight"
+        LayerNorm_weight_base_name = "model.language_model.encoder.initial_layernorm.weight"
+        LayerNorm_bias_base_name = "model.language_model.encoder.initial_layernorm.bias"
+        pooler_dense_base_name = "model.language_model.pooler.dense.weight"
+        pooler_bias_base_name = "model.language_model.pooler.dense.bias"
+
+    nemo_state_dict[word_embeddings_weight_base_name] = param_to_weights(word_embeddings_weight)
+    nemo_state_dict[position_embeddings_weight_base_name] = param_to_weights(position_embeddings_weight)
+    nemo_state_dict[token_type_embeddings_weight_base_name] = param_to_weights(token_type_embeddings_weight)
+    nemo_state_dict[LayerNorm_weight_base_name] = param_to_weights(LayerNorm_weight)
+    nemo_state_dict[LayerNorm_bias_base_name] = param_to_weights(LayerNorm_bias)
+    nemo_state_dict[pooler_dense_base_name] = param_to_weights(pooler_dense)
+    nemo_state_dict[pooler_bias_base_name] = param_to_weights(pooler_bias)
 
-        outputs = model(**batch_dict_cuda)
-        embeddings = average_pool(outputs[0], batch_dict_cuda['attention_mask'])
-        embeddings = F.normalize(embeddings, p=2, dim=1)
-    # Print difference between two embeddings
-    print("Difference between reference embedding and converted embedding results:")
-    print(embeddings - embeddings_hf)
+    # Padding to new vocab size
+    if args.mcore:
+        original_embedding = nemo_state_dict['model.embedding.word_embeddings.weight']
+    else:
+        original_embedding = nemo_state_dict['model.language_model.embedding.word_embeddings.weight']
+    vocab_size = original_embedding.size(0)
+    if model.padded_vocab_size > vocab_size:
+        zeros_to_add = torch.zeros(
+            model.padded_vocab_size - vocab_size,
+            original_embedding.size(1),
+            dtype=original_embedding.dtype,
+            device=original_embedding.device,
+        )
+        # Concatenate the two tensors along rows
+        padded_embedding = torch.cat([original_embedding, zeros_to_add], dim=0)
+        if args.mcore:
+            nemo_state_dict['model.embedding.word_embeddings.weight'] = padded_embedding
+        else:
+            nemo_state_dict['model.language_model.embedding.word_embeddings.weight'] = padded_embedding
 
+    model.load_state_dict(nemo_state_dict, strict=True)
     model.save_to(args.output_path)
     logging.info(f'NeMo model saved to: {args.output_path}')
 

From 417aa512c36af9f591267453448020887f2f8baf Mon Sep 17 00:00:00 2001
From: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Date: Wed, 27 Mar 2024 16:30:34 -0700
Subject: [PATCH 079/140] fix title underline and code-block spacing (#8758)

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>
---
 .../asr/asr_language_modeling_and_customization.rst       | 8 ++++----
 docs/source/asr/results.rst                               | 5 ++++-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/docs/source/asr/asr_language_modeling_and_customization.rst b/docs/source/asr/asr_language_modeling_and_customization.rst
index 0761f60d2380..d5a748e2379e 100644
--- a/docs/source/asr/asr_language_modeling_and_customization.rst
+++ b/docs/source/asr/asr_language_modeling_and_customization.rst
@@ -1,6 +1,6 @@
-#####################
+#######################################
 ASR Language Modeling and Customization
-#####################
+#######################################
 
 Language models have shown to help the accuracy of ASR models. NeMo supports the following two approaches to incorporate language models into the ASR models:
 
@@ -548,9 +548,9 @@ The following is the list of the arguments for the opengrm script:
 +----------------------+--------+------------------+-----------------------------------------------------------------------------------------------------------------+
 
 
-******************
+***************************************************
 Context-biasing (word boosting) without external LM
-******************
+***************************************************
 
 NeMo toolkit supports a fast context-biasing method for CTC and Transducer (RNN-T) ASR models with CTC-based Word Spotter.
 The method involves decoding CTC log probabilities with a context graph built for words and phrases from the context-biasing list.
diff --git a/docs/source/asr/results.rst b/docs/source/asr/results.rst
index 05f91dde88ae..3ad1ad0b5091 100644
--- a/docs/source/asr/results.rst
+++ b/docs/source/asr/results.rst
@@ -138,11 +138,12 @@ For more information, see `nemo.collections.asr.modules <./api.html#modules>`__.
 -----
 
 Inference with Multi-task Models
-^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Multi-task models that use structured prompts require additionl task tokens as input, in which case it is recommended to use manifest as input. Below is an example of using the `nvidia/canary-1b` model:
 
 .. code-block:: python
+
     from nemo.collections.asr.models import EncDecMultiTaskModel
    
     # load model
@@ -162,6 +163,7 @@ Multi-task models that use structured prompts require additionl task tokens as i
 Here the manifest file should be a json file where each line has the following format:
 
 .. code-block:: bash
+
     {
        "audio_filepath": "/path/to/audio.wav",  # path to the audio file
        "duration": None,  # duration of the audio in seconds, set to `None` to use full audio
@@ -175,6 +177,7 @@ Here the manifest file should be a json file where each line has the following f
 Note that using manifest allows to specify the task configuration for each audio individually. If we want to use the same task configuration for all the audio files, it can be specified in `transcribe` method directly. 
 
 .. code-block:: python
+
     canary_model.transcribe(
             audio=[list of audio files],
             batch_size=4,  # batch size to run the inference with

From e64b2227b4759665187d061784927d2f9d0868b3 Mon Sep 17 00:00:00 2001
From: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Date: Wed, 27 Mar 2024 20:28:54 -0400
Subject: [PATCH 080/140] Reduce system memory usage during checkpoint
 loading/saving (#8694)

* avoid duplicate optimizer state dict fix

Signed-off-by: jiemingz <jiemingz@nvidia.com>

* load checkpoint directly to GPU

Signed-off-by: jiemingz <jiemingz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix tensorstore import

Signed-off-by: jiemingz <jiemingz@nvidia.com>

* fix isort

Signed-off-by: jiemingz <jiemingz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix loading with torch dist ckpt

Signed-off-by: jiemingz <jiemingz@nvidia.com>

---------

Signed-off-by: jiemingz <jiemingz@nvidia.com>
Co-authored-by: jiemingz <jiemingz@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 nemo/collections/nlp/parts/nlp_overrides.py | 24 +++++++++++++++------
 nemo/core/optim/distributed_adam.py         |  5 +++--
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 61e8ec64cae4..bfbc916c89d1 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -92,6 +92,7 @@
         make_sharded_optimizer_tensor,
         optim_state_to_sharding_state,
     )
+    from megatron.core.dist_checkpointing.strategies import tensorstore
     from megatron.core.tensor_parallel.layers import param_is_not_tensor_parallel_duplicate
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
     from megatron.core.transformer.transformer_layer import TransformerLayer as MCoreTransformerLayer
@@ -254,7 +255,7 @@ def configure_ddp(self):
             else:
                 super().configure_ddp()
 
-    def optimizer_sharded_state_dict(self):
+    def optimizer_sharded_state_dict(self, unsharded_optim_state=None):
         """
         Sharded state dictionary for an MainParamsOptimizerWrapper.
         Used to save and load the optimizer state when training with distributed_checkpoint.
@@ -274,7 +275,7 @@ def optimizer_sharded_state_dict(self):
         }
 
         if isinstance(optimizer, MegatronDistributedFusedAdam):
-            return optimizer.sharded_state_dict(model_sharded_state_dict)
+            return optimizer.sharded_state_dict(model_sharded_state_dict, unsharded_optim_state)
         elif not isinstance(optimizer, MainParamsOptimizerWrapper):
             # Regular optimizer, e.g. Adam or FusedAdam
             init_optimizer_states(optimizer)
@@ -337,9 +338,14 @@ def save_checkpoint(
             hasattr(self.lightning_module, 'sharded_state_dict')
             and self.lightning_module.sharded_state_dict() is not None
         ):
+            assert (
+                len(checkpoint['optimizer_states']) == 1
+            ), "Currently only support checkpointing 1 distributed optimizer per time!"
             # converts the optimizer states to their sharded equivalents
-            checkpoint['optimizer_states'] = [self.optimizer_sharded_state_dict()]
-
+            sharded_optim_state = self.optimizer_sharded_state_dict(
+                unsharded_optim_state=checkpoint['optimizer_states'][0]
+            )
+            checkpoint['optimizer_states'] = [sharded_optim_state]
             # dist_checkpointing expects a directory so we will name the directory
             # using the path with the file extension removed
             checkpoint_dir = ckpt_to_dir(filepath)
@@ -437,9 +443,13 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
             checkpoint['state_dict'] = sharded_state_dict
             checkpoint['optimizer_states'] = [self.optimizer_sharded_state_dict()]
 
-            checkpoint = dist_checkpointing.load(sharded_state_dict=checkpoint, checkpoint_dir=checkpoint_path)
-
-            checkpoint = self._fix_tensors_device(checkpoint)
+            if self.torch_dist_ckpt:
+                sharded_strategy = ('torch_dist', 1)
+            else:
+                sharded_strategy = tensorstore.TensorStoreLoadShardedStrategy(load_directly_on_device=True)
+            checkpoint = dist_checkpointing.load(
+                sharded_state_dict=checkpoint, checkpoint_dir=checkpoint_path, sharded_strategy=sharded_strategy
+            )
 
             return checkpoint
 
diff --git a/nemo/core/optim/distributed_adam.py b/nemo/core/optim/distributed_adam.py
index a2316dabb023..a85747c9f640 100644
--- a/nemo/core/optim/distributed_adam.py
+++ b/nemo/core/optim/distributed_adam.py
@@ -549,8 +549,9 @@ def _check_params_shard_dtypes(self, params_buckets: Dict[int, DistributedFusedA
         # Handle any remaining dtype conversions
         super()._check_params_shard_dtypes(params_buckets)
 
-    def sharded_state_dict(self, model_sharded_state_dict):
-        optimizer_state_dict = self.state_dict()
+    def sharded_state_dict(self, model_sharded_state_dict, optimizer_state_dict=None):
+        if optimizer_state_dict is None:
+            optimizer_state_dict = self.state_dict()
 
         id_to_sharded_param_map = get_param_id_to_sharded_param_map(
             model_sharded_state_dict=model_sharded_state_dict, optim_params_iter=self.parameters(),

From 9d86acd5ebf3cec020f84dfe7e25c109506803b1 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Fri, 29 Mar 2024 11:02:40 -0700
Subject: [PATCH 081/140] Pagaray/nemo cicd part7 (#8763)

* Use common docker registry/server across vms

* temp for test

* adjustments

* update TestData path

* update checkout path to not overlap/conflict/collide

* update container build path to not overlap/conflict/collide

* update container build path to not overlap/conflict/collide

* temp for test

* update TestData contianer mount

* ACR login

* ACR login

* update working directory

* attempt fix

* attempt fix

* fix path for inside-container-execution

* fix path for inside-container-execution

* Change for Prod
---
 .github/workflows/cicd-main.yml | 1294 ++++++++++++++++++++-----------
 1 file changed, 821 insertions(+), 473 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 595c8c302ccf..214a1c1a0c95 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -34,18 +34,21 @@ jobs:
         docker container prune --filter "until=24h" --force
         docker image prune -a --filter "until=24h" --force
 
-  checkout-repository:
-    runs-on: self-hosted-azure
-    container:
-      image: nvcr.io/nvidia/pytorch:24.01-py3
-      volumes:
-        - ${{ github.workspace }}:/workspace
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v2
+#  checkout-repository:
+#    runs-on: self-hosted-azure
+#    container:
+#      image: nvcr.io/nvidia/pytorch:24.01-py3
+#      volumes:
+#        - ${{ github.workspace }}:/workspace
+#    steps:
+#    - name: Checkout repository
+#      uses: actions/checkout@v2
+#      with:
+#        path: ${{ github.run_id }}
+
 
   cicd-test-container-setup:
-    needs: [checkout-repository]
+    needs: [cicd-cluster-clean]
     runs-on: self-hosted-azure
     # uses: actions/cache@v2
     #container:
@@ -55,14 +58,26 @@ jobs:
 #        --device=/dev/nvidia0
 #        --gpus all
 #        --shm-size=8g 
-#        --env TRANSFORMERS_OFFLINE=0 
+#        --env TRANSFORMERS_OFFLINE=0
 #        --env HYDRA_FULL_ERROR=1
     steps:
+    - name: Log into ACR (Azure Container Registry)  # this login is for the pushing step after
+      uses: azure/docker-login@v1
+      with:
+        login-server: nemoci.azurecr.io
+        username: nemoci
+        password: ${{ secrets.ACR_PASSWORD }}
+
+    - name: Checkout repository
+      uses: actions/checkout@v2
+      with:
+        path: ${{ github.run_id }}
+
     - name: Container setup
       run: |
         # Pull base PyTorch container
         docker pull nvcr.io/nvidia/pytorch:24.01-py3
-        docker run --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume ${{ github.workspace }}:/workspace --volume /home/TestData:/home/TestData nvcr.io/nvidia/pytorch:24.01-py3 /bin/bash -c '
+        docker run --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume ${{ github.workspace }}/${{ github.run_id }}:/workspace --volume /datadrive/TestData:/home/TestData nvcr.io/nvidia/pytorch:24.01-py3 /bin/bash -c '
             set -x
 
             # PyTorch version
@@ -138,20 +153,20 @@ jobs:
         # Push container
         echo "Docker: List containers" && docker ps -a
         DOCKER_COMMIT=$(docker ps --latest --quiet)  # latest container
-        docker commit $DOCKER_COMMIT nemo_container
-        docker tag nemo_container localhost:5000/nemo_container_${{ github.run_id }}
-        docker push localhost:5000/nemo_container_${{ github.run_id }}
+        docker commit $DOCKER_COMMIT nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+        docker tag nemoci.azurecr.io/nemo_container_${{ github.run_id }} nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+        docker push nemoci.azurecr.io/nemo_container_${{ github.run_id }}
 
     # - name: Build and push to local registry
     #   uses: docker/build-push-action@v5
     #   with:
     #       context: .
     #       push: true
-    #       tags: localhost:5000/name/app:latest
+    #       tags: nemoci.azurecr.io/name/app:latest
 
     # - name: Inspect
     #   run: |
-    #     docker buildx imagetools inspect localhost:5000/name/app:latest 
+    #     docker buildx imagetools inspect nemoci.azurecr.io/name/app:latest
 
     #- name: Post-workflow execution
     #  uses: gacts/run-and-post-run@v1
@@ -164,22 +179,25 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
     - name: Checkout repository
       uses: actions/checkout@v2
     - name: "L0: Unit Tests GPU"
       run: |
         NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads
-    - uses: "./.github/actions/cancel-workflow"
+    - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
       if: "failure()"
       
 
@@ -187,22 +205,25 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
     - name: Checkout repository
       uses: actions/checkout@v2
     - name: "L0: Unit Tests CPU"
       run: |
         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-    - uses: "./.github/actions/cancel-workflow"
+    - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
       if: "failure()"
 
 
@@ -214,15 +235,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -232,22 +256,25 @@ jobs:
             --output_path=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \
             --precision=16
             rm -f /home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Community_LLM_Checkpoints_tests_StarCoder:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -256,22 +283,25 @@ jobs:
             --input_name_or_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf \
             --output_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf
             rm -f /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Community_LLM_Checkpoints_tests_Falcon:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -280,22 +310,25 @@ jobs:
             --input_name_or_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf \
             --output_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo
             rm -f /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Community_LLM_Checkpoints_tests_Baichuan2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -304,7 +337,7 @@ jobs:
             --input_name_or_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base \
             --output_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo
             rm -f /home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   # L2: ASR dev run
@@ -312,15 +345,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -333,22 +369,25 @@ jobs:
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_to_text_results
             rm -rf examples/asr/speech_to_text_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   ASR_dev_run_Speech_to_Text_WPE_-_CitriNet:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -364,22 +403,25 @@ jobs:
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_to_text_wpe_results
             rm -rf examples/asr/speech_to_text_wpe_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   ASR_dev_run_Speech_Pre-training_-_CitriNet:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -393,22 +435,25 @@ jobs:
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_pre_training_results
             rm -rf examples/asr/speech_pre_training_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   ASR_dev_run_Speech_To_Text_Finetuning:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -424,22 +469,25 @@ jobs:
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_finetuning_results
             rm -rf examples/asr/speech_finetuning_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   ASR_dev_run_Speech_To_Text_HF_Finetuning:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -471,22 +519,25 @@ jobs:
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_finetuning_results
             rm -rf examples/asr/speech_finetuning_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   ASR_dev_run_Speech_to_Text_WPE_-_Conformer:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -504,7 +555,7 @@ jobs:
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_to_text_wpe_conformer_results
             rm -rf examples/asr/speech_to_text_wpe_conformer_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   # L2: ASR dev run - part two
@@ -512,15 +563,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -539,22 +593,25 @@ jobs:
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_to_text_wpe_squeezeformer_results
             rm -rf examples/asr/speech_to_text_wpe_squeezeformer_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Speech_to_Text_EMA:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -568,14 +625,14 @@ jobs:
             +exp_manager.ema.enable=True \
             exp_manager.exp_dir=examples/asr/speech_to_text_results
             rm -rf examples/asr/speech_to_text_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   # L2_Speech_to_Text_AED:
   #   needs: [cicd-test-container-setup]
   #   runs-on: self-hosted-azure
   #   container:
-  #     image: localhost:5000/nemo_container_${{ github.run_id }}
+  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
   #     options: 
   #       # --user 0:128
   #       --device=/dev/nvidia0
@@ -583,7 +640,7 @@ jobs:
   #       --shm-size=8g 
   #       --env TRANSFORMERS_OFFLINE=0 
   #       --env HYDRA_FULL_ERROR=1
-  #       --volume /home/TestData:/home/TestData
+  #       --volume /datadrive/TestData:/home/TestData
   #   steps:
   #       - name: Checkout repository
   #         uses: actions/checkout@v2
@@ -625,15 +682,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -650,22 +710,25 @@ jobs:
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/speaker_tasks/recognition/speaker_recognition_results
             rm -rf examples/speaker_tasks/recognition/speaker_recognition_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Speaker_dev_run_Speaker_Diarization:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -683,22 +746,25 @@ jobs:
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/speaker_tasks/diarization/speaker_diarization_results
             rm -rf examples/speaker_tasks/diarization/speaker_diarization_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Speaker_dev_run_Speech_to_Label:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -719,22 +785,25 @@ jobs:
             ~model.preprocessor.n_fft \
             exp_manager.exp_dir=examples/asr/speech_to_label_results
             rm -rf examples/asr/speech_to_label_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -750,22 +819,25 @@ jobs:
             diarizer.asr.parameters.asr_based_vad=True \
             diarizer.out_dir=examples/speaker_tasks/diarization/speaker_diarization_asr_results
             rm -rf examples/speaker_tasks/diarization/speaker_diarization_asr_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Speaker_dev_run_Clustering_Diarizer_Inference:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -780,7 +852,7 @@ jobs:
             diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \
             diarizer.out_dir=examples/speaker_tasks/diarization/clustering_diarizer_results
             rm -rf examples/speaker_tasks/diarization/clustering_diarizer_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
 
@@ -788,15 +860,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -808,22 +883,25 @@ jobs:
             diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \
             diarizer.out_dir=examples/speaker_tasks/diarization/neural_diarizer_results
             rm -rf examples/speaker_tasks/diarization/neural_diarizer_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -836,7 +914,7 @@ jobs:
             data_simulator.session_config.num_sessions=2 \
             data_simulator.session_config.session_length=60
             rm -rf ./test_simulator
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
 
@@ -845,15 +923,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -868,22 +949,25 @@ jobs:
             +trainer.num_sanity_val_steps=1 \
             exp_manager.exp_dir=examples/asr/speech_to_text_results
             rm -rf examples/asr/speech_to_text_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -905,7 +989,7 @@ jobs:
             ~model.preprocessor.n_fft \
             exp_manager.exp_dir=examples/asr/speech_to_label_results
             rm -rf examples/asr/speech_to_label_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
 
@@ -914,15 +998,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -939,22 +1026,25 @@ jobs:
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_to_text_adapters_results
             rm -rf examples/asr/speech_to_text_adapters_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_ASR_Adapters_RelPos_MHA_Adapters:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -972,7 +1062,7 @@ jobs:
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_to_text_adapters_mha_results
             rm -rf examples/asr/speech_to_text_adapters_mha_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
 
@@ -981,15 +1071,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1000,7 +1093,7 @@ jobs:
             output_filename="stt_test_res.json" \
             amp=true
             rm -rf stt_test_res.json
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   # L2: Transducer alignment
@@ -1008,21 +1101,24 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
         - run: |
             pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   # L2: Segmentation Tool
@@ -1030,15 +1126,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1055,22 +1154,25 @@ jobs:
             -r /home/TestData/ctc_segmentation/eng/eng_valid_segments_1.7.txt \
             -g /home/TestData/ctc_segmentation/eng/output${TIME}/verified_segments/nv_test_segments.txt && \
             rm -rf /home/TestData/ctc_segmentation/eng/output${TIME}
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1087,7 +1189,7 @@ jobs:
             -r /home/TestData/ctc_segmentation/ru/valid_ru_segments_1.7.txt \
             -g /home/TestData/ctc_segmentation/ru/output${TIME}/verified_segments/ru_segments.txt && \
             rm -rf /home/TestData/ctc_segmentation/ru/output${TIME}
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
 
@@ -1096,15 +1198,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1129,7 +1234,7 @@ jobs:
                     pretrained_model=${OUTPUT_DIR_CONFORMER}/G2P-Conformer-CTC/test/checkpoints/G2P-Conformer-CTC.nemo \
                     manifest_filepath=/home/TestData/g2p/g2p.json \
                     phoneme_field=text
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
     # TODO: pleasefixme @redoctopus
@@ -1155,22 +1260,25 @@ jobs:
     #             phoneme_field=text
     #   }
     # }
-    # - uses: "./.github/actions/cancel-workflow"
+    # - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
     # if: "failure()"
 
   L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1193,7 +1301,7 @@ jobs:
                     manifest=/home/TestData/g2p/manifest.json \
                     pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \
                     output_manifest=preds.json
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
 
@@ -1204,7 +1312,7 @@ jobs:
   #   needs: [cicd-test-container-setup]
   #   runs-on: self-hosted-azure
   #   container:
-  #     image: localhost:5000/nemo_container_${{ github.run_id }}
+  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
   #     options: 
   #       # --user 0:128
   #       --device=/dev/nvidia0
@@ -1212,7 +1320,7 @@ jobs:
   #       --shm-size=8g 
   #       --env TRANSFORMERS_OFFLINE=0 
   #       --env HYDRA_FULL_ERROR=1
-  #       --volume /home/TestData:/home/TestData
+  #       --volume /datadrive/TestData:/home/TestData
   #   steps:
   #       - name: Checkout repository
   #         uses: actions/checkout@v2
@@ -1245,15 +1353,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1277,22 +1388,25 @@ jobs:
             trainer.accelerator=gpu \
             exp_manager=null  && \
             rm -rf sgd_gen_bert_outputs
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Dialogue_Classification_Intent_and_slot_classification_using_IntentSlotClassificationModel:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1315,22 +1429,25 @@ jobs:
             trainer.accelerator=gpu \
             exp_manager=null  && \
             rm -rf sgd_gen_bert_intent_classification_outputs
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Dialogue_Classification_Intent_classification_using_ZeroShotIntentModel:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1356,22 +1473,25 @@ jobs:
             trainer.accelerator=gpu \
             exp_manager=null  && \
             rm -rf sgd_gen_zero_shot_intent_classification_outputs
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1398,22 +1518,25 @@ jobs:
             trainer.accelerator=gpu \
             exp_manager=null  && \
             rm -rf design_zero_shot_intent_classification_outputs
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel_BART_Classifier:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1433,22 +1556,25 @@ jobs:
             trainer.accelerator=gpu \
             exp_manager=null  && \
             rm -rf design_zero_shot_intent_classification_bart_outputs
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Dialogue_Classification_Design_Intent_classification_using_DialogueNearestNeighbourModel:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1467,7 +1593,7 @@ jobs:
             trainer.accelerator=gpu \
             exp_manager=null  && \
             rm -rf design_dialogue_nearest_neighbour_classification_outputs
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   # L2: Dialogue Generation
@@ -1475,15 +1601,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1509,22 +1638,25 @@ jobs:
             trainer.accelerator=gpu \
             exp_manager=null  && \
             rm -rf answer_extender_s2s
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Dialogue_Generation_Dialogue_SGD_Based_Answer_Extender_using_DialogueS2SGenerationModel:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1555,7 +1687,7 @@ jobs:
             trainer.accelerator=gpu \
             exp_manager=null  && \
             rm -rf sgd_answer_extender_s2s
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
 #     - name: L2: Dialogue Generation Part 2
@@ -1594,15 +1726,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1622,7 +1757,7 @@ jobs:
             trainer.accelerator=gpu \
             exp_manager=null  && \
             rm -rf answer_extender
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   # L2: Duplex Text Normalization
@@ -1630,15 +1765,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1664,7 +1802,7 @@ jobs:
             data.train_ds.tar_metadata_file=/home/TestData/nlp/duplex_text_norm/tarred_small/metadata.json \
             data.test_ds.use_cache=false \
             data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
 # Runs out of memory on the 12G TITAN V (GPU 0 on main CI)
@@ -1700,15 +1838,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1726,7 +1867,7 @@ jobs:
             trainer.accelerator="gpu" \
             +trainer.fast_dev_run=true \
             exp_manager=null
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   # L2: Parallel BERT Question-Answering SQUAD v1.1 & v2.0
@@ -1734,15 +1875,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1768,22 +1912,25 @@ jobs:
             trainer.devices=[0] \
             trainer.accelerator="gpu" \
             exp_manager=null
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Parallel_BERT_Question-Answering_SQUAD_v2_0:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1806,7 +1953,7 @@ jobs:
             trainer.devices=[1] \
             trainer.accelerator="gpu" \
             exp_manager=null
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   # L2: Parallel BART Question-Answering SQUAD v1.1 & v2.0
@@ -1814,15 +1961,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1848,22 +1998,25 @@ jobs:
             trainer.devices=[0] \
             trainer.accelerator="gpu" \
             exp_manager=null
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Parallel_BART_Question-Answering_SQUAD_v2_0:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1886,7 +2039,7 @@ jobs:
             trainer.devices=[1] \
             trainer.accelerator="gpu" \
             exp_manager=null
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   # L2: Parallel GPT2 Question-Answering SQUAD v1.1 & v2.0
@@ -1894,15 +2047,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1928,22 +2084,25 @@ jobs:
             trainer.devices=[0] \
             trainer.accelerator="gpu" \
             exp_manager=null
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Parallel_GPT2_Question-Answering_SQUAD_v2_0:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1966,7 +2125,7 @@ jobs:
             trainer.devices=[1] \
             trainer.accelerator="gpu" \
             exp_manager=null
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   # L2: Intent and Slot Classification Tasks
@@ -1974,15 +2133,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1997,22 +2159,25 @@ jobs:
             +trainer.fast_dev_run=true \
             exp_manager.exp_dir=checkpoints
             rm -rf checkpoints
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2026,7 +2191,7 @@ jobs:
             +trainer.fast_dev_run=true \
             exp_manager.exp_dir=checkpoints2
             rm -rf checkpoints2
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
     # TODO: add when megatron-bert is supported again
@@ -2140,15 +2305,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2164,22 +2332,25 @@ jobs:
             +trainer.fast_dev_run=true \
             model.dataset.class_balancing="weighted_loss" \
             exp_manager.exp_dir=null
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2200,22 +2371,25 @@ jobs:
               +trainer.fast_dev_run=true \
               exp_manager.exp_dir=null && \
             rm -rf "${data_dir}"
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2229,22 +2403,25 @@ jobs:
             model.dataset.use_cache=false \
             model.language_model.pretrained_model_name="TurkuNLP/bert-base-finnish-cased-v1" \
             exp_manager.exp_dir=null
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
         
   L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2253,22 +2430,25 @@ jobs:
             model.dataset.data_dir=/home/TestData/nlp/ner/ \
             model.dataset.use_cache=false \
             pretrained_model=/home/TestData/nlp/pretrained_models/NER_Model_with_BERT_base_uncased.nemo
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2284,22 +2464,25 @@ jobs:
               +model.test_ds.use_cache=false \
               pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo && \
             rm -rf "${data_dir}"
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Parallel_NLP_Examples2_Punctuation_Capitalization_2GPUs_with_DistilBERT_Finetuning_on_other_data:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2343,7 +2526,7 @@ jobs:
             rm -rf /workspace/NeMo/examples/nlp/token_classification/nemo_experiments \
               "${tmp_data_dir_2}" \
               "${output_dir}"
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   # Punctuation & Capitalization tarred dataset:
@@ -2351,15 +2534,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2408,7 +2594,7 @@ jobs:
               trainer.max_epochs=1 \
               +exp_manager.explicit_log_dir=${output_dir}/output && \
             rm -rf "${output_dir}" "${data_dir}"
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   # Punctuation_Capitalization_Different_ways_of_passing_labels_to_model
@@ -2416,15 +2602,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2479,7 +2668,7 @@ jobs:
   #   needs: [cicd-test-container-setup]
   #   runs-on: self-hosted-azure
   #   container:
-  #     image: localhost:5000/nemo_container_${{ github.run_id }}
+  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
   #     options: 
   #       # --user 0:128
   #       --device=/dev/nvidia0
@@ -2487,7 +2676,7 @@ jobs:
   #       --shm-size=8g 
   #       --env TRANSFORMERS_OFFLINE=0 
   #       --env HYDRA_FULL_ERROR=1
-  #       --volume /home/TestData:/home/TestData
+  #       --volume /datadrive/TestData:/home/TestData
   #   steps:
   #       - name: Checkout repository
   #         uses: actions/checkout@v2
@@ -2542,15 +2731,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2565,7 +2757,7 @@ jobs:
               --pretrained_name punctuation_en_bert \
               --batch_size 32 && \
             rm -rf "${output_dir}"
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
   
   # L2: Parallel Pretraining BERT pretraining from Text/Preprocessed
@@ -2573,15 +2765,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2608,22 +2803,25 @@ jobs:
               
             rm -f /home/TestData/nlp/wikitext-2/*.pkl
             #rm -rf examples/nlp/language_modeling/PretrainingBERTFromText
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Pretraining_BERT_from_Preprocessed:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2649,7 +2847,7 @@ jobs:
               exp_manager.create_checkpoint_callback=False \
               
             #rm -rf examples/nlp/language_modeling/PretrainingBERTFromPreprocessed
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   # L2: Entity Linking        
@@ -2657,15 +2855,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2680,7 +2881,7 @@ jobs:
             model.train_ds.batch_size=8 \
             model.validation_ds.batch_size=8 \
             exp_manager.exp_dir=null
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
 
@@ -2691,15 +2892,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2762,22 +2966,25 @@ jobs:
               +exp_manager.resume_if_exists=True
               
             rm -rf examples/nlp/machine_translation/nmt_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2802,22 +3009,25 @@ jobs:
               +trainer.fast_dev_run=true \
               +trainer.limit_test_batches=2 \
               exp_manager=null
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2840,7 +3050,7 @@ jobs:
               +trainer.fast_dev_run=true \
               +trainer.limit_test_batches=2 \
               exp_manager=null
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   # L2: NMT Attention is All You Need Inference
@@ -2848,15 +3058,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2868,7 +3081,7 @@ jobs:
             --tgtout=/home/TestData/nlp/nmt/toy_data/out.txt \
             --target_lang en \
             --source_lang de
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   # L2: NMT Attention is All You Need Finetuning
@@ -2876,15 +3089,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2911,7 +3127,7 @@ jobs:
             +exp_manager.checkpoint_callback_params.save_best_model=true
         
             rm -rf examples/nlp/machine_translation/nmt_finetune
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   # L2: NMT Tarred Dataset Creation
@@ -2919,15 +3135,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2953,22 +3172,25 @@ jobs:
             trainer.accelerator="gpu" \
             +trainer.fast_dev_run=true \
             exp_manager=null \
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2984,22 +3206,25 @@ jobs:
             --lines_per_dataset_fragment=500 \
             --num_batches_per_tarfile=10 \
             --n_preproc_jobs=2 \
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Megatron_NMT_Training_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -3093,22 +3318,25 @@ jobs:
             model.decoder_tokenizer.library=sentencepiece \
             model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model
             rm -rf examples/nlp/machine_translation/megatron_nmt_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Megatron_BART_Perceiver_MIM_Training_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -3202,7 +3430,7 @@ jobs:
             ++model.hiddens.loss.mim.cls_name=a_mim \
             ++model.hiddens.loss.mim.loss_weight=0.5
             rm -rf examples/nlp/language_modeling/megatron_mim_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
     # stage('L2: NMT Bottleneck Fallback') {
@@ -3418,15 +3646,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -3492,22 +3723,25 @@ jobs:
 
             rm -rf examples/nlp/language_modeling/bert_pretrain_results
             rm -rf examples/nlp/language_modeling/bert_index_mappings
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
  
   L2_Megatron_Bert_Pretraining_and_Resume_Training:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -3574,22 +3808,25 @@ jobs:
 
             rm -rf examples/nlp/language_modeling/bert_pretrain_results
             rm -rf examples/nlp/language_modeling/bert_index_mappings
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Megatron_Core_Bert_Pretraining_and_Resume_Training:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -3658,22 +3895,25 @@ jobs:
 
             rm -rf examples/nlp/language_modeling/bert_pretrain_results
             rm -rf examples/nlp/language_modeling/bert_index_mappings
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Megatron_RETRO_Pretraining_and_Resume_Training:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -3741,14 +3981,14 @@ jobs:
             +model.data.mock=True
 
             rm -rf examples/nlp/language_modeling/retro_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   # L2_Megatron_RETRO_muTransfer_Pretraining_Performance:
   #   needs: [cicd-test-container-setup]
   #   runs-on: self-hosted-azure
   #   container:
-  #     image: localhost:5000/nemo_container_${{ github.run_id }}
+  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
   #     options: 
   #       # --user 0:128
   #       --device=/dev/nvidia0
@@ -3756,7 +3996,7 @@ jobs:
   #       --shm-size=8g 
   #       --env TRANSFORMERS_OFFLINE=0 
   #       --env HYDRA_FULL_ERROR=1
-  #       --volume /home/TestData:/home/TestData
+  #       --volume /datadrive/TestData:/home/TestData
   #   steps:
   #       - name: Checkout repository
   #         uses: actions/checkout@v2
@@ -3835,22 +4075,25 @@ jobs:
   #               assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"
 
   #               rm -rf examples/nlp/language_modeling/retro_results
-  #       - uses: "./.github/actions/cancel-workflow"
+  #       - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
   #         if: "failure()"
 
   L2_BioMegatron_Bert_NER_Task:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -3862,22 +4105,25 @@ jobs:
             model.language_model.pretrained_model_name=biomegatron345m_biovocab_30k_cased \
             model.tokenizer.tokenizer_name=null
             rm -rf examples/nlp/language_modeling/token_classification_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -3955,22 +4201,25 @@ jobs:
         
             rm -rf examples/nlp/language_modeling/gpt_pretrain_results
             rm -rf examples/nlp/language_modeling/gpt_index_mappings
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4053,7 +4302,7 @@ jobs:
 
            rm -rf examples/nlp/language_modeling/gpt_pretrain_results
            rm -rf examples/nlp/language_modeling/gpt_index_mappings
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
     #  This test requires Ampere but some of the test GPUs are Volta
@@ -4152,15 +4401,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4241,22 +4493,25 @@ jobs:
         
             rm -rf examples/nlp/language_modeling/gpt_pretrain_results
             rm -rf examples/nlp/language_modeling/gpt_index_mappings
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4337,22 +4592,25 @@ jobs:
             
             rm -rf examples/nlp/language_modeling/gpt_pretrain_results
             rm -rf examples/nlp/language_modeling/gpt_index_mappings
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4436,7 +4694,7 @@ jobs:
         
             rm -rf examples/nlp/language_modeling/gpt_pretrain_results
             rm -rf examples/nlp/language_modeling/gpt_index_mappings
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   #@athitten Remove /home/TestData/nlp/megatron_sft/trec.jsonl for validation and test file until we have support for multiple dataloaders in lightning 2.0
@@ -4444,15 +4702,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4518,22 +4779,25 @@ jobs:
             model.data.validation_ds.names=[quarel]
 
             rm -rf examples/nlp/language_modeling/gpt_sft_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Megatron_GPT_Finetuning_StarCoder_PP1:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4562,22 +4826,25 @@ jobs:
             model.data.train_ds.concat_sampling_probabilities=[1.0]
         
             rm -rf examples/nlp/language_modeling/gpt_sft_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Megatron_GPT_PEFT_Lora_PP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4608,22 +4875,25 @@ jobs:
             model.data.validation_ds.names=[quarel]
 
             rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Megatron_GPT_PEFT_Lora_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4670,22 +4940,25 @@ jobs:
             inference.outfile_path='/home/TestData/nlp/lora_tuning_tp2/out.jsonl'
 
             rm -rf /home/TestData/nlp/lora_tuning_tp2
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Megatron_GPT_Eval:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4696,22 +4969,25 @@ jobs:
                 tensor_model_parallel_size=1 \
                 inference.tokens_to_generate=32 \
                 trainer.precision=32
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Megatron_GPT_Eval_PP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4724,22 +5000,25 @@ jobs:
                 trainer.devices=2 \
                 trainer.num_nodes=1 \
                 trainer.precision=32
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4759,7 +5038,7 @@ jobs:
                 inference.repetition_penalty=1.0 \
                 inference.outfile_path=examples/nlp/language_modeling/out.jsonl && \
                 rm -rf examples/nlp/language_modeling/out.jsonl
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
     # TODO: Add this test back. Test was failing on CI machines due to HW error
@@ -4794,15 +5073,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4816,22 +5098,25 @@ jobs:
                 --target_pipeline_model_parallel_size 2
 
              rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4845,22 +5130,25 @@ jobs:
                 --target_pipeline_model_parallel_size 1
 
             rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Megatron_T5_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4952,22 +5240,25 @@ jobs:
 
             rm -rf examples/nlp/language_modeling/t5_pretrain_results
             rm -rf examples/nlp/language_modeling/t5_index_mappings
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5059,22 +5350,25 @@ jobs:
 
             rm -rf examples/nlp/language_modeling/t5_pretrain_results
             rm -rf examples/nlp/language_modeling/t5_index_mappings
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5166,22 +5460,25 @@ jobs:
 
             rm -rf examples/nlp/language_modeling/t5_pretrain_results
             rm -rf examples/nlp/language_modeling/t5_index_mappings
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Megatron_T5_Pretraining_and_Resume_Training_PP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5247,22 +5544,25 @@ jobs:
 
             rm -rf examples/nlp/language_modeling/t5_pretrain_results
             rm -rf examples/nlp/language_modeling/t5_index_mappings
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Megatron_T5_w_Mixture_of_Expert_Pretraining:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5301,22 +5601,25 @@ jobs:
 
             rm -rf examples/nlp/language_modeling/t5_pretrain_results
             rm -rf examples/nlp/language_modeling/t5_index_mappings
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5392,22 +5695,25 @@ jobs:
         
             rm -rf examples/nlp/language_modeling/t5_pretrain_results
             rm -rf examples/nlp/language_modeling/t5_index_mappings
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Megatron_T5_Eval:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5416,22 +5722,25 @@ jobs:
                 --model_file /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
                 --prompt 'How do I fix my GPU memory issue? I am seeing <mask> out of memory.' \
                 --tensor_model_parallel_size 1
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Megatron_BART_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5496,22 +5805,25 @@ jobs:
             model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'
         
             rm -rf examples/nlp/language_modeling/bart_pretrain_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_Megatron_BART_Pretraining_and_Resume_Training_PP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5580,7 +5892,7 @@ jobs:
             model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]
         
             rm -rf examples/nlp/language_modeling/bart_pretrain_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   # L2: Megatron T5 GLUE/XNLI Finetuning 
@@ -5589,15 +5901,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5626,22 +5941,25 @@ jobs:
             model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/dev_ci.tsv
             
             rm -rf examples/nlp/language_modeling/t5_glue_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
   
   L2_Megatron_T5_GLUE_XNLI:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5675,22 +5993,25 @@ jobs:
             model.data.test_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv
             
             rm -rf examples/nlp/language_modeling/t5_xnli_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
  
   L2_Megatron_T5_PEFT_Lora_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5739,7 +6060,7 @@ jobs:
             inference.outfile_path=/home/TestData/nlp/t5_lora_tuning_tp2/out.jsonl
 
             rm -rf /home/TestData/nlp/t5_lora_tuning_tp2
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   # L2: Megatron Mock Data Generation                
@@ -5747,15 +6068,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5767,22 +6091,25 @@ jobs:
                 exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
                 model.data.data_impl=mock \
                 model.data.data_prefix=[]
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
  
   L2_Megatron_Mock_Data_Generation_MockT5Dataset:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5796,7 +6123,7 @@ jobs:
             model.data.data_prefix=[]
 
             rm -rf examples/nlp/language_modeling/t5_pretrain_results
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   # L2: TTS Fast dev runs 1
@@ -5804,15 +6131,18 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5835,22 +6165,25 @@ jobs:
             ~model.text_normalizer \
             ~model.text_normalizer_call_kwargs \
             ~trainer.check_val_every_n_epoch
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_TTS_Fast_dev_runs_1_WaveGlow:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5869,22 +6202,25 @@ jobs:
             model.waveglow.n_wn_layers=2 \
             model.waveglow.n_wn_channels=32 \
             ~trainer.check_val_every_n_epoch
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_TTS_Fast_dev_runs_1_FastPitch:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5913,22 +6249,25 @@ jobs:
             ~trainer.check_val_every_n_epoch \
             ~model.text_normalizer \
             ~model.text_normalizer_call_kwargs
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_TTS_Fast_dev_runs_1_RADTTS:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5954,22 +6293,25 @@ jobs:
             ~trainer.check_val_every_n_epoch \
             ~model.text_normalizer \
             ~model.text_normalizer_call_kwargs
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_TTS_Fast_dev_runs_1_Mixer-TTS:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5992,22 +6334,25 @@ jobs:
             ~trainer.check_val_every_n_epoch \
             ~model.text_normalizer \
             ~model.text_normalizer_call_kwargs
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   L2_TTS_Fast_dev_runs_1_Hifigan:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -6027,7 +6372,7 @@ jobs:
             model.generator.upsample_initial_channel=64 \
             +model.debug=true \
             ~trainer.check_val_every_n_epoch
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
   # L2: NeRF
@@ -6035,7 +6380,7 @@ jobs:
   #   needs: [cicd-test-container-setup]
   #   runs-on: self-hosted-azure
   #   container:
-  #     image: localhost:5000/nemo_container_${{ github.run_id }}
+  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
   #     options: 
   #       # --user 0:128
   #       --device=/dev/nvidia0
@@ -6043,7 +6388,7 @@ jobs:
   #       --shm-size=8g 
   #       --env TRANSFORMERS_OFFLINE=0 
   #       --env HYDRA_FULL_ERROR=1
-  #       --volume /home/TestData:/home/TestData
+  #       --volume /datadrive/TestData:/home/TestData
   #   steps:
   #       - name: Checkout repository
   #         uses: actions/checkout@v2
@@ -6056,22 +6401,25 @@ jobs:
   #           exp_manager.exp_dir=examples/multimodal/text_to_image/nerf/dreamfusion_results
   #
   #           rm -rf examples/multimodal/text_to_image/nerf/dreamfusion_results
-  #       - uses: "./.github/actions/cancel-workflow"
+  #       - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
   #         if: "failure()"
 
   Speech_Checkpoints_tests:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      image: localhost:5000/nemo_container_${{ github.run_id }}
+      credentials:
+         username: nemoci
+         password: ${{ secrets.ACR_PASSWORD }}
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all
-        --shm-size=8g 
+        --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /home/TestData:/home/TestData
+        --volume /datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -6082,6 +6430,6 @@ jobs:
                 batch_size=64 \
                 tolerance=0.1012
             rm -f examples/asr/evaluation_transcripts.json
-        - uses: "./.github/actions/cancel-workflow"
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 

From 4c4fea812343d434262ef00685fcec294f33ff5b Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Mon, 1 Apr 2024 12:52:07 -0700
Subject: [PATCH 082/140] [Nemo CICD] Auth adjustment (#8778)

* test run

* test run2

* adjustment

* For Prod
---
 .github/workflows/cicd-main.yml | 446 ++++++++++++++++----------------
 1 file changed, 223 insertions(+), 223 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 214a1c1a0c95..d0a74da92ab8 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -65,8 +65,8 @@ jobs:
       uses: azure/docker-login@v1
       with:
         login-server: nemoci.azurecr.io
-        username: nemoci
         password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
 
     - name: Checkout repository
       uses: actions/checkout@v2
@@ -180,8 +180,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -206,8 +206,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -236,8 +236,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -264,8 +264,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -291,8 +291,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -318,8 +318,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -346,8 +346,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -377,8 +377,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -411,8 +411,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -443,8 +443,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -477,8 +477,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -527,8 +527,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -564,8 +564,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -601,8 +601,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -683,8 +683,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -718,8 +718,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -754,8 +754,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -793,8 +793,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -827,8 +827,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -861,8 +861,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -891,8 +891,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -924,8 +924,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -957,8 +957,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -999,8 +999,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1034,8 +1034,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1072,8 +1072,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1102,8 +1102,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1127,8 +1127,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1162,8 +1162,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1199,8 +1199,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1268,8 +1268,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1354,8 +1354,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1396,8 +1396,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1437,8 +1437,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1481,8 +1481,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1526,8 +1526,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1564,8 +1564,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1602,8 +1602,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1646,8 +1646,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1727,8 +1727,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1766,8 +1766,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1839,8 +1839,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1876,8 +1876,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1920,8 +1920,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1962,8 +1962,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2006,8 +2006,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2048,8 +2048,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2092,8 +2092,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2134,8 +2134,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2167,8 +2167,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2306,8 +2306,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2340,8 +2340,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2379,8 +2379,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2411,8 +2411,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2438,8 +2438,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2472,8 +2472,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2535,8 +2535,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2603,8 +2603,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2732,8 +2732,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2766,8 +2766,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2811,8 +2811,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2856,8 +2856,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2893,8 +2893,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2974,8 +2974,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -3017,8 +3017,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -3059,8 +3059,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -3090,8 +3090,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -3136,8 +3136,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -3180,8 +3180,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -3214,8 +3214,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -3326,8 +3326,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -3647,8 +3647,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -3731,8 +3731,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -3816,8 +3816,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -3903,8 +3903,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -4083,8 +4083,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -4113,8 +4113,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -4209,8 +4209,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -4402,8 +4402,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -4501,8 +4501,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -4600,8 +4600,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -4703,8 +4703,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -4787,8 +4787,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -4834,8 +4834,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -4883,8 +4883,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -4948,8 +4948,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -4977,8 +4977,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5008,8 +5008,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5074,8 +5074,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5106,8 +5106,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5138,8 +5138,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5248,8 +5248,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5358,8 +5358,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5468,8 +5468,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5552,8 +5552,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5609,8 +5609,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5703,8 +5703,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5730,8 +5730,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5813,8 +5813,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5902,8 +5902,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5949,8 +5949,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -6001,8 +6001,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -6069,8 +6069,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -6099,8 +6099,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -6132,8 +6132,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -6173,8 +6173,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -6210,8 +6210,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -6257,8 +6257,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -6301,8 +6301,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -6342,8 +6342,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -6409,8 +6409,8 @@ jobs:
     runs-on: self-hosted-azure
     container:
       credentials:
-         username: nemoci
-         password: ${{ secrets.ACR_PASSWORD }}
+        password: ${{ secrets.ACR_PASSWORD }}
+        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128

From 35c040ab24f68d134737db41bdaeb2ca7c80c739 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Tue, 2 Apr 2024 00:53:42 +0200
Subject: [PATCH 083/140] Model quantization manual (#8730)

* Model quantization manual

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Next iteration

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Use torch get_rank() for broader coverage of use-cases

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Remove unused import

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 docs/source/nlp/quantization.rst  | 89 +++++++++++++++++++++++++++++++
 nemo/export/quantize/quantizer.py |  7 ++-
 2 files changed, 92 insertions(+), 4 deletions(-)
 create mode 100644 docs/source/nlp/quantization.rst

diff --git a/docs/source/nlp/quantization.rst b/docs/source/nlp/quantization.rst
new file mode 100644
index 000000000000..feb5881ed09d
--- /dev/null
+++ b/docs/source/nlp/quantization.rst
@@ -0,0 +1,89 @@
+.. _megatron_quantization:
+
+Quantization
+==========================
+
+Post Training Quantization (PTQ)
+--------------------------------
+
+PTQ enables deploying a model in a low-precision format -- FP8, INT4 or INT8 -- for efficient serving. Different quantization methods are available including FP8 quantization, INT8 SmoothQuant and INT4 AWQ.
+
+Model quantization has two primary benefits: reduced model memory requirements and increased inference throughput.
+
+In NeMo, quantization is enabled by the Nvidia AMMO library -- a unified algorithmic model optimization & deployment toolkit.
+
+The quantization process consists of the following steps:
+
+1. Loading a model checkpoint using appropriate parallelism strategy for evaluation
+2. Calibrating the model to obtain appropriate algorithm-specific scaling factors
+3. Producing output directory or .qnemo tarball with model config (json), quantized weights (safetensors) and tokenizer config (yaml).
+
+Loading models requires using AMMO spec defined in `megatron.core.deploy.gpt.model_specs module <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/deploy/gpt/model_specs.py>`_. Typically the calibration step is lightweight and uses a small dataset to obtain appropriate statistics for scaling tensors. The output directory produced (or a .qnemo tarball) is ready to be used to build a serving engine with the Nvidia TensorRT-LLM library. The engine build step is also soon to be the part of NeMo project and ``nemo.deploy`` and ``nemo.export`` modules, see https://github.com/NVIDIA/NeMo/pull/8690.
+
+Quantization algorithm can also be conveniently set to ``"null"`` to perform only the weights export step using default precision for TensorRT-LLM deployment. This is useful to obtain baseline performance and accuracy results for comparison.
+
+
+Example
+^^^^^^^
+The example below shows how to quantize the Llama2 70b model into FP8 precision, using tensor parallelism of 8 on a single DGX H100 node. The quantized model is intended for serving using 2 GPUs specified with ``export.inference_tensor_parallel`` parameter.
+
+The script should be launched correctly with the number of processes equal to tensor parallelism. This is achieved with the ``mpirun`` command below.
+
+.. code-block:: bash
+
+    mpirun -n 8 python examples/nlp/language_modeling/megatron_llama_quantization.py \
+        model_file=llama2-70b-base-bf16.nemo \
+        tensor_model_parallel_size=8 \
+        pipeline_model_parallel_size=1 \
+        trainer.num_nodes=1 \
+        trainer.devices=8 \
+        trainer.precision=bf16 \
+        quantization.algorithm=fp8 \
+        export.decoder_type=llama \
+        export.inference_tensor_parallel=2 \
+        model_save=llama2-70b-base-fp8-qnemo
+
+
+
+The output directory stores the following files:
+
+.. code-block:: bash
+
+    llama2-70b-base-fp8-qnemo/
+    ├── config.json
+    ├── rank0.safetensors
+    ├── rank1.safetensors
+    ├── tokenizer.model
+    └── tokenizer_config.yaml
+
+
+The TensorRT-LLM engine can be build with ``trtllm-build`` command, see `TensorRT-LLM documentation <https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama#fp8-post-training-quantization>`_.
+
+.. code-block:: bash
+
+    trtllm-build \
+        --checkpoint_dir llama2-70b-base-fp8-qnemo \
+        --output_dir engine_dir \
+        --max_batch_size 8 \
+        --max_input_len 2048 \
+        --max_output_len 512
+
+
+
+Known issues
+^^^^^^^^^^^^
+* Currently in NeMo quantizing and building TensorRT-LLM engines is limited to single-node use cases.
+* Supported and tested model family is Llama2. Quantizing other model types is experimental and may not be fully supported.
+* For INT8 SmoothQuant ``quantization.algorithm=int8_sq``, the TensorRT-LLM engine cannot be build with CLI ``trtllm-build`` command -- Python API and ``tensorrt_llm.builder`` should be used instead.
+
+
+Please refer to the following papers for more details on quantization techniques.
+
+References
+----------
+
+`FP8 Formats for Deep Learning, 2022 <https://arxiv.org/abs/2209.05433>`_
+
+`SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models, 2022 <https://arxiv.org/abs/2211.10438>`_
+
+`AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration, 2023 <https://arxiv.org/abs/2306.00978>`_
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index d24e2a80babc..435ca6a496b1 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -28,7 +28,6 @@
 from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
 from nemo.utils import logging
 from nemo.utils.distributed import temporary_directory
-from nemo.utils.get_rank import is_global_rank_zero
 from nemo.utils.model_utils import load_config, save_artifacts
 
 try:
@@ -128,7 +127,7 @@ def _load_model(
 
         self._check_ddp_initialized(model)
 
-        if is_global_rank_zero():
+        if dist.get_rank() == 0:
             print(model)
 
         return model
@@ -183,7 +182,7 @@ def quantize(
 
         def forward_loop():
             for i, batch in enumerate(dataloader):
-                if is_global_rank_zero():
+                if dist.get_rank() == 0:
                     print(f"Calibrating batch {i}")
                 model.predict_step(batch, i)
 
@@ -212,7 +211,7 @@ def export(self, model, model_save: str):
                 export_tensorrt_llm_config=self.export_config.export_tensorrt_llm_config,
             )
             dist.barrier()  # Wait until all ranks complete export_model_config step
-            if is_global_rank_zero():
+            if dist.get_rank() == 0:
                 logging.info(f"Exporting quantized weights, model artifacts, and tokenizer config to {model_save}...")
                 save_artifacts(model, export_dir)
                 if save_qnemo:

From 2cfacc338f8f6db579e74313ad8cc642ad648d6e Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Mon, 1 Apr 2024 19:00:58 -0400
Subject: [PATCH 084/140] Docs: PEFT and Packed Seq (#8716)

* update docs

Signed-off-by: Chen Cui <chcui@nvidia.com>

* update docs

Signed-off-by: Chen Cui <chcui@nvidia.com>

* case

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add packed seq doc

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add packed seq to home page

Signed-off-by: Chen Cui <chcui@nvidia.com>

* format

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix math formatting

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add ref for tme doc

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 docs/source/ckpt_converters/user_guide.rst    |  78 +++++-----
 docs/source/conf.py                           |   3 +-
 docs/source/nlp/nemo_megatron/intro.rst       |   1 +
 .../nlp/nemo_megatron/packed_sequence.rst     | 140 ++++++++++++++++++
 .../nlp/nemo_megatron/peft/landing_page.rst   |  18 +--
 .../nemo_megatron/peft/supported_methods.rst  |   2 +-
 6 files changed, 192 insertions(+), 50 deletions(-)
 create mode 100644 docs/source/nlp/nemo_megatron/packed_sequence.rst

diff --git a/docs/source/ckpt_converters/user_guide.rst b/docs/source/ckpt_converters/user_guide.rst
index 6d247d0b24d5..9de22f4b5994 100644
--- a/docs/source/ckpt_converters/user_guide.rst
+++ b/docs/source/ckpt_converters/user_guide.rst
@@ -6,45 +6,45 @@ This guide provides instructions on how to use the conversion scripts to convert
 Support Matrix
 --------------
 
-+----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
-| Conversion           | From             | To              | Github Link                                                                                                        |
-+======================+==================+=================+====================================================================================================================+
-| Baichuan             | Hugging Face     | NeMo            | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py>`_   |
-+----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
-| Baichuan             | NeMo             | Hugging Face    | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py>`_   |
-+----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
-| BERT                 | Hugging Face     | NeMo            | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py>`_        |
-+----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
-| BERT                 | NeMo             | Hugging Face    | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_bert_nemo_to_hf.py>`_        |
-+----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
-| Falcon               | Hugging Face     | NeMo            | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py>`_      |
-+----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
-| Falcon               | NeMo             | Hugging Face    | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py>`_      |
-+----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
-| Gemma                | Hugging Face     | NeMo            | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py>`_       |
-+----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
-| Gemma                | JAX              | NeMo            | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py>`_      |
-+----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
-| Gemma                | PyTorch          | NeMo            | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py>`_      |
-+----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
-| GPT                  | NeMo             | mcore           | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py>`_      |
-+----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
-| LLaMA                | Hugging Face     | NeMo            | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py>`_       |
-+----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
-| LLaMA                | NeMo             | Hugging Face    | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py>`_       |
-+----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
-| Mistral 7B           | Hugging Face     | NeMo            | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py>`_  |
-+----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
-| Mistral 7B           | NeMo             | Hugging Face    | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py>`_  |
-+----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
-| Mixtral              | Hugging Face     | NeMo            | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py>`_     |
-+----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
-| Mixtral              | NeMo             | Hugging Face    | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py>`_     |
-+----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
-| MPT                  | Hugging Face     | NeMo            | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mpt_hf_to_nemo.py>`_         |
-+----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
-| Starcoder            | Hugging Face     | NeMo            | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py>`_   |
-+----------------------+------------------+-----------------+--------------------------------------------------------------------------------------------------------------------+
++----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
+| Conversion           | From             | To                  | Github Link                                                                                                        |
++======================+==================+=====================+====================================================================================================================+
+| Baichuan             | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py>`_   |
++----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
+| Baichuan             | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py>`_   |
++----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
+| BERT                 | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py>`_        |
++----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
+| BERT                 | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_bert_nemo_to_hf.py>`_        |
++----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
+| Falcon               | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py>`_      |
++----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
+| Falcon               | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py>`_      |
++----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
+| Gemma                | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py>`_       |
++----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
+| Gemma                | JAX              | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py>`_      |
++----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
+| Gemma                | PyTorch          | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py>`_      |
++----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
+| GPT/LLaMA            | NeMo (Legacy)    | NeMo (Megatron-Core)| `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py>`_      |
++----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
+| LLaMA                | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py>`_       |
++----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
+| LLaMA                | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py>`_       |
++----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
+| Mistral 7B           | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py>`_  |
++----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
+| Mistral 7B           | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py>`_  |
++----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
+| Mixtral              | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py>`_     |
++----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
+| Mixtral              | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py>`_     |
++----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
+| MPT                  | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mpt_hf_to_nemo.py>`_         |
++----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
+| Starcoder            | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py>`_   |
++----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
 
 
 Convert Hugging Face LLaMA Checkpoints to NeMo
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 6d086cb42e9f..e8fba7457605 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -274,4 +274,5 @@ def setup(app):
 ogp_image = 'https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/_static/nv_logo.png'
 
 # MathJax CDN
-mathjax_path = "https://cdn.jsdelivr.net/npm/mathjax@3.2.2/es5/mml-chtml.min.js"
+# follow recommendation here https://www.sphinx-doc.org/en/master/usage/extensions/math.html#module-sphinx.ext.mathjax
+mathjax_path = "https://cdn.jsdelivr.net/npm/mathjax@2/MathJax.js?config=TeX-AMS-MML_HTMLorMML"
diff --git a/docs/source/nlp/nemo_megatron/intro.rst b/docs/source/nlp/nemo_megatron/intro.rst
index 6ddf008214cc..c582edbffd61 100644
--- a/docs/source/nlp/nemo_megatron/intro.rst
+++ b/docs/source/nlp/nemo_megatron/intro.rst
@@ -23,6 +23,7 @@ To learn more about using NeMo to train Large Language Models at scale, please r
    flash_attention
    positional_embeddings
    mcore_customization
+   packed_sequence
 
 
 References
diff --git a/docs/source/nlp/nemo_megatron/packed_sequence.rst b/docs/source/nlp/nemo_megatron/packed_sequence.rst
new file mode 100644
index 000000000000..23c8976d4f5e
--- /dev/null
+++ b/docs/source/nlp/nemo_megatron/packed_sequence.rst
@@ -0,0 +1,140 @@
+Sequence Packing for SFT/PEFT
+-----------------------------
+
+
+Overview
+^^^^^^^^
+
+When finetuning a large language model with either full-parameter or parameter-efficient finetuning, GPU
+underutilization is a common problem due to an inefficient data pipeline. This is because most finetuning datasets have
+a skewed distribution of sequence lengths, with many short sequences and a few long sequences, following Zipf’s Law.
+Transformer models can only take in fixed length inputs, so the input has to be padded with many unused pad tokens,
+which is inefficient in two ways:
+
+- Computation performed on the pad values is eventually ignored for model output, resulting in wasted FLOPs.
+- Micro batch size is often limited by the batch which contains longer sequences, so that most other micro batches have
+  underutilized GPU memory.
+
+Sequence packing is a training technique where multiple training sequences (examples) are concatenated together into
+one long sequence (pack). This eliminates the need for padding and allows more tokens to be processed in each
+micro batch, maximizing both GPU compute and GPU memory.
+
+While sequences for pretraining can be concatenated naively, this is not the case for SFT and instruction fine-tuning
+where each input sequence should be treated individually. The conventional solution is to build an extended attention
+mask to mark the sequence id each token belongs to, and mask out attention values between sequences. However, this
+increases the complexity of attention from :math:`\sum_i {s_i}^2` to :math:`\Big({\sum_i {s_i}}\Big)^2`, where :math:`s_i` is the
+length of the ith subsequence. In practice, the conventional solution puts a limit on the length of packing.
+Instead, NeMo provides a highly optimized version of sequence packing which makes use of variable-length attention
+kernels in FlashAttention and TransformerEngine. With this, attention values between sequences are never calculated,
+so the complexity of attention remains at :math:`\sum_i {s_i}^2`. This allows packing sequences to arbitrary lengths so
+that GPU memory can be fully utilized.
+
+All things considered, NeMo’s implementation of sequence packing provides [#f1]_
+
+- Up to 10X performance improvement in terms of FLOPs
+- Up to 6X performance improvement in terms of training time
+- No impact on model convergence
+
+
+
+How to run SFT/PEFT with packed sequence
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Prepare Dataset
+"""""""""""""""
+
+We provide a convenient script to pack your SFT or PEFT dataset.
+This script assumes that you already have a prepared dataset file for SFT/PEFT training in NeMo. If you do not, please
+follow `this <https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/llama2sft.html#prepare-data>`_ to
+download and prepare the dolly dataset as an example.
+You will get a file named training.jsonl. The rest of this tutorial also assumes you already have a recipe for
+training with the unpacked dataset.
+
+Two main steps are run in this script:
+
+1. The online processing code in GPTSFTDataset is run (including prompt template manipulation, sequence length
+   truncation, tokenization, etc) and the result is an array of tokenized sequences, represented by indices).
+2. The sequences are grouped by length, and a packing algorithm is run.
+
+You can read more about packing algorithms `here <https://en.wikipedia.org/wiki/Bin_packing_problem#Offline_algorithms>`_.
+Currently, two variants of *first fit* are supported.
+- *first_fit_decreasing* sorts the sequences in decreasing order before applying the first-fit algorithm. It generates a
+more optimal packing, but it tends to keep all short sequences together, which may have an impact for convergence.
+- *first_fit_shuffle* runs first-fit in a random order. Packing is less optimal but it keeps the dataset order random.
+The recommendation is to run *first_fit_shuffle* and check the packed sequence lengths. If they are similar to the
+target length (i.e. efficient packing), then use shuffle. Otherwise try *first_fit_decreasing*.
+
+    .. code-block:: bash
+
+        python scripts/nlp_language_modeling/prepare_packed_ft_dataset.py \
+           model.data.train_ds.file_names=[/path/to/training.jsonl] \
+           model.data.train_ds.max_seq_length=2048 \
+           model.restore_from_path=<path/to/nemo_model> \
+           +output_dir=<output_folder>
+           +pack_sizes=[2048,4096,8192] \
+        [  +packing_algorithm=first_fit_shuffle \  ]
+        [  +seed=0                                 ]
+
+.. note::
+
+    Note 1. If your model or dataset requires non-default configs for conventional SFT/PEFT training in NeMo, you will
+    need to pass in the same configs to ``model.data.train_ds`` as you would for training with unpacked dataset.
+
+    Note 2. ``model.data.train_ds.max_seq_length`` is the length to truncate each sequence before packing multiple sequences
+    to the size of packed sequence (``pack_size``). ``max_seq_length`` should be set to the same value as unpacked data,
+    and can be determined by examining the distribution of sequence lengths in the dataset.
+
+    Note 3. Currently, we require a full nemo model file for simplicity and readability of code, but in theory only a
+    tokenizer file is needed. This part can be improved in a future iteration of the script.
+
+    Note 4. ``pack_sizes`` is a list of packed sequence lengths. In this example, there will be three output files, one for
+    each pack size. The output files are named ``<output_folder>/packed_{pack_size}_seed{seed}.npy``.
+    This argument is a list because you will likely want to experiment with a few ``pack_sizes`` to find out which length
+    can fill the GPU memory without exceeding it. Adjusting ``pack_size`` is analogous to adjusting the micro batch size in
+    the unpacked case.
+
+
+Adjust Training Config
+""""""""""""""""""""""
+
+To train with packed sequences, you need to change four items in the SFT/PEFT config file
+
+1. Turn on the packed_sequence flag
+
+    .. code-block:: bash
+
+        ++model.data.train_ds.packed_sequence=True
+
+2. Use the new dataset file instead of the original jsonl file
+
+    .. code-block:: bash
+
+        model.data.train_ds.file_names=output_folder/packed_{pack_size}_seed{seed}.npy
+
+3. Specify the packed sequence length. This should be one of the ``pack_sizes`` you specified during data preparation.
+
+    .. code-block:: bash
+
+        model.data.train_ds.max_seq_length={pack_size}
+
+4. Adjust the batch sizes.
+
+    - Micro batch size has to be set to 1 as a nominal constraint. This is because batches are now concatenated in the
+      preprocessing step. You can increase the ``pack_size`` to achieve the same purpose of increasing micro batch size.
+    - Global batch size has to be adjusted so that the training recipe is maintained. Because each pack contains
+      multiple sequences now, global batch size needs to be reduced by the average number of sequences per pack ``n``,
+      where :math:`n = \frac{# sequences in dataset}{# packs}`. This ensures that each gradient iteration sees (on
+      average) the same number of tokens. The value of ``n`` is printed out when the script is run.
+
+    .. code-block:: bash
+
+        model.micro_batch_size=1
+        model.global_batch_size=<GBS divided by n>
+
+Now you are all set to finetune your model with a much improved throughput!
+
+
+.. rubric:: Footnotes
+
+.. [#f1] Experiments were performed on Llama 7B with Dolly dataset. Actual performance improvement depends on dataset
+         and model.
\ No newline at end of file
diff --git a/docs/source/nlp/nemo_megatron/peft/landing_page.rst b/docs/source/nlp/nemo_megatron/peft/landing_page.rst
index 37ca1eff6c17..84d4930e46f4 100644
--- a/docs/source/nlp/nemo_megatron/peft/landing_page.rst
+++ b/docs/source/nlp/nemo_megatron/peft/landing_page.rst
@@ -10,18 +10,18 @@ points, PEFT achieves comparable performance to full finetuning at a
 fraction of the computational and storage costs.
 
 NeMo supports four PEFT methods which can be used with various
-transformer-based models. `Here <https://github.com/NVIDIA/NeMo/tree/main/scripts/nlp_language_modeling>`__
+transformer-based models. `Here <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters>`__
 is a collection of conversion scripts that convert
 popular models from HF format to nemo format.
 
-==================== ===== ======== ========= ====== ========= ===== ==
-\                    GPT 3 Nemotron LLaMa 1/2 Falcon Starcoder Gemma T5
-==================== ===== ======== ========= ====== ========= ===== ==
-LoRA                  ✅    ✅      ✅        ✅     ✅        ✅    ✅
-P-Tuning              ✅    ✅      ✅        ✅     ✅        ✅    ✅
-Adapters (Canonical)  ✅    ✅      ✅               ✅        ✅    ✅
-IA3                   ✅    ✅      ✅               ✅        ✅    ✅
-==================== ===== ======== ========= ====== ========= ===== ==
+==================== ===== ======== ========= ====== ========= ======= ======= ===== ==
+\                    GPT 3 Nemotron LLaMa 1/2 Falcon Starcoder Mistral Mixtral Gemma T5
+==================== ===== ======== ========= ====== ========= ======= ======= ===== ==
+LoRA                  ✅    ✅      ✅        ✅     ✅        ✅      ✅      ✅    ✅
+P-Tuning              ✅    ✅      ✅        ✅     ✅        ✅      ✅      ✅    ✅
+Adapters (Canonical)  ✅    ✅      ✅               ✅        ✅      ✅      ✅    ✅
+IA3                   ✅    ✅      ✅               ✅        ✅              ✅    ✅
+==================== ===== ======== ========= ====== ========= ======= ======= ===== ==
 
 Learn more about PEFT in NeMo with the :ref:`peftquickstart` which provides an overview on how PEFT works
 in NeMo. Read about the supported PEFT methods
diff --git a/docs/source/nlp/nemo_megatron/peft/supported_methods.rst b/docs/source/nlp/nemo_megatron/peft/supported_methods.rst
index 140963b3e602..b63fc775dd7f 100644
--- a/docs/source/nlp/nemo_megatron/peft/supported_methods.rst
+++ b/docs/source/nlp/nemo_megatron/peft/supported_methods.rst
@@ -1,4 +1,4 @@
-
+.. _peft_supported_methods:
 
 Supported PEFT methods
 ----------------------

From c6a519188f257489d3bd08eaae4eede8224ee071 Mon Sep 17 00:00:00 2001
From: Tim Moon <4406448+timmoon10@users.noreply.github.com>
Date: Tue, 2 Apr 2024 06:41:48 -0700
Subject: [PATCH 085/140] PEFT keeps frozen model in train mode (#8780)

Train mode is needed for FP8 scale updates.

Signed-off-by: Tim Moon <tmoon@nvidia.com>
---
 .../nlp/parts/mixins/nlp_adapter_mixins.py    | 23 +++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
index c0d128164760..3797ec909737 100644
--- a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
+++ b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
@@ -191,7 +191,7 @@ def add_adapter(self, peft_cfgs: Union[PEFTConfig, List[PEFTConfig]]):
             return
 
         self.base_keys = self._get_all_keys()
-        self.freeze()
+        self.freeze(training=True)
         logging.info(f"Before adding PEFT params:\n{self.summarize()}")
 
         for peft_cfg in peft_cfgs:
@@ -241,7 +241,7 @@ def setup_optimizer_param_groups(self):
         and/or prompt table will use the learning rate set by the user.
         """
         if self.use_peft:
-            self.freeze()  # Freeze the entire model
+            self.freeze(training=True)  # Freeze the entire model
             if not self.ptuning_only_and_non_first_stage:
                 opt_params = []
                 for _, module in self.named_modules():
@@ -518,3 +518,22 @@ def merge_inference_cfg(cls, path: str, cfg: DictConfig) -> DictConfig:
 
         peft_cfg.megatron_amp_O2 = False  # always evaluate with O1
         return peft_cfg
+
+    def freeze(self, training: bool = False) -> None:
+        """Freeze all params
+
+        Finetuning, e.g. with PEFT, involves training steps with
+        frozen modules. Even if the params are not being updated, the
+        modules may require other training mode behaviors like
+        updating FP8 scaling factors. See
+        https://pytorch.org/docs/stable/notes/autograd.html#locally-disabling-gradient-computation.
+
+        Args:
+            training (bool): Whether to set training mode or
+                evaluation mode.
+
+        """
+
+        for param in self.parameters():
+            param.requires_grad = False
+        self.train(mode=training)

From 8daa5a7ca6d7b96b6a984f94cf4fdbf22adc1780 Mon Sep 17 00:00:00 2001
From: Sangkug Lym <slym@nvidia.com>
Date: Tue, 2 Apr 2024 12:52:11 -0700
Subject: [PATCH 086/140] Update the interface of userbuffer tensor-parallel
 communication overlap (#8681)

* update the interface of userbuffer tensor-parallel communication overlap

Signed-off-by: Sangkug Lym <slym@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../gpt_full_te_layer_autocast_spec.py        | 154 +++++++++--------
 .../modules/common/megatron/transformer.py    | 156 ++++++++++--------
 2 files changed, 164 insertions(+), 146 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
index df872e03c682..f89cbedf9f5d 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from importlib.metadata import version
 from typing import Any, Callable, Optional
 
 import torch
+from pkg_resources import packaging
 
 from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
 from nemo.collections.nlp.parts import utils_funcs
@@ -79,54 +81,56 @@ def __init__(
         ub_tp_comm_overlap: bool = False,
         ub_bulk_wgrad: bool = True,
         ub_bulk_dgrad: bool = True,
-        ub_split_ag: bool = True,
-        ub_split_rs: bool = True,
-        ub_atomic_gemm_ag: bool = False,
-        ub_atomic_gemm_rs: bool = False,
         autocast_dtype: Any = 16,
         zero_centered_gamma: bool = False,
         device: str = 'cuda',
+        **kwargs,
     ) -> None:
         if not HAVE_MEGATRON_CORE or not HAVE_TE:
             raise ImportError(IMPORT_ERROR)
 
-        super().__init__(
-            hidden_size=hidden_size,
-            ffn_hidden_size=ffn_hidden_size,
-            layernorm_epsilon=layernorm_epsilon,
-            num_attention_heads=num_attention_heads,
-            init_method=init_method,
-            output_layer_init_method=output_layer_init_method,
-            hidden_dropout=hidden_dropout,
-            attention_dropout=attention_dropout,
-            layer_number=layer_number,
-            kv_channels=kv_channels,
-            self_attn_mask_type=self_attn_mask_type,
-            tp_group=tp_group,
-            tp_size=tp_size,
-            params_dtype=params_dtype,
-            get_rng_state_tracker=get_rng_state_tracker,
-            fuse_wgrad_accumulation=fuse_wgrad_accumulation,
-            seq_length=seq_length,
-            micro_batch_size=micro_batch_size,
-            sequence_parallel=sequence_parallel,
-            apply_residual_connection_post_layernorm=apply_residual_connection_post_layernorm,
-            output_layernorm=output_layernorm,
-            layer_type=layer_type,
-            drop_path_rate=drop_path_rate,
-            set_parallel_mode=tp_size > 1,
-            fuse_qkv_params=True,
-            zero_centered_gamma=zero_centered_gamma,
-            ub_tp_comm_overlap=ub_tp_comm_overlap,
-            ub_bulk_wgrad=ub_bulk_wgrad,
-            ub_bulk_dgrad=ub_bulk_dgrad,
-            ub_split_ag=ub_split_ag,
-            ub_split_rs=ub_split_rs,
-            ub_atomic_gemm_ag=ub_atomic_gemm_ag,
-            ub_atomic_gemm_rs=ub_atomic_gemm_rs,
-            device=device,
-        )
-        # use_emha=use_emha,
+        transformer_layer_args = {
+            "hidden_size": hidden_size,
+            "ffn_hidden_size": ffn_hidden_size,
+            "layernorm_epsilon": layernorm_epsilon,
+            "num_attention_heads": num_attention_heads,
+            "init_method": init_method,
+            "output_layer_init_method": output_layer_init_method,
+            "hidden_dropout": hidden_dropout,
+            "attention_dropout": attention_dropout,
+            "layer_number": layer_number,
+            "kv_channels": kv_channels,
+            "self_attn_mask_type": self_attn_mask_type,
+            "tp_group": tp_group,
+            "tp_size": tp_size,
+            "params_dtype": params_dtype,
+            "get_rng_state_tracker": get_rng_state_tracker,
+            "fuse_wgrad_accumulation": fuse_wgrad_accumulation,
+            "seq_length": seq_length,
+            "micro_batch_size": micro_batch_size,
+            "sequence_parallel": sequence_parallel,
+            "apply_residual_connection_post_layernorm": apply_residual_connection_post_layernorm,
+            "output_layernorm": output_layernorm,
+            "layer_type": layer_type,
+            "drop_path_rate": drop_path_rate,
+            "set_parallel_mode": tp_size > 1,
+            "fuse_qkv_params": True,
+            "zero_centered_gamma": zero_centered_gamma,
+            "ub_tp_comm_overlap": ub_tp_comm_overlap,
+            "ub_bulk_wgrad": ub_bulk_wgrad,
+            "ub_bulk_dgrad": ub_bulk_dgrad,
+            "device": device,
+        }
+        te_version = packaging.version.Version(version("transformer-engine"))
+        if te_version > packaging.version.Version("1.5.0"):
+            transformer_layer_args["ub_overlap_ag"] = kwargs.get("ub_overlap_ag", True)
+            transformer_layer_args["ub_overlap_rs"] = kwargs.get("ub_overlap_rs", True)
+        else:
+            transformer_layer_args["ub_split_ag"] = kwargs.get("ub_split_ag", True)
+            transformer_layer_args["ub_split_rs"] = kwargs.get("ub_split_rs", True)
+            transformer_layer_args["ub_atomic_gemm_ag"] = kwargs.get("ub_atomic_gemm_ag", False)
+            transformer_layer_args["ub_atomic_gemm_rs"] = kwargs.get("ub_atomic_gemm_rs", False)
+        super().__init__(**transformer_layer_args)
 
         # Dtype for forward pass - ignore amp O2
         self.dtype = utils_funcs.torch_dtype_from_precision(autocast_dtype, megatron_amp_O2=None)
@@ -172,38 +176,42 @@ def __init__(self, config, layer_number=1, hidden_dropout=None):
         self.is_first_microbatch = True
         precision = 'bf16' if config.bf16 else 16
 
-        super().__init__(
-            hidden_size=config.hidden_size,
-            ffn_hidden_size=config.ffn_hidden_size,
-            layernorm_epsilon=config.layernorm_epsilon,
-            num_attention_heads=config.num_attention_heads,
-            init_method=config.init_method,
-            output_layer_init_method=config.output_layer_init_method,
-            hidden_dropout=config.hidden_dropout,
-            attention_dropout=config.attention_dropout,
-            layer_number=layer_number + self._get_layer_offset(),
-            kv_channels=config.kv_channels,
-            # self_attn_mask_type='causal', # Use default 'causal'
-            tp_size=parallel_state.get_tensor_model_parallel_world_size(),
-            params_dtype=config.params_dtype,
-            get_rng_state_tracker=tensor_parallel.random.get_cuda_rng_tracker,
-            fuse_wgrad_accumulation=config.gradient_accumulation_fusion,
-            seq_length=None,  # used for jit warmup
-            micro_batch_size=None,  # used for jit warmup
-            sequence_parallel=config.sequence_parallel,
-            apply_residual_connection_post_layernorm=config.apply_residual_connection_post_layernorm,
-            autocast_dtype=precision,
-            # use_emha=False, # Use default 'False'
-            ub_tp_comm_overlap=config.tp_comm_overlap,
-            ub_bulk_wgrad=config.tp_comm_bulk_wgrad,
-            ub_bulk_dgrad=config.tp_comm_bulk_dgrad,
-            ub_split_ag=config.tp_comm_split_ag,
-            ub_split_rs=config.tp_comm_split_rs,
-            ub_atomic_gemm_ag=config.tp_comm_atomic_ag,
-            ub_atomic_gemm_rs=config.tp_comm_atomic_rs,
-            zero_centered_gamma=config.layernorm_zero_centered_gamma,
-            device='cpu' if config.use_cpu_initialization else 'cuda',
-        )
+        transformer_layer_args = {
+            "hidden_size": config.hidden_size,
+            "ffn_hidden_size": config.ffn_hidden_size,
+            "layernorm_epsilon": config.layernorm_epsilon,
+            "num_attention_heads": config.num_attention_heads,
+            "init_method": config.init_method,
+            "output_layer_init_method": config.output_layer_init_method,
+            "hidden_dropout": config.hidden_dropout,
+            "attention_dropout": config.attention_dropout,
+            "layer_number": layer_number + self._get_layer_offset(),
+            "kv_channels": config.kv_channels,
+            "tp_size": parallel_state.get_tensor_model_parallel_world_size(),
+            "params_dtype": config.params_dtype,
+            "get_rng_state_tracker": tensor_parallel.random.get_cuda_rng_tracker,
+            "fuse_wgrad_accumulation": config.gradient_accumulation_fusion,
+            "seq_length": None,  # used for jit warmup
+            "micro_batch_size": None,  # used for jit warmup
+            "sequence_parallel": config.sequence_parallel,
+            "apply_residual_connection_post_layernorm": config.apply_residual_connection_post_layernorm,
+            "autocast_dtype": precision,
+            "ub_tp_comm_overlap": config.tp_comm_overlap,
+            "ub_bulk_wgrad": config.tp_comm_bulk_wgrad,
+            "ub_bulk_dgrad": config.tp_comm_bulk_dgrad,
+            "zero_centered_gamma": config.layernorm_zero_centered_gamma,
+            "device": 'cpu' if config.use_cpu_initialization else 'cuda',
+        }
+        te_version = packaging.version.Version(version("transformer-engine"))
+        if te_version > packaging.version.Version("1.5.0"):
+            transformer_layer_args["ub_overlap_ag"] = config.tp_comm_overlap_ag
+            transformer_layer_args["ub_overlap_rs"] = config.tp_comm_overlap_rs
+        else:
+            transformer_layer_args["ub_split_ag"] = config.tp_comm_split_ag
+            transformer_layer_args["ub_split_rs"] = config.tp_comm_split_rs
+            transformer_layer_args["ub_atomic_gemm_ag"] = config.tp_comm_atomic_ag
+            transformer_layer_args["ub_atomic_gemm_rs"] = config.tp_comm_atomic_rs
+        super().__init__(**transformer_layer_args)
 
     # Called by MCore's TransformerBlock.forward
     # megatron/core/transformer/transformer_block.py
diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py
index f115b645666b..d37c1e75d341 100644
--- a/nemo/collections/nlp/modules/common/megatron/transformer.py
+++ b/nemo/collections/nlp/modules/common/megatron/transformer.py
@@ -15,11 +15,13 @@
 
 """Transformer."""
 from contextlib import nullcontext
+from importlib.metadata import version
 from typing import Any, Callable, Optional
 
 import torch
 import torch.nn as nn
 from einops import rearrange
+from pkg_resources import packaging
 
 from nemo.collections.common.parts.adapter_modules import LinearAdapterConfig
 from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import (
@@ -798,51 +800,53 @@ def __init__(
         ub_tp_comm_overlap: bool = False,
         ub_bulk_wgrad: bool = True,
         ub_bulk_dgrad: bool = True,
-        ub_split_ag: bool = True,
-        ub_split_rs: bool = True,
-        ub_atomic_gemm_ag: bool = False,
-        ub_atomic_gemm_rs: bool = False,
         autocast_dtype: Any = 16,
         zero_centered_gamma: bool = False,
         device: str = 'cuda',
+        **kwargs,
     ) -> None:
-        super().__init__(
-            hidden_size=hidden_size,
-            ffn_hidden_size=ffn_hidden_size,
-            layernorm_epsilon=layernorm_epsilon,
-            num_attention_heads=num_attention_heads,
-            init_method=init_method,
-            output_layer_init_method=output_layer_init_method,
-            hidden_dropout=hidden_dropout,
-            attention_dropout=attention_dropout,
-            layer_number=layer_number,
-            kv_channels=kv_channels,
-            self_attn_mask_type=self_attn_mask_type,
-            tp_group=tp_group,
-            tp_size=tp_size,
-            params_dtype=params_dtype,
-            get_rng_state_tracker=get_rng_state_tracker,
-            fuse_wgrad_accumulation=fuse_wgrad_accumulation,
-            seq_length=seq_length,
-            micro_batch_size=micro_batch_size,
-            sequence_parallel=sequence_parallel,
-            apply_residual_connection_post_layernorm=apply_residual_connection_post_layernorm,
-            output_layernorm=output_layernorm,
-            layer_type=layer_type,
-            drop_path_rate=drop_path_rate,
-            set_parallel_mode=tp_size > 1,
-            fuse_qkv_params=True,
-            zero_centered_gamma=zero_centered_gamma,
-            ub_tp_comm_overlap=ub_tp_comm_overlap,
-            ub_bulk_wgrad=ub_bulk_wgrad,
-            ub_bulk_dgrad=ub_bulk_dgrad,
-            ub_split_ag=ub_split_ag,
-            ub_split_rs=ub_split_rs,
-            ub_atomic_gemm_ag=ub_atomic_gemm_ag,
-            ub_atomic_gemm_rs=ub_atomic_gemm_rs,
-            device=device,
-        )
-        # use_emha=use_emha,
+        transformer_layer_args = {
+            "hidden_size": hidden_size,
+            "ffn_hidden_size": ffn_hidden_size,
+            "layernorm_epsilon": layernorm_epsilon,
+            "num_attention_heads": num_attention_heads,
+            "init_method": init_method,
+            "output_layer_init_method": output_layer_init_method,
+            "hidden_dropout": hidden_dropout,
+            "attention_dropout": attention_dropout,
+            "layer_number": layer_number,
+            "kv_channels": kv_channels,
+            "self_attn_mask_type": self_attn_mask_type,
+            "tp_group": tp_group,
+            "tp_size": tp_size,
+            "params_dtype": params_dtype,
+            "get_rng_state_tracker": get_rng_state_tracker,
+            "fuse_wgrad_accumulation": fuse_wgrad_accumulation,
+            "seq_length": seq_length,
+            "micro_batch_size": micro_batch_size,
+            "sequence_parallel": sequence_parallel,
+            "apply_residual_connection_post_layernorm": apply_residual_connection_post_layernorm,
+            "output_layernorm": output_layernorm,
+            "layer_type": layer_type,
+            "drop_path_rate": drop_path_rate,
+            "set_parallel_mode": tp_size > 1,
+            "fuse_qkv_params": True,
+            "zero_centered_gamma": zero_centered_gamma,
+            "ub_tp_comm_overlap": ub_tp_comm_overlap,
+            "ub_bulk_wgrad": ub_bulk_wgrad,
+            "ub_bulk_dgrad": ub_bulk_dgrad,
+            "device": device,
+        }
+        te_version = packaging.version.Version(version("transformer-engine"))
+        if te_version > packaging.version.Version("1.5.0"):
+            transformer_layer_args["ub_overlap_ag"] = kwargs.get("ub_overlap_ag", True)
+            transformer_layer_args["ub_overlap_rs"] = kwargs.get("ub_overlap_rs", True)
+        else:
+            transformer_layer_args["ub_split_ag"] = kwargs.get("ub_split_ag", True)
+            transformer_layer_args["ub_split_rs"] = kwargs.get("ub_split_rs", True)
+            transformer_layer_args["ub_atomic_gemm_ag"] = kwargs.get("ub_atomic_gemm_ag", False)
+            transformer_layer_args["ub_atomic_gemm_rs"] = kwargs.get("ub_atomic_gemm_rs", False)
+        super().__init__(**transformer_layer_args)
 
         # Dtype for forward pass - ignore amp O2
         self.dtype = utils_funcs.torch_dtype_from_precision(autocast_dtype, megatron_amp_O2=None)
@@ -1065,38 +1069,44 @@ def build_layer(layer_number):
                 lt = layer_type
 
             if self.transformer_engine:
-                return AutocastTransformerLayer(
-                    hidden_size=hidden_size,
-                    ffn_hidden_size=ffn_hidden_size,
-                    layernorm_epsilon=layernorm_epsilon,
-                    num_attention_heads=num_attention_heads,
-                    init_method=init_method,
-                    output_layer_init_method=output_layer_init_method,
-                    hidden_dropout=hidden_dropout,
-                    attention_dropout=attention_dropout,
-                    layer_number=layer_number + layer_number_offset,
-                    kv_channels=kv_channels,
-                    self_attn_mask_type=self_attn_mask_type.name,
-                    tp_size=parallel_state.get_tensor_model_parallel_world_size(),
-                    params_dtype=config.params_dtype,
-                    get_rng_state_tracker=tensor_parallel.random.get_cuda_rng_tracker,
-                    fuse_wgrad_accumulation=config.gradient_accumulation_fusion,
-                    seq_length=None,  # used for jit warmup
-                    micro_batch_size=None,  # used for jit warmup
-                    sequence_parallel=config.sequence_parallel,
-                    apply_residual_connection_post_layernorm=False,
-                    autocast_dtype=precision,
-                    use_emha=use_emha,
-                    ub_tp_comm_overlap=ub_tp_comm_overlap,
-                    ub_bulk_wgrad=config.tp_comm_bulk_wgrad,
-                    ub_bulk_dgrad=config.tp_comm_bulk_dgrad,
-                    ub_split_ag=config.tp_comm_split_ag,
-                    ub_split_rs=config.tp_comm_split_rs,
-                    ub_atomic_gemm_ag=config.tp_comm_atomic_ag,
-                    ub_atomic_gemm_rs=config.tp_comm_atomic_rs,
-                    zero_centered_gamma=normalization == 'layernorm1p',
-                    device='cpu' if config.use_cpu_initialization else 'cuda',
-                )
+                transformer_layer_args = {
+                    "hidden_size": hidden_size,
+                    "ffn_hidden_size": ffn_hidden_size,
+                    "layernorm_epsilon": layernorm_epsilon,
+                    "num_attention_heads": num_attention_heads,
+                    "init_method": init_method,
+                    "output_layer_init_method": output_layer_init_method,
+                    "hidden_dropout": hidden_dropout,
+                    "attention_dropout": attention_dropout,
+                    "layer_number": layer_number + layer_number_offset,
+                    "kv_channels": kv_channels,
+                    "self_attn_mask_type": self_attn_mask_type.name,
+                    "tp_size": parallel_state.get_tensor_model_parallel_world_size(),
+                    "params_dtype": config.params_dtype,
+                    "get_rng_state_tracker": tensor_parallel.random.get_cuda_rng_tracker,
+                    "fuse_wgrad_accumulation": config.gradient_accumulation_fusion,
+                    "seq_length": None,  # used for jit warmup
+                    "micro_batch_size": None,  # used for jit warmup
+                    "sequence_parallel": config.sequence_parallel,
+                    "apply_residual_connection_post_layernorm": False,
+                    "autocast_dtype": precision,
+                    "use_emha": use_emha,
+                    "ub_tp_comm_overlap": ub_tp_comm_overlap,
+                    "ub_bulk_wgrad": config.tp_comm_bulk_wgrad,
+                    "ub_bulk_dgrad": config.tp_comm_bulk_dgrad,
+                    "zero_centered_gamma": normalization == 'layernorm1p',
+                    "device": 'cpu' if config.use_cpu_initialization else 'cuda',
+                }
+                te_version = packaging.version.Version(version("transformer-engine"))
+                if te_version > packaging.version.Version("1.5.0"):
+                    transformer_layer_args["ub_overlap_ag"] = config.tp_comm_overlap_ag
+                    transformer_layer_args["ub_overlap_rs"] = config.tp_comm_overlap_rs
+                else:
+                    transformer_layer_args["ub_split_ag"] = config.tp_comm_split_ag
+                    transformer_layer_args["ub_split_rs"] = config.tp_comm_split_rs
+                    transformer_layer_args["ub_atomic_gemm_ag"] = config.tp_comm_atomic_ag
+                    transformer_layer_args["ub_atomic_gemm_rs"] = config.tp_comm_atomic_rs
+                return AutocastTransformerLayer(**transformer_layer_args)
             else:
                 return ParallelTransformerLayer(
                     config=config,

From a72a5e8ca7ce60a85691dc6ef985e8bd921fdbae Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Tue, 2 Apr 2024 14:41:16 -0700
Subject: [PATCH 087/140] Pass groupedgemm flag to spec maker; re-enable
 --transformer-engine spec. (#8731)

* Pass groupedgemm flag to spec maker.

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Use --transformer-engine flag to toggle TE backend in MCore.

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Move self.transformer_engine higher

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Import get_gpt_layer_local_spec

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* use TE in tests

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Warn user if megatron_amp_o2 is enabled but TE is not.

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 Jenkinsfile                                   |  3 +++
 .../language_modeling/megatron_gpt_model.py   | 25 ++++++++++++++-----
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 883f6e105ed0..07f34babccf9 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -3843,6 +3843,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         model.optim.name=distributed_fused_adam \
         model.optim.lr=2e-4 \
         model.optim.sched.warmup_steps=1 \
+        model.transformer_engine=true \
         model.optim.sched.constant_steps=1 \
         model.optim.sched.min_lr=8e-5 \
         model.max_position_embeddings=128 \
@@ -3885,6 +3886,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         model.data.seq_length=128 \
         model.normalization=rmsnorm \
         model.bias=False \
+        model.transformer_engine=True \
         model.bias_activation_fusion=False \
         model.bias_dropout_add_fusion=False \
         model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
@@ -3984,6 +3986,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
        model.position_embedding_type=rope \
        model.rotary_percentage=0.5 \
        model.normalization=rmsnorm \
+       model.transformer_engine=True \
        model.bias=False \
        model.bias_activation_fusion=False \
        model.bias_dropout_add_fusion=False \
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 546ea429b149..925f92df250e 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -93,7 +93,10 @@
     from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset
     from megatron.core.deploy.gpt.model_specs import get_gpt_layer_ammo_spec
     from megatron.core.models.gpt import GPTModel as MCoreGPTModel
-    from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+    from megatron.core.models.gpt.gpt_layer_specs import (
+        get_gpt_layer_local_spec,
+        get_gpt_layer_with_transformer_engine_spec,
+    )
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
     from megatron.core.transformer.transformer_config import TransformerConfig
@@ -133,12 +136,15 @@ def mcore_supports_moe() -> bool:
         return False
 
 
-def get_specs(spec_name, num_experts=None):
+def get_specs(spec_name, num_experts=None, moe_grouped_gemm=False, use_te=True):
     if num_experts is not None:
         assert mcore_supports_moe(), "Megatron-core >= v0.5.0 is required for MoE"
 
+    if use_te and spec_name == '':
+        spec_name = 'te_gpt'
     name_spec_dict = {
-        "": get_gpt_layer_with_transformer_engine_spec(num_experts),
+        "": get_gpt_layer_local_spec(num_experts, moe_grouped_gemm),
+        "te_gpt": get_gpt_layer_with_transformer_engine_spec(num_experts, moe_grouped_gemm),
         "megatron_falcon_gpt": get_falcon_layer_spec(),
         "megatron_gpt_full_te_layer_autocast": get_gpt_full_te_layer_autocast_spec(),
         "ammo": get_gpt_layer_ammo_spec(),
@@ -301,6 +307,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         if self.cfg.get('expert_model_parallel_size', 1) > 1 and self.with_distributed_adam:
             raise ValueError('Expert parallelism is currently not supporting distributed optimizer')
 
+        self.transformer_engine = cfg.get('transformer_engine', False)
+        if self.megatron_amp_O2 and not self.transformer_engine:
+            logging.warning('megatron_amp_O2 is enabled but transformer-engine is not.')
+
         # build_model returns a list of modules which are used for interleaved pipeline parallelism
         if isinstance(self.trainer.accelerator, CPUAccelerator):
             self.model = build_model(
@@ -341,8 +351,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
             True if (not self.megatron_amp_O2) and (self.autocast_dtype in [torch.float16, torch.bfloat16]) else False
         )
 
-        self.transformer_engine = cfg.get('transformer_engine', False)
-
         # configuration used for inference
         self._inference_config = None
 
@@ -380,7 +388,12 @@ def model_provider_func(self, pre_process, post_process):
         if self.mcore_gpt:
             model = MCoreGPTModel(
                 config=self.transformer_config,
-                transformer_layer_spec=get_specs(self.spec_name, self.transformer_config.num_moe_experts),
+                transformer_layer_spec=get_specs(
+                    self.spec_name,
+                    self.transformer_config.num_moe_experts,
+                    self.transformer_config.moe_grouped_gemm,
+                    self.transformer_engine,
+                ),
                 vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size),
                 max_sequence_length=self.cfg.get('encoder_seq_length', 512),
                 pre_process=pre_process,

From 461294e662e9c025d1afb237c3f619e0cd7868c0 Mon Sep 17 00:00:00 2001
From: Ming <111467530+Victor49152@users.noreply.github.com>
Date: Tue, 2 Apr 2024 17:03:16 -0700
Subject: [PATCH 088/140] Cherry pick the notebook fixes from 1.23.0 to main
 (#8556)

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>
---
 tutorials/multimodal/DreamBooth Tutorial.ipynb   | 10 +++++-----
 .../multimodal/Multimodal Data Preparation.ipynb |  2 +-
 .../multimodal/Stable Diffusion Tutorial.ipynb   | 16 ++++++++--------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/tutorials/multimodal/DreamBooth Tutorial.ipynb b/tutorials/multimodal/DreamBooth Tutorial.ipynb
index 8651b55d6308..f3444df25a51 100644
--- a/tutorials/multimodal/DreamBooth Tutorial.ipynb	
+++ b/tutorials/multimodal/DreamBooth Tutorial.ipynb	
@@ -124,10 +124,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "! python /opt/NeMo/examples/multimodal/foundation/clip/convert_external_clip_to_nemo.py \\\n",
+    "! python /opt/NeMo/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py \\\n",
     "        --arch ViT-L-14 \\\n",
     "        --version openai \\\n",
-    "        --hparams_file /opt/NeMo/examples/multimodal/foundation/clip/conf/megatron_clip_VIT-L-14.yaml \\\n",
+    "        --hparams_file /opt/NeMo/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml \\\n",
     "        --nemo_file /ckpts/openai.nemo"
    ]
   },
@@ -167,7 +167,7 @@
    "outputs": [],
    "source": [
     "## This is the example command for running dreambooth training\n",
-    "! python /opt/NeMo/examples/multimodal/generative/dreambooth/dreambooth.py \\\n",
+    "! python /opt/NeMo/examples/multimodal/text_to_image/dreambooth/dreambooth.py \\\n",
     "    model.unet_config.from_pretrained=/ckpts/unet.bin \\\n",
     "    model.unet_config.from_NeMo=False \\\n",
     "    model.first_stage_config.from_pretrained=/ckpts/vae.bin \\\n",
@@ -196,7 +196,7 @@
    "outputs": [],
    "source": [
     "## This is the example command for running DreamBooth inference\n",
-    "! torchrun /opt/NeMo/examples/multimodal/generative/dreambooth/dreambooth_infer.py \\\n",
+    "! torchrun /opt/NeMo/examples/multimodal/text_to_image/dreambooth/dreambooth_infer.py \\\n",
     "    model.restore_from_path='/opt/NeMo/tutorials/multimodal/nemo_experiments/Dreambooth/checkpoints/Dreambooth.nemo' \\\n",
     "    infer.num_images_per_prompt=4 \\\n",
     "    infer.inference_steps=50 \\\n",
@@ -270,4 +270,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/tutorials/multimodal/Multimodal Data Preparation.ipynb b/tutorials/multimodal/Multimodal Data Preparation.ipynb
index a65814c5c2eb..bc297a4e1f58 100644
--- a/tutorials/multimodal/Multimodal Data Preparation.ipynb	
+++ b/tutorials/multimodal/Multimodal Data Preparation.ipynb	
@@ -660,4 +660,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/tutorials/multimodal/Stable Diffusion Tutorial.ipynb b/tutorials/multimodal/Stable Diffusion Tutorial.ipynb
index ed794356f280..48da90dcb23d 100644
--- a/tutorials/multimodal/Stable Diffusion Tutorial.ipynb	
+++ b/tutorials/multimodal/Stable Diffusion Tutorial.ipynb	
@@ -9,7 +9,7 @@
     "# Stable Diffusion Training / Inference Tutorial\n",
     "\n",
     "### Note:\n",
-    "Currently, this notebook must be run in a NeMo container. An example command to launch the container:\n",
+    "Currently, this notebook must be run in a NeMo container (> 24.01). An example command to launch the container:\n",
     "\n",
     "```\n",
     "docker run --gpus all -it --rm -v <your_nemo_dir>:/opt/NeMo --shm-size=8g \\\n",
@@ -30,7 +30,7 @@
     "\n",
     "## Datasets\n",
     "\n",
-    "Please refer to [ADD LINK]() for how to prepare a training dataset for Stable diffusion.\n",
+    "Please refer to [Dataset Tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/multimodal/Multimodal%20Data%20Preparation.ipynb) for how to prepare a training dataset for Stable diffusion.\n",
     "\n",
     "For a pre-cached Stable Diffusion dataset, each webdataset tar file should, at a minimum, include the pickle files that store the pre-cached image and text features:\n",
     "\n",
@@ -117,10 +117,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "! python examples/multimodal/foundation/clip/convert_external_clip_to_nemo.py \\\n",
+    "! python /opt/NeMo/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py \\\n",
     "        --arch ViT-L-14 \\\n",
     "        --version openai \\\n",
-    "        --hparams_file /opt/NeMo/examples/multimodal/foundation/clip/conf/megatron_clip_VIT-L-14.yaml \\\n",
+    "        --hparams_file /opt/NeMo/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml \\\n",
     "        --nemo_file /ckpts/openai.nemo"
    ]
   },
@@ -151,7 +151,7 @@
     "\n",
     "### Option 2: Training on Precached Dataset (Training UNet Only)\n",
     "\n",
-    "When using precached dataset (please refer to the [Dataset Tutorial](ADD_LINK) for details), every text feature and image feature are stored as key-value pairs in `.pickle` file:\n",
+    "When using precached dataset (please refer to the [Dataset Tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/multimodal/Multimodal%20Data%20Preparation.ipynb) for details), every text feature and image feature are stored as key-value pairs in `.pickle` file:\n",
     "\n",
     "```\n",
     "{\n",
@@ -201,7 +201,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "! torchrun /opt/NeMo/examples/multimodal/generative/stable_diffusion/sd_train.py trainer.max_steps=100 model.data.synthetic_data=True"
+    "! torchrun /opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/sd_train.py trainer.max_steps=100 model.data.synthetic_data=True"
    ]
   },
   {
@@ -247,7 +247,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "! ! torchrun  /opt/NeMo/examples/multimodal/generative/stable_diffusion/sd_infer.py model.restore_from_path='/opt/NeMo/tutorials/multimodal/nemo_experiments/stable-diffusion-train/checkpoints/stable-diffusion-train.nemo'"
+    "! torchrun  /opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/sd_infer.py model.restore_from_path='/opt/NeMo/tutorials/multimodal/nemo_experiments/stable-diffusion-train/checkpoints/stable-diffusion-train.nemo'"
    ]
   }
  ],
@@ -272,4 +272,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file

From 3497afa0b809254db8e5179bd792f70f05b323b5 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Tue, 2 Apr 2024 18:21:37 -0700
Subject: [PATCH 089/140] [Nemo CICD] (#8794)

* Change container registry login

* temp for test

* Change container registry login

* Revert "temp for test"

This reverts commit 0cee494c6d94f3d6a33cc4d65674cf706f9ef45c.
---
 .github/workflows/cicd-main.yml | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index d0a74da92ab8..adb9c7e690b3 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -62,11 +62,9 @@ jobs:
 #        --env HYDRA_FULL_ERROR=1
     steps:
     - name: Log into ACR (Azure Container Registry)  # this login is for the pushing step after
-      uses: azure/docker-login@v1
-      with:
-        login-server: nemoci.azurecr.io
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
+      run: |
+        # Login to Azure Container Registry
+        az acr login --name nemoci.azurecr.io
 
     - name: Checkout repository
       uses: actions/checkout@v2

From ea5d1eff442159d05a29b399fa40c4ea4dce2621 Mon Sep 17 00:00:00 2001
From: Jaemin Choi <minitu77@gmail.com>
Date: Wed, 3 Apr 2024 07:34:00 -0700
Subject: [PATCH 090/140] Add legacy_dataset flag to use legacy NeMo dataset
 path instead of MCore (#8783)

* Add legacy_dataset flag to use legacy NeMo dataset path instead of MCore

Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>

* Fix get_batch

Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>

---------

Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>
Co-authored-by: Jaemin Choi <jaeminc@nvidia.com>
---
 .../language_modeling/megatron_gpt_model.py   | 80 +++++++++++--------
 1 file changed, 48 insertions(+), 32 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 925f92df250e..04f842ed1bca 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -35,6 +35,7 @@
     MegatronPretrainingRandomSampler,
     MegatronPretrainingSampler,
 )
+from nemo.collections.nlp.data.language_modeling.megatron.gpt_dataset import build_train_valid_test_datasets
 from nemo.collections.nlp.data.language_modeling.megatron.gpt_fim_dataset import (
     GPTFIMDataset,
     GPTFIMDatasetConfig,
@@ -971,9 +972,10 @@ def get_batch(self, data_iterator, tuning):
             'tokens': data["tokens"],
             'labels': data["labels"],
             'loss_mask': data["loss_mask"],
-            'attention_mask': data["attention_mask"],
             'position_ids': data["position_ids"],
         }
+        if "attention_mask" in data:
+            batch['attention_mask'] = data["attention_mask"]
 
         return batch
 
@@ -1309,41 +1311,55 @@ def build_train_valid_test_datasets(self):
             fim_tokens = [fim_tokens.prefix, fim_tokens.middle, fim_tokens.suffix, fim_tokens.pad, fim_tokens.eod]
             self.tokenizer.add_special_tokens({'additional_special_tokens': fim_tokens})
 
-        mock_dataset = True if self.cfg.data.get("data_impl", "mmap") == "mock" else False
-        kwargs = {
-            "is_built_on_rank": is_dataset_built_on_rank,
-            "random_seed": self.cfg.seed,
-            "sequence_length": self.cfg.data.seq_length,
-            "path_to_cache": self.cfg.data.index_mapping_dir,
-            "tokenizer": self.tokenizer,
-            "reset_position_ids": self.reset_position_ids,
-            "reset_attention_mask": self.reset_attention_mask,
-            "eod_mask_loss": self.eod_mask_loss,
-            "mock": mock_dataset,
-            "mmap_bin_files": self.cfg.data.get("mmap_bin_files", True),
-        }
-
-        # support for dict data input type
-        if isinstance(self.cfg.data.data_prefix, DictConfig):
-            _pref = self.cfg.data.data_prefix
-            kwargs['blend_per_split'] = [_pref['train'], _pref['validation'], _pref['test']]
+        if self.cfg.data.get("legacy_dataset", False):
+            self._train_ds, self._validation_ds, self._test_ds = build_train_valid_test_datasets(
+                cfg=self.cfg,
+                trainer=self.trainer,
+                data_prefix=self.cfg.data.data_prefix,
+                data_impl=self.cfg.data.data_impl,
+                splits_string=self.cfg.data.splits_string,
+                train_valid_test_num_samples=train_valid_test_num_samples,
+                seq_length=self.cfg.data.seq_length,
+                seed=self.cfg.seed,
+                skip_warmup=self.cfg.data.get('skip_warmup', True),
+                tokenizer=self.tokenizer,
+            )
         else:
-            kwargs['blend'] = self.cfg.data.data_prefix
-            kwargs["split"] = self.cfg.data.splits_string
+            mock_dataset = True if self.cfg.data.get("data_impl", "mmap") == "mock" else False
+            kwargs = {
+                "is_built_on_rank": is_dataset_built_on_rank,
+                "random_seed": self.cfg.seed,
+                "sequence_length": self.cfg.data.seq_length,
+                "path_to_cache": self.cfg.data.index_mapping_dir,
+                "tokenizer": self.tokenizer,
+                "reset_position_ids": self.reset_position_ids,
+                "reset_attention_mask": self.reset_attention_mask,
+                "eod_mask_loss": self.eod_mask_loss,
+                "mock": mock_dataset,
+                "mmap_bin_files": self.cfg.data.get("mmap_bin_files", True),
+            }
 
-        if self.cfg.data.get('add_fim', False):
-            dataset_config = GPTFIMDatasetConfig(self.cfg.data.fim, **kwargs)
+            # support for dict data input type
+            if isinstance(self.cfg.data.data_prefix, DictConfig):
+                _pref = self.cfg.data.data_prefix
+                kwargs['blend_per_split'] = [_pref['train'], _pref['validation'], _pref['test']]
+            else:
+                kwargs['blend'] = self.cfg.data.data_prefix
+                kwargs["split"] = self.cfg.data.splits_string
 
-            self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
-                GPTFIMDataset, train_valid_test_num_samples, dataset_config,
-            ).build()
-        else:
-            dataset_config = GPTDatasetConfig(**kwargs)
-            dataset_type = MockGPTDataset if mock_dataset else GPTDataset
+            if self.cfg.data.get('add_fim', False):
+                dataset_config = GPTFIMDatasetConfig(self.cfg.data.fim, **kwargs)
+
+                self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
+                    GPTFIMDataset, train_valid_test_num_samples, dataset_config,
+                ).build()
+            else:
+                dataset_config = GPTDatasetConfig(**kwargs)
+                dataset_type = MockGPTDataset if mock_dataset else GPTDataset
 
-            self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
-                dataset_type, train_valid_test_num_samples, dataset_config,
-            ).build()
+                self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
+                    dataset_type, train_valid_test_num_samples, dataset_config,
+                ).build()
 
         if self._train_ds is not None:
             logging.info(f'Length of train dataset: {len(self._train_ds)}')

From 0eb630227e6ed6a4010c5ff083c9cce57a7ee73c Mon Sep 17 00:00:00 2001
From: Dong Hyuk Chang <thomaschang26@tutanota.com>
Date: Wed, 3 Apr 2024 13:52:39 -0400
Subject: [PATCH 091/140] Remove OpenAI Triton from requirements (#8805)

* Remove triton version check for test

Signed-off-by: Dong Hyuk Chang <donghyukc@nvidia.com>

* Clean up test

Signed-off-by: Dong Hyuk Chang <donghyukc@nvidia.com>

---------

Signed-off-by: Dong Hyuk Chang <donghyukc@nvidia.com>
Co-authored-by: Dong Hyuk Chang <donghyukc@nvidia.com>
---
 nemo/collections/nlp/modules/common/megatron/attention.py | 5 +----
 requirements/requirements.txt                             | 1 -
 tests/collections/nlp/test_flash_attention.py             | 5 +----
 3 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/attention.py b/nemo/collections/nlp/modules/common/megatron/attention.py
index d4f40d546aef..a52607c01b7d 100644
--- a/nemo/collections/nlp/modules/common/megatron/attention.py
+++ b/nemo/collections/nlp/modules/common/megatron/attention.py
@@ -70,10 +70,7 @@
     import pkg_resources
     from flash_attn.flash_attn_triton import flash_attn_func as flash_attn_func_triton
 
-    # pinned triton version for flash-attention triton https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attn_triton.py#L3
-    assert pkg_resources.get_distribution("triton").version == '2.0.0.dev20221202'
-
-except (ImportError, ModuleNotFoundError, AssertionError, pkg_resources.DistributionNotFound):
+except (ImportError, ModuleNotFoundError, pkg_resources.DistributionNotFound):
 
     flash_attn_func_triton = None
 
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index f6b13019a6c9..20efa2b22013 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -10,6 +10,5 @@ tensorboard
 text-unidecode
 torch
 tqdm>=4.41.0
-triton
 wget
 wrapt
diff --git a/tests/collections/nlp/test_flash_attention.py b/tests/collections/nlp/test_flash_attention.py
index 3560229e847b..4bd740011b24 100644
--- a/tests/collections/nlp/test_flash_attention.py
+++ b/tests/collections/nlp/test_flash_attention.py
@@ -42,11 +42,8 @@
     import pkg_resources
     import triton
 
-    # pinned triton version for flash-attention triton https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attn_triton.py#L3
-    assert pkg_resources.get_distribution("triton").version == '2.0.0.dev20221202'
-
     HAVE_TRITON = True
-except (ImportError, ModuleNotFoundError, AssertionError):
+except (ImportError, ModuleNotFoundError):
     HAVE_TRITON = False
 
 try:

From cf8d882198b0e509bac3a1d08e1e0cb21f6f0f18 Mon Sep 17 00:00:00 2001
From: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Date: Wed, 3 Apr 2024 13:20:18 -0700
Subject: [PATCH 092/140] Reconfigure limit_train_batches in terms of micro
 batches (#8738)

* Reconfigure limit_train_batches in micro batches

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Replace _reconfigure_val_batches with _reconfigure_limit_batches for BERT & T5

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../language_modeling/megatron_base_model.py  | 47 ++++++++++---------
 .../language_modeling/megatron_bert_model.py  |  6 ++-
 .../language_modeling/megatron_gpt_model.py   |  4 +-
 .../language_modeling/megatron_t5_model.py    |  6 ++-
 4 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 85867df672f2..178346abde1c 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -324,40 +324,43 @@ def get_model_module_list(self):
         else:
             return [self.model]
 
-    def _reconfigure_val_batches(self):
+    def _reconfigure_limit_batches(self, limit_batches, dataloader, mode):
         """
         Reconfigure trainer.limit_val_batches for pretraining
         """
-        # Override limit_val_batches to be a multiple of num microbatches and so there are limit_val_batches//num_micro_batches num of global batches
-        if isinstance(self.trainer.limit_val_batches, int):
-            self.trainer.limit_val_batches *= get_num_microbatches()
+        # Override limit_batches in terms of num microbatches and so there are limit_batches//num_micro_batches num of global batches
+        if isinstance(limit_batches, int):
+            limit_batches *= get_num_microbatches()
         else:
-            assert isinstance(self.trainer.limit_val_batches, float)
-            # Don't reconfigure if limit_val_batches is 0.0 or if there's no val dataloader
-            if self.trainer.limit_val_batches == 0.0 or self._validation_dl is None:
+            assert isinstance(limit_batches, float)
+            # Don't reconfigure if limit_batches is 0.0 or if there's no dataloader
+            if limit_batches == 0.0 or dataloader is None:
                 return
-            # len(self._validation_dl) returns len as num of microbatches
-            val_len_in_micro_batches = len(self._validation_dl)
-            if self._validation_ds is not None and len(self._validation_dl) != float("inf"):
-                if self.trainer.limit_val_batches == 1.0:
-                    self.trainer.limit_val_batches = val_len_in_micro_batches
+            # len(dataloader) returns len as num of microbatches
+            dl_len_in_micro_batches = len(dataloader)
+            if len(dataloader) != float("inf"):
+                if limit_batches == 1.0:
+                    limit_batches = dl_len_in_micro_batches
                 else:
-                    limit_val_micro_batches = int(val_len_in_micro_batches * self.trainer.limit_val_batches)
-                    if limit_val_micro_batches == 0 and self.trainer.limit_val_batches > 0.0:
-                        min_percentage = 1.0 / len(self._validation_dl)
+                    limit_micro_batches = int(dl_len_in_micro_batches * limit_batches)
+                    if limit_micro_batches == 0 and limit_batches > 0.0:
+                        min_percentage = 1.0 / len(dataloader)
                         raise MisconfigurationException(
-                            f"You requested to check {self.trainer.limit_val_batches} of the val_dataloader but"
-                            f" {self.trainer.limit_val_batches} * {len(self._validation_dl)} < 1. Please increase the"
+                            f"You requested to check {limit_batches} of the val_dataloader but"
+                            f" {limit_batches} * {len(dataloader)} < 1. Please increase the"
                             f" `limit_val_batches` argument. Try at least"
                             f" `limit_val_batches={min_percentage}`"
                         )
                     # Make sure trainer.limit_val_batches is a multiple of num of microbatches
-                    if limit_val_micro_batches < get_num_microbatches():
-                        self.trainer.limit_val_batches = get_num_microbatches()
+                    if limit_micro_batches < get_num_microbatches():
+                        limit_batches = get_num_microbatches()
                     else:
-                        self.trainer.limit_val_batches = (
-                            limit_val_micro_batches - limit_val_micro_batches % get_num_microbatches()
-                        )
+                        limit_batches = limit_batches - limit_batches % get_num_microbatches()
+
+        if mode == 'train':
+            self.trainer.limit_train_batches = limit_batches
+        else:
+            self.trainer.limit_val_batches = limit_batches
 
         # Override num sanity steps to be a multiple of num of microbatches
         self.trainer.num_sanity_val_steps *= get_num_microbatches()
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
index 98d0e9f12584..fb02223112d6 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
@@ -683,8 +683,6 @@ def build_LDDL_data(self, cfg):
         logging.info(f'Finished building LDDL Dataloaders')
 
     def build_train_valid_test_datasets(self):
-        # Override limit_val_batches to be a multiple of num microbatches to prevent val_step from exiting in between a step
-        self._reconfigure_val_batches()
         logging.info('Building Bert datasets.')
         if self.trainer.limit_val_batches > 1.0 and isinstance(self.trainer.limit_val_batches, float):
             raise ValueError("limit_val_batches must be an integer or float less than or equal to 1.0.")
@@ -730,6 +728,10 @@ def build_train_valid_test_datasets(self):
         if self._test_ds is not None:
             logging.info(f'Length of test dataset: {len(self._test_ds)}')
         logging.info(f'Finished building Bert datasets.')
+
+        # Override limit_val_batches to be a multiple of num microbatches to prevent val_step from exiting in between a step
+        self._reconfigure_limit_batches(self.trainer.limit_val_batches, self._validation_dl, 'val')
+
         return self._train_ds, self._validation_ds, self._test_ds
 
     def backward(self, *args, **kwargs):
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 04f842ed1bca..989b14dd88e8 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1453,8 +1453,10 @@ def setup(self, stage=None):
             self.setup_training_data(self.cfg.data)
             self.setup_validation_data(self.cfg.data)
             self.setup_test_data(self.cfg.data)
+            # Override limit_train_batches in terms of num of microbatches
+            self._reconfigure_limit_batches(self.trainer.limit_train_batches, self._train_dl, 'train')
             # Override limit_val_batches to be a multiple of num microbatches to prevent val_step from exiting in between a step
-            self._reconfigure_val_batches()
+            self._reconfigure_limit_batches(self.trainer.limit_val_batches, self._validation_dl, 'val')
 
         if stage == 'fit':
             self.initialize_last_rank_embeddings()
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_t5_model.py b/nemo/collections/nlp/models/language_modeling/megatron_t5_model.py
index dfca83999a35..0f5022795446 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_t5_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_t5_model.py
@@ -197,8 +197,6 @@ def add_special_tokens_to_tokenizer(
                         tokenizer.add_special_tokens([f'<extra_id_{mask_type}>'])
 
     def build_train_valid_test_datasets(self):
-        # Override limit_val_batches to be a multiple of num microbatches to prevent val_step from exiting in between a step
-        self._reconfigure_val_batches()
         logging.info(f'Building {self.model_name} datasets.')
         if self.trainer.limit_val_batches > 1.0 and isinstance(self.trainer.limit_val_batches, float):
             raise ValueError("limit_val_batches must be an integer or float less than or equal to 1.0.")
@@ -245,6 +243,10 @@ def build_train_valid_test_datasets(self):
         logging.info(f'Length of val dataset: {len(self._validation_ds)}')
         logging.info(f'Length of test dataset: {len(self._test_ds)}')
         logging.info(f'Finished building {self.model_name} datasets.')
+
+        # Override limit_val_batches to be a multiple of num microbatches to prevent val_step from exiting in between a step
+        self._reconfigure_limit_batches(self.trainer.limit_val_batches, self._validation_dl, 'val')
+
         return self._train_ds, self._validation_ds, self._test_ds
 
     def list_available_models(self):

From 3baa24cb499da68604338b205a50cc616c554598 Mon Sep 17 00:00:00 2001
From: Selvaraj Anandaraj <anandaraj@wisc.edu>
Date: Wed, 3 Apr 2024 22:05:54 -0700
Subject: [PATCH 093/140] Embedding Wgrad defering (#8727)

* Embedding wgrad defer init commit

Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../language_modeling/megatron_gpt_model.py   | 43 ++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 989b14dd88e8..859262bfd2ef 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -101,7 +101,7 @@
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
     from megatron.core.transformer.transformer_config import TransformerConfig
-    from megatron.core.utils import init_method_normal, scaled_init_method_normal
+    from megatron.core.utils import drain_embedding_wgrad_compute, init_method_normal, scaled_init_method_normal
 
     # TODO @tmoon: Use once available in Megatron-LM
     # from megatron.core.pipeline_parallel.schedules import DataIteratorList
@@ -705,11 +705,52 @@ def training_step(self, dataloader_iter):
                     for param in module.embedding.parameters():
                         param.data_ptr()
 
+        if self.cfg.get('pipeline_model_parallel_size', 1) > 1 and parallel_state.is_pipeline_last_stage(
+            ignore_virtual=True
+        ):
+            if (
+                self.cfg.get('defer_embedding_wgrad_compute', False) and self.mcore_gpt
+            ):  # Silently ignore the optimization if MCORE is not used
+                module_list = self.get_model_module_list()
+                if len(module_list) > 1:
+                    embedding_module = module_list[-1]
+                else:
+                    embedding_module = module_list[0]
+
+                embedding_module.embedding_activation_buffer.clear()
+                assert (
+                    len(embedding_module.embedding_activation_buffer) == 0
+                ), "When you defer wgrads, this buffer should not hold stray activations"
+
         loss_mean = self.training_step_fwd_bwd_step_call(dataloader_iter, forward_only=False)
 
         if self.cfg.get('fp8', False):
             self.prev_step_training = self.training
 
+        # Optimization: Defer the embedding GEMM Wgrads of the last PP stage to pipeline flush waiting time
+        if self.cfg.get('pipeline_model_parallel_size', 1) > 1 and parallel_state.is_pipeline_last_stage(
+            ignore_virtual=True
+        ):
+            if (
+                self.cfg.get('defer_embedding_wgrad_compute', False) and self.mcore_gpt
+            ):  # Silently ignore the optimization if MCORE is not used
+                module_list = self.get_model_module_list()
+                if len(module_list) > 1:
+                    embedding_module = module_list[-1]
+                else:
+                    embedding_module = module_list[0]
+
+                embedding_activation_buffer = embedding_module.embedding_activation_buffer
+                grad_output_buffer = embedding_module.grad_output_buffer
+                if self.cfg.get('share_embeddings_and_output_weights', True):
+                    weight = embedding_module.shared_embedding_or_output_weight()
+                else:
+                    weight = embedding_module.output_layer.weight
+
+                drain_embedding_wgrad_compute(
+                    embedding_module.config, embedding_activation_buffer, grad_output_buffer, weight
+                )
+
         # when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced
         if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False):
             self.megatron_timer_start('allreduce_sequence_parallel_gradients', log_level=1)

From 55d6c42b7f32c8fc907be7f61fce51b4e4a5bc88 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Wed, 3 Apr 2024 22:20:45 -0700
Subject: [PATCH 094/140] [Nemo CICD] CI adjustments: Change Login mechanism
 (#8813)

* Change container registry login

* temp for test

* Change container registry login

* Revert "temp for test"

This reverts commit 0cee494c6d94f3d6a33cc4d65674cf706f9ef45c.

* CI adjustments

* attempt fix for workflow trigger

* attempt fix for workflow trigger
---
 .github/workflows/cicd-main.yml | 339 +-------------------------------
 1 file changed, 1 insertion(+), 338 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index adb9c7e690b3..b44ab5613b71 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -15,6 +15,7 @@ name: "CICD NeMo"
 
 on:
   pull_request:
+    types: [opened, synchronize]
     branches: [ "main" ]
 
 jobs:
@@ -61,11 +62,6 @@ jobs:
 #        --env TRANSFORMERS_OFFLINE=0
 #        --env HYDRA_FULL_ERROR=1
     steps:
-    - name: Log into ACR (Azure Container Registry)  # this login is for the pushing step after
-      run: |
-        # Login to Azure Container Registry
-        az acr login --name nemoci.azurecr.io
-
     - name: Checkout repository
       uses: actions/checkout@v2
       with:
@@ -177,9 +173,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -203,9 +196,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -233,9 +223,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -261,9 +248,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -288,9 +272,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -315,9 +296,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -343,9 +321,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -374,9 +349,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -408,9 +380,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -440,9 +409,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -474,9 +440,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -524,9 +487,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -561,9 +521,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -598,9 +555,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -680,9 +634,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -715,9 +666,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -751,9 +699,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -790,9 +735,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -824,9 +766,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -858,9 +797,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -888,9 +824,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -921,9 +854,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -954,9 +884,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -996,9 +923,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1031,9 +955,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1069,9 +990,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1099,9 +1017,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1124,9 +1039,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1159,9 +1071,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1196,9 +1105,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1265,9 +1171,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1351,9 +1254,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1393,9 +1293,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1434,9 +1331,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1478,9 +1372,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1523,9 +1414,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1561,9 +1449,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1599,9 +1484,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1643,9 +1525,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1724,9 +1603,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1763,9 +1639,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1836,9 +1709,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1873,9 +1743,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1917,9 +1784,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -1959,9 +1823,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2003,9 +1864,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2045,9 +1903,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2089,9 +1944,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2131,9 +1983,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2164,9 +2013,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2303,9 +2149,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2337,9 +2180,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2376,9 +2216,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2408,9 +2245,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2435,9 +2269,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2469,9 +2300,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2532,9 +2360,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2600,9 +2425,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2729,9 +2551,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2763,9 +2582,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2808,9 +2624,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2853,9 +2666,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2890,9 +2700,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -2971,9 +2778,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -3014,9 +2818,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -3056,9 +2857,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -3087,9 +2885,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -3133,9 +2928,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -3177,9 +2969,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -3211,9 +3000,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -3323,9 +3109,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -3644,9 +3427,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -3728,9 +3508,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -3813,9 +3590,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -3900,9 +3674,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -4080,9 +3851,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -4110,9 +3878,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -4206,9 +3971,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -4399,9 +4161,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -4498,9 +4257,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -4597,9 +4353,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -4700,9 +4453,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -4784,9 +4534,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -4831,9 +4578,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -4880,9 +4624,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -4945,9 +4686,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -4974,9 +4712,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5005,9 +4740,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5071,9 +4803,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5103,9 +4832,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5135,9 +4861,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5245,9 +4968,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5355,9 +5075,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5465,9 +5182,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5549,9 +5263,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5606,9 +5317,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5700,9 +5408,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5727,9 +5432,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5810,9 +5512,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5899,9 +5598,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5946,9 +5642,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -5998,9 +5691,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -6066,9 +5756,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -6096,9 +5783,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -6129,9 +5813,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -6170,9 +5851,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -6207,9 +5885,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -6254,9 +5929,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -6298,9 +5970,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -6339,9 +6008,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128
@@ -6406,9 +6072,6 @@ jobs:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
     container:
-      credentials:
-        password: ${{ secrets.ACR_PASSWORD }}
-        username: nemoci
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
         # --user 0:128

From 71c73ae844e2c1eb37f1434526ce573eb3e00023 Mon Sep 17 00:00:00 2001
From: Ming <111467530+Victor49152@users.noreply.github.com>
Date: Wed, 3 Apr 2024 22:37:25 -0700
Subject: [PATCH 095/140] Add SDXL support to NeMo (#8734)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update README.md: output_path --> output_manifest_filepath (#7442)

Signed-off-by: Samuele Cornell <cornellsamuele@gmail.com>

* Fix refiner issue on FID

* Fix refiner seeding issue

* Updating FlashAttention API to match FlashAttentionV2

* Multiple fixes for mm

* Fix CI inductor issue and update to torch compile

* Remove suppress error

* Fix when conversion config uses fp16 and it complains about precision plugin

* Add training config with no cropping and extra conditioning.

* Fixing FAv2 API usage

* Initial release of content filtering model

* Added synthetic dataloader for precached and online mode

* Mingyuanm/dreambooth opt

* Add llama2 support in neva training

* Fix sampler length

* Fix all precision issues in nemo multimodal

* Add rope dynamic linear scaling (#7437)

* Add dynamic linear scaling

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Yang Zhang <yzhang123@users.noreply.github.com>

* Fix None dataloader issue in PTL2.0 (#7455)

* Fix None dataloader issue in PTL2.0

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* updating values of self._validation_dl and self._test_dl as well

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>

* updating values of self._validation_dl and self._test_dl as well

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [ASR] Confidence measure -> method renames (#7434)

* measure -> method

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Add steps for document of getting dataset 'SF Bilingual Speech' (#7378)

* Add steps for document of getting dataset 'SF Bilingual Speech'

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* Update datasets.rst

added a link from a tutorial demonstrating detailed data prep steps.

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* RNN-T confidence and alignment bugfix (#7381)

* new frame_confidence and alignments lists are now always created after the while loop

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* tests added

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

---------

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* Fix resume from checkpoint in exp_manager (#7424) (#7426)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Fix checking of cuda/cpu device for inputs of Decoder (#7444)

* Fix checking of cuda/cpu device for inputs of Decoder

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* Update tacotron2.py

Signed-off-by: Jason <jasoli@nvidia.com>

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Signed-off-by: Jason <jasoli@nvidia.com>
Co-authored-by: Jason <jasoli@nvidia.com>

* Fix failure of ljspeech's get_data.py (#7430)

* Fix failure of ljspeech's get_data.py

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [TTS] Fix audio codec type checks (#7373)

* [TTS] Fix audio codec type checks

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Fix audio codec tests

Signed-off-by: Ryan <rlangman@nvidia.com>

---------

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Add dataset to path of logged artifacts (#7462)

* [TTS] Add dataset to path of logged artifacts

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Revert axis name back to Audio Frames

Signed-off-by: Ryan <rlangman@nvidia.com>

---------

Signed-off-by: Ryan <rlangman@nvidia.com>

* Fix sft dataset truncation (#7464)

* Add fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Automatic Lip Reading Recognition (ALR) - ASR/CV (Visual ASR) (#7330)

* striding_conv1d_k5 and dw_striding_conv1d_k5 subsampling

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* transpose conv1d inputs

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* Update subsampling.py

change striding_conv1d_k5 to striding_conv1d

Signed-off-by: Maxime Burchi <60737204+burchim@users.noreply.github.com>

* cv branch

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* video manifest

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* add collection classes

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add test_step_outputs

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* correct manifest bug when having only audio or only videos

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* correct manifest bug when having only audio or only videos

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* clean references

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* freeze unfreeze transcribe cv models

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* correct manifest get_full_path bug

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* update for PR

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* guard torchvision

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update nemo/collections/cv/data/video_to_text_dataset.py

Co-authored-by: Igor Gitman <igor.a.gitman@gmail.com>
Signed-off-by: Maxime Burchi <60737204+burchim@users.noreply.github.com>

* _video_speech_collate_fn in cv/data/video_to_text.py

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* add self.out = None to asr subsampling

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* Update nemo/collections/cv/data/video_to_text_dataset.py

Co-authored-by: Igor Gitman <igor.a.gitman@gmail.com>
Signed-off-by: Maxime Burchi <60737204+burchim@users.noreply.github.com>

* cv -> multimodal/speech_cv branch

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: mburchi <maxime.burchi@gmail.com>
Signed-off-by: Maxime Burchi <60737204+burchim@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Igor Gitman <igor.a.gitman@gmail.com>

* HF StarCoder to NeMo conversion script (#7421)

* Script to convert HF StarCoder checkpoint to NeMo

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* StarCoder conversion test

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Fix test

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Catch up with save_to changes

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Don't abbreviate args for clarity

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Configurable precision: BF16 vs FP32

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* fix bug when loading dist ckpt in peft (#7452)

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>

* Fix adding positional embeddings in-place in transformer module (#7440)

Signed-off-by: Tamerlan Tabolov <tktabolov@gmail.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>

* Fix (#7478)

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* add sleep (#7498) (#7499)

* add sleep


* add sleep onto config instead


* add comment


---------

Signed-off-by: Gerald Shen <geshen@nvidia.com>
Co-authored-by: Gerald Shen <119401249+gshennvm@users.noreply.github.com>

* Fix exp manager check for sleep (#7503) (#7504)

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>

* bugfix: trainer.accelerator=auto from None. (#7492) (#7493)

Signed-off-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Co-authored-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>

* [doc] fix broken link (#7481)

Signed-off-by: Stas Bekman <stas00@users.noreply.github.com>

* [TTS] Read audio as int32 to avoid flac read errors (#7477)

* [TTS] Read audio as int32 to avoid flac read errors

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Add comment about read failures

Signed-off-by: Ryan <rlangman@nvidia.com>

---------

Signed-off-by: Ryan <rlangman@nvidia.com>

* Add dataset 'AISHELL-3' from OpenSLR for training mandarin TTS (#7409)

* Add dataset 'AISHELL-3' from OpenSLR for training mandarin TTS
* Train 'AISHELL-3' dataset with multi-speakers

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* Update get_data.py

update copyright header

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* Update get_data.py

added a disclaimer

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add new configuration file for AISHELL3 with multispeaker of fastpitch

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* dllogger - log on rank 0 only (#7513)

Signed-off-by: Stas Bekman <stas00@users.noreply.github.com>

* Fix TTS FastPitch tutorial (#7494) (#7516)

* Fix

---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>

* Fix get_dist() tensor dimension (#7506) (#7515)

Signed-off-by: Jocelyn Huang <jocelynh@nvidia.com>
Co-authored-by: Jocelyn <jocelynh@nvidia.com>

* bugfix: specify trainer.strategy=auto when devices=1 (#7509) (#7512)

Signed-off-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>

* fix (#7511)

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* [TTS] Fix FastPitch data prep tutorial (#7524)

Signed-off-by: Ryan <rlangman@nvidia.com>

* add italian tokenization (#7486)

* add italian tokenization

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add more ipa lexicon it

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix error deletion

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>

* add test

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Replace None strategy with auto in tutorial notebooks (#7521) (#7527)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>

* unpin setuptools (#7534) (#7535)

Signed-off-by: fayejf <36722593+fayejf@users.noreply.github.com>
Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com>

* remove auto generated examples (#7510)

* explicitly remove autogenerated examples for data parallel evaluation

Signed-off-by: arendu <adithyare@nvidia.com>

* mark autogenrated and remove it for test

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: arendu <adithyare@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Add the `strategy` argument to `MegatronGPTModel.generate()` (#7264)

It is passed as an explicit argument rather than through
`**strategy_args` so as to ensure someone cannot accidentally pass other
arguments that would end up being ignored.

It is a keyword-only argument to ensure that if in the future we want to
update the signature to `**strategy_args`, we can do it without breaking
code.

Signed-off-by: Olivier Delalleau <507137+odelalleau@users.noreply.github.com>

* Fix PTL2.0 related ASR bugs in r1.21.0: Val metrics logging, None dataloader issue (#7531) (#7533)

* fix none dataloader issue ptl2


* ptl2.0 logging fixes for rnnt_models


---------

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>
Co-authored-by: Kunal Dhawan <kunaldhawan97@gmail.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>

* gpus -> devices (#7542) (#7545)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>

* Update FFMPEG version to fix issue with torchaudio (#7551) (#7553)

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>

* PEFT GPT & T5 Refactor (#7308)

* initial implementation of add_adapters API

* correct type hint

* Add config in add_adapters for save and load (@author bobchen)

* Remove AdapterConfig to avoid import error

* Add AdaterConfig back and move adaptermixin to sft model

* Add NLPSaveRestoreConnector as default in NLPModel.restore_from

* Add restore_from_nemo_with_adapter and test script

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* rename t5 file and classes to be consistent with GPT

* add t5 sft dataset

* add support for single-file format with T5SFTDataset

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Various small changes to make T5 SFT work like GPT SFT

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add adapter evaluation test script

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add MultiAdaterConfig for ia3 and fix builder issue

* Make ptuning for T5SFTModel work using mixin

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add IA3_Adapter for AdapterName

* Add adapter name for ptuning and attention adapter

* Make test script GPT/T5 agnostic

* Add layer selection feature

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Integrate adapter name and config

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update gpt peft tuning script to new API

* add t5 peft tuning script with new API

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix IA3 layer selection issue

* Override state_dict on SFT model instead of mixin

* Add load adapter by adapter config

* move peft config map away from example script

* auto get config from nemo adapter

* Move PEFTConfig to new file

* fix ckpt save/load for t5

* name change: add_adapters -> add_adapter

* variable name change

* update t5 script

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix t5 issues

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add weight tying

* update gpt tuning script

* PEFT-API proposal

* Fix according to comments

* update tuning scripts

* move merge_cfg_with to mixin class since it applies to both gpt and t5 and requires the model class for restore

* Add mcore_gpt support for NLPAdapterMixin

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix typo

* variable name change to distinguish "peft" and "adapter"

* override `load_adapters` to support `add_adapter` name change

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update tuning and eval script for adapter save/load

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add Ptuning on first stage only

* add lora tutorial for review

* Fix layer selection for mcore

* add landing page

* fix resume training

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add mcore condition in sharded_state_dict to make sft work

* Update lora_tutorial.md

First edit of this file for PEFT documentation for NeMO

Signed-off-by: hkelly33 <58792115+hkelly33@users.noreply.github.com>

* rename Adapter to AttentionAdapter to avoid confusion in doc

* Change load_adapters to load .nemo

* add quick start guide

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add load_adapters with .ckpt

* Remove setup_complete changes in load_adapters

* update landing page

* remove typo

* Updated quick_start.md per Chen Cui

Signed-off-by: hkelly33 <58792115+hkelly33@users.noreply.github.com>

* Add inference config merger and tutorial

* Add doc string for NLPAdapterModelMixin and deprecated warning on MegatronGPTPEFTModel

* add supported_methods.md and update other documentations

* Update supported_methods.md

minor updates.

Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>

* Update landing_page.md

minor update.

Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>

* Modify doc string for NLPAdapterModelMixin

* Add doc string add_adapters in NLPAdapterModelMixin

* rename canonical adapters

* remove mcore hard dependency

* [PATCH] move microbatch calculator to nemo from apex

* remove apex dependency in gpt and t5 sft models

* remove apex dependency in gpt model

* render doc strings

* fix

* Add missing virtual_tokens on ptuning

* fix docstrings

* update gpt-style model coverage in docs

* update docstring

* Remove pdb

* add lightning_fabric to make docstring rendering work

* Add Ptuning missing key

* try docstring rendering

* Fix ptuning issue

* update gpt t5 peft tuning and eval scripts

* typos

* update eval config

* fix bug relating to apex dependency removal

* typo

* make predict step behave the same as test step

* make lora tutorial work in notebook

* cosmetics

* update yaml scripts

* mcore_gpt attribute optional

* typo

* update eval scripts and fix T5 eval bugs

* add NLPDDPStrategyNotebook and trainer builder logic to use it

* update lora notebook to use new trainer builder

* fix microbatch calculator bug for inference after training

* Convert markdown files to RST and incorporate with doc

* typo

* revise language

* remove extra cell

* remove unnecessary inheritance

* remove old tests

* move layer selection default so logging messages make sense

* remove `save_adapters` as adapter weights are saved automatically during training

* initialize weights from a checkpoint instead of randomly

* multiple fields can form a context (#7147)

* list of context fields and flexible prompt template

Signed-off-by: arendu <adithya.r@gmail.com>

* list of fields for context

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Add multiple truncation fields and middle truncation

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Compatible to old ckpt

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix tokenize detokenize issue

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove detokenization, add truncation augmentation

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Resolve comments

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Remove unused import

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert eos

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Add tokenizer space_sensitive attribute

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix error

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Fix erorr and use re

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Change assert logic

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Follow adi suggestion

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove merge function

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add example and comment

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Remove context_key and add comment

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Remove random truncation

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix template none

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Signed-off-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>

* revert config changes

* remove accidental breakpoint

* support TP>1 loading

* infer adapter type from checkpoint in during eval

* breakup add adapter

* enable interpolation of train_ds and validation_ds

* update metric calc script to conform to single-file eval format

* remove extraneous print

* update lora notebook for updated merge_inference_cfg

* Update nlp_adapter_mixins.py

variable name change

Signed-off-by: Chen Cui <chcui@nvidia.com>

* turn off grad scaler for PP to match old scripts

* remove PEFTSaveRestoreConnector since functionality all covered by the new mixin class

* remove resume_from_checkpoint check since covered in #7335

* revert changes made in eval config interpolation

* more interpolation

* typo

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove dup line

Signed-off-by: Chen Cui <chcui@nvidia.com>

* code style warnings

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix config mistake

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add copyright header

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix code check warnings

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert changes to remove apex dependency (mixed apex+nemo microbatch calculator broke some CI tests)

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add more deprecation notices

Signed-off-by: Chen Cui <chcui@nvidia.com>

* update deprecation notices

Signed-off-by: Chen Cui <chcui@nvidia.com>

* update deprecation notices

Signed-off-by: Chen Cui <chcui@nvidia.com>

* consolidate peft and sft scripts

Signed-off-by: Chen Cui <chcui@nvidia.com>

* update CI tests

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* notebook branch points to main to prepare for merge

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix gpt and t5 validation with any metric other than loss

Signed-off-by: Chen Cui <chcui@nvidia.com>

* support pre-extracted checkpoints

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Signed-off-by: hkelly33 <58792115+hkelly33@users.noreply.github.com>
Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>
Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Signed-off-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Marc Romeyn <marcromeyn@gmail.com>
Co-authored-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: hkelly33 <58792115+hkelly33@users.noreply.github.com>
Co-authored-by: Adi Renduchintala <adithyare@nvidia.com>
Co-authored-by: Yuanzhe Dong <yudong@nvidia.com>
Co-authored-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>

* fix a typo (#7496)

Signed-off-by: BestJuly <chntaoli@163.com>

* [TTS] remove curly braces from ${BRANCH} in jupyer notebook cell. (#7554) (#7560)

* remove curly braces.
* remove installation of pynini.
---------

Signed-off-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>

* add youtube embed url (#7570)

Signed-off-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Co-authored-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>

* Remap speakers to continuous range of speaker_id for dataset AISHELL3 (#7536)

* Remap speakers to continuous range of speaker_id for dataset AISHELL3
* Add new key/value pair to record raw speaker for AISHELL3 dataset

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* fix validation_step_outputs initialization for multi-dataloader (#7546) (#7572)

* added correct validation_step_outputs initialization for mutli-dataloader


* changed kernel for display


* Update logic for validation and test step outputs


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert multidataloader changes in multilang ASR notebook


---------

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>
Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: Kunal Dhawan <kunaldhawan97@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Append output of val step to self.validation_step_outputs (#7530) (#7532)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>

* [TTS] fixed trainer's accelerator and strategy. (#7569) (#7574)

Signed-off-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>

* Append val/test output to instance variable in EncDecSpeakerLabelModel (#7562) (#7573)

* Append val/test output to the instance variable in EncDecSpeakerLabelModel


* Handle test case in evaluation_step


* Replace type with isinstance


---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>

* Fix CustomProgressBar for resume (#7427) (#7522)

* Fix CustomProgress Bar for resume and multiple epochs


* Edit num_training_batches


* Use max_steps as total for progress bar for resume


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* fix typos in nfa and speech enhancement tutorials (#7580) (#7583)

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>
Co-authored-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>

* Add strategy as ddp_find_unused_parameters_true for glue_benchmark.py (#7454) (#7461)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>

* update strategy (#7577) (#7578)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>

* Fix typos (#7581)

* Change hifigan finetune strategy to ddp_find_unused_parameters_true (#7579) (#7584)

* Change strategy to auto


---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>

* [BugFix] Add missing quotes for auto strategy in tutorial notebooks (#7541) (#7548)

* Add missing quotes for auto strategy


* Revert trainer.gpus to trainer.devices in Self_Supervised_Pre_Training.ipynb


---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>

* add build os key (#7596) (#7599)

* add build os key


* add tools


* update to stable version


---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>

* StarCoder SFT test + bump PyT NGC image to 23.09 (#7540)

* Add SFT StarCoder test

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Remove _modify_config call as it is covered in load_from_nemo just below

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Test with pyt:23.09 container

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* defaults changed (#7600)

* defaults changed

Signed-off-by: arendu <adithyare@nvidia.com>

* typo

Signed-off-by: arendu <adithyare@nvidia.com>

* update

Signed-off-by: arendu <adithyare@nvidia.com>

---------

Signed-off-by: arendu <adithyare@nvidia.com>

* add ItalianPhonemesTokenizer (#7587)

* add ItalianPhonemesTokenizer

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix Italian phonemes

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add test

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>

---------

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* best ckpt fix (#7564) (#7588)

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>

* Add files via upload (#7598)

specifies the branch

Signed-off-by: George <37293288+Jorjeous@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* Fix validation in G2PModel and ThutmoseTaggerModel (#7597) (#7606)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>

* Broadcast loss only when using pipeline parallelism and within the pipeline parallel domain (#7576) (#7586)

* Broadcast loss only when using pipeline parallelism and within the pipeline parallel domain
* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Safeguard nemo_text_processing installation on ARM (#7485)

* safeguard nemo_text_processing installing

Signed-off-by: Jason <jasoli@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update check

Signed-off-by: Jason <jasoli@nvidia.com>

---------

Signed-off-by: Jason <jasoli@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Bound transformers version in requirements (#7620)

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* fix llama2 70b lora tuning bug (#7622)

* fix llama2 70b lora tuning bug

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Update peft_config.py

brackets

Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>
Co-authored-by: Adi Renduchintala <adithyare@nvidia.com>

* Fix import error no module name model_utils (#7629)

Signed-off-by: Mehadi Hasan Menon <mehadihasan80@gmail.com>

* add fc large ls models (#7641)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>

* bugfix: trainer.gpus, trainer.strategy, trainer.accelerator (#7621) (#7642)

* [TTS] bugfix for Tacotron2 tutorial due to PTL 2.0
* trainer.gpus -> trainer.devices
* fixed related tutorial bugs
---------
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* fix ssl models ptl monitor val through logging (#7608) (#7614)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* Fix metrics for SE tutorial (#7604) (#7612)

Signed-off-by: Ante Jukić <ajukic@nvidia.com>
Co-authored-by: anteju <108555623+anteju@users.noreply.github.com>

* Add ddp_find_unused_parameters=True and change accelerator to auto (#7623) (#7644)

* Add ddp_find_unused_parameters=True and change acclerator to auto


* Add ddp_find_unused_parameters True for normalization_as_tagging_train.py


---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>

* Fix py3.11 dataclasses issue  (#7616)

* Fix py3.11 dataclasses issue  (#7582)

* Update ASR configs to support Python 3.11

Signed-off-by: smajumdar <titu1994@gmail.com>

* Update TTS configs to support Python 3.11

Signed-off-by: smajumdar <titu1994@gmail.com>

* Guard MeCab and Ipadic

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix remaining ASR dataclasses

Signed-off-by: smajumdar <titu1994@gmail.com>

* Fix remaining ASR dataclasses

Signed-off-by: smajumdar <titu1994@gmail.com>

* Fix scripts

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Update name to ConfidenceMethodConfig

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Broadcast loss only when using pipeline parallelism and within the pipeline parallel domain (#7576) (#7586)

* Broadcast loss only when using pipeline parallelism and within the pipeline parallel domain
* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Safeguard nemo_text_processing installation on ARM (#7485)

* safeguard nemo_text_processing installing

Signed-off-by: Jason <jasoli@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update check

Signed-off-by: Jason <jasoli@nvidia.com>

---------

Signed-off-by: Jason <jasoli@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Fix changes to confidence measure

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Sangkug Lym <slym@nvidia.com>
Signed-off-by: Jason <jasoli@nvidia.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Jason <jasoli@nvidia.com>

* [Stable Diffusion/ControlNet] Enable O2 training for SD and Fix ControlNet CI failure

* Mingyuanm/dreambooth fix

* Fix NeMo CI Infer Issue

* DreamFusion

* Move neva export changes

* Add Imagen Synthetic Dataloader

* Add VITWrapper and export stuff to wrapper

* Update neva with megatron-core support

* Fix issues with Dockerfile (#7650) (#7652)

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>

* [ASR] RNN-T greedy decoding max_frames fix for alignment and confidence (#7635)

* decoding and test fix

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [ASR] Fix type error in jasper (#7636) (#7653)

Signed-off-by: Ryan <rlangman@nvidia.com>
Co-authored-by: Ryan Langman <rlangman@nvidia.com>

* [TTS] Add STFT and SI-SDR loss to audio codec recipe (#7468)

* [TTS] Add STFT and SI-SDR loss to audio codec recipe

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Fix STFT resolution

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Fix training metric logging

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Add docstring to mel and stft losses

Signed-off-by: Ryan <rlangman@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Ryan <rlangman@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Create per.py (#7538)

* Move model precision copy (#7336)

* move cfg precision set to megatron base model

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* remove copy from other models

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* modify attribute not arg

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* fix gpt model test for ptl 2.0

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* rename function and add docstring

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* replace precision to dtype conditionals with func call

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* unnecessary function and cfg reset

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* set default value

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* fix precision lookup in a few more places

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* rename mapping function

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* ununsed import

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* save torch datatype to model

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* set weights precision wrt amp o2

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Revert "set weights precision wrt amp o2"

This reverts commit 313a4bfe5eb69d771a6d2433898c0685836aef5c.

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* revert half precision at inference attempt

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* move autocast dtype to base model

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* move params dtype to base model, enable fp16 O2 inf

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* unused imports

Signed-off-by: Maanu Grover <maanug@nvidia.com>

---------

Signed-off-by: Maanu Grover <maanug@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix PEFT checkpoint loading (#7388)

* Fix PEFT checkpoint loading

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Use distributed optimizer support for multiple dtypes (#7359)

* Update distopt wrapper with multiple dtype support

Remove manual handling of separate FP32 optimizer.

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Use distopt support for contiguous buffers with multiple dtypes

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Fix typo

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Separate distopt buckets for first GPT layer and non-overlapped params

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Add distopt logic for int dtypes

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Update Apex commit

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Remove unused variables

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Update Apex commit in README and Jenkensfile

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Debug Dockerfile and Jenkinsfile

Signed-off-by: Tim Moon <tmoon@nvidia.com>

---------

Signed-off-by: Tim Moon <tmoon@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* minor fix for llama ckpt conversion script (#7387)

* minor fix for llama ckpt conversion script

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* Update Jenkinsfile

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* remove fast_swiglu configuration

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

---------

Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix wrong calling of librosa.get_duration() in notebook (#7376)

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [PATCH] PEFT import mcore (#7393)

* [PATCH] PEFT import mcore

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Create per.py

Script for calculation Punctuation Error Rate and related rates (correct rate, deletions rate, etc.)

Signed-off-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [TTS] Added a callback for logging initial data (#7384)

Signed-off-by: Ante Jukić <ajukic@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Update Core Commit (#7402)

* Update Core Commit

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* update commit

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

---------

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Use cfg attribute in bert (#7394)

* use cfg attribute instead of arg

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* use torch_dtype in place of cfg.precision

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* move precision copy before super constructor

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* use trainer arg

Signed-off-by: Maanu Grover <maanug@nvidia.com>

---------

Signed-off-by: Maanu Grover <maanug@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Add support for bias conversion in Swiglu models (#7386)

* Add support for bias conversion in Swiglu models

Signed-off-by: smajumdar <titu1994@gmail.com>

* Add support for auto extracting tokenizer model

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add support for auto extracting tokenizer model

Signed-off-by: smajumdar <titu1994@gmail.com>

* Fix issue with missing tokenizer

Signed-off-by: smajumdar <titu1994@gmail.com>

* Refactor

Signed-off-by: smajumdar <titu1994@gmail.com>

* Refactor

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Update save_to and restore_from for dist checkpointing (#7343)

* add dist ckpt to save to, in progress

Signed-off-by: eharper <eharper@nvidia.com>

* move dist ckpt

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* clean up

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update restore from, need to figure out how to initialize distributed

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* launch distrib if needed when restoring dist ckpt

Signed-off-by: eharper <eharper@nvidia.com>

* when using mcore we can change tp pp on the fly

Signed-off-by: eharper <eharper@nvidia.com>

* add load_from_checkpoint support for dist ckpt

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update llama convert script to save dist .nemo

Signed-off-by: eharper <eharper@nvidia.com>

* fix load dist ckpt

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* setup TE TP groups if needed

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* setup te tp groups if needed

Signed-off-by: eharper <eharper@nvidia.com>

* remove import

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>
Signed-off-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: jasonwan <jasonwan@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* fix forward for with mcore=false (#7403)

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix logging to remove 's/it' from progress bar in Megatron models and add train_step_timing (#7374)

* Add CustomProgressBar class to exp_manager and trainer callbacks

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix the progress bar to reflect total microbatch cnt

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Modify CustomProgressBar class

1) Modify CustomProgressBar class to update progress bar per global_step instead of per microbatch
2) Add the callback to other megatron training/finetuning files that are not using MegatronTrainerBuilder

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add CustomProgressBar callback to tuning files

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Set Activation Checkpointing Defaults (#7404)

* Set Activation Checkpointing Defaults

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* check for None

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* make loss mask default to false (#7407)

Signed-off-by: eharper <eharper@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Add dummy userbuffer config files (#7408)

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* add missing ubconf files (#7412)

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* New tutorial on Speech Data Explorer (#7405)

* Added Google Colab based tutorial on Speech Data Explorer

Signed-off-by: George Zelenfroynd <gzelenfroind@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Update ptl training ckpt conversion script to work with dist ckpt (#7416)

* update ptl convert script

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* don't break legacy

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: eharper <eharper@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Allow disabling sanity checking when num_sanity_val_steps=0 (#7413)

* Allow disabling sanity checking when num_sanity_val_steps=0

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Update num_sanity_val_steps to be a multiple of num_microbatches

Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Add comprehensive error messages (#7261)

Signed-off-by: Anton Peganov <apeganov@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* check NEMO_PATH (#7418)

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* layer selection for ia3 (#7417)

* layer selection for ia3

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: arendu <adithyare@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix missing pip package 'einops' (#7397)

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix failure of pyaudio in Google Colab (#7396)

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Update README.md: output_path --> output_manifest_filepath (#7442)

Signed-off-by: Samuele Cornell <cornellsamuele@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Add rope dynamic linear scaling (#7437)

* Add dynamic linear scaling

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Yang Zhang <yzhang123@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix None dataloader issue in PTL2.0 (#7455)

* Fix None dataloader issue in PTL2.0

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* updating values of self._validation_dl and self._test_dl as well

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>

* updating values of self._validation_dl and self._test_dl as well

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [ASR] Confidence measure -> method renames (#7434)

* measure -> method

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Add steps for document of getting dataset 'SF Bilingual Speech' (#7378)

* Add steps for document of getting dataset 'SF Bilingual Speech'

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* Update datasets.rst

added a link from a tutorial demonstrating detailed data prep steps.

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* RNN-T confidence and alignment bugfix (#7381)

* new frame_confidence and alignments lists are now always created after the while loop

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* tests added

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

---------

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix resume from checkpoint in exp_manager (#7424) (#7426)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix checking of cuda/cpu device for inputs of Decoder (#7444)

* Fix checking of cuda/cpu device for inputs of Decoder

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* Update tacotron2.py

Signed-off-by: Jason <jasoli@nvidia.com>

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Signed-off-by: Jason <jasoli@nvidia.com>
Co-authored-by: Jason <jasoli@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix failure of ljspeech's get_data.py (#7430)

* Fix failure of ljspeech's get_data.py

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [TTS] Fix audio codec type checks (#7373)

* [TTS] Fix audio codec type checks

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Fix audio codec tests

Signed-off-by: Ryan <rlangman@nvidia.com>

---------

Signed-off-by: Ryan <rlangman@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [TTS] Add dataset to path of logged artifacts (#7462)

* [TTS] Add dataset to path of logged artifacts

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Revert axis name back to Audio Frames

Signed-off-by: Ryan <rlangman@nvidia.com>

---------

Signed-off-by: Ryan <rlangman@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix sft dataset truncation (#7464)

* Add fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Automatic Lip Reading Recognition (ALR) - ASR/CV (Visual ASR) (#7330)

* striding_conv1d_k5 and dw_striding_conv1d_k5 subsampling

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* transpose conv1d inputs

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* Update subsampling.py

change striding_conv1d_k5 to striding_conv1d

Signed-off-by: Maxime Burchi <60737204+burchim@users.noreply.github.com>

* cv branch

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* video manifest

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* add collection classes

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add test_step_outputs

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* correct manifest bug when having only audio or only videos

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* correct manifest bug when having only audio or only videos

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* clean references

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* freeze unfreeze transcribe cv models

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* correct manifest get_full_path bug

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* update for PR

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* guard torchvision

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update nemo/collections/cv/data/video_to_text_dataset.py

Co-authored-by: Igor Gitman <igor.a.gitman@gmail.com>
Signed-off-by: Maxime Burchi <60737204+burchim@users.noreply.github.com>

* _video_speech_collate_fn in cv/data/video_to_text.py

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* add self.out = None to asr subsampling

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* Update nemo/collections/cv/data/video_to_text_dataset.py

Co-authored-by: Igor Gitman <igor.a.gitman@gmail.com>
Signed-off-by: Maxime Burchi <60737204+burchim@users.noreply.github.com>

* cv -> multimodal/speech_cv branch

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: mburchi <maxime.burchi@gmail.com>
Signed-off-by: Maxime Burchi <60737204+burchim@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Igor Gitman <igor.a.gitman@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* HF StarCoder to NeMo conversion script (#7421)

* Script to convert HF StarCoder checkpoint to NeMo

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* StarCoder conversion test

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Fix test

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Catch up with save_to changes

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Don't abbreviate args for clarity

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Configurable precision: BF16 vs FP32

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* fix bug when loading dist ckpt in peft (#7452)

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix adding positional embeddings in-place in transformer module (#7440)

Signed-off-by: Tamerlan Tabolov <tktabolov@gmail.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix (#7478)

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* add sleep (#7498) (#7499)

* add sleep

* add sleep onto config instead

* add comment

---------

Signed-off-by: Gerald Shen <geshen@nvidia.com>
Co-authored-by: Gerald Shen <119401249+gshennvm@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix exp manager check for sleep (#7503) (#7504)

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* bugfix: trainer.accelerator=auto from None. (#7492) (#7493)

Signed-off-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Co-authored-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [doc] fix broken link (#7481)

Signed-off-by: Stas Bekman <stas00@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [TTS] Read audio as int32 to avoid flac read errors (#7477)

* [TTS] Read audio as int32 to avoid flac read errors

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Add comment about read failures

Signed-off-by: Ryan <rlangman@nvidia.com>

---------

Signed-off-by: Ryan <rlangman@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Add dataset 'AISHELL-3' from OpenSLR for training mandarin TTS (#7409)

* Add dataset 'AISHELL-3' from OpenSLR for training mandarin TTS
* Train 'AISHELL-3' dataset with multi-speakers

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* Update get_data.py

update copyright header

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* Update get_data.py

added a disclaimer

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add new configuration file for AISHELL3 with multispeaker of fastpitch

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* dllogger - log on rank 0 only (#7513)

Signed-off-by: Stas Bekman <stas00@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix TTS FastPitch tutorial (#7494) (#7516)

* Fix

---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix get_dist() tensor dimension (#7506) (#7515)

Signed-off-by: Jocelyn Huang <jocelynh@nvidia.com>
Co-authored-by: Jocelyn <jocelynh@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* bugfix: specify trainer.strategy=auto when devices=1 (#7509) (#7512)

Signed-off-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* fix (#7511)

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [TTS] Fix FastPitch data prep tutorial (#7524)

Signed-off-by: Ryan <rlangman@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* add italian tokenization (#7486)

* add italian tokenization

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add more ipa lexicon it

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix error deletion

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>

* add test

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Replace None strategy with auto in tutorial notebooks (#7521) (#7527)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* unpin setuptools (#7534) (#7535)

Signed-off-by: fayejf <36722593+fayejf@users.noreply.github.com>
Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Update per.py

- if __name__ == "__main__" removed (now metric can be imported);
- removed excessive classes (like "Sample" and "Statistics");
- transition from pandas df to dict of dicts;
- removed unnecessary "return";
- notation fixing;
- reduced calculation time

Signed-off-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Create punctuation_rates.py

Signed-off-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Format fixing

Signed-off-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* added nemo.logging, header, docstrings, how to use

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Added asserions to rate_punctuation.py

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* fix typo

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* added function for import and call, docstrings

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* remove auto generated examples (#7510)

* explicitly remove autogenerated examples for data parallel evaluation

Signed-off-by: arendu <adithyare@nvidia.com>

* mark autogenrated and remove it for test

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: arendu <adithyare@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Add the `strategy` argument to `MegatronGPTModel.generate()` (#7264)

It is passed as an explicit argument rather than through
`**strategy_args` so as to ensure someone cannot accidentally pass other
arguments that would end up being ignored.

It is a keyword-only argument to ensure that if in the future we want to
update the signature to `**strategy_args`, we can do it without breaking
code.

Signed-off-by: Olivier Delalleau <507137+odelalleau@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix PTL2.0 related ASR bugs in r1.21.0: Val metrics logging, None dataloader issue (#7531) (#7533)

* fix none dataloader issue ptl2

* ptl2.0 logging fixes for rnnt_models

---------

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>
Co-authored-by: Kunal Dhawan <kunaldhawan97@gmail.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* gpus -> devices (#7542) (#7545)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Update FFMPEG version to fix issue with torchaudio (#7551) (#7553)

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* PEFT GPT & T5 Refactor (#7308)

* initial implementation of add_adapters API

* correct type hint

* Add config in add_adapters for save and load (@author bobchen)

* Remove AdapterConfig to avoid import error

* Add AdaterConfig back and move adaptermixin to sft model

* Add NLPSaveRestoreConnector as default in NLPModel.restore_from

* Add restore_from_nemo_with_adapter and test script

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* rename t5 file and classes to be consistent with GPT

* add t5 sft dataset

* add support for single-file format with T5SFTDataset

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Various small changes to make T5 SFT work like GPT SFT

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add adapter evaluation test script

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add MultiAdaterConfig for ia3 and fix builder issue

* Make ptuning for T5SFTModel work using mixin

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add IA3_Adapter for AdapterName

* Add adapter name for ptuning and attention adapter

* Make test script GPT/T5 agnostic

* Add layer selection feature

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Integrate adapter name and config

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update gpt peft tuning script to new API

* add t5 peft tuning script with new API

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix IA3 layer selection issue

* Override state_dict on SFT model instead of mixin

* Add load adapter by adapter config

* move peft config map away from example script

* auto get config from nemo adapter

* Move PEFTConfig to new file

* fix ckpt save/load for t5

* name change: add_adapters -> add_adapter

* variable name change

* update t5 script

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix t5 issues

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add weight tying

* update gpt tuning script

* PEFT-API proposal

* Fix according to comments

* update tuning scripts

* move merge_cfg_with to mixin class since it applies to both gpt and t5 and requires the model class for restore

* Add mcore_gpt support for NLPAdapterMixin

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix typo

* variable name change to distinguish "peft" and "adapter"

* override `load_adapters` to support `add_adapter` name change

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update tuning and eval script for adapter save/load

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add Ptuning on first stage only

* add lora tutorial for review

* Fix layer selection for mcore

* add landing page

* fix resume training

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add mcore condition in sharded_state_dict to make sft work

* Update lora_tutorial.md

First edit of this file for PEFT documentation for NeMO

Signed-off-by: hkelly33 <58792115+hkelly33@users.noreply.github.com>

* rename Adapter to AttentionAdapter to avoid confusion in doc

* Change load_adapters to load .nemo

* add quick start guide

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add load_adapters with .ckpt

* Remove setup_complete changes in load_adapters

* update landing page

* remove typo

* Updated quick_start.md per Chen Cui

Signed-off-by: hkelly33 <58792115+hkelly33@users.noreply.github.com>

* Add inference config merger and tutorial

* Add doc string for NLPAdapterModelMixin and deprecated warning on MegatronGPTPEFTModel

* add supported_methods.md and update other documentations

* Update supported_methods.md

minor updates.

Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>

* Update landing_page.md

minor update.

Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>

* Modify doc string for NLPAdapterModelMixin

* Add doc string add_adapters in NLPAdapterModelMixin

* rename canonical adapters

* remove mcore hard dependency

* [PATCH] move microbatch calculator to nemo from apex

* remove apex dependency in gpt and t5 sft models

* remove apex dependency in gpt model

* render doc strings

* fix

* Add missing virtual_tokens on ptuning

* fix docstrings

* update gpt-style model coverage in docs

* update docstring

* Remove pdb

* add lightning_fabric to make docstring rendering work

* Add Ptuning missing key

* try docstring rendering

* Fix ptuning issue

* update gpt t5 peft tuning and eval scripts

* typos

* update eval config

* fix bug relating to apex dependency removal

* typo

* make predict step behave the same as test step

* make lora tutorial work in notebook

* cosmetics

* update yaml scripts

* mcore_gpt attribute optional

* typo

* update eval scripts and fix T5 eval bugs

* add NLPDDPStrategyNotebook and trainer builder logic to use it

* update lora notebook to use new trainer builder

* fix microbatch calculator bug for inference after training

* Convert markdown files to RST and incorporate with doc

* typo

* revise language

* remove extra cell

* remove unnecessary inheritance

* remove old tests

* move layer selection default so logging messages make sense

* remove `save_adapters` as adapter weights are saved automatically during training

* initialize weights from a checkpoint instead of randomly

* multiple fields can form a context (#7147)

* list of context fields and flexible prompt template

Signed-off-by: arendu <adithya.r@gmail.com>

* list of fields for context

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Add multiple truncation fields and middle truncation

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Compatible to old ckpt

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix tokenize detokenize issue

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove detokenization, add truncation augmentation

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Resolve comments

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Remove unused import

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert eos

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Add tokenizer space_sensitive attribute

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix error

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Fix erorr and use re

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Change assert logic

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Follow adi suggestion

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove merge function

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add example and comment

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Remove context_key and add comment

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Remove random truncation

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix template none

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Signed-off-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>

* revert config changes

* remove accidental breakpoint

* support TP>1 loading

* infer adapter type from checkpoint in during eval

* breakup add adapter

* enable interpolation of train_ds and validation_ds

* update metric calc script to conform to single-file eval format

* remove extraneous print

* update lora notebook for updated merge_inference_cfg

* Update nlp_adapter_mixins.py

variable name change

Signed-off-by: Chen Cui <chcui@nvidia.com>

* turn off grad scaler for PP to match old scripts

* remove PEFTSaveRestoreConnector since functionality all covered by the new mixin class

* remove resume_from_checkpoint check since covered in #7335

* revert changes made in eval config interpolation

* more interpolation

* typo

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove dup line

Signed-off-by: Chen Cui <chcui@nvidia.com>

* code style warnings

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix config mistake

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add copyright header

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix code check warnings

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert changes to remove apex dependency (mixed apex+nemo microbatch calculator broke some CI tests)

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add more deprecation notices

Signed-off-by: Chen Cui <chcui@nvidia.com>

* update deprecation notices

Signed-off-by: Chen Cui <chcui@nvidia.com>

* update deprecation notices

Signed-off-by: Chen Cui <chcui@nvidia.com>

* consolidate peft and sft scripts

Signed-off-by: Chen Cui <chcui@nvidia.com>

* update CI tests

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* notebook branch points to main to prepare for merge

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix gpt and t5 validation with any metric other than loss

Signed-off-by: Chen Cui <chcui@nvidia.com>

* support pre-extracted checkpoints

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Signed-off-by: hkelly33 <58792115+hkelly33@users.noreply.github.com>
Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>
Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Signed-off-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Marc Romeyn <marcromeyn@gmail.com>
Co-authored-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: hkelly33 <58792115+hkelly33@users.noreply.github.com>
Co-authored-by: Adi Renduchintala <adithyare@nvidia.com>
Co-authored-by: Yuanzhe Dong <yudong@nvidia.com>
Co-authored-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* fix a typo (#7496)

Signed-off-by: BestJuly <chntaoli@163.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [TTS] remove curly braces from ${BRANCH} in jupyer notebook cell. (#7554) (#7560)

* remove curly braces.
* remove installation of pynini.
---------

Signed-off-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* add youtube embed url (#7570)

Signed-off-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Co-authored-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Remap speakers to continuous range of speaker_id for dataset AISHELL3 (#7536)

* Remap speakers to continuous range of speaker_id for dataset AISHELL3
* Add new key/value pair to record raw speaker for AISHELL3 dataset

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* fix validation_step_outputs initialization for multi-dataloader (#7546) (#7572)

* added correct validation_step_outputs initialization for mutli-dataloader

* changed kernel for display

* Update logic for validation and test step outputs

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert multidataloader changes in multilang ASR notebook

---------

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>
Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: Kunal Dhawan <kunaldhawan97@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Append output of val step to self.validation_step_outputs (#7530) (#7532)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [TTS] fixed trainer's accelerator and strategy. (#7569) (#7574)

Signed-off-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Append val/test output to instance variable in EncDecSpeakerLabelModel (#7562) (#7573)

* Append val/test output to the instance variable in EncDecSpeakerLabelModel

* Handle test case in evaluation_step

* Replace type with isinstance

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix CustomProgressBar for resume (#7427) (#7522)

* Fix CustomProgress Bar for resume and multiple epochs

* Edit num_training_batches

* Use max_steps as total for progress bar for resume

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* fix typos in nfa and speech enhancement tutorials (#7580) (#7583)

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>
Co-authored-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Add strategy as ddp_find_unused_parameters_true for glue_benchmark.py (#7454) (#7461)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* update strategy (#7577) (#7578)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix typos (#7581)

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Change hifigan finetune strategy to ddp_find_unused_parameters_true (#7579) (#7584)

* Change strategy to auto

---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [BugFix] Add missing quotes for auto strategy in tutorial notebooks (#7541) (#7548)

* Add missing quotes for auto strategy

* Revert trainer.gpus to trainer.devices in Self_Supervised_Pre_Training.ipynb

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* added per tests

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [PATCH] PEFT import mcore (#7393)

* [PATCH] PEFT import mcore

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* add build os key (#7596) (#7599)

* add build os key

* add tools

* update to stable version

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* StarCoder SFT test + bump PyT NGC image to 23.09 (#7540)

* Add SFT StarCoder test

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Remove _modify_config call as it is covered in load_from_nemo just below

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Test with pyt:23.09 container

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* defaults changed (#7600)

* defaults changed

Signed-off-by: arendu <adithyare@nvidia.com>

* typo

Signed-off-by: arendu <adithyare@nvidia.com>

* update

Signed-off-by: arendu <adithyare@nvidia.com>

---------

Signed-off-by: arendu <adithyare@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* add ItalianPhonemesTokenizer (#7587)

* add ItalianPhonemesTokenizer

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix Italian phonemes

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add test

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>

---------

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* best ckpt fix (#7564) (#7588)

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* rate_punctuation.py

Fixed output manifest saving

Signed-off-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix tests

Signed-off-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Add files via upload (#7598)

specifies the branch

Signed-off-by: George <37293288+Jorjeous@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix validation in G2PModel and ThutmoseTaggerModel (#7597) (#7606)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Broadcast loss only when using pipeline parallelism and within the pipeline parallel domain (#7576) (#7586)

* Broadcast loss only when using pipeline parallelism and within the pipeline parallel domain
* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Safeguard nemo_text_processing installation on ARM (#7485)

* safeguard nemo_text_processing installing

Signed-off-by: Jason <jasoli@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update check

Signed-off-by: Jason <jasoli@nvidia.com>

---------

Signed-off-by: Jason <jasoli@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Function name fixing

Signed-off-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Moving PER to speech_to_text_eval.py

Added:
- "use_per": PER metric computing;
- "scores_per_sample": metrics computation sample by sample for wer/cer/punctuation rates;
- "output_with_scores_filename": saving manifest with metrics

Signed-off-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Update test_metrics.py

Updated "punctuation_error_rate" function name

Signed-off-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Added use_per description

Signed-off-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* guard extra dependencies

Signed-off-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Write metrics to "output_filename" if "scores_per_sample=True"

Signed-off-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* scores_per_sample description

Signed-off-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix import guards

Signed-off-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Stats printing when HAVE_TABLUATE_AND_PANDAS=False

Signed-off-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Bound transformers version in requirements (#7620)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* fix llama2 70b lora tuning bug (#7622)

* fix llama2 70b lora tuning bug

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Update peft_config.py

brackets

Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>
Co-authored-by: Adi Renduchintala <adithyare@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix import error no module name model_utils (#7629)

Signed-off-by: Mehadi Hasan Menon <mehadihasan80@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Delete examples/asr/rate_punctuation.py

Signed-off-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Added use_per description

Signed-off-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* metric and variables name fixing

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Add else samples = None

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* add fc large ls models (#7641)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* bugfix: trainer.gpus, trainer.strategy, trainer.accelerator (#7621) (#7642)

* [TTS] bugfix for Tacotron2 tutorial due to PTL 2.0
* trainer.gpus -> trainer.devices
* fixed related tutorial bugs
---------
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* fix ssl models ptl monitor val through logging (#7608) (#7614)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix metrics for SE tutorial (#7604) (#7612)

Signed-off-by: Ante Jukić <ajukic@nvidia.com>
Co-authored-by: anteju <108555623+anteju@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Add ddp_find_unused_parameters=True and change accelerator to auto (#7623) (#7644)

* Add ddp_find_unused_parameters=True and change acclerator to auto

* Add ddp_find_unused_parameters True for normalization_as_tagging_train.py

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix py3.11 dataclasses issue  (#7616)

* Fix py3.11 dataclasses issue  (#7582)

* Update ASR configs to support Python 3.11

Signe…

* conversion issue fix (#7648) (#7668)

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>

* layernorm1p fix (#7523) (#7567)

* layernorm1p fix


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add layernorm1p to if statement


* config changes


* gpt config changes


* remove layernorm_zero_centered_gamma from gpt config


* change line


---------

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* generalized chat sft prompt (#7655)

* fix dataset issues

Signed-off-by: Yi Dong <yidong@nvidia.com>

* working version

Signed-off-by: Yi Dong <yidong@nvidia.com>

* all passed

Signed-off-by: Yi Dong <yidong@nvidia.com>

* refactor tests

Signed-off-by: Yi Dong <yidong@nvidia.com>

* all pass

Signed-off-by: Yi Dong <yidong@nvidia.com>

* working version

Signed-off-by: Yi Dong <yidong@nvidia.com>

* use end name signal for labels

Signed-off-by: Yi Dong <yidong@nvidia.com>

* all fixed

Signed-off-by: Yi Dong <yidong@nvidia.com>

* update doc

Signed-off-by: Yi Dong <yidong@nvidia.com>

* style fix

Signed-off-by: Yi Dong <yidong@nvidia.com>

* remove unused imports

Signed-off-by: Yi Dong <yidong@nvidia.com>

* make sure nccl not timing out

Signed-off-by: Yi Dong <yidong@nvidia.com>

* style fix

Signed-off-by: Yi Dong <yidong@nvidia.com>

* generate example template

Signed-off-by: Yi Dong <yidong@nvidia.com>

* generic end of name token

Signed-off-by: Yi Dong <yidong@nvidia.com>

* style fix

Signed-off-by: Yi Dong <yidong@nvidia.com>

* add the chat prompt format into the config

Signed-off-by: Yi Dong <yidong@nvidia.com>

* make sure sft working

Signed-off-by: Yi Dong <yidong@nvidia.com>

* address reviewer comment

Signed-off-by: Yi Dong <yidong@nvidia.com>

* fix non

Signed-off-by: Yi Dong <yidong@nvidia.com>

* try openAI prompt

Signed-off-by: Yi Dong <yidong@nvidia.com>

* remove unused imports

Signed-off-by: Yi Dong <yidong@nvidia.com>

* remove human labels from the data

Signed-off-by: Yi Dong <yidong@nvidia.com>

* use hf dataset to clean

Signed-off-by: Yi Dong <yidong@nvidia.com>

* reviewer comments

Signed-off-by: Yi Dong <yidong@nvidia.com>

---------

Signed-off-by: Yi Dong <yidong@nvidia.com>

* Fix vad & speech command tutorial - onnx (#7671) (#7672)

* fix vad onnx


* fix mbn onnx


---------

Signed-off-by: fayejf <fayejf07@gmail.com>
Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com>

* Fix in the confidence ensemble test (#7682)

* Fix in the confidence ensemble test

Signed-off-by: Igor Gitman <igitman@nvidia.com>

* Correct parameter names

Signed-off-by: Igor Gitman <igitman@nvidia.com>

---------

Signed-off-by: Igor Gitman <igitman@nvidia.com>

* PEFT eval fix (#7626) (#7638)

* fix issue where peft weights are not loaded for distributed checkpoints


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* propagate mp config (#7637) (#7639)

Signed-off-by: eharper <eharper@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Add find_unused_parameters_true for text_classiftn and punctuation_capitalization (#7649) (#7657)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>

* Hotfix (#7501) (#7568)

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>
Co-authored-by: jbaczek <45043825+jbaczek@users.noreply.github.com>

* Avoid duplicated checkpoint save (#7555) (#7566)

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Co-authored-by: mikolajblaz <mikolajblaz@users.noreply.github.com>

* Cache FP8 weight and transpose only at the first micro-batch in each validation and test routine (#7470) (#7483)

* Cache weight and transpose only in the first batch in all training, val, and test runs


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Add an option to disable manual GC in validation (#7467) (#7476)

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>

* Remove PUBLICATIONS.md, point to github.io NeMo page instead (#7694) (#7695)

* update publications section to point to blog website page


* add hyphen


* use double backquotes for code formatting


---------

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>
Signed-off-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Co-authored-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>

* Fix multi rank finetune for ASR (#7684) (#7699)

* Fix multi rank finetune for ASR


* Actually add time


* Actually add time


---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>

* Update docs: readme, getting started, ASR intro (#7679)

* [TTS] Add dataset to path of logged artifacts (#7462)

* [TTS] Add dataset to path of logged artifacts

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Revert axis name back to Audio Frames

Signed-off-by: Ryan <rlangman@nvidia.com>

---------

Signed-off-by: Ryan <rlangman@nvidia.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* move install info to INSTALLATION.md

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* tidy up links

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Fix sft dataset truncation (#7464)

* Add fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Automatic Lip Reading Recognition (ALR) - ASR/CV (Visual ASR) (#7330)

* striding_conv1d_k5 and dw_striding_conv1d_k5 subsampling

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* transpose conv1d inputs

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* Update subsampling.py

change striding_conv1d_k5 to striding_conv1d

Signed-off-by: Maxime Burchi <60737204+burchim@users.noreply.github.com>

* cv branch

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* video manifest

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* add collection classes

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add test_step_outputs

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* correct manifest bug when having only audio or only videos

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* correct manifest bug when having only audio or only videos

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* clean references

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* freeze unfreeze transcribe cv models

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* correct manifest get_full_path bug

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* update for PR

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* guard torchvision

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update nemo/collections/cv/data/video_to_text_dataset.py

Co-authored-by: Igor Gitman <igor.a.gitman@gmail.com>
Signed-off-by: Maxime Burchi <60737204+burchim@users.noreply.github.com>

* _video_speech_collate_fn in cv/data/video_to_text.py

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* add self.out = None to asr subsampling

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* Update nemo/collections/cv/data/video_to_text_dataset.py

Co-authored-by: Igor Gitman <igor.a.gitman@gmail.com>
Signed-off-by: Maxime Burchi <60737204+burchim@users.noreply.github.com>

* cv -> multimodal/speech_cv branch

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: mburchi <maxime.burchi@gmail.com>
Signed-off-by: Maxime Burchi <60737204+burchim@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Igor Gitman <igor.a.gitman@gmail.com>

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* HF StarCoder to NeMo conversion script (#7421)

* Script to convert HF StarCoder checkpoint to NeMo

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* StarCoder conversion test

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Fix test

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Catch up with save_to changes

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Don't abbreviate args for clarity

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Configurable precision: BF16 vs FP32

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* fix bug when loading dist ckpt in peft (#7452)

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Fix adding positional embeddings in-place in transformer module (#7440)

Signed-off-by: Tamerlan Tabolov <tktabolov@gmail.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Fix (#7478)

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* add sleep (#7498) (#7499)

* add sleep


* add sleep onto config instead


* add comment


---------

Signed-off-by: Gerald Shen <geshen@nvidia.com>
Co-authored-by: Gerald Shen <119401249+gshennvm@users.noreply.github.com>

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Fix exp manager check for sleep (#7503) (#7504)

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* bugfix: trainer.accelerator=auto from None. (#7492) (#7493)

Signed-off-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Co-authored-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* [doc] fix broken link (#7481)

Signed-off-by: Stas Bekman <stas00@users.noreply.github.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* [TTS] Read audio as int32 to avoid flac read errors (#7477)

* [TTS] Read audio as int32 to avoid flac read errors

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Add comment about read failures

Signed-off-by: Ryan <rlangman@nvidia.com>

---------

Signed-off-by: Ryan <rlangman@nvidia.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Add dataset 'AISHELL-3' from OpenSLR for training mandarin TTS (#7409)

* Add dataset 'AISHELL-3' from OpenSLR for training mandarin TTS
* Train 'AISHELL-3' dataset with multi-speakers

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* Update get_data.py

update copyright header

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* Update get_data.py

added a disclaimer

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add new configuration file for AISHELL3 with multispeaker of fastpitch

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* dllogger - log on rank 0 only (#7513)

Signed-off-by: Stas Bekman <stas00@users.noreply.github.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Fix TTS FastPitch tutorial (#7494) (#7516)

* Fix

---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Fix get_dist() tensor dimension (#7506) (#7515)

Signed-off-by: Jocelyn Huang <jocelynh@nvidia.com>
Co-authored-by: Jocelyn <jocelynh@nvidia.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* bugfix: specify trainer.strategy=auto when devices=1 (#7509) (#7512)

Signed-off-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* fix (#7511)

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* [TTS] Fix FastPitch data prep tutorial (#7524)

Signed-off-by: Ryan <rlangman@nvidia.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* add italian tokenization (#7486)

* add italian tokenization

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add more ipa lexicon it

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix error deletion

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>

* add test

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Replace None strategy with auto in tutorial notebooks (#7521) (#7527)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* unpin setuptools (#7534) (#7535)

Signed-off-by: fayejf <36722593+fayejf@users.noreply.github.com>
Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* remove auto generated examples (#7510)

* explicitly remove autogenerated examples for data parallel evaluation

Signed-off-by: arendu <adithyare@nvidia.com>

* mark autogenrated and remove it for test

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: arendu <adithyare@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Add the `strategy` argument to `MegatronGPTModel.generate()` (#7264)

It is passed as an explicit argument rather than through
`**strategy_args` so as to ensure someone cannot accidentally pass other
arguments that would end up being ignored.

It is a keyword-only argument to ensure that if in the future we want to
update the signature to `**strategy_args`, we can do it without breaking
code.

Signed-off-by: Olivier Delalleau <507137+odelalleau@users.noreply.github.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Fix PTL2.0 related ASR bugs in r1.21.0: Val metrics logging, None dataloader issue (#7531) (#7533)

* fix none dataloader issue ptl2


* ptl2.0 logging fixes for rnnt_models


---------

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>
Co-authored-by: Kunal Dhawan <kunaldhawan97@gmail.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* gpus -> devices (#7542) (#7545)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Update FFMPEG version to fix issue with torchaudio (#7551) (#7553)

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* PEFT GPT & T5 Refactor (#7308)

* initial implementation of add_adapters API

* correct type hint

* Add config in add_adapters for save and load (@author bobchen)

* Remove AdapterConfig to avoid import error

* Add AdaterConfig back and move adaptermixin to sft model

* Add NLPSaveRestoreConnector as default in NLPModel.restore_from

* Add restore_from_nemo_with_adapter and test script

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* rename t5 file and classes to be consistent with GPT

* add t5 sft dataset

* add support for single-file format with T5SFTDataset

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Various small changes to make T5 SFT work like GPT SFT

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add adapter evaluation test script

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add MultiAdaterConfig for ia3 and fix builder issue

* Make ptuning for T5SFTModel work using mixin

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add IA3_Adapter for AdapterName

* Add adapter name for ptuning and attention adapter

* Make test script GPT/T5 agnostic

* Add layer selection feature

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Integrate adapter name and config

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update gpt peft tuning script to new API

* add t5 peft tuning script with new API

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix IA3 layer selection issue

* Override state_dict on SFT model instead of mixin

* Add load adapter by adapter config

* move peft config map away from example script

* auto get config from nemo adapter

* Move PEFTConfig to new file

* fix ckpt save/load for t5

* name change: add_adapters -> add_adapter

* variable name change

* update t5 script

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix t5 issues

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add weight tying

* update gpt tuning script

* PEFT-API proposal

* Fix according to comments

* update tuning scripts

* move merge_cfg_with to mixin class since it applies to both gpt and t5 and requires the model class for restore

* Add mcore_gpt support for NLPAdapterMixin

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix typo

* variable name change to distinguish "peft" and "adapter"

* override `load_adapters` to support `add_adapter` name change

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update tuning and eval script for adapter save/load

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add Ptuning on first stage only

* add lora tutorial for review

* Fix layer selection for mcore

* add landing page

* fix resume training

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add mcore condition in sharded_state_dict to make sft work

* Update lora_tutorial.md

First edit of this file for PEFT documentation for NeMO

Signed-off-by: hkelly33 <58792115+hkelly33@users.noreply.github.com>

* rename Adapter to AttentionAdapter to avoid confusion in doc

* Change load_adapters to load .nemo

* add quick start guide

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add load_adapters with .ckpt

* Remove setup_complete changes in load_adapters

* update landing page

* remove typo

* Updated quick_start.md per Chen Cui

Signed-off-by: hkelly33 <58792115+hkelly33@users.noreply.github.com>

* Add inference config merger and tutorial

* Add doc string for NLPAdapterModelMixin and deprecated warning on MegatronGPTPEFTModel

* add supported_methods.md and update other documentations

* Update supported_methods.md

minor updates.

Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>

* Update landing_page.md

minor update.

Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>

* Modify doc string for NLPAdapterModelMixin

* Add doc string add_adapters in NLPAdapterModelMixin

* rename canonical adapters

* remove mcore hard dependency

* [PATCH] move microbatch calculator to nemo from apex

* remove apex dependency in gpt and t5 sft models

* remove apex dependency in gpt model

* render doc strings

* fix

* Add missing virtual_tokens on ptuning

* fix docstrings

* update gpt-style model coverage in docs

* update docstring

* Remove pdb

* add lightning_fabric to make docstring rendering work

* Add Ptuning missing key

* try docstring rendering

* Fix ptuning issue

* update gpt t5 peft tuning and eval scripts

* typos

* update eval config

* fix bug relating to apex dependency removal

* typo

* make predict step behave the same as test step

* make lora tutorial work in notebook

* cosmetics

* update yaml scripts

* mcore_gpt attribute optional

* typo

* update eval scripts and fix T5 eval bugs

* add NLPDDPStrategyNotebook and trainer builder logic to use it

* update lora notebook to use new trainer builder

* fix microbatch calculator bug for inference after training

* Convert markdown files to RST and incorporate with doc

* typo

* revise language

* remove extra cell

* remove unnecessary inheritance

* remove old tests

* move layer selection default so logging messages make sense

* remove `save_adapters` as adapter weights are saved automatically during training

* initialize weights from a checkpoint instead of randomly

* multiple fields can form a context (#7147)

* list of context fields and flexible prompt template

Signed-off-by: arendu <adithya.r@gmail.com>

* list of fields for context

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Add multiple truncation fields and middle truncation

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Compatible to old ckpt

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix tokenize detokenize issue

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove detokenization, add truncation augmentation

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Resolve comments

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Remove unused import

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert eos

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Add tokenizer space_sensitive attribute

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix error

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Fix erorr and use re

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Change assert logic

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Follow adi suggestion

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove merge function

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add example and comment

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Remove context_key and add comment

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* Remove random truncation

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix template none

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Signed-off-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>

* revert config changes

* remove accidental breakpoint

* support TP>1 loading

* infer adapter type from checkpoint in during eval

* breakup add adapter

* enable interpolation of train_ds and validation_ds

* update metric calc script to conform to single-file eval format

* remove extraneous print

* update lora notebook for updated merge_inference_cfg

* Update nlp_adapter_mixins.py

variable name change

Signed-off-by: Chen Cui <chcui@nvidia.com>

* turn off grad scaler for PP to match old scripts

* remove PEFTSaveRestoreConnector since functionality all covered by the new mixin class

* remove resume_from_checkpoint check since covered in #7335

* revert changes made in eval config interpolation

* more interpolation

* typo

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove dup line

Signed-off-by: Chen Cui <chcui@nvidia.com>

* code style warnings

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix config mistake

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add copyright header

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix code check warnings

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert changes to remove apex dependency (mixed apex+nemo microbatch calculator broke some CI tests)

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add more deprecation notices

Signed-off-by: Chen Cui <chcui@nvidia.com>

* update deprecation notices

Signed-off-by: Chen Cui <chcui@nvidia.com>

* update deprecation notices

Signed-off-by: Chen Cui <chcui@nvidia.com>

* consolidate peft and sft scripts

Signed-off-by: Chen Cui <chcui@nvidia.com>

* update CI tests

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* notebook branch points to main to prepare for merge

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix gpt and t5 validation with any metric other than loss

Signed-off-by: Chen Cui <chcui@nvidia.com>

* support pre-extracted checkpoints

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: jasonwan <jasonwan@nvidia.com>
Signed-off-by: hkelly33 <58792115+hkelly33@users.noreply.github.com>
Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>
Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Signed-off-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Marc Romeyn <marcromeyn@gmail.com>
Co-authored-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: hkelly33 <58792115+hkelly33@users.noreply.github.com>
Co-authored-by: Adi Renduchintala <adithyare@nvidia.com>
Co-authored-by: Yuanzhe Dong <yudong@nvidia.com>
Co-authored-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* fix a typo (#7496)

Signed-off-by: BestJuly <chntaoli@163.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* [TTS] remove curly braces from ${BRANCH} in jupyer notebook cell. (#7554) (#7560)

* remove curly braces.
* remove installation of pynini.
---------

Signed-off-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* add youtube embed url (#7570)

Signed-off-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Co-authored-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Remap speakers to continuous range of speaker_id for dataset AISHELL3 (#7536)

* Remap speakers to continuous range of speaker_id for dataset AISHELL3
* Add new key/value pair to record raw speaker for AISHELL3 dataset

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* fix validation_step_outputs initialization for multi-dataloader (#7546) (#7572)

* added correct validation_step_outputs initialization for mutli-dataloader


* changed kernel for display


* Update logic for validation and test step outputs


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert multidataloader changes in multilang ASR notebook


---------

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>
Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: Kunal Dhawan <kunaldhawan97@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Append output of val step to self.validation_step_outputs (#7530) (#7532)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* [TTS] fixed trainer's accelerator and strategy. (#7569) (#7574)

Signed-off-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Append val/test output to instance variable in EncDecSpeakerLabelModel (#7562) (#7573)

* Append val/test output to the instance variable in EncDecSpeakerLabelModel


* Handle test case in evaluation_step


* Replace type with isinstance


---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Fix CustomProgressBar for resume (#7427) (#7522)

* Fix CustomProgress Bar for resume and multiple epochs


* Edit num_training_batches


* Use max_steps as total for progress bar for resume


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* fix typos in nfa and speech enhancement tutorials (#7580) (#7583)

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>
Co-authored-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Add strategy as ddp_find_unused_parameters_true for glue_benchmark.py (#7454) (#7461)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* update strategy (#7577) (#7578)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Fix typos (#7581)

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Change hifigan finetune strategy to ddp_find_unused_parameters_true (#7579) (#7584)

* Change strategy to auto


---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* [BugFix] Add missing quotes for auto strategy in tutorial notebooks (#7541) (#7548)

* Add missing quotes for auto strategy


* Revert trainer.gpus to trainer.devices in Self_Supervised_Pre_Training.ipynb


---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* add build os key (#7596) (#7599)

* add build os key


* add tools


* update to stable version


---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* StarCoder SFT test + bump PyT NGC image to 23.09 (#7540)

* Add SFT StarCoder test

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Remove _modify_config call as it is covered in load_from_nemo just below

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Test with pyt:23.09 container

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* defaults changed (#7600)

* defaults changed

Signed-off-by: arendu <adithyare@nvidia.com>

* typo

Signed-off-by: arendu <adithyare@nvidia.com>

* update

Signed-off-by: arendu <adithyare@nvidia.com>

---------

Signed-off-by: arendu <adithyare@nvidia.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* add ItalianPhonemesTokenizer (#7587)

* add ItalianPhonemesTokenizer

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix Italian phonemes

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add test

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>

---------

Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* best ckpt fix (#7564) (#7588)

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Add files via upload (#7598)

specifies the branch

Signed-off-by: George <37293288+Jorjeous@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Fix validation in G2PModel and ThutmoseTaggerModel (#7597) (#7606)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Broadcast loss only when using pipeline parallelism and within the pipeline parallel domain (#7576) (#7586)

* Broadcast loss only when using pipeline parallelism and within the pipeline parallel domain
* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Safeguard nemo_text_processing installation on ARM (#7485)

* safeguard nemo_text_processing installing

Signed-off-by: Jason <jasoli@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update check

Signed-off-by: Jason <jasoli@nvidia.com>

---------

Signed-off-by: Jason <jasoli@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Bound transformers version in requirements (#7620)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* fix llama2 70b lora tuning bug (#7622)

* fix llama2 70b lora tuning bug

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Update peft_config.py

brackets

Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>
Co-authored-by: Adi Renduchintala <adithyare@nvidia.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Fix import error no module name model_utils (#7629)

Signed-off-by: Mehadi Hasan Menon <mehadihasan80@gmail.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* add fc large ls models (#7641)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* bugfix: trainer.gpus, trainer.strategy, trainer.accelerator (#7621) (#7642)

* [TTS] bugfix for Tacotron2 tutorial due to PTL 2.0
* trainer.gpus -> trainer.devices
* fixed related tutorial bugs
---------
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* fix ssl models ptl monitor val through logging (#7608) (#7614)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Fix metrics for SE tutorial (#7604) (#7612)

Signed-off-by: Ante Jukić <ajukic@nvidia.com>
Co-authored-by: anteju <108555623+anteju@users.noreply.github.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Add ddp_find_unused_parameters=True and change accelerator to auto (#7623) (#7644)

* Add ddp_find_unused_parameters=True and change acclerator to auto


* Add ddp_find_unused_parameters True for normalization_as_tagging_train.py


---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Fix py3.11 dataclasses issue  (#7616)

* Fix py3.11 dataclasses issue  (#7582)

* Update ASR configs to support Python 3.11

Signed-off-by: smajumdar <titu1994@gmail.com>

* Update TTS configs to support Python 3.11

Signed-off-by: smajumdar <titu1994@gmail.com>

* Guard MeCab and Ipadic

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix remaining ASR dataclasses

Signed-off-by: smajumdar <titu1994@gmail.com>

* Fix remaining ASR dataclasses

Signed-off-by: smajumdar <titu1994@gmail.com>

* Fix scripts

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Update name to ConfidenceMethodConfig

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Broadcast loss only when using pipeline parallelism and within the pipeline parallel domain (#7576) (#7586)

* Broadcast loss only when using pipeline parallelism and within the pipeline parallel domain
* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Safeguard nemo_text_processing installation on ARM (#7485)

* safeguard nemo_text_processing installing

Signed-off-by: Jason <jasoli@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update check

Signed-off-by: Jason <jasoli@nvidia.com>

---------

Signed-off-by: Jason <jasoli@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Fix changes to confidence measure

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Sangkug Lym <slym@nvidia.com>
Signed-off-by: Jason <jasoli@nvidia.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Jason <jasoli@nvidia.com>

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Fix issues with Dockerfile (#7650) (#7652)

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* [ASR] RNN-T greedy decoding max_frames fix for alignment and confidence (#7635)

* decoding and test fix

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* [ASR] Fix type error in jasper (#7636) (#7653)

Signed-off-by: Ryan <rlangman@nvidia.com>
Co-authored-by: Ryan Langman <rlangman@nvidia.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* [TTS] Add STFT and SI-SDR loss to audio codec recipe (#7468)

* [TTS] Add STFT and SI-SDR loss to audio codec recipe

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Fix STFT resolution

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Fix training metric logging

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Add docstring to mel and stft losses

Signed-off-by: Ryan <rlangman@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Ryan <rlangman@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* add outline of asr quickstart info to asr/intro.rst

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* add CLI, LM and real-time transcription sections

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Create per.py (#7538)

* Move model precision copy (#7336)

* move cfg precision set to megatron base model

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* remove copy from other models

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* modify attribute not arg

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* fix gpt model test for ptl 2.0

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* rename function and add docstring

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* replace precision to dtype conditionals with func call

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* unnecessary function and cfg reset

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* set default value

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* fix precision lookup in a few more places

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* rename mapping function

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* ununsed import

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* save torch datatype to model

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* set weights precision wrt amp o2

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* Revert "set weights precision wrt amp o2"

This reverts commit 313a4bfe5eb69d771a6d2433898c0685836aef5c.

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* revert half precision at inference attempt

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* move autocast dtype to base model

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* move params dtype to base model, enable fp16 O2 inf

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* unused imports

Signed-off-by: Maanu Grover <maanug@nvidia.com>

---------

Signed-off-by: Maanu Grover <maanug@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix PEFT checkpoint loading (#7388)

* Fix PEFT checkpoint loading

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Use distributed optimizer support for multiple dtypes (#7359)

* Update distopt wrapper with multiple dtype support

Remove manual handling of separate FP32 optimizer.

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Use distopt support for contiguous buffers with multiple dtypes

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Fix typo

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Separate distopt buckets for first GPT layer and non-overlapped params

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Add distopt logic for int dtypes

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Update Apex commit

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Remove unused variables

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Update Apex commit in README and Jenkensfile

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Debug Dockerfile and Jenkinsfile

Signed-off-by: Tim Moon <tmoon@nvidia.com>

---------

Signed-off-by: Tim Moon <tmoon@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* minor fix for llama ckpt conversion script (#7387)

* minor fix for llama ckpt conversion script

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* Update Jenkinsfile

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* remove fast_swiglu configuration

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

---------

Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix wrong calling of librosa.get_duration() in notebook (#7376)

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [PATCH] PEFT import mcore (#7393)

* [PATCH] PEFT import mcore

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Create per.py

Script for calculation Punctuation Error Rate and related rates (correct rate, deletions rate, etc.)

Signed-off-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [TTS] Added a callback for logging initial data (#7384)

Signed-off-by: Ante Jukić <ajukic@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Update Core Commit (#7402)

* Update Core Commit

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* update commit

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

---------

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Use cfg attribute in bert (#7394)

* use cfg attribute instead of arg

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* use torch_dtype in place of cfg.precision

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* move precision copy before super constructor

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* use trainer arg

Signed-off-by: Maanu Grover <maanug@nvidia.com>

---------

Signed-off-by: Maanu Grover <maanug@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Add support for bias conversion in Swiglu models (#7386)

* Add support for bias conversion in Swiglu models

Signed-off-by: smajumdar <titu1994@gmail.com>

* Add support for auto extracting tokenizer model

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add support for auto extracting tokenizer model

Signed-off-by: smajumdar <titu1994@gmail.com>

* Fix issue with missing tokenizer

Signed-off-by: smajumdar <titu1994@gmail.com>

* Refactor

Signed-off-by: smajumdar <titu1994@gmail.com>

* Refactor

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Update save_to and restore_from for dist checkpointing (#7343)

* add dist ckpt to save to, in progress

Signed-off-by: eharper <eharper@nvidia.com>

* move dist ckpt

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* clean up

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update restore from, need to figure out how to initialize distributed

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* launch distrib if needed when restoring dist ckpt

Signed-off-by: eharper <eharper@nvidia.com>

* when using mcore we can change tp pp on the fly

Signed-off-by: eharper <eharper@nvidia.com>

* add load_from_checkpoint support for dist ckpt

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update llama convert script to save dist .nemo

Signed-off-by: eharper <eharper@nvidia.com>

* fix load dist ckpt

Signed-off-by: jasonwan <jasonwan@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* setup TE TP groups if needed

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* setup te tp groups if needed

Signed-off-by: eharper <eharper@nvidia.com>

* remove import

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>
Signed-off-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: jasonwan <jasonwan@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* fix forward for with mcore=false (#7403)

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix logging to remove 's/it' from progress bar in Megatron models and add train_step_timing (#7374)

* Add CustomProgressBar class to exp_manager and trainer callbacks

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix the progress bar to reflect total microbatch cnt

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Modify CustomProgressBar class

1) Modify CustomProgressBar class to update progress bar per global_step instead of per microbatch
2) Add the callback to other megatron training/finetuning files that are not using MegatronTrainerBuilder

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add CustomProgressBar callback to tuning files

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Set Activation Checkpointing Defaults (#7404)

* Set Activation Checkpointing Defaults

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* check for None

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* make loss mask default to false (#7407)

Signed-off-by: eharper <eharper@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Add dummy userbuffer config files (#7408)

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* add missing ubconf files (#7412)

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* New tutorial on Speech Data Explorer (#7405)

* Added Google Colab based tutorial on Speech Data Explorer

Signed-off-by: George Zelenfroynd <gzelenfroind@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Update ptl training ckpt conversion script to work with dist ckpt (#7416)

* update ptl convert script

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* don't break legacy

Signed-off-by: eharper <eharper@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: eharper <eharper@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Allow disabling sanity checking when num_sanity_val_steps=0 (#7413)

* Allow disabling sanity checking when num_sanity_val_steps=0

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Update num_sanity_val_steps to be a multiple of num_microbatches

Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Add comprehensive error messages (#7261)

Signed-off-by: Anton Peganov <apeganov@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* check NEMO_PATH (#7418)

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* layer selection for ia3 (#7417)

* layer selection for ia3

Signed-off-by: arendu <adithyare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: arendu <adithyare@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix missing pip package 'einops' (#7397)

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix failure of pyaudio in Google Colab (#7396)

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Update README.md: output_path --> output_manifest_filepath (#7442)

Signed-off-by: Samuele Cornell <cornellsamuele@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Add rope dynamic linear scaling (#7437)

* Add dynamic linear scaling

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bug

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Yang Zhang <yzhang123@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix None dataloader issue in PTL2.0 (#7455)

* Fix None dataloader issue in PTL2.0

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* updating values of self._validation_dl and self._test_dl as well

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>

* updating values of self._validation_dl and self._test_dl as well

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [ASR] Confidence measure -> method renames (#7434)

* measure -> method

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Add steps for document of getting dataset 'SF Bilingual Speech' (#7378)

* Add steps for document of getting dataset 'SF Bilingual Speech'

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* Update datasets.rst

added a link from a tutorial demonstrating detailed data prep steps.

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* RNN-T confidence and alignment bugfix (#7381)

* new frame_confidence and alignments lists are now always created after the while loop

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* tests added

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

---------

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix resume from checkpoint in exp_manager (#7424) (#7426)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix checking of cuda/cpu device for inputs of Decoder (#7444)

* Fix checking of cuda/cpu device for inputs of Decoder

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* Update tacotron2.py

Signed-off-by: Jason <jasoli@nvidia.com>

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Signed-off-by: Jason <jasoli@nvidia.com>
Co-authored-by: Jason <jasoli@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix failure of ljspeech's get_data.py (#7430)

* Fix failure of ljspeech's get_data.py

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [TTS] Fix audio codec type checks (#7373)

* [TTS] Fix audio codec type checks

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Fix audio codec tests

Signed-off-by: Ryan <rlangman@nvidia.com>

---------

Signed-off-by: Ryan <rlangman@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* [TTS] Add dataset to path of logged artifacts (#7462)

* [TTS] Add dataset to path of logged artifacts

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Revert axis name back to Audio Frames

Signed-off-by: Ryan <rlangman@nvidia.com>

---------

Signed-off-by: Ryan <rlangman@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix sft dataset truncation (#7464)

* Add fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Automatic Lip Reading Recognition (ALR) - ASR/CV (Visual ASR) (#7330)

* striding_conv1d_k5 and dw_striding_conv1d_k5 subsampling

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* transpose conv1d inputs

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* Update subsampling.py

change striding_conv1d_k5 to striding_conv1d

Signed-off-by: Maxime Burchi <60737204+burchim@users.noreply.github.com>

* cv branch

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* video manifest

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* add collection classes

Signed-off-by: mburchi <maxime.burchi@gmail.com>

* [pre-commit.ci] auto fixes from pre-…

* fix onnx (#7703) (#7704)

Signed-off-by: fayejf <fayejf07@gmail.com>
Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com>

* move core install to /workspace (#7706)

Signed-off-by: Abhinav Khattar <aklife97@gmail.com>

* Fix typo in audio codec config, encoder target (#7697)

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

* Replace strategy='dp'/None with 'auto' (#7681) (#7696)

* Add strategy=auto for None and dp
* Change strategy from None to auto
---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>

* [ASR] Multichannel mask estimator with flex number of channels (#7317)

* Adding a mask estimator which can process an arbitrary number of channels

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

* Bypass failing tests + mark as pleasefixme

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

---------

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

* fix ptl_bugs in slu_models.py (#7689) (#7712)

* fix ptl_bugs in slu_models.py


* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* change strategy to ddp_find_unused_parameters_true in  slu example yaml


---------

Signed-off-by: Seonghun Noh <jzi040941@naver.com>
Signed-off-by: Seonghun <jzi040941@naver.com>
Co-authored-by: Seonghun Noh <jzi040941@naver.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* fix code block typo (#7717)

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Update key mapping logic

* Few merge fixes

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix diff for non-mm models

* Fix diff for non-mm models

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove deployment and export scripts

* Improve the unet ckpt loading logic.

* Improve the unet ckpt loading logic.

* Add checkpoint_averaging script

* Hide multimodal code changes

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix Eric's comments

* Revert "Hide multimodal code changes"

This reverts commit d6900f9bc1922d086e2e388dcec6e3bd2b0f59dc.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix configs

* Fix neva model

* Fix neva casting

* Fix neva LoRA non MCore version

* Fix neva LoRA MCore

* [SD] group norm fixes

* Fix neva cfg merge

* remove groupnorm dependency

* LLaVA 1_5 and LORA update

* Fix logs

* Fix neva mcore infernece

* Fix ema

* Fix NeVA

* Remove llama tricks since we are padding the embedding weights directly now

* Update Dockerfile and mm requirements

* Refactoring megatron sdxl class

* Multimodal unit and jenkins tests

* Bug fixes

* Add Multimodal Docs

* update default conv_template

* Fix neva evaluation

* Fix evaluation loading

* Fix evaluation API

* improve logging

* Main commit to enable FSDP. Known issue: BF16 and FP16 loss goes Nan, mixed or true

* Update neva evalution code

* Remove package requirement

* Add lora support to NeMo SD and Dreambooth

* Fix some mapping with inductor

* Mingyuanm/merge mlperf with main

* Create new sdxl inference script

* NeVA v1.5 Fixes Merge

* add torch.compile but have recompilation issues.

* Increase cache size for SDXL.

* NeVA plain template fix

* Revert clip_grads.py from mlperf merge since it breaks models with TP

* Fix bugs during merge

* Enable sdxl lora

* lora disabled by default

* Add clip bigG conversion config

* Add support to switch open_clip to megatron clip

* Sync with FSDP updates for PTL 2.0.7

* minor fix

* Fix the clip grads py

* indent typo

* only remove ckpt at rank 0

* remove nccl config path since incompatible with current version

* update inference config.

* Update FID calculation scripts for sdxl

* Minor fixes

* Update seeding for inference

* commnets update

* fix contorlnet image logger and sdxl

* Fix paths

* Update inference pipeline path

* Add a param to manually set fsdp buffer dtype

* Add fsdp to SD1

* Add different generator to each rank for sdxl

* Remove legacy files before refactoring

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Bug fixes during merging

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove unrelated changes

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* conf updates

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* revert unrelated changes

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fixes fsdp wrap

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* inference config updated

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Add copyright headers

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add guard for potential undenfined local var

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update error type

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* address comments

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Samuele Cornell <cornellsamuele@gmail.com>
Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Signed-off-by: KunalDhawan <kunaldhawan97@gmail.com>
Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>
Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Jason <jasoli@nvidia.com>
Signed-off-by: Ryan <rlangman@nvidia.com>
Signed-off-by: mburchi <maxime.burchi@gmail.com>
Signed-off-by: Maxime Burchi <60737204+burchim@users.noreply.github.com>
Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
Signed-off-by: Tamerlan Tabolov <tktabolov@gmail.com>
Signed-off-by: Gerald Shen <geshen@nvidia.com>
Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Signed-off-by: Stas Bekman <stas00@users.noreply.github.com>
Signed-off-by: Jocelyn Huang <jocelynh@nvidia.com>
Signed-off-by: Abhinav Khattar <aklife97@gmail.com>
Signed-off-by: GiacomoLeoneMaria <giacomoleonemaria@gmail.com>
Signed-off-by: fayejf <36722593+fayejf@users.noreply.github.com>
Signed-off-by: arendu <adithyare@nvidia.com>
Signed-off-by: Olivier Delalleau <507137+odelalleau@users.noreply.github.com>
Signed-off-by: Nithin Rao Koluguri <nithinraok>
Signed-off-by: jasonwan <jasonwan@nvidia.com>
Signed-off-by: hkelly33 <58792115+hkelly33@users.noreply.github.com>
Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>
Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: BestJuly <chntaoli@163.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>
Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: George <37293288+Jorjeous@users.noreply.github.com>
Signed-off-by: Sangkug Lym <slym@nvidia.com>
Signed-off-by: Mehadi Hasan Menon <mehadihasan80@gmail.com>
Signed-off-by: Ante Jukić <ajukic@nvidia.com>
Signed-off-by: Maanu Grover <maanug@nvidia.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>
Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Signed-off-by: Tim Moon <tmoon@nvidia.com>
Signed-off-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Signed-off-by: eharper <eharper@nvidia.com>
Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Signed-off-by: George Zelenfroynd <gzelenfroind@nvidia.com>
Signed-off-by: Anton Peganov <apeganov@nvidia.com>
Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
Signed-off-by: Yi Dong <yidong@nvidia.com>
Signed-off-by: fayejf <fayejf07@gmail.com>
Signed-off-by: Igor Gitman <igitman@nvidia.com>
Signed-off-by: Jan Baczek <jbaczek@nvidia.com>
Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Signed-off-by: Seonghun Noh <jzi040941@naver.com>
Signed-off-by: Seonghun <jzi040941@naver.com>
Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>
Co-authored-by: Samuele Cornell <cornellsamuele@gmail.com>
Co-authored-by: Bobby Chen <bobchen@nvidia.com>
Co-authored-by: Parth Mannan <pmannan@nvidia.com>
Co-authored-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: Lukasz Pierscieniewski <lukaszp@nvidia.com>
Co-authored-by: Yu Yao <yuya@nvidia.com>
Co-authored-by: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Yang Zhang <yzhang123@users.noreply.github.com>
Co-authored-by: Kunal Dhawan <kunaldhawan97@gmail.com>
Co-authored-by: Aleksandr Laptev <alaptev@nvidia.com>
Co-authored-by: Robin Dong <robin.k.dong@gmail.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Jason <jasoli@nvidia.com>
Co-authored-by: Ryan Langman <rlangman@nvidia.com>
Co-authored-by: Maxime Burchi <60737204+burchim@users.noreply.github.com>
Co-authored-by: Igor Gitman <igor.a.gitman@gmail.com>
Co-authored-by: Jan Lasek <janek.lasek@gmail.com>
Co-authored-by: Kelvin Liu <lhb8125@users.noreply.github.com>
Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>
Co-authored-by: Tamerlan Tabolov <nektonikto999@gmail.com>
Co-authored-by: Gerald Shen <119401249+gshennvm@users.noreply.github.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Co-authored-by: Stas Bekman <stas00@users.noreply.github.com>
Co-authored-by: Jocelyn <jocelynh@nvidia.com>
Co-authored-by: Abhinav Khattar <aklife97@gmail.com>
Co-authored-by: Giacomo Leone Maria Cavallini <72698188+GiacomoLeoneMaria@users.noreply.github.com>
Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com>
Co-authored-by: Adi Renduchintala <adithyare@nvidia.com>
Co-authored-by: Olivier Delalleau <507137+odelalleau@users.noreply.github.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: meatybobby <meatybobby@gmail.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Marc Romeyn <marcromeyn@gmail.com>
Co-authored-by: jasonwan <jasonwan@nvidia.com>
Co-authored-by: hkelly33 <58792115+hkelly33@users.noreply.github.com>
Co-authored-by: Yuanzhe Dong <yudong@nvidia.com>
Co-authored-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Co-authored-by: Li Tao <chntaoli@163.com>
Co-authored-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Co-authored-by: Igor Gitman <igitman@nvidia.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: George <37293288+Jorjeous@users.noreply.github.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Mehadi Hasan Menon <mehadihasan80@gmail.com>
Co-authored-by: anteju <108555623+anteju@users.noreply.github.com>
Co-authored-by: Ao Tang <aot@nvidia.com>
Co-authored-by: Ahmad Kiswani <kiswani.ahmad@gmail.com>
Co-authored-by: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Co-authored-by: Maanu Grover <109391026+maanug-nv@users.noreply.github.com>
Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com>
Co-authored-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: PeganovAnton <apeganov@nvidia.com>
Co-authored-by: Nikolay Karpov <karpnv@gmail.com>
Co-authored-by: Evelina <10428420+ekmb@users.noreply.github.com>
Co-authored-by: Yi Dong <43824965+yidong72@users.noreply.github.com>
Co-authored-by: jbaczek <45043825+jbaczek@users.noreply.github.com>
Co-authored-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
Co-authored-by: Seonghun Noh <jzi040941@naver.com>
Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>
Co-authored-by: Szymon Mikler <sjmikler@gmail.com>
Co-authored-by: Wil Kong <alpha0422@gmail.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 .../stable_diffusion/conf/sd_xl_base.yaml     | 102 +++
 .../conf/sd_xl_base_train.yaml                | 212 +++++
 .../conf/sd_xl_base_train_cache_both.yaml     | 177 +++++
 .../conf/sd_xl_base_train_no_conditions.yaml  | 204 +++++
 .../conf/sd_xl_fid_images.yaml                |  95 +++
 .../stable_diffusion/conf/sd_xl_infer.yaml    |  67 ++
 .../generate_xl_fid_images.py                 | 138 ++++
 .../stable_diffusion/sd_xl_infer.py           |  58 ++
 .../stable_diffusion/sd_xl_train.py           | 102 +++
 .../augmentation/augmentations.py             |   2 +
 .../stable_diffusion_dataset.py               | 169 +++-
 .../stable_diffusion/diffusion_engine.py      | 723 ++++++++++++++++++
 .../stable_diffusion/ldm/autoencoder.py       |  27 +-
 .../modules/imagen/diffusionmodules/blocks.py |   1 +
 .../modules/stable_diffusion/attention.py     |  81 +-
 .../diffusionmodules/denoiser.py              |  75 ++
 .../diffusionmodules/denoiser_scaling.py      |  45 ++
 .../diffusionmodules/denoiser_weighting.py    |  38 +
 .../diffusionmodules/discretizer.py           |  76 ++
 .../diffusionmodules/guiders.py               |  64 ++
 .../stable_diffusion/diffusionmodules/loss.py |  75 ++
 .../diffusionmodules/openaimodel.py           | 331 ++++++--
 .../diffusionmodules/sampling.py              | 315 ++++++++
 .../diffusionmodules/sampling_utils.py        |  60 ++
 .../diffusionmodules/sigma_sampling.py        |  40 +
 .../stable_diffusion/diffusionmodules/util.py |  11 +
 .../diffusionmodules/wrappers.py              |  42 +
 .../stable_diffusion/encoders/modules.py      | 362 ++++++++-
 .../encoders/x_transformer.py                 |   2 +-
 .../schedulers/ddim_scheduler.py              | 407 ++++++++++
 .../parts/stable_diffusion/sdxl_helpers.py    | 246 ++++++
 .../parts/stable_diffusion/sdxl_pipeline.py   | 250 ++++++
 .../parts/stable_diffusion/utils.py           |  25 +
 nemo/collections/multimodal/parts/utils.py    |  13 +-
 .../language_modeling/megatron_base_model.py  |  18 +
 .../nlp/modules/common/megatron/clip_grads.py |   1 -
 nemo/collections/nlp/parts/nlp_overrides.py   |  11 +-
 .../parquet_conversion.py                     | 103 +++
 38 files changed, 4649 insertions(+), 119 deletions(-)
 create mode 100644 examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base.yaml
 create mode 100644 examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base_train.yaml
 create mode 100644 examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base_train_cache_both.yaml
 create mode 100644 examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base_train_no_conditions.yaml
 create mode 100644 examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_fid_images.yaml
 create mode 100644 examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_infer.yaml
 create mode 100644 examples/multimodal/text_to_image/stable_diffusion/generate_xl_fid_images.py
 create mode 100644 examples/multimodal/text_to_image/stable_diffusion/sd_xl_infer.py
 create mode 100644 examples/multimodal/text_to_image/stable_diffusion/sd_xl_train.py
 create mode 100644 nemo/collections/multimodal/models/text_to_image/stable_diffusion/diffusion_engine.py
 create mode 100644 nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/denoiser.py
 create mode 100644 nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/denoiser_scaling.py
 create mode 100644 nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/denoiser_weighting.py
 create mode 100644 nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/discretizer.py
 create mode 100644 nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/guiders.py
 create mode 100644 nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/loss.py
 create mode 100644 nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/sampling.py
 create mode 100644 nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/sampling_utils.py
 create mode 100644 nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/sigma_sampling.py
 create mode 100644 nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/wrappers.py
 create mode 100644 nemo/collections/multimodal/modules/stable_diffusion/schedulers/ddim_scheduler.py
 create mode 100644 nemo/collections/multimodal/parts/stable_diffusion/sdxl_helpers.py
 create mode 100644 nemo/collections/multimodal/parts/stable_diffusion/sdxl_pipeline.py
 create mode 100644 scripts/multimodal_dataset_conversion/parquet_conversion.py

diff --git a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base.yaml b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base.yaml
new file mode 100644
index 000000000000..c536bae15926
--- /dev/null
+++ b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base.yaml
@@ -0,0 +1,102 @@
+model:
+  scale_factor: 0.13025
+  disable_first_stage_autocast: True
+  is_legacy: False
+
+  denoiser_config:
+    _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser.DiscreteDenoiser
+    num_idx: 1000
+
+    weighting_config:
+      _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_weighting.EpsWeighting
+    scaling_config:
+      _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_scaling.EpsScaling
+    discretization_config:
+      _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization
+
+
+  unet_config:
+    _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel
+    from_pretrained: /sdxl_ckpts/stable-diffusion-xl-base-1.0/unet/diffusion_pytorch_model.safetensors
+    from_NeMo: False
+    adm_in_channels: 2816
+    num_classes: sequential
+    use_checkpoint: False
+    in_channels: 4
+    out_channels: 4
+    model_channels: 320
+    attention_resolutions: [ 4, 2 ]
+    num_res_blocks: 2
+    channel_mult: [ 1, 2, 4 ]
+    num_head_channels: 64
+    use_spatial_transformer: True
+    use_linear_in_transformer: True
+    transformer_depth: [ 1, 2, 10 ]  # note: the first is unused (due to attn_res starting at 2) 32, 16, 8 --> 64, 32, 16
+    context_dim: 2048
+    image_size: 64 # unused
+#    spatial_transformer_attn_type: softmax  #note: only default softmax is supported now
+    legacy: False
+    use_flash_attention: False
+
+  first_stage_config:
+    _target_: nemo.collections.multimodal.models.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper
+    from_pretrained: /sdxl_ckpts/stable-diffusion-xl-base-1.0/vae/diffusion_pytorch_model.safetensors
+    embed_dim: 4
+    monitor: val/rec_loss
+    ddconfig:
+      attn_type: vanilla
+      double_z: true
+      z_channels: 4
+      resolution: 256
+      in_channels: 3
+      out_ch: 3
+      ch: 128
+      ch_mult: [ 1, 2, 4, 4 ]
+      num_res_blocks: 2
+      attn_resolutions: [ ]
+      dropout: 0.0
+    lossconfig:
+      target: torch.nn.Identity
+
+
+
+  conditioner_config:
+    _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner
+    emb_models:
+      # crossattn cond
+      - is_trainable: False
+        input_key: txt
+        emb_model:
+          _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder
+          layer: hidden
+          layer_idx: 11
+      # crossattn and vector cond
+      - is_trainable: False
+        input_key: txt
+        emb_model:
+          _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder2
+          arch: ViT-bigG-14
+          version: laion2b_s39b_b160k
+          freeze: True
+          layer: penultimate
+          always_return_pooled: True
+          legacy: False
+      # vector cond
+      - is_trainable: False
+        input_key: original_size_as_tuple
+        emb_model:
+          _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND
+          outdim: 256  # multiplied by two
+      # vector cond
+      - is_trainable: False
+        input_key: crop_coords_top_left
+        emb_model:
+          _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND
+          outdim: 256  # multiplied by two
+      # vector cond
+      - is_trainable: False
+        input_key: target_size_as_tuple
+        emb_model:
+          _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND
+          outdim: 256  # multiplied by two
+
diff --git a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base_train.yaml b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base_train.yaml
new file mode 100644
index 000000000000..7aa765db2e5f
--- /dev/null
+++ b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base_train.yaml
@@ -0,0 +1,212 @@
+name: stable-diffusion-xl-train
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16-mixed
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 1 # PTL default. In practice, max_steps will be reached first.
+  max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+  gradient_clip_val: 1.0
+  benchmark: False
+  enable_model_summary: True
+  limit_val_batches: 0
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: stable-diffusion
+    group: nemo-sd
+    name: ${name}
+    resume: True
+  create_checkpoint_callback: True
+  create_tensorboard_logger: True
+  checkpoint_callback_params:
+    every_n_train_steps: 10000
+    every_n_epochs: 0
+    monitor: reduced_train_loss
+    filename: '${name}--{reduced_train_loss:.2f}-{step}-{consumed_samples}'
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  ema:
+    enable: False
+    decay: 0.9999
+    validate_original_weights: False
+    every_n_steps: 1
+    cpu_offload: False
+
+
+model:
+  precision: ${trainer.precision}
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+  micro_batch_size: 1 # limited by GPU memory
+  global_batch_size: 1 # will use more micro batches to reach global batch size
+
+
+  scale_factor: 0.13025
+  disable_first_stage_autocast: True
+  is_legacy: False
+  inductor: False # Not working right now
+  capture_cudagraph_iters: -1
+  scale_by_std: False
+  channels_last: False
+  fsdp: True
+  fsdp_set_buffer_dtype: null
+  precache_mode: null # [text, both, null]
+
+  loss_fn_config:
+    _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.loss.StandardDiffusionLoss
+    sigma_sampler:
+      _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sigma_sampling.DiscreteSampling
+      num_idx: 1000
+      discretization:
+        _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization
+
+
+  denoiser_config:
+    _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser.DiscreteDenoiser
+    num_idx: 1000
+
+    weighting_config:
+      _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_weighting.EpsWeighting
+    scaling_config:
+      _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_scaling.EpsScaling
+    discretization_config:
+      _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization
+
+
+  unet_config:
+    _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel
+    from_NeMo: False
+    adm_in_channels: 2816
+    num_classes: sequential
+    use_checkpoint: False
+    in_channels: 4
+    out_channels: 4
+    model_channels: 320
+    attention_resolutions: [ 4, 2 ]
+    num_res_blocks: 2
+    channel_mult: [ 1, 2, 4 ]
+    num_head_channels: 64
+    use_spatial_transformer: True
+    use_linear_in_transformer: True
+    transformer_depth: [ 1, 2, 10 ]  # note: the first is unused (due to attn_res starting at 2) 32, 16, 8 --> 64, 32, 16
+    context_dim: 2048
+    image_size: 64 # unused
+    legacy: False
+    use_flash_attention: True
+
+  first_stage_config:
+    _target_: nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper
+    from_pretrained: /sdxl_ckpts/stable-diffusion-xl-base-1.0/vae/diffusion_pytorch_model.safetensors
+    embed_dim: 4
+    monitor: val/rec_loss
+    ddconfig:
+      attn_type: vanilla
+      double_z: true
+      z_channels: 4
+      resolution: 256
+      in_channels: 3
+      out_ch: 3
+      ch: 128
+      ch_mult: [ 1, 2, 4, 4 ]
+      num_res_blocks: 2
+      attn_resolutions: [ ]
+      dropout: 0.0
+    lossconfig:
+      target: torch.nn.Identity
+
+
+
+  conditioner_config:
+    _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner
+    emb_models:
+      # crossattn cond
+      - is_trainable: False
+        input_key: captions
+        ucg_rate: 0.1
+        emb_model:
+          _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder
+          layer: hidden
+          layer_idx: 11
+      # crossattn and vector cond
+      - is_trainable: False
+        ucg_rate: 0.1
+        input_key: captions
+        emb_model:
+          _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder2
+          arch: ViT-bigG-14
+          version: laion2b_s39b_b160k
+          freeze: True
+          layer: penultimate
+          always_return_pooled: True
+          legacy: False
+      # vector cond
+      - is_trainable: False
+        ucg_rate: 0.1
+        input_key: original_size_as_tuple
+        emb_model:
+          _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND
+          outdim: 256  # multiplied by two
+      # vector cond
+      - is_trainable: False
+        ucg_rate: 0.1
+        input_key: crop_coords_top_left
+        emb_model:
+          _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND
+          outdim: 256  # multiplied by two
+      # vector cond
+      - is_trainable: False
+        ucg_rate: 0.1
+        input_key: target_size_as_tuple
+        emb_model:
+          _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND
+          outdim: 256  # multiplied by two
+
+
+  data:
+    num_workers: 16
+    train:
+      dataset_path:
+        - YOUR_TRAINING_DATASET_WDINFO_FILE
+      augmentations:
+        resize_smallest_side: 256
+        horizontal_flip: False
+      filterings:
+
+    webdataset:
+      infinite_sampler: False
+      local_root_path: /datasets/coyo
+
+  seed: 1234
+  resume_from_checkpoint: null # manually set the checkpoint file to load from
+  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+
+  optim:
+    name: fused_adam
+    lr: 1e-4 # Need to adjust according to the global bs
+    weight_decay: 0.
+    betas:
+      - 0.9
+      - 0.999
+    sched:
+      name: WarmupHoldPolicy
+      warmup_steps: 10000
+      hold_steps: 10000000000000 # Incredibly large value to hold the lr as constant
+
+  # Nsys profiling options
+  nsys_profile:
+    enabled: False
+    start_step: 10  # Global batch to start profiling
+    end_step: 10 # Global batch to end profiling
+    ranks: [ 0 ] # Global rank IDs to profile
+    gen_shape: False # Generate model and kernel details including input shapes
diff --git a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base_train_cache_both.yaml b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base_train_cache_both.yaml
new file mode 100644
index 000000000000..43299fcc736d
--- /dev/null
+++ b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base_train_cache_both.yaml
@@ -0,0 +1,177 @@
+name: stable-diffusion-xl-train
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: 32
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  replace_sampler_ddp: False
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
+  max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+  gradient_clip_val: 1.0
+  benchmark: False
+  enable_model_summary: True
+  limit_val_batches: 0
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: stable-diffusion
+    group: nemo-sd
+    name: ${name}
+    resume: True
+  create_checkpoint_callback: True
+  create_tensorboard_logger: True
+  checkpoint_callback_params:
+    every_n_train_steps: 10000
+    every_n_epochs: 0
+    monitor: reduced_train_loss
+    filename: '${name}--{reduced_train_loss:.2f}-{step}-{consumed_samples}'
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  ema:
+    enable: False
+    decay: 0.9999
+    validate_original_weights: False
+    every_n_steps: 1
+    cpu_offload: False
+
+
+model:
+  precision: ${trainer.precision}
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+  micro_batch_size: 1 # limited by GPU memory
+  global_batch_size: 1 # will use more micro batches to reach global batch size
+
+
+  scale_factor: 0.13025
+  disable_first_stage_autocast: True
+  is_legacy: False
+  inductor: False # Not working right now
+  capture_cudagraph_iters: -1
+  scale_by_std: False
+  channels_last: True
+  fsdp: True
+  precache_mode: both # [text, both, null]
+  input_key: latents_256
+
+  loss_fn_config:
+    _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.loss.StandardDiffusionLoss
+    sigma_sampler:
+      _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sigma_sampling.DiscreteSampling
+      num_idx: 1000
+      discretization:
+        _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization
+
+
+  denoiser_config:
+    _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser.DiscreteDenoiser
+    num_idx: 1000
+
+    weighting_config:
+      _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_weighting.EpsWeighting
+    scaling_config:
+      _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_scaling.EpsScaling
+    discretization_config:
+      _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization
+
+
+  unet_config:
+    _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel
+    from_NeMo: False
+    adm_in_channels: 1280
+    num_classes: sequential
+    use_checkpoint: False
+    in_channels: 4
+    out_channels: 4
+    model_channels: 320
+    attention_resolutions: [ 4, 2 ]
+    num_res_blocks: 2
+    channel_mult: [ 1, 2, 4 ]
+    num_head_channels: 64
+    use_spatial_transformer: True
+    use_linear_in_transformer: True
+    transformer_depth: [ 1, 2, 10 ]  # note: the first is unused (due to attn_res starting at 2) 32, 16, 8 --> 64, 32, 16
+    context_dim: 2048
+    image_size: 64 # unused
+#    spatial_transformer_attn_type: softmax  #note: only default softmax is supported now
+    legacy: False
+    use_flash_attention: True
+
+  first_stage_config:
+    _target_: nemo.collections.multimodal.models.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper
+    from_pretrained: /sdxl_ckpts/stable-diffusion-xl-base-1.0/vae/diffusion_pytorch_model.safetensors
+    embed_dim: 4
+    monitor: val/rec_loss
+    ddconfig:
+      attn_type: vanilla
+      double_z: true
+      z_channels: 4
+      resolution: 256
+      in_channels: 3
+      out_ch: 3
+      ch: 128
+      ch_mult: [ 1, 2, 4, 4 ]
+      num_res_blocks: 2
+      attn_resolutions: [ ]
+      dropout: 0.0
+    lossconfig:
+      target: torch.nn.Identity
+
+
+
+  conditioner_config:
+    _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner
+    emb_models:
+      # crossattn cond
+      - is_trainable: False
+        ucg_rate: 0.1
+        input_keys: [prompt_embeds, pooled_prompt_embeds]
+        emb_model:
+          _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.PrecachedEmbModel
+
+  data:
+    num_workers: 16
+    train:
+      dataset_path:
+        - YOUR_TRAINING_DATASET_WDINFO_FILE
+      augmentations:
+        resize_smallest_side: 256
+        horizontal_flip: False
+      filterings:
+
+    webdataset:
+      infinite_sampler: False
+      local_root_path: DATASET_MOUNT_PATH
+
+  seed: 1234
+  resume_from_checkpoint: null # manually set the checkpoint file to load from
+  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+
+  optim:
+    name: fused_adam
+    lr: 1e-4 # Need to adjust based on global batch size
+    weight_decay: 0.
+    betas:
+      - 0.9
+      - 0.999
+    sched:
+      name: WarmupHoldPolicy
+      warmup_steps: 10000
+      hold_steps: 10000000000000 # Incredibly large value to hold the lr as constant
+
+  # Nsys profiling options
+  nsys_profile:
+    enabled: False
+    start_step: 10  # Global batch to start profiling
+    end_step: 10 # Global batch to end profiling
+    ranks: [ 0 ] # Global rank IDs to profile
+    gen_shape: False # Generate model and kernel details including input shapes
diff --git a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base_train_no_conditions.yaml b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base_train_no_conditions.yaml
new file mode 100644
index 000000000000..a9de572be985
--- /dev/null
+++ b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base_train_no_conditions.yaml
@@ -0,0 +1,204 @@
+name: stable-diffusion-xl-train
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16-mixed
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
+  max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+  gradient_clip_val: 1.0
+  benchmark: False
+  enable_model_summary: True
+  limit_val_batches: 0
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: stable-diffusion
+    group: nemo-sd
+    name: ${name}
+    resume: True
+  create_checkpoint_callback: True
+  create_tensorboard_logger: True
+  checkpoint_callback_params:
+    every_n_train_steps: 10000
+    every_n_epochs: 0
+    monitor: reduced_train_loss
+    filename: '${name}--{reduced_train_loss:.2f}-{step}-{consumed_samples}'
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  ema:
+    enable: False
+    decay: 0.9999
+    validate_original_weights: False
+    every_n_steps: 1
+    cpu_offload: False
+
+
+model:
+  precision: ${trainer.precision}
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+  micro_batch_size: 1 # limited by GPU memory
+  global_batch_size: 2 # will use more micro batches to reach global batch size
+
+
+  scale_factor: 0.13025
+  disable_first_stage_autocast: True
+  is_legacy: False
+  inductor: False
+  capture_cudagraph_iters: -1
+  scale_by_std: False
+  channels_last: False
+  fsdp: True
+  precache_mode: null # [text, both, null]
+
+  loss_fn_config:
+    _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.loss.StandardDiffusionLoss
+    sigma_sampler:
+      _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sigma_sampling.DiscreteSampling
+      num_idx: 1000
+      discretization:
+        _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization
+
+
+  denoiser_config:
+    _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser.DiscreteDenoiser
+    num_idx: 1000
+
+    weighting_config:
+      _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_weighting.EpsWeighting
+    scaling_config:
+      _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_scaling.EpsScaling
+    discretization_config:
+      _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization
+
+
+  unet_config:
+    _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel
+    from_NeMo: False
+    adm_in_channels: 1280
+    num_classes: sequential
+    use_checkpoint: False
+    in_channels: 4
+    out_channels: 4
+    model_channels: 320
+    attention_resolutions: [ 4, 2 ]
+    num_res_blocks: 2
+    channel_mult: [ 1, 2, 4 ]
+    num_head_channels: 64
+    use_spatial_transformer: True
+    use_linear_in_transformer: True
+    transformer_depth: [ 1, 2, 10 ]  # note: the first is unused (due to attn_res starting at 2) 32, 16, 8 --> 64, 32, 16
+    context_dim: 2048
+    image_size: 64 # unused
+    legacy: False
+    use_flash_attention: True
+
+  first_stage_config:
+    _target_: nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper
+    from_pretrained: /sdxl_ckpts/stable-diffusion-xl-base-1.0/vae/diffusion_pytorch_model.safetensors
+    embed_dim: 4
+    monitor: val/rec_loss
+    ddconfig:
+      attn_type: vanilla
+      double_z: true
+      z_channels: 4
+      resolution: 256
+      in_channels: 3
+      out_ch: 3
+      ch: 128
+      ch_mult: [ 1, 2, 4, 4 ]
+      num_res_blocks: 2
+      attn_resolutions: [ ]
+      dropout: 0.0
+    lossconfig:
+      target: torch.nn.Identity
+
+
+
+  conditioner_config:
+    _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner
+    emb_models:
+      # crossattn cond
+      - is_trainable: False
+        input_key: captions
+        ucg_rate: 0.1
+        emb_model:
+          _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder
+          layer: hidden
+          layer_idx: 11
+      # crossattn and vector cond
+      - is_trainable: False
+        ucg_rate: 0.1
+        input_key: captions
+        emb_model:
+          _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder2
+          arch: ViT-bigG-14
+          version: laion2b_s39b_b160k
+          freeze: True
+          layer: penultimate
+          always_return_pooled: True
+          legacy: False
+
+
+
+  data:
+    num_workers: 16
+    train:
+      dataset_path:
+        - YOUR_TRAINING_DATASET_WDINFO_FILE
+      augmentations:
+        resize_smallest_side: 256
+        center_crop_h_w: 256, 256
+        horizontal_flip: False
+      filterings:
+
+    webdataset:
+      infinite_sampler: False
+      local_root_path: /datasets/coyo
+
+  seed: 1234
+  resume_from_checkpoint: null # manually set the checkpoint file to load from
+  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+
+  optim:
+    name: fused_adam
+    lr: 1e-4 # Need to adjust according to the global bs
+    weight_decay: 0.
+    betas:
+      - 0.9
+      - 0.999
+    sched:
+      name: WarmupHoldPolicy
+      warmup_steps: 10000
+      hold_steps: 10000000000000 # Incredibly large value to hold the lr as constant
+
+  # Nsys profiling options
+  nsys_profile:
+    enabled: False
+    start_step: 10  # Global batch to start profiling
+    end_step: 10 # Global batch to end profiling
+    ranks: [ 0 ] # Global rank IDs to profile
+    gen_shape: False # Generate model and kernel details including input shapes
+
+  peft:
+    peft_scheme: null
+    restore_from_path: null
+    lora_tuning:
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
\ No newline at end of file
diff --git a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_fid_images.yaml b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_fid_images.yaml
new file mode 100644
index 000000000000..647c7bb8ef24
--- /dev/null
+++ b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_fid_images.yaml
@@ -0,0 +1,95 @@
+name: stable-diffusion-train
+
+fid:
+  classifier_free_guidance:
+    - 1.5
+    - 2
+    - 3
+    - 4
+    - 5
+    - 6
+    - 7
+    - 8
+  nnodes_per_cfg: 1
+  ntasks_per_node: 8
+  local_task_id: null
+  num_images_to_eval: 30000
+  coco_captions_path: /coco2014/coco2014_val_sampled_30k/captions
+  coco_images_path: /coco2014/coco2014_val/images_256
+  save_path: output
+
+model:
+  restore_from_path:
+  is_legacy: False
+
+use_refiner: False
+use_fp16: False # use fp16 model weights
+
+base_model_config: /opt/NeMo/examples/multimodal/generative/stable_diffusion/conf/sd_xl_base.yaml
+refiner_config: /opt/NeMo/examples/multimodal/generative/stable_diffusion/conf/sd_xl_refiner.yaml
+
+
+infer:
+  num_samples: 1
+  prompt:
+    - "A professional photograph of an astronaut riding a pig"
+  negative_prompt: ""
+  seed: 123
+
+
+sampling:
+  base:
+    sampler: EulerEDMSampler
+    width: 1344
+    height: 768
+    steps: 40
+    discretization: "LegacyDDPMDiscretization"
+    guider: "VanillaCFG"
+    thresholder: "None"
+    scale: 5.0
+    aesthetic_score: 5.0
+    negative_aesthetic_score: 5.0
+    img2img_strength: 1.0
+    orig_width: 1344
+    orig_height: 768
+    crop_coords_top: 0
+    crop_coords_left: 0
+    sigma_min: 0.0292
+    sigma_max: 14.6146
+    rho: 3.0
+    s_churn: 0.0
+    s_tmin: 0.0
+    s_tmax: 999.0
+    s_noise: 1.0
+    eta: 1.0
+    order: 4
+  refiner:
+    sampler: EulerEDMSampler
+    width: 1344
+    height: 768
+    steps: 40
+    discretization: "LegacyDDPMDiscretization"
+    guider: "VanillaCFG"
+    thresholder: "None"
+    scale: 5.0
+    aesthetic_score: 6.0
+    negative_aesthetic_score: 2.5
+    img2img_strength: 0.15
+    crop_coords_top: 0
+    crop_coords_left: 0
+    sigma_min: 0.0292
+    sigma_max: 14.6146
+    rho: 3.0
+    s_churn: 0.0
+    s_tmin: 0.0
+    s_tmax: 999.0
+    s_noise: 1.0
+    eta: 1.0
+    order: 4
+
+trainer:
+  devices: ${evaluation.fid.ntasks_per_node}
+  num_nodes: 1
+  accelerator: gpu
+  precision: 32
+  logger: False # logger provided by exp_manager
diff --git a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_infer.yaml b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_infer.yaml
new file mode 100644
index 000000000000..eb1f6d7ccb8e
--- /dev/null
+++ b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_infer.yaml
@@ -0,0 +1,67 @@
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: 32
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
+  max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+  gradient_clip_val: 1.0
+  benchmark: False
+  enable_model_summary: True
+  limit_val_batches: 0
+
+
+infer:
+  num_samples: 4
+  prompt:
+    - "A professional photograph of an astronaut riding a pig"
+    - 'A photo of a Shiba Inu dog with a backpack riding a bike. It is wearing sunglasses and a beach hat.'
+    - 'A cute corgi lives in a house made out of sushi.'
+    - 'A high contrast portrait of a very happy fuzzy panda dressed as a chef in a high end kitchen making dough. There is a painting of flowers on the wall behind him.'
+    - 'A brain riding a rocketship heading towards the moon.'
+  negative_prompt: ""
+  seed: 123
+
+
+sampling:
+  base:
+    sampler: EulerEDMSampler
+    width: 256
+    height: 256
+    steps: 40
+    discretization: "LegacyDDPMDiscretization"
+    guider: "VanillaCFG"
+    thresholder: "None"
+    scale: 5.0
+    img2img_strength: 1.0
+    sigma_min: 0.0292
+    sigma_max: 14.6146
+    rho: 3.0
+    s_churn: 0.0
+    s_tmin: 0.0
+    s_tmax: 999.0
+    s_noise: 1.0
+    eta: 1.0
+    order: 4
+    orig_width: 1024
+    orig_height: 1024
+    crop_coords_top: 0
+    crop_coords_left: 0
+    aesthetic_score: 5.0
+    negative_aesthetic_score: 5.0
+
+model:
+  restore_from_path:
+  is_legacy: False
+
+use_refiner: False
+use_fp16: False # use fp16 model weights
+out_path: ./output
+
+base_model_config: /opt/NeMo/examples/multimodal/generative/stable_diffusion/conf/sd_xl_base.yaml
+refiner_config: /opt/NeMo/examples/multimodal/generative/stable_diffusion/conf/sd_xl_refiner.yaml
\ No newline at end of file
diff --git a/examples/multimodal/text_to_image/stable_diffusion/generate_xl_fid_images.py b/examples/multimodal/text_to_image/stable_diffusion/generate_xl_fid_images.py
new file mode 100644
index 000000000000..b20830866635
--- /dev/null
+++ b/examples/multimodal/text_to_image/stable_diffusion/generate_xl_fid_images.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+import torch
+from einops import rearrange
+from omegaconf.omegaconf import open_dict
+from PIL import Image
+
+from nemo.collections.multimodal.models.text_to_image.stable_diffusion.diffusion_engine import MegatronDiffusionEngine
+from nemo.collections.multimodal.parts.stable_diffusion.sdxl_pipeline import SamplingPipeline
+from nemo.collections.multimodal.parts.utils import setup_trainer_and_model_for_inference
+from nemo.core.config import hydra_runner
+
+
+@hydra_runner(config_path='conf/stable_diffusion/conf', config_name='sd_xl_fid_images')
+def main(cfg):
+    # Read configuration parameters
+    nnodes_per_cfg = cfg.fid.nnodes_per_cfg
+    ntasks_per_node = cfg.fid.ntasks_per_node
+    local_task_id = cfg.fid.local_task_id
+    num_images_to_eval = cfg.fid.num_images_to_eval
+    path = cfg.fid.coco_captions_path
+    use_refiner = cfg.get('use_refiner', False)
+
+    node_id = int(os.environ.get("SLURM_ARRAY_TASK_ID", 0))
+    node_id_per_cfg = node_id % nnodes_per_cfg
+
+    current_node_cfg = cfg.fid.classifier_free_guidance[node_id // nnodes_per_cfg]
+    with open_dict(cfg):
+        cfg.sampling.base.scale = current_node_cfg
+        if use_refiner:
+            cfg.sampling.refiner.scale = current_node_cfg
+    save_path = os.path.join(cfg.fid.save_path, str(current_node_cfg))
+
+    # Read and store captions
+    captions = []
+    caption_files = sorted(os.listdir(path))
+    assert len(caption_files) >= num_images_to_eval
+    for file in caption_files[:num_images_to_eval]:
+        with open(os.path.join(path, file), 'r') as f:
+            captions += f.readlines()
+
+    # Calculate partition sizes and select the partition for the current node
+    partition_size_per_node = num_images_to_eval // nnodes_per_cfg
+    start_idx = node_id_per_cfg * partition_size_per_node
+    end_idx = (node_id_per_cfg + 1) * partition_size_per_node if node_id_per_cfg != nnodes_per_cfg - 1 else None
+    captions = captions[start_idx:end_idx]
+
+    local_task_id = int(local_task_id) if local_task_id is not None else int(os.environ.get("SLURM_LOCALID", 0))
+    partition_size_per_task = int(len(captions) // ntasks_per_node)
+
+    # Select the partition for the current task
+    start_idx = local_task_id * partition_size_per_task
+    end_idx = (local_task_id + 1) * partition_size_per_task if local_task_id != ntasks_per_node - 1 else None
+    input = captions[start_idx:end_idx]
+
+    print(f"Current worker {node_id}:{local_task_id} will generate {len(input)} images")
+
+    os.makedirs(save_path, exist_ok=True)
+
+    torch.cuda.set_device(local_task_id)
+
+    # base_model_config = cfg.base_model_config
+    # base = SamplingPipeline(base_model_config, use_fp16=cfg.use_fp16)
+    def model_cfg_modifier(model_cfg):
+        model_cfg.precision = cfg.trainer.precision
+        model_cfg.ckpt_path = None
+        model_cfg.inductor = False
+        model_cfg.unet_config.from_pretrained = None
+        model_cfg.first_stage_config.from_pretrained = None
+        model_cfg.fsdp = False
+        model_cfg.global_batch_size = model_cfg.micro_batch_size * ntasks_per_node
+
+    torch.backends.cuda.matmul.allow_tf32 = True
+    trainer, megatron_diffusion_model = setup_trainer_and_model_for_inference(
+        model_provider=MegatronDiffusionEngine, cfg=cfg, model_cfg_modifier=model_cfg_modifier
+    )
+    model = megatron_diffusion_model.model
+    model.cuda().eval()
+    base = SamplingPipeline(model, use_fp16=cfg.use_fp16, is_legacy=cfg.model.is_legacy)
+
+    if use_refiner:
+        refiner_config = cfg.refiner_config
+        refiner = SamplingPipeline(refiner_config, use_fp16=cfg.use_fp16)
+
+    # Generate images using the model and save them
+    for i, prompt in enumerate(input):
+        cfg.infer.prompt = [prompt]
+        seed = int(cfg.infer.seed + local_task_id * 10 + node_id_per_cfg * 100 + i * 1000)
+        output = base.text_to_image(
+            params=cfg.sampling.base,
+            prompt=cfg.infer.prompt,
+            negative_prompt=cfg.infer.negative_prompt,
+            samples=cfg.infer.num_samples,
+            return_latents=True if use_refiner else False,
+            seed=seed,
+        )
+
+        if use_refiner:
+            assert isinstance(output, (tuple, list))
+            output, samples_z = output
+            assert output is not None
+            assert samples_z is not None
+
+            # perform_save_locally(cfg.out_path, samples)
+
+            output = refiner.refiner(
+                params=cfg.sampling.refiner,
+                image=samples_z,
+                prompt=cfg.infer.prompt,
+                negative_prompt=cfg.infer.negative_prompt,
+                samples=cfg.infer.num_samples,
+                seed=cfg.infer.seed,
+            )
+
+        for sample in output:
+            image_num = i + partition_size_per_node * node_id_per_cfg + partition_size_per_task * local_task_id
+            sample = 255.0 * rearrange(sample.cpu().numpy(), "c h w -> h w c")
+            image = Image.fromarray(sample.astype(np.uint8))
+            image.save(os.path.join(save_path, f'image{image_num:06d}.png'))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_xl_infer.py b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_infer.py
new file mode 100644
index 000000000000..8d18be517c69
--- /dev/null
+++ b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_infer.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+from nemo.collections.multimodal.models.text_to_image.stable_diffusion.diffusion_engine import MegatronDiffusionEngine
+from nemo.collections.multimodal.parts.stable_diffusion.sdxl_helpers import perform_save_locally
+from nemo.collections.multimodal.parts.stable_diffusion.sdxl_pipeline import SamplingPipeline
+from nemo.collections.multimodal.parts.utils import setup_trainer_and_model_for_inference
+from nemo.core.config import hydra_runner
+
+
+@hydra_runner(config_path='conf', config_name='sd_xl_infer')
+def main(cfg):
+    def model_cfg_modifier(model_cfg):
+        model_cfg.precision = cfg.trainer.precision
+        model_cfg.ckpt_path = None
+        model_cfg.inductor = False
+        model_cfg.unet_config.from_pretrained = None
+        model_cfg.first_stage_config.from_pretrained = None
+        model_cfg.first_stage_config._target_ = 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper'
+        model_cfg.fsdp = False
+
+    torch.backends.cuda.matmul.allow_tf32 = True
+    trainer, megatron_diffusion_model = setup_trainer_and_model_for_inference(
+        model_provider=MegatronDiffusionEngine, cfg=cfg, model_cfg_modifier=model_cfg_modifier
+    )
+
+    model = megatron_diffusion_model.model
+    model.cuda().eval()
+
+    base = SamplingPipeline(model, use_fp16=cfg.use_fp16, is_legacy=cfg.model.is_legacy)
+    use_refiner = cfg.get('use_refiner', False)
+    for i, prompt in enumerate(cfg.infer.prompt):
+        samples = base.text_to_image(
+            params=cfg.sampling.base,
+            prompt=[prompt],
+            negative_prompt=cfg.infer.negative_prompt,
+            samples=cfg.infer.num_samples,
+            return_latents=True if use_refiner else False,
+            seed=int(cfg.infer.seed + i * 100),
+        )
+
+        perform_save_locally(cfg.out_path, samples)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_xl_train.py b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_train.py
new file mode 100644
index 000000000000..a91beca93761
--- /dev/null
+++ b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_train.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import torch
+import torch._dynamo.config as dynamo_config
+from omegaconf.omegaconf import OmegaConf
+from pytorch_lightning import Trainer
+
+from nemo.collections.multimodal.models.text_to_image.stable_diffusion.diffusion_engine import MegatronDiffusionEngine
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPFSDPStrategy
+from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.exp_manager import exp_manager
+
+
+class MegatronStableDiffusionTrainerBuilder(MegatronTrainerBuilder):
+    """Builder for SD model Trainer with overrides."""
+
+    def _training_strategy(self) -> NLPDDPStrategy:
+        """
+        Returns a ddp strategy passed to Trainer.strategy.
+        """
+        """
+                Returns a DDP or a FSDP strategy passed to Trainer.strategy.
+                """
+        # check interactive environment
+        _IS_INTERACTIVE = hasattr(sys, "ps1") or bool(sys.flags.interactive)
+        if _IS_INTERACTIVE and self.cfg.trainer.devices == 1:
+            logging.info("Detected interactive environment, using NLPDDPStrategyNotebook")
+            return NLPDDPStrategyNotebook(no_ddp_communication_hook=True, find_unused_parameters=False,)
+
+        if self.cfg.model.get('fsdp', False):
+            assert (
+                not self.cfg.model.optim.get('name') == 'distributed_fused_adam'
+            ), 'Distributed optimizer cannot be used with FSDP.'
+            if self.cfg.model.get('megatron_amp_O2', False):
+                logging.info('Torch FSDP is not compatible with O2 precision recipe. Setting O2 `False`.')
+                self.cfg.model.megatron_amp_O2 = False
+            return NLPFSDPStrategy(
+                limit_all_gathers=self.cfg.model.get('fsdp_limit_all_gathers', True),
+                sharding_strategy=self.cfg.model.get('fsdp_sharding_strategy', 'full'),
+                cpu_offload=self.cfg.model.get('fsdp_cpu_offload', False),
+                grad_reduce_dtype=self.cfg.model.get('fsdp_grad_reduce_dtype', 32),
+                precision=self.cfg.trainer.precision,
+                use_orig_params=self.cfg.model.inductor,
+                set_buffer_dtype=self.cfg.get('fsdp_set_buffer_dtype', None),
+            )
+
+        return NLPDDPStrategy(
+            no_ddp_communication_hook=(not self.cfg.model.get('ddp_overlap')),
+            gradient_as_bucket_view=self.cfg.model.gradient_as_bucket_view,
+            find_unused_parameters=False,
+        )
+
+
+@hydra_runner(config_path='conf', config_name='sd_xl_base_train')
+def main(cfg) -> None:
+    logging.info("\n\n************** Experiment configuration ***********")
+    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
+
+    torch.backends.cuda.matmul.allow_tf32 = True
+
+    trainer = MegatronStableDiffusionTrainerBuilder(cfg).create_trainer()
+
+    exp_manager(trainer, cfg.exp_manager)
+
+    model = MegatronDiffusionEngine(cfg.model, trainer)
+
+    if cfg.model.get('peft', None):
+
+        peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
+
+        if cfg.model.peft.restore_from_path is not None:
+            # initialize peft weights from a checkpoint instead of randomly
+            # This is not the same as resume training because optimizer states are not restored.
+            logging.info("PEFT Weights will be loaded from", cfg.model.peft.restore_from_path)
+            model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls(model_cfg))
+        elif peft_cfg_cls is not None:
+            logging.info("Adding adapter weights to the model for PEFT")
+            model.add_adapter(peft_cfg_cls(cfg.model))
+        else:
+            logging.info(f"Running full finetuning since no peft scheme is given.")
+
+    trainer.fit(model)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/nemo/collections/multimodal/data/stable_diffusion/augmentation/augmentations.py b/nemo/collections/multimodal/data/stable_diffusion/augmentation/augmentations.py
index 3fb8a1d3959f..9c8e0492def9 100644
--- a/nemo/collections/multimodal/data/stable_diffusion/augmentation/augmentations.py
+++ b/nemo/collections/multimodal/data/stable_diffusion/augmentation/augmentations.py
@@ -17,6 +17,8 @@
     TORCHVISION_AVAILABLE = True
 except (ImportError, ModuleNotFoundError):
     TORCHVISION_AVAILABLE = False
+import numpy as np
+import torch
 
 
 def construct_clip_augmentations(n_px=224):
diff --git a/nemo/collections/multimodal/data/stable_diffusion/stable_diffusion_dataset.py b/nemo/collections/multimodal/data/stable_diffusion/stable_diffusion_dataset.py
index e4f3dea59169..5929798267a5 100644
--- a/nemo/collections/multimodal/data/stable_diffusion/stable_diffusion_dataset.py
+++ b/nemo/collections/multimodal/data/stable_diffusion/stable_diffusion_dataset.py
@@ -11,7 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from functools import partial
+
+import numpy as np
 import torch
+import torchvision.transforms as TT
 
 from nemo.collections.multimodal.data.common.webdataset import WebDatasetCommon
 from nemo.collections.multimodal.data.stable_diffusion.augmentation.augmentations import (
@@ -132,13 +136,17 @@ def build_train_valid_precached_datasets(
     model_cfg, consumed_samples,
 ):
     data_cfg = model_cfg.data
+    has_stage_key = model_cfg.get('first_stage_key', False)
 
     # This function maps data that are tuples to dictionary.
     def tuple_to_dict(inp):
         for input in inp:
             out_dict = dict()
-            out_dict[model_cfg.first_stage_key] = torch.tensor(input['autoencoderkl_image'])
-            out_dict[model_cfg.cond_stage_key] = torch.tensor(input['clip-vit-large-patch14_text'])
+            if has_stage_key:
+                out_dict[model_cfg.first_stage_key] = torch.tensor(input['autoencoderkl_image'])
+                out_dict[model_cfg.cond_stage_key] = torch.tensor(input['clip-vit-large-patch14_text'])
+            else:
+                out_dict = input
             yield out_dict
 
     def transform_fn(sample):
@@ -223,3 +231,160 @@ def transform_fn(sample):
         )
 
     return train_data, val_data
+
+
+def build_sdxl_train_valid_datasets(
+    model_cfg, consumed_samples,
+):
+    data_cfg = model_cfg.data
+
+    def build_resolution_filter(value=None, method='larger'):
+        assert method == 'larger' or method == 'smaller'
+        if method == 'larger':
+            print(f'Only Selecting images with resolution >= {value}')
+            return lambda x: x['jpg'].size[0] >= value and x['jpg'].size[1] >= value
+        print(f'Only Selecting images with resolution <= {value}')
+        return lambda x: x['jpg'].size[0] <= value and x['jpg'].size[1] <= value
+
+    # This function maps data that are tuples to dictionary.
+    def tuple_to_dict(inp):
+        for input in inp:
+            out_dict = dict()
+            out_dict['images'] = input[0].permute(1, 2, 0)
+            out_dict['captions'] = input[1]
+            yield out_dict
+
+    def AddOriginalImageSizeAsTupleAndCropToSquare(inp):
+        for input in inp:
+            out_dict = dict()
+            out_dict['images'] = input[0]
+            out_dict['captions'] = input[1]
+            h, w = out_dict['images'].shape[1], out_dict['images'].shape[2]
+            out_dict['original_size_as_tuple'] = torch.tensor([h, w])
+            size = min(h, w)
+            out_dict['target_size_as_tuple'] = torch.tensor([size, size])
+            delta_h = h - size
+            delta_w = w - size
+            assert not all(
+                [delta_h, delta_w]
+            )  # we assume that the image is already resized such that the smallest size is at the desired size. Thus, eiter delta_h or delta_w must be zero
+            top = np.random.randint(0, delta_h + 1)
+            left = np.random.randint(0, delta_w + 1)
+            out_dict['images'] = TT.functional.crop(
+                out_dict['images'], top=top, left=left, height=size, width=size
+            ).permute(1, 2, 0)
+            out_dict["crop_coords_top_left"] = torch.tensor([top, left])
+            yield out_dict
+
+    def transform_fn(sample):
+        image, text = sample["jpg"], sample["txt"]
+        # TODO : If no agumentations just return the image ?
+        img_transform = construct_image_augmentations(data_cfg.train.get("augmentations", None))
+        text_transform = identical_transform
+        return img_transform(image), text_transform(text)
+
+    if 'center_crop_h_w' in data_cfg.train.get("augmentations", None):
+        print(
+            'Training with center cropping, image size and crop coordinates will not be used as extra conditions during training'
+        )
+        compose_fn = tuple_to_dict
+    else:
+        compose_fn = AddOriginalImageSizeAsTupleAndCropToSquare
+
+    filter_cfg = data_cfg.train.get('filterings', None)
+    filter_fn = build_resolution_filter(**filter_cfg.resolution) if filter_cfg else None
+    train_data = WebDatasetCommon(
+        dataset_cfg=data_cfg,
+        consumed_samples=consumed_samples,
+        map_fn=transform_fn,
+        compose_fn=compose_fn,
+        filter_fn=filter_fn,
+        is_train=True,
+    )
+
+    val_data = None
+    if data_cfg.get("validation") is not None and data_cfg.validation.get("data_path"):
+        val_data = WebDatasetCommon(
+            dataset_cfg=data_cfg,
+            consumed_samples=consumed_samples,
+            map_fn=transform_fn,
+            compose_fn=tuple_to_dict,
+            filter_fn=filter_fn,
+            is_train=False,
+        )
+
+    return train_data, val_data
+
+
+def build_sdxl_precached_text_train_valid_datasets(
+    model_cfg, consumed_samples,
+):
+    data_cfg = model_cfg.data
+
+    def build_resolution_filter(value=None, method='larger'):
+        assert method == 'larger' or method == 'smaller'
+        if method == 'larger':
+            print(f'Only Selecting images with resolution >= {value}')
+            return lambda x: x['jpg'].size[0] >= value and x['jpg'].size[1] >= value
+        print(f'Only Selecting images with resolution <= {value}')
+        return lambda x: x['jpg'].size[0] <= value and x['jpg'].size[1] <= value
+
+    # This function maps data that are tuples to dictionary.
+    def tuple_to_dict(inp):
+        for input in inp:
+            out_dict = dict()
+            out_dict['images'] = input[0].permute(1, 2, 0)
+            out_dict['captions'] = input[1]
+            yield out_dict
+
+    def AddOriginalImageSizeAsTupleAndCropToSquare(inp):
+        for input in inp:
+            out_dict = dict()
+            out_dict['images'] = input[0]
+            out_dict.update(input[1])
+            out_dict['captions'] = 'fake caption'
+            h, w = out_dict['images'].shape[1], out_dict['images'].shape[2]
+            out_dict['original_size_as_tuple'] = torch.tensor([h, w])
+            size = min(h, w)
+            out_dict['target_size_as_tuple'] = torch.tensor([size, size])
+            delta_h = h - size
+            delta_w = w - size
+            assert not all(
+                [delta_h, delta_w]
+            )  # we assume that the image is already resized such that the smallest size is at the desired size. Thus, eiter delta_h or delta_w must be zero
+            top = np.random.randint(0, delta_h + 1)
+            left = np.random.randint(0, delta_w + 1)
+            out_dict['images'] = TT.functional.crop(
+                out_dict['images'], top=top, left=left, height=size, width=size
+            ).permute(1, 2, 0)
+            out_dict["crop_coords_top_left"] = torch.tensor([top, left])
+            yield out_dict
+
+    def transform_fn(sample):
+        image, pickle = sample["png"], sample["pickle"]
+        img_transform = construct_image_augmentations(data_cfg.train.get("augmentations", None))
+        return img_transform(image), pickle
+
+    filter_cfg = data_cfg.train.get('filterings', None)
+    filter_fn = build_resolution_filter(**filter_cfg.resolution) if filter_cfg else None
+    train_data = WebDatasetCommon(
+        dataset_cfg=data_cfg,
+        consumed_samples=consumed_samples,
+        map_fn=transform_fn,
+        compose_fn=AddOriginalImageSizeAsTupleAndCropToSquare,
+        filter_fn=filter_fn,
+        is_train=True,
+    )
+
+    val_data = None
+    if data_cfg.get("validation") is not None and data_cfg.validation.get("data_path"):
+        val_data = WebDatasetCommon(
+            dataset_cfg=data_cfg,
+            consumed_samples=consumed_samples,
+            map_fn=transform_fn,
+            compose_fn=tuple_to_dict,
+            filter_fn=filter_fn,
+            is_train=False,
+        )
+
+    return train_data, val_data
diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/diffusion_engine.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/diffusion_engine.py
new file mode 100644
index 000000000000..efc1550113a0
--- /dev/null
+++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/diffusion_engine.py
@@ -0,0 +1,723 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractclassmethod
+from contextlib import contextmanager
+from typing import Any, Dict, List, Tuple, Union
+
+import hydra
+import pytorch_lightning as pl
+import torch
+import torch._dynamo
+import torch.nn as nn
+from einops import rearrange
+from omegaconf import DictConfig, ListConfig, OmegaConf
+from pytorch_lightning import Trainer
+from pytorch_lightning.utilities import rank_zero_only
+from safetensors.torch import load_file as load_safetensors
+from torch._dynamo import optimize
+from torch.optim.lr_scheduler import LambdaLR
+
+from nemo.collections.multimodal.data.stable_diffusion.stable_diffusion_dataset import (
+    build_sdxl_precached_text_train_valid_datasets,
+    build_sdxl_train_valid_datasets,
+    build_train_valid_precached_datasets,
+)
+from nemo.collections.multimodal.modules.stable_diffusion.attention import LinearWrapper
+from nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.wrappers import OPENAIUNETWRAPPER
+from nemo.collections.multimodal.parts.stable_diffusion.utils import (
+    default,
+    disabled_train,
+    get_obj_from_str,
+    instantiate_from_config,
+    log_txt_as_img,
+)
+from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
+from nemo.collections.nlp.modules.common.megatron.module import Float16Module
+from nemo.collections.nlp.parts.mixins.nlp_adapter_mixins import NLPAdapterModelMixin
+from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP, PEFTConfig
+from nemo.collections.nlp.parts.utils_funcs import get_last_rank
+from nemo.core.classes import ModelPT, Serialization
+from nemo.core.classes.mixins.adapter_mixins import AdapterModuleMixin
+from nemo.utils import logging, model_utils
+
+try:
+    from apex import amp
+    from apex.transformer.enums import AttnMaskType
+    from apex.transformer.pipeline_parallel.utils import get_num_microbatches
+
+    HAVE_APEX = True
+except (ImportError, ModuleNotFoundError):
+    HAVE_APEX = False
+
+try:
+    from megatron.core import parallel_state
+    from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
+
+UNCONDITIONAL_CONFIG = {
+    "target": "sgm.modules.GeneralConditioner",
+    "params": {"emb_models": []},
+}
+
+
+class DiffusionEngine(nn.Module, Serialization):
+    def __init__(self, cfg, model_parallel_config):
+        super().__init__()
+        unet_config = cfg.unet_config
+        denoiser_config = cfg.denoiser_config
+        first_stage_config = cfg.first_stage_config
+        conditioner_config = cfg.conditioner_config
+        sampler_config = cfg.get('sampler_config', None)
+        optimizer_config = cfg.get('optimizer_config', None)
+        scheduler_config = cfg.get('scheduler_config', None)
+        loss_fn_config = cfg.get('loss_fn_config', None)
+        network_wrapper = cfg.get('network_wrapper', None)
+        compile_model = cfg.get('compile_model', False)
+        self.config = model_parallel_config
+
+        self.channels_last = cfg.get('channels_last', False)
+        self.log_keys = cfg.get('log_keys', None)
+        self.input_key = cfg.get('input_key', 'images')
+        # Precaching
+        self.precache_mode = cfg.get('precache_mode')
+
+        self.loss_fn = DiffusionEngine.from_config_dict(loss_fn_config) if loss_fn_config is not None else None
+
+        model = DiffusionEngine.from_config_dict(unet_config)
+        self.model = get_obj_from_str(default(network_wrapper, OPENAIUNETWRAPPER))(model, compile_model=compile_model)
+        if cfg.get('inductor', False):
+            torch._dynamo.config.cache_size_limit = 256
+            torch._dynamo.config.dynamic_shapes = False
+            torch._dynamo.config.automatic_dynamic_shapes = False
+            torch._dynamo.config.suppress_errors = True
+            self.model = torch.compile(self.model)
+
+        self.denoiser = DiffusionEngine.from_config_dict(denoiser_config)
+        self.sampler = instantiate_from_config(sampler_config) if sampler_config is not None else None
+
+        self.conditioner = DiffusionEngine.from_config_dict(default(conditioner_config, UNCONDITIONAL_CONFIG))
+        self.scheduler_config = scheduler_config
+        # Precaching
+        self.precache_mode = cfg.get('precache_mode')
+        self._init_first_stage(first_stage_config)
+        self.model_type = None
+
+        self.rng = torch.Generator(device=torch.cuda.current_device(),)
+
+        self.use_ema = False  # TODO use_ema need to switch to NeMo style
+        if self.use_ema:
+            self.model_ema = LitEma(self.model, decay=ema_decay_rate)
+            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
+
+        self.scale_factor = cfg.scale_factor
+        self.disable_first_stage_autocast = cfg.disable_first_stage_autocast
+        self.no_cond_log = cfg.get('no_cond_log', False)
+
+        if self.channels_last:
+            if self.first_stage_model:
+                self.first_stage_model = self.first_stage_model.to(memory_format=torch.channels_last)
+            self.model = self.model.to(memory_format=torch.channels_last)
+
+    def _init_first_stage(self, config):
+        if self.precache_mode == 'both':
+            logging.info('Do not intialize VAE when caching image features.')
+            self.first_stage_model = None
+            return
+        model = DiffusionEngine.from_config_dict(config).eval()
+        model.train = disabled_train
+        for param in model.parameters():
+            param.requires_grad = False
+        self.first_stage_model = model
+
+    def get_input(self, batch):
+        # assuming unified data format, dataloader returns a dict.
+        # image tensors should be scaled to -1 ... 1 and in bchw format
+        return batch[self.input_key]
+
+    @torch.no_grad()
+    def decode_first_stage(self, z):
+        z = 1.0 / self.scale_factor * z
+        with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast):
+            out = self.first_stage_model.decode(z)
+        return out
+
+    @torch.no_grad()
+    def encode_first_stage(self, x):
+        with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast):
+            z = self.first_stage_model.encode(x)
+        z = self.scale_factor * z
+        return z
+
+    def forward(self, x, batch):
+        loss = self.loss_fn(self.model, self.denoiser, self.conditioner, x, batch, rng=self.rng)
+        loss_mean = loss.mean()
+        log_prefix = 'train' if self.training else 'val'
+        loss_dict = {f"{log_prefix}/loss": loss_mean}
+        return loss_mean, loss_dict
+
+    def shared_step(self, batch: Dict) -> Any:
+        x = self.get_input(batch)
+        x = self.encode_first_stage(x)
+        batch["global_step"] = self.global_step
+        loss, loss_dict = self(x, batch)
+        return loss, loss_dict
+
+    def training_step(self, batch, batch_idx):
+        loss, loss_dict = self.shared_step(batch)
+
+        self.log_dict(loss_dict, prog_bar=True, logger=True, on_step=True, on_epoch=False)
+
+        self.log(
+            "global_step", self.global_step, prog_bar=True, logger=True, on_step=True, on_epoch=False,
+        )
+
+        if self.scheduler_config is not None:
+            lr = self.optimizers().param_groups[0]["lr"]
+            self.log("lr_abs", lr, prog_bar=True, logger=True, on_step=True, on_epoch=False)
+
+        return loss
+
+    def on_train_start(self, *args, **kwargs):
+        if self.sampler is None or self.loss_fn is None:
+            raise ValueError("Sampler and loss function need to be set for training.")
+
+    def on_train_batch_end(self, *args, **kwargs):
+        if self.use_ema:
+            self.model_ema(self.model)
+
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.model.parameters())
+            self.model_ema.copy_to(self.model)
+            if context is not None:
+                print(f"{context}: Switched to EMA weights")
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.model.parameters())
+                if context is not None:
+                    print(f"{context}: Restored training weights")
+
+    def instantiate_optimizer_from_config(self, params, lr, cfg):
+        return get_obj_from_str(cfg["target"])(params, lr=lr, **cfg.get("params", dict()))
+
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        params = list(self.model.parameters())
+        for embedder in self.conditioner.embedders:
+            if embedder.is_trainable:
+                params = params + list(embedder.parameters())
+        opt = self.instantiate_optimizer_from_config(params, lr, self.optimizer_config)
+        if self.scheduler_config is not None:
+            scheduler = DiffusionEngine.from_config_dict(self.scheduler_config)
+            print("Setting up LambdaLR scheduler...")
+            scheduler = [
+                {"scheduler": LambdaLR(opt, lr_lambda=scheduler.schedule), "interval": "step", "frequency": 1,}
+            ]
+            return [opt], scheduler
+        return opt
+
+    @torch.no_grad()
+    def sample(
+        self,
+        cond: Dict,
+        uc: Union[Dict, None] = None,
+        batch_size: int = 16,
+        shape: Union[None, Tuple, List] = None,
+        **kwargs,
+    ):
+        randn = torch.randn(batch_size, *shape, generator=self.rng).to(self.device)
+
+        denoiser = lambda input, sigma, c: self.denoiser(self.model, input, sigma, c, **kwargs)
+        samples = self.sampler(denoiser, randn, cond, uc=uc)
+        return samples
+
+    @torch.no_grad()
+    def log_conditionings(self, batch: Dict, n: int) -> Dict:
+        """
+        Defines heuristics to log different conditionings.
+        These can be lists of strings (text-to-image), tensors, ints, ...
+        """
+        image_h, image_w = batch[self.input_key].shape[2:]
+        log = dict()
+
+        for embedder in self.conditioner.embedders:
+            if ((self.log_keys is None) or (embedder.input_key in self.log_keys)) and not self.no_cond_log:
+                x = batch[embedder.input_key][:n]
+                if isinstance(x, torch.Tensor):
+                    if x.dim() == 1:
+                        # class-conditional, convert integer to string
+                        x = [str(x[i].item()) for i in range(x.shape[0])]
+                        xc = log_txt_as_img((image_h, image_w), x, size=image_h // 4)
+                    elif x.dim() == 2:
+                        # size and crop cond and the like
+                        x = ["x".join([str(xx) for xx in x[i].tolist()]) for i in range(x.shape[0])]
+                        xc = log_txt_as_img((image_h, image_w), x, size=image_h // 20)
+                    else:
+                        raise NotImplementedError()
+                elif isinstance(x, (List, ListConfig)):
+                    if isinstance(x[0], str):
+                        # strings
+                        xc = log_txt_as_img((image_h, image_w), x, size=image_h // 20)
+                    else:
+                        raise NotImplementedError()
+                else:
+                    raise NotImplementedError()
+                log[embedder.input_key] = xc
+        return log
+
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        # only required for pipeline parallelism
+        pass
+
+    @torch.no_grad()
+    def log_images(self, batch: Dict, N: int = 8, sample: bool = True, ucg_keys: List[str] = None, **kwargs,) -> Dict:
+        conditioner_input_keys = [e.input_key for e in self.conditioner.embedders]
+        if ucg_keys:
+            assert all(map(lambda x: x in conditioner_input_keys, ucg_keys)), (
+                "Each defined ucg key for sampling must be in the provided conditioner input keys,"
+                f"but we have {ucg_keys} vs. {conditioner_input_keys}"
+            )
+        else:
+            ucg_keys = conditioner_input_keys
+        log = dict()
+
+        x = self.get_input(batch)
+
+        c, uc = self.conditioner.get_unconditional_conditioning(
+            batch, force_uc_zero_embeddings=ucg_keys if len(self.conditioner.embedders) > 0 else [],
+        )
+
+        sampling_kwargs = {}
+
+        N = min(x.shape[0], N)
+        x = x.to(self.device)[:N]
+        log["inputs"] = x
+        z = self.encode_first_stage(x)
+        log["reconstructions"] = self.decode_first_stage(z)
+        log.update(self.log_conditionings(batch, N))
+
+        for k in c:
+            if isinstance(c[k], torch.Tensor):
+                c[k], uc[k] = map(lambda y: y[k][:N].to(self.device), (c, uc))
+
+        if sample:
+            with self.ema_scope("Plotting"):
+                samples = self.sample(c, shape=z.shape[1:], uc=uc, batch_size=N, **sampling_kwargs)
+            samples = self.decode_first_stage(samples)
+            log["samples"] = samples
+        return log
+
+
+class MegatronDiffusionEngine(NLPAdapterModelMixin, MegatronBaseModel):
+    """Megatron DiffusionEngine Model."""
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer):
+        if not HAVE_APEX:
+            raise ImportError(
+                "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
+            )
+        if not HAVE_MEGATRON_CORE:
+            raise ImportError(
+                "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
+            )
+
+        # this prevents base constructor from initializing tokenizer
+        self.tokenizer = None
+        super().__init__(cfg, trainer=trainer)
+
+        self._validate_trainer()
+
+        # megatron_amp_O2 is not yet supported in diffusion models
+        self.megatron_amp_O2 = cfg.get('megatron_amp_O2', False)
+        self.use_fsdp = cfg.get('fsdp', False)
+
+        self.model = self.model_provider_func()
+
+        self.conditioning_keys = []
+
+        if self.trainer.precision in ['bf16', 'bf16-mixed']:
+            self.autocast_dtype = torch.bfloat16
+        elif self.trainer.precision in [32, '32', '32-true']:
+            self.autocast_dtype = torch.float
+        elif self.trainer.precision in [16, '16', '16-mixed']:
+            self.autocast_dtype = torch.half
+        else:
+            raise ValueError('precision must be in ["32-true", "16-mixed", "bf16-mixed"]')
+
+    def get_module_list(self):
+        if isinstance(self.model, list):
+            return [model.module if isinstance(model, Float16Module) else model for model in self.model]
+        elif isinstance(self.model, Float16Module):
+            return [self.model.module]
+        else:
+            return [self.model]
+
+    def model_provider_func(self, pre_process=True, post_process=True):
+        """Model depends on pipeline paralellism."""
+        model = DiffusionEngine(cfg=self.cfg, model_parallel_config=self.model_parallel_config)
+        return model
+
+    # def forward(self, x, c, *args, **kwargs):
+    #     output_tensor = self.model(x, c, *args, **kwargs)
+    #     return output_tensor
+
+    def forward(self, dataloader_iter, batch_idx):
+        loss = self.training_step(dataloader_iter, batch_idx)
+        return loss
+
+    @rank_zero_only
+    @torch.no_grad()
+    def on_train_batch_start(self, batch, batch_idx, dataloader_idx=0):
+        if self.cfg.scale_by_std and self.current_epoch == 0 and self.global_step == 0 and batch_idx == 0:
+            assert self.cfg.scale_factor == 1.0, 'rather not use custom rescaling and std-rescaling simultaneously'
+            batch[self.cfg.first_stage_key] = batch[self.cfg.first_stage_key].cuda(non_blocking=True)
+            self.model.on_train_batch_start(batch, batch_idx)
+
+    def fwd_bwd_step(self, dataloader_iter, forward_only):
+        tensor_shape = None  # Placeholder
+
+        # handle asynchronous grad reduction
+        no_sync_func = None
+        if not forward_only and self.with_distributed_adam:
+            no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,)
+
+        # pipeline schedules will get these from self.model.config
+        for module in self.get_module_list():
+            module.config.no_sync_func = no_sync_func
+        fwd_bwd_function = get_forward_backward_func()
+
+        losses_reduced_per_micro_batch = fwd_bwd_function(
+            forward_step_func=self.get_forward_output_and_loss_func(),
+            data_iterator=dataloader_iter,
+            model=self.model,
+            num_microbatches=get_num_microbatches(),
+            forward_only=forward_only,
+            seq_length=None,
+            micro_batch_size=self.cfg.micro_batch_size,
+        )
+
+        loss_dict = {}
+        if losses_reduced_per_micro_batch:
+            if (not forward_only) or self.cfg.data.get('validation_drop_last', True):
+                # average loss across micro batches
+                for key in losses_reduced_per_micro_batch[0]:
+                    loss_tensors_list = [loss_reduced[key] for loss_reduced in losses_reduced_per_micro_batch]
+                    loss_tensor = torch.stack(loss_tensors_list)
+                    loss_dict[key] = loss_tensor.mean()
+                loss_mean = loss_dict["train/loss"]
+            else:
+                raise NotImplementedError("Losses of micro batches sizes must be uniform!")
+        else:
+            if forward_only:
+                loss_mean = []
+            else:
+                loss_mean = torch.tensor(0.0, device=torch.cuda.current_device())
+
+        return loss_mean, loss_dict
+
+    def training_step(self, dataloader_iter):
+        """
+            Our dataloaders produce a micro-batch and then we fetch
+            a number of microbatches depending on the global batch size and model parallel size
+            from the dataloader to produce a list of microbatches.
+            Batch should be a list of microbatches and those microbatches should on CPU.
+            Microbatches are then moved to GPU during the pipeline.
+            The list of microbatches is then piped through the pipeline using Apex fwd/bwd functions.
+        """
+        self._optimizer.zero_grad()
+
+        loss_mean, loss_dict = self.fwd_bwd_step(dataloader_iter, False)
+
+        torch.distributed.broadcast(loss_mean, get_last_rank())
+
+        # when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced
+        if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False):
+            self.allreduce_sequence_parallel_gradients()
+
+        if self.use_fsdp:
+            pass
+        elif self.with_distributed_adam:
+            # gradients are reduced internally in distributed optimizer
+            pass
+        elif self.megatron_amp_O2:
+            # # when using pipeline parallelism grads must be all-reduced after the pipeline (not asynchronously)
+            # if self.cfg.get('pipeline_model_parallel_size', 1) > 1 or self.cfg.get('sequence_parallel', False):
+            #     # main grads are stored in the MainParamsOptimizer wrapper
+            #     self._optimizer.allreduce_main_grads()
+            self._optimizer.allreduce_main_grads()
+        elif not self.cfg.get('ddp_overlap', True):
+            # async grad allreduce is not currently implemented for O1/autocasting mixed precision training
+            # so we all-reduce gradients after the pipeline
+            self.allreduce_gradients()  # @sangkug we think this is causing memory to blow up (hurts perf)
+
+        if self.cfg.precision in [16, '16', '16-mixed']:
+            loss_scale = self.trainer.precision_plugin.scaler._scale
+            if loss_scale is not None:
+                self.log('loss_scale', loss_scale, batch_size=1)
+
+        self.log_dict(loss_dict, prog_bar=False, logger=True, on_step=True, rank_zero_only=True, batch_size=1)
+        self.log('reduced_train_loss', loss_mean, prog_bar=True, rank_zero_only=True, batch_size=1)
+        lr = self._optimizer.param_groups[0]['lr']
+        self.log('lr', lr, prog_bar=True, rank_zero_only=True, batch_size=1)
+        self.log('global_step', self.trainer.global_step + 1, prog_bar=True, rank_zero_only=True, batch_size=1)
+        self.log(
+            'consumed_samples',
+            self.compute_consumed_samples(self.trainer.global_step + 1 - self.init_global_step),
+            prog_bar=True,
+            rank_zero_only=True,
+            batch_size=1,
+        )
+        return loss_mean
+
+    def backward(self, *args, **kwargs):
+        """ LightningModule hook to do backward.
+            We want this to do nothing since we run backward in the fwd/bwd functions from apex.
+            No need to call it here.
+        """
+        pass
+
+    def optimizer_zero_grad(self, *args, **kwargs):
+        """ LightningModule hook to zero grad.
+            We want this to do nothing as we are zeroing grads during the training_step.
+        """
+        pass
+
+    def _append_sequence_parallel_module_grads(self, module, grads):
+        """ Helper method for allreduce_sequence_parallel_gradients"""
+
+        for param in module.parameters():
+            sequence_parallel_param = getattr(param, 'sequence_parallel', False)
+            if sequence_parallel_param and param.requires_grad:
+                if self.megatron_amp_O2:
+                    grad = param.main_grad
+                else:
+                    grad = param.grad
+                grads.append(grad.data)
+
+    def get_forward_output_and_loss_func(self):
+        def process_batch(batch):
+            """ Prepares the global batch for apex fwd/bwd functions.
+                Global batch is a list of micro batches.
+            """
+            # SD has more dedicated structure for encoding, so we enable autocasting here as well
+            with torch.cuda.amp.autocast(
+                self.autocast_dtype in (torch.half, torch.bfloat16), dtype=self.autocast_dtype,
+            ):
+                if self.model.precache_mode == 'both':
+                    x = batch[self.model.input_key].to(torch.cuda.current_device())
+                    if self.model.channels_last:
+                        x = x.to(memory_format=torch.channels_last, non_blocking=True)
+                    else:
+                        x = x.to(memory_format=torch.contiguous_format, non_blocking=True)
+                else:
+                    x = batch[self.model.input_key].to(torch.cuda.current_device())
+                    if self.model.channels_last:
+                        x = x.permute(0, 3, 1, 2).to(memory_format=torch.channels_last, non_blocking=True)
+                    else:
+                        x = rearrange(x, "b h w c -> b c h w")
+                        x = x.to(memory_format=torch.contiguous_format, non_blocking=True)
+                    x = self.model.encode_first_stage(x)
+
+                batch['global_step'] = self.trainer.global_step
+
+            return x, batch
+
+        def fwd_output_and_loss_func(dataloader_iter, model):
+            batch, _, _ = next(dataloader_iter)
+            x, batch = process_batch(batch)
+
+            loss, loss_dict = model(x, batch)
+
+            def dummy(output_tensor):
+                return loss, loss_dict
+
+            # output_tensor, and a function to convert output_tensor to loss + loss_dict
+            return loss, dummy
+
+        return fwd_output_and_loss_func
+
+    def validation_step(self, dataloader_iter, batch_idx):
+        loss, val_loss_dict = self.fwd_bwd_step(dataloader_iter, batch_idx, True)
+
+        self.log_dict(val_loss_dict, prog_bar=False, logger=True, on_step=False, on_epoch=True, batch_size=1)
+
+        return loss
+
+    def setup(self, stage=None):
+        """ PTL hook that is executed after DDP spawns.
+            We setup datasets here as megatron datasets require DDP to instantiate.
+            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+        Args:
+            stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
+        """
+        self.model.rng.manual_seed(self.cfg.seed + 100 * parallel_state.get_data_parallel_rank())
+
+        # log number of parameters
+        if isinstance(self.model, list):
+            num_parameters_on_device = sum(
+                [sum([p.nelement() for p in model_module.parameters()]) for model_module in self.model]
+            )
+        else:
+            num_parameters_on_device = sum([p.nelement() for p in self.model.parameters()])
+
+        # to be summed across data parallel group
+        total_num_parameters = torch.tensor(num_parameters_on_device).cuda(non_blocking=True)
+
+        torch.distributed.all_reduce(total_num_parameters, group=parallel_state.get_model_parallel_group())
+
+        logging.info(
+            f'Pipeline model parallel rank: {parallel_state.get_pipeline_model_parallel_rank()}, '
+            f'Tensor model parallel rank: {parallel_state.get_tensor_model_parallel_rank()}, '
+            f'Number of model parameters on device: {num_parameters_on_device:.2e}. '
+            f'Total number of model parameters: {total_num_parameters:.2e}.'
+        )
+
+        resume_checkpoint_path = self.trainer.ckpt_path
+        if resume_checkpoint_path:
+            init_consumed_samples = self._extract_consumed_samples_from_ckpt(resume_checkpoint_path)
+        else:
+            init_consumed_samples = 0
+        self.init_consumed_samples = init_consumed_samples
+        self.init_global_step = self.trainer.global_step
+
+        # allowing restored models to optionally setup datasets
+        self.build_train_valid_test_datasets()
+
+        # Batch size need to be provided for webdatset
+        self._num_micro_batches = get_num_microbatches()
+        self._micro_batch_size = self.cfg.micro_batch_size
+
+        self.setup_training_data(self.cfg.data)
+        self.setup_validation_data(self.cfg.data)
+        self.setup_test_data(self.cfg.data)
+
+    def build_train_valid_test_datasets(self):
+        logging.info('Building datasets for Stable Diffusion...')
+        if self.trainer.limit_val_batches > 1.0 and isinstance(self.trainer.limit_val_batches, float):
+            raise ValueError("limit_val_batches must be an integer or float less than or equal to 1.0.")
+
+        if self.model.precache_mode == 'text':
+            logging.info('Precahing text only.')
+            build_dataset_cls = build_sdxl_precached_text_train_valid_datasets
+        elif self.model.precache_mode == 'both':
+            logging.info('Precaching text and image.')
+            build_dataset_cls = build_train_valid_precached_datasets
+        elif self.model.precache_mode is None:
+            build_dataset_cls = build_sdxl_train_valid_datasets
+        else:
+            raise ValueError("unsupported precache mode provided. Check your config file.")
+        self._train_ds, self._validation_ds = build_dataset_cls(
+            model_cfg=self.cfg, consumed_samples=self.compute_consumed_samples(0)
+        )
+        self._test_ds = None
+
+        if self._train_ds is not None:
+            logging.info(f'Length of train dataset: {len(self._train_ds)}')
+        if self._validation_ds is not None:
+            logging.info(f'Length of val dataset: {len(self._validation_ds)}')
+        if self._test_ds is not None:
+            logging.info(f'Length of test dataset: {len(self._test_ds)}')
+        logging.info(f'Finished building datasets for LatentDiffusion.')
+        return self._train_ds, self._validation_ds, self._test_ds
+
+    def setup_training_data(self, cfg):
+        if hasattr(self, '_train_ds') and self._train_ds is not None:
+            consumed_samples = self.compute_consumed_samples(0)
+            logging.info(
+                f'Setting up train dataloader with len(len(self._train_ds)): {len(self._train_ds)} and consumed samples: {consumed_samples}'
+            )
+            self._train_dl = torch.utils.data.DataLoader(
+                self._train_ds,
+                batch_size=self._micro_batch_size,
+                num_workers=cfg.num_workers,
+                pin_memory=True,
+                drop_last=True,
+                persistent_workers=True,
+            )
+
+    def setup_validation_data(self, cfg):
+        if hasattr(self, '_validation_ds') and self._validation_ds is not None:
+            consumed_samples = 0
+            logging.info(
+                f'Setting up validation dataloader with len(len(self._validation_ds)): {len(self._validation_ds)} and consumed samples: {consumed_samples}'
+            )
+            self._validation_dl = torch.utils.data.DataLoader(
+                self._validation_ds,
+                batch_size=self._micro_batch_size,
+                num_workers=cfg.num_workers,
+                pin_memory=True,
+                drop_last=False,
+                persistent_workers=True,
+            )
+
+    def setup_test_data(self, cfg):
+        if hasattr(self, '_test_ds') and self._test_ds is not None:
+            consumed_samples = 0
+            logging.info(
+                f'Setting up test dataloader with len(len(self._test_ds)): {len(self._test_ds)} and consumed samples: {consumed_samples}'
+            )
+            self._test_dl = torch.utils.data.DataLoader(
+                self._test_ds, batch_size=self._micro_batch_size, num_workers=cfg.num_workers, pin_memory=True,
+            )
+
+    def transfer_batch_to_device(self, batch: Any, device: torch.device, dataloader_idx: int) -> Any:
+        """ PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device
+            When using pipeline parallelism, we need the global batch to remain on the CPU,
+            since the memory overhead will be too high when using a large number of microbatches.
+            Microbatches are transferred from CPU to GPU inside the pipeline.
+        """
+        return batch
+
+    def _validate_trainer(self):
+        """ Certain trainer configurations can break training.
+            Here we try to catch them and raise an error.
+        """
+        if self.trainer.accumulate_grad_batches > 1:
+            raise ValueError(
+                f'Gradient accumulation is done within training_step. trainer.accumulate_grad_batches must equal 1'
+            )
+
+    @classmethod
+    def list_available_models(cls):
+        return None
+
+    def parameters(self):
+        if isinstance(self.model, list):
+            return itertools.chain.from_iterable(module.parameters() for module in self.model)
+        else:
+            return self.model.parameters()
+
+    def _check_and_add_adapter(self, name, module, peft_name, peft_cfg, name_key_to_mcore_mixins=None):
+        if isinstance(module, AdapterModuleMixin):
+            if isinstance(module, LinearWrapper):
+                peft_cfg.in_features, peft_cfg.out_features = module.in_features, module.out_features
+            else:
+                return
+            if model_utils.import_class_by_path(peft_cfg._target_) in module.get_accepted_adapter_types():
+                module.add_adapter(
+                    name=peft_name,
+                    cfg=peft_cfg,
+                    base_model_cfg=self.cfg,
+                    model_parallel_config=self.model_parallel_config,
+                )
diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/autoencoder.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/autoencoder.py
index 3a929ddd6829..6bd47a78fbcf 100644
--- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/autoencoder.py
+++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/autoencoder.py
@@ -337,15 +337,21 @@ def __init__(
             self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
 
         if from_pretrained is not None:
-            state_dict = torch.load(from_pretrained)
-            missed_key, unexpected_key, missmatched_key, err_msg = self._load_pretrained_model(state_dict)
+            if from_pretrained.endswith('safetensors'):
+                from safetensors.torch import load_file as load_safetensors
 
-            if len(missed_key) > 0:
+                state_dict = load_safetensors(from_pretrained)
+            else:
+                state_dict = torch.load(from_pretrained)
+            if 'state_dict' in state_dict:
+                state_dict = state_dict['state_dict']
+            missing_key, unexpected_key, _, _ = self._load_pretrained_model(state_dict)
+            if len(missing_key) > 0:
                 print(
-                    f'{self.__class__.__name__}: Following keys are missing during loading unet weights, which may lead to compromised image quality for a resumed training. Please check the checkpoint you provided.'
+                    f'{self.__class__.__name__}: Following keys are missing during loading VAE weights, which may lead to compromised image quality for a resumed training. Please check the checkpoint you provided.'
                 )
-                print("missed key: ", missed_key)
-                print("unexpected key: ", unexpected_key)
+                print(f'Missing:{missing_key}')
+                print(f'Unexpected:{unexpected_key}')
 
     def _state_key_mapping(self, state_dict: dict):
         import re
@@ -416,7 +422,9 @@ def _find_mismatched_keys(
                         del state_dict[checkpoint_key]
             return mismatched_keys
 
-        if state_dict['encoder.mid.attn_1.q.weight'].shape == torch.Size([512, 512]):
+        if 'encoder.mid.attn_1.q.weight' in loaded_keys and (
+            state_dict['encoder.mid.attn_1.q.weight'].shape == torch.Size([512, 512])
+        ):
             for key in [
                 'encoder.mid.attn_1.q.weight',
                 'decoder.mid.attn_1.q.weight',
@@ -594,6 +602,11 @@ def to_rgb(self, x):
         return x
 
 
+class AutoencoderKLInferenceWrapper(AutoencoderKL):
+    def encode(self, x):
+        return super().encode(x).sample()
+
+
 class IdentityFirstStage(torch.nn.Module):
     def __init__(self, *args, vq_interface=False, **kwargs):
         self.vq_interface = vq_interface  # TODO: Should be true by default but check to not break older stuff
diff --git a/nemo/collections/multimodal/modules/imagen/diffusionmodules/blocks.py b/nemo/collections/multimodal/modules/imagen/diffusionmodules/blocks.py
index 445c3c7a98de..1d6b8395a58f 100644
--- a/nemo/collections/multimodal/modules/imagen/diffusionmodules/blocks.py
+++ b/nemo/collections/multimodal/modules/imagen/diffusionmodules/blocks.py
@@ -690,6 +690,7 @@ def _forward(self, x):
         if self.flash_attention:
             # qkv shape: (b, (3 h d) s), need to reshape to (b, s, h, d) for each q, k, v
             b, _, _ = qkv.shape
+            h = self.num_heads
             q, k, v = qkv.chunk(3, dim=1)
             max_seqlen_q, max_seqlen_k = q.shape[2], k.shape[2]
             q = rearrange(q, 'b (h d) s -> (b s) h d', h=self.num_heads)
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/attention.py b/nemo/collections/multimodal/modules/stable_diffusion/attention.py
index e70a473d658b..c92980d904f6 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/attention.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/attention.py
@@ -17,7 +17,7 @@
 import torch
 import torch.nn.functional as F
 from apex.contrib.group_norm import GroupNorm
-from einops import rearrange
+from einops import rearrange, repeat
 from torch import einsum, nn
 from torch._dynamo import disable
 
@@ -27,11 +27,12 @@
     ParallelLinearAdapterConfig,
 )
 from nemo.core import adapter_mixins
+from nemo.utils import logging
 
 
 def check_cuda():
     if not torch.cuda.is_available():
-        raise RuntimeError('CUDA is not available')
+        raise ImportError('CUDA is not available')
     cur_device = torch.cuda.current_device()
     dprops = torch.cuda.get_device_properties(cur_device)
 
@@ -252,19 +253,32 @@ def __init__(
             else:
                 self.flash_attn = FlashCrossAttention(softmax_scale=self.scale)
 
-    def forward(self, x, context=None, mask=None):
+    def forward(self, x, context=None, mask=None, additional_tokens=None, n_times_crossframe_attn_in_self=0):
         h = self.heads
 
+        if additional_tokens is not None:
+            # get the number of masked tokens at the beginning of the output sequence
+            n_tokens_to_mask = additional_tokens.shape[1]
+            # add additional token
+            x = torch.cat([additional_tokens, x], dim=1)
+
         q = self.to_q(x)
         context = default(context, x)
         k = self.to_k(context)
         v = self.to_v(context)
 
-        out = self._attention(q, k, v, mask)
+        if n_times_crossframe_attn_in_self:
+            # reprogramming cross-frame attention as in https://arxiv.org/abs/2303.13439
+            assert x.shape[0] % n_times_crossframe_attn_in_self == 0
+            n_cp = x.shape[0] // n_times_crossframe_attn_in_self
+            k = repeat(k[::n_times_crossframe_attn_in_self], "b ... -> (b n) ...", n=n_cp)
+            v = repeat(v[::n_times_crossframe_attn_in_self], "b ... -> (b n) ...", n=n_cp)
+
+        out = self._attention(q, k, v, mask, additional_tokens=None)
 
         return self.to_out(out)
 
-    def _attention(self, q, k, v, mask=None):
+    def _attention(self, q, k, v, mask=None, additional_tokens=None):
         h = self.heads
 
         if (
@@ -318,7 +332,9 @@ def _attention(self, q, k, v, mask=None):
 
             out = self.flash_attn(q, kv)
             out = out.view(b, s_q, hd)
-
+        if additional_tokens is not None:
+            # remove additional token
+            out = out[:, n_tokens_to_mask:]
         return out
 
 
@@ -362,15 +378,33 @@ def __init__(
         self.norm3 = nn.LayerNorm(dim)
         self.use_checkpoint = use_checkpoint
 
-    def forward(self, x, context=None):
+    def forward(self, x, context=None, additional_tokens=None, n_times_crossframe_attn_in_self=0):
+        kwargs = {"x": x}
+
+        if context is not None:
+            kwargs.update({"context": context})
+        if additional_tokens is not None:
+            kwargs.update({"additional_tokens": additional_tokens})
+
+        if n_times_crossframe_attn_in_self:
+            kwargs.update({"n_times_crossframe_attn_in_self": n_times_crossframe_attn_in_self})
+
         if self.use_checkpoint:
             return checkpoint(self._forward, (x, context), self.parameters(), self.use_checkpoint)
         else:
             return self._forward(x, context)
 
-    def _forward(self, x, context=None):
-        x = self.attn1(self.norm1(x), context=context if self.disable_self_attn else None) + x
-        x = self.attn2(self.norm2(x), context=context) + x
+    def _forward(self, x, context=None, additional_tokens=None, n_times_crossframe_attn_in_self=0):
+        x = (
+            self.attn1(
+                self.norm1(x),
+                context=context if self.disable_self_attn else None,
+                additional_tokens=additional_tokens,
+                n_times_crossframe_attn_in_self=n_times_crossframe_attn_in_self if not self.disable_self_attn else 0,
+            )
+            + x
+        )
+        x = self.attn2(self.norm2(x), context=context, additional_tokens=additional_tokens) + x
         x = self.ff(self.norm3(x)) + x
         return x
 
@@ -399,8 +433,26 @@ def __init__(
         lora_network_alpha=None,
     ):
         super().__init__()
-        if exists(context_dim) and not isinstance(context_dim, list):
+        logging.info(
+            f"constructing {self.__class__.__name__} of depth {depth} w/ {in_channels} channels and {n_heads} heads"
+        )
+        from omegaconf import ListConfig
+
+        if exists(context_dim) and not isinstance(context_dim, (list, ListConfig)):
             context_dim = [context_dim]
+        if exists(context_dim) and isinstance(context_dim, list):
+            if depth != len(context_dim):
+                logging.info(
+                    f"WARNING: {self.__class__.__name__}: Found context dims {context_dim} of depth {len(context_dim)}, "
+                    f"which does not match the specified 'depth' of {depth}. Setting context_dim to {depth * [context_dim[0]]} now."
+                )
+                # depth does not match context dims.
+                assert all(
+                    map(lambda x: x == context_dim[0], context_dim)
+                ), "need homogenous context_dim to match depth automatically"
+                context_dim = depth * [context_dim[0]]
+        elif context_dim is None:
+            context_dim = [None] * depth
         self.in_channels = in_channels
         inner_dim = n_heads * d_head
         self.norm = Normalize(in_channels)
@@ -409,7 +461,6 @@ def __init__(
             self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
         else:
             self.proj_in = nn.Linear(in_channels, inner_dim)
-
         self.transformer_blocks = nn.ModuleList(
             [
                 BasicTransformerBlock(
@@ -431,6 +482,8 @@ def __init__(
             self.proj_out = zero_module(nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0))
         else:
             self.proj_out = zero_module(nn.Linear(in_channels, inner_dim))
+            # self.proj_out = zero_module(nn.Linear(inner_dim, in_channels))
+            # Usually inner_dim is the same as in_channels.
         self.use_linear = use_linear
 
     def forward(self, x, context=None):
@@ -446,10 +499,12 @@ def forward(self, x, context=None):
         if self.use_linear:
             x = self.proj_in(x)
         for i, block in enumerate(self.transformer_blocks):
+            if i > 0 and len(context) == 1:
+                i = 0  # use same context for each block
             x = block(x, context=context[i])
         if self.use_linear:
             x = self.proj_out(x)
         x = x.transpose(1, 2).view(b, c, h, w)  # b (h w) c -> b c h w
         if not self.use_linear:
             x = self.proj_out(x)
-        return x + x_in
+        return x_in + x
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/denoiser.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/denoiser.py
new file mode 100644
index 000000000000..df1f27449bd1
--- /dev/null
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/denoiser.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch.nn as nn
+
+from nemo.collections.multimodal.parts.stable_diffusion.utils import append_dims, instantiate_from_config
+from nemo.core.classes import Serialization
+
+
+class Denoiser(nn.Module, Serialization):
+    def __init__(self, weighting_config, scaling_config):
+        super().__init__()
+        self.weighting = weighting_config
+        self.scaling = scaling_config
+
+    def possibly_quantize_sigma(self, sigma):
+        return sigma
+
+    def possibly_quantize_c_noise(self, c_noise):
+        return c_noise
+
+    def w(self, sigma):
+        return self.weighting(sigma)
+
+    def __call__(self, network, input, sigma, cond):
+        sigma = self.possibly_quantize_sigma(sigma)
+        sigma_shape = sigma.shape
+        sigma = append_dims(sigma, input.ndim)
+        c_skip, c_out, c_in, c_noise = self.scaling(sigma)
+        c_noise = self.possibly_quantize_c_noise(c_noise.reshape(sigma_shape))
+        return network(input * c_in, c_noise, cond) * c_out + input * c_skip
+
+
+class DiscreteDenoiser(Denoiser):
+    def __init__(
+        self,
+        weighting_config,
+        scaling_config,
+        num_idx,
+        discretization_config,
+        do_append_zero=False,
+        quantize_c_noise=True,
+        flip=True,
+    ):
+        super().__init__(weighting_config, scaling_config)
+        sigmas = discretization_config(num_idx, do_append_zero=do_append_zero, flip=flip)
+        self.register_buffer("sigmas", sigmas)
+        self.quantize_c_noise = quantize_c_noise
+
+    def sigma_to_idx(self, sigma):
+        dists = sigma - self.sigmas[:, None]
+        return dists.abs().argmin(dim=0).view(sigma.shape)
+
+    def idx_to_sigma(self, idx):
+        return self.sigmas[idx]
+
+    def possibly_quantize_sigma(self, sigma):
+        return self.idx_to_sigma(self.sigma_to_idx(sigma))
+
+    def possibly_quantize_c_noise(self, c_noise):
+        if self.quantize_c_noise:
+            return self.sigma_to_idx(c_noise)
+        else:
+            return c_noise
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/denoiser_scaling.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/denoiser_scaling.py
new file mode 100644
index 000000000000..0ccaa0985505
--- /dev/null
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/denoiser_scaling.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+
+class EDMScaling:
+    def __init__(self, sigma_data=0.5):
+        self.sigma_data = sigma_data
+
+    def __call__(self, sigma):
+        c_skip = self.sigma_data ** 2 / (sigma ** 2 + self.sigma_data ** 2)
+        c_out = sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2) ** 0.5
+        c_in = 1 / (sigma ** 2 + self.sigma_data ** 2) ** 0.5
+        c_noise = 0.25 * sigma.log()
+        return c_skip, c_out, c_in, c_noise
+
+
+class EpsScaling:
+    def __call__(self, sigma):
+        c_skip = torch.ones_like(sigma, device=sigma.device)
+        c_out = -sigma
+        c_in = 1 / (sigma ** 2 + 1.0) ** 0.5
+        c_noise = sigma.clone()
+        return c_skip, c_out, c_in, c_noise
+
+
+class VScaling:
+    def __call__(self, sigma):
+        c_skip = 1.0 / (sigma ** 2 + 1.0)
+        c_out = -sigma / (sigma ** 2 + 1.0) ** 0.5
+        c_in = 1.0 / (sigma ** 2 + 1.0) ** 0.5
+        c_noise = sigma.clone()
+        return c_skip, c_out, c_in, c_noise
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/denoiser_weighting.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/denoiser_weighting.py
new file mode 100644
index 000000000000..470f433f425e
--- /dev/null
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/denoiser_weighting.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+
+class UnitWeighting:
+    def __call__(self, sigma):
+        return torch.ones_like(sigma, device=sigma.device)
+
+
+class EDMWeighting:
+    def __init__(self, sigma_data=0.5):
+        self.sigma_data = sigma_data
+
+    def __call__(self, sigma):
+        return (sigma ** 2 + self.sigma_data ** 2) / (sigma * self.sigma_data) ** 2
+
+
+class VWeighting(EDMWeighting):
+    def __init__(self):
+        super().__init__(sigma_data=1.0)
+
+
+class EpsWeighting:
+    def __call__(self, sigma):
+        return sigma ** -2.0
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/discretizer.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/discretizer.py
new file mode 100644
index 000000000000..d348e079b4b8
--- /dev/null
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/discretizer.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import abstractmethod
+from functools import partial
+
+import numpy as np
+import torch
+
+from nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.util import make_beta_schedule
+from nemo.collections.multimodal.parts.stable_diffusion.utils import append_zero
+
+
+def generate_roughly_equally_spaced_steps(num_substeps: int, max_step: int) -> np.ndarray:
+    return np.linspace(max_step - 1, 0, num_substeps, endpoint=False).astype(int)[::-1]
+
+
+class Discretization:
+    def __call__(self, n, do_append_zero=True, device="cpu", flip=False):
+        sigmas = self.get_sigmas(n, device=device)
+        sigmas = append_zero(sigmas) if do_append_zero else sigmas
+        return sigmas if not flip else torch.flip(sigmas, (0,))
+
+    @abstractmethod
+    def get_sigmas(self, n, device):
+        pass
+
+
+class EDMDiscretization(Discretization):
+    def __init__(self, sigma_min=0.02, sigma_max=80.0, rho=7.0):
+        self.sigma_min = sigma_min
+        self.sigma_max = sigma_max
+        self.rho = rho
+
+    def get_sigmas(self, n, device="cpu"):
+        ramp = torch.linspace(0, 1, n, device=device)
+        min_inv_rho = self.sigma_min ** (1 / self.rho)
+        max_inv_rho = self.sigma_max ** (1 / self.rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** self.rho
+        return sigmas
+
+
+class LegacyDDPMDiscretization(Discretization):
+    def __init__(
+        self, linear_start=0.00085, linear_end=0.0120, num_timesteps=1000,
+    ):
+        super().__init__()
+        self.num_timesteps = num_timesteps
+        betas = make_beta_schedule("linear", num_timesteps, linear_start=linear_start, linear_end=linear_end)
+        alphas = 1.0 - betas
+        self.alphas_cumprod = np.cumprod(alphas, axis=0)
+        self.to_torch = partial(torch.tensor, dtype=torch.float32)
+
+    def get_sigmas(self, n, device="cpu"):
+        if n < self.num_timesteps:
+            timesteps = generate_roughly_equally_spaced_steps(n, self.num_timesteps)
+            alphas_cumprod = self.alphas_cumprod[timesteps]
+        elif n == self.num_timesteps:
+            alphas_cumprod = self.alphas_cumprod
+        else:
+            raise ValueError
+
+        to_torch = partial(torch.tensor, dtype=torch.float32, device=device)
+        sigmas = to_torch((1 - alphas_cumprod) / alphas_cumprod) ** 0.5
+        return torch.flip(sigmas, (0,))
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/guiders.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/guiders.py
new file mode 100644
index 000000000000..55c20ece828f
--- /dev/null
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/guiders.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+
+import torch
+
+from nemo.collections.multimodal.parts.stable_diffusion.utils import default, instantiate_from_config
+
+
+class VanillaCFG:
+    """
+    implements parallelized CFG
+    """
+
+    def __init__(self, scale, dyn_thresh_config=None):
+        scale_schedule = lambda scale, sigma: scale  # independent of step
+        self.scale_schedule = partial(scale_schedule, scale)
+        self.dyn_thresh = instantiate_from_config(
+            default(
+                dyn_thresh_config, {"target": "sgm.modules.diffusionmodules.sampling_utils.NoDynamicThresholding"},
+            )
+        )
+
+    def __call__(self, x, sigma):
+        x_u, x_c = x.chunk(2)
+        scale_value = self.scale_schedule(sigma)
+        x_pred = self.dyn_thresh(x_u, x_c, scale_value)
+        return x_pred
+
+    def prepare_inputs(self, x, s, c, uc):
+        c_out = dict()
+
+        for k in c:
+            if k in ["vector", "crossattn", "concat"]:
+                c_out[k] = torch.cat((uc[k], c[k]), 0)
+            else:
+                assert c[k] == uc[k]
+                c_out[k] = c[k]
+        return torch.cat([x] * 2), torch.cat([s] * 2), c_out
+
+
+class IdentityGuider:
+    def __call__(self, x, sigma):
+        return x
+
+    def prepare_inputs(self, x, s, c, uc):
+        c_out = dict()
+
+        for k in c:
+            c_out[k] = c[k]
+
+        return x, s, c_out
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/loss.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/loss.py
new file mode 100644
index 000000000000..f6de8305f805
--- /dev/null
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/loss.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Union
+
+import torch
+import torch.nn as nn
+from omegaconf import ListConfig
+
+from nemo.collections.multimodal.parts.stable_diffusion.utils import append_dims, instantiate_from_config
+from nemo.collections.multimodal.parts.utils import randn_like
+
+
+class StandardDiffusionLoss(nn.Module):
+    def __init__(
+        self,
+        sigma_sampler,
+        type="l2",
+        offset_noise_level=0.0,
+        batch2model_keys: Optional[Union[str, List[str], ListConfig]] = None,
+    ):
+        super().__init__()
+
+        assert type in ["l2", "l1", "lpips"]
+
+        self.sigma_sampler = sigma_sampler
+
+        self.type = type
+        self.offset_noise_level = offset_noise_level
+
+        if type == "lpips":
+            self.lpips = LPIPS().eval()
+
+        if not batch2model_keys:
+            batch2model_keys = []
+
+        if isinstance(batch2model_keys, str):
+            batch2model_keys = [batch2model_keys]
+
+        self.batch2model_keys = set(batch2model_keys)
+
+    def __call__(self, network, denoiser, conditioner, input, batch, rng=None):
+        cond = conditioner(batch)
+        additional_model_inputs = {key: batch[key] for key in self.batch2model_keys.intersection(batch)}
+
+        rand = torch.randint(0, self.sigma_sampler.num_idx, (input.shape[0],), generator=rng, device=rng.device).to(
+            self.sigma_sampler.sigmas.device
+        )
+        sigmas = self.sigma_sampler(input.shape[0], rand=rand).to(input.device)
+        noise = randn_like(input, generator=rng)
+        if self.offset_noise_level > 0.0:
+            noise = noise + self.offset_noise_level * append_dims(
+                torch.randn(input.shape[0], device=input.device, generator=rng), input.ndim
+            )
+        noised_input = input + noise * append_dims(sigmas, input.ndim)
+        model_output = denoiser(network, noised_input, sigmas, cond, **additional_model_inputs)
+        w = append_dims(denoiser.w(sigmas), input.ndim)
+        return self.get_loss(model_output, input, w)
+
+    def get_loss(self, model_output, target, w):
+        if self.type == "l2":
+            return torch.mean((w * (model_output - target) ** 2).reshape(target.shape[0], -1), 1)
+        elif self.type == "l1":
+            return torch.mean((w * (model_output - target).abs()).reshape(target.shape[0], -1), 1)
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
index 62842da602dc..91a214b90713 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
@@ -13,6 +13,9 @@
 # limitations under the License.
 import math
 from abc import abstractmethod
+from collections.abc import Iterable
+from functools import partial
+from typing import Iterable
 
 import numpy as np
 import torch
@@ -25,6 +28,8 @@
     avg_pool_nd,
     checkpoint,
     conv_nd,
+    default,
+    exists,
     linear,
     normalization,
     timestep_embedding,
@@ -114,12 +119,13 @@ class Upsample(nn.Module):
                     It can be 1, 2, or 3. If set to 3, upsampling occurs in the inner-two dimensions.
     """
 
-    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1, third_up=False):
         super().__init__()
         self.channels = channels
         self.out_channels = out_channels or channels
         self.use_conv = use_conv
         self.dims = dims
+        self.third_up = third_up
         if use_conv:
             self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=padding)
 
@@ -133,7 +139,8 @@ def forward(self, x):
         if dtype == torch.bfloat16:
             x = x.to(torch.float32)
         if self.dims == 3:
-            x = F.interpolate(x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest")
+            t_factor = 1 if not self.third_up else 2
+            x = F.interpolate(x, (t_factor * x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest")
         else:
             x = F.interpolate(x, scale_factor=2, mode="nearest")
         if dtype == torch.bfloat16:
@@ -173,13 +180,13 @@ class Downsample(nn.Module):
                     It can be 1, 2, or 3. For 3D signals, downsampling occurs in the inner-two dimensions.
     """
 
-    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1, third_down=False):
         super().__init__()
         self.channels = channels
         self.out_channels = out_channels or channels
         self.use_conv = use_conv
         self.dims = dims
-        stride = 2 if dims != 3 else (1, 2, 2)
+        stride = 2 if dims != 3 else ((1, 2, 2) if not third_down else (2, 2, 2))
         if use_conv:
             self.op = conv_nd(dims, self.channels, self.out_channels, 3, stride=stride, padding=padding)
         else:
@@ -222,6 +229,9 @@ def __init__(
         use_checkpoint=False,
         up=False,
         down=False,
+        kernel_size=3,
+        exchange_temb_dims=False,
+        skip_t_emb=False,
         resblock_gn_groups=32,
     ):
         super().__init__()
@@ -232,10 +242,16 @@ def __init__(
         self.use_conv = use_conv
         self.use_checkpoint = use_checkpoint
         self.use_scale_shift_norm = use_scale_shift_norm
+        self.exchange_temb_dims = exchange_temb_dims
+
+        if isinstance(kernel_size, Iterable):
+            padding = [k // 2 for k in kernel_size]
+        else:
+            padding = kernel_size // 2
 
         self.in_layers = nn.Sequential(
             normalization(channels, act="silu", gn_groups=resblock_gn_groups),
-            conv_nd(dims, channels, self.out_channels, 3, padding=1),
+            conv_nd(dims, channels, self.out_channels, kernel_size, padding=padding),
         )
 
         self.updown = up or down
@@ -249,19 +265,25 @@ def __init__(
         else:
             self.h_upd = self.x_upd = nn.Identity()
 
-        self.emb_layers = nn.Sequential(
-            nn.SiLU(), linear(emb_channels, 2 * self.out_channels if use_scale_shift_norm else self.out_channels,),
-        )
+        self.skip_t_emb = skip_t_emb
+        self.emb_out_channels = 2 * self.out_channels if use_scale_shift_norm else self.out_channels
+        if self.skip_t_emb:
+            logging.info(f"Skipping timestep embedding in {self.__class__.__name__}")
+            assert not self.use_scale_shift_norm
+            self.emb_layers = None
+            self.exchange_temb_dims = False
+        else:
+            self.emb_layers = nn.Sequential(nn.SiLU(), linear(emb_channels, self.emb_out_channels),)
         self.out_layers = nn.Sequential(
             normalization(self.out_channels, act="silu", gn_groups=resblock_gn_groups),
             nn.Dropout(p=dropout),
-            zero_module(conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)),
+            zero_module(conv_nd(dims, self.out_channels, self.out_channels, kernel_size, padding=padding)),
         )
 
         if self.out_channels == channels:
             self.skip_connection = nn.Identity()
         elif use_conv:
-            self.skip_connection = conv_nd(dims, channels, self.out_channels, 3, padding=1)
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, kernel_size, padding=padding)
         else:
             self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
 
@@ -291,7 +313,11 @@ def _forward(self, x, emb):
             h = in_conv(h)
         else:
             h = self.in_layers(x)
-        emb_out = self.emb_layers(emb).type(h.dtype)
+
+        if self.skip_t_emb:
+            emb_out = th.zeros_like(h)
+        else:
+            emb_out = self.emb_layers(emb).type(h.dtype)
         while len(emb_out.shape) < len(h.shape):
             emb_out = emb_out[..., None]
         if self.use_scale_shift_norm:
@@ -300,6 +326,8 @@ def _forward(self, x, emb):
             h = out_norm(h) * (1 + scale) + shift
             h = out_rest(h)
         else:
+            if self.exchange_temb_dims:
+                emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
             h = h + emb_out
             h = self.out_layers(h)
         return self.skip_connection(x) + h
@@ -316,10 +344,36 @@ def __init__(
         self, channels, num_heads=1, num_head_channels=-1, use_checkpoint=False, use_new_attention_order=False,
     ):
         super().__init__()
-        logging.info(
-            "This option is deprecated, please set use_spatial_transformer=True in unet_config to build attention blocks"
-        )
-        raise NotImplementedError
+        self.channels = channels
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert (
+                channels % num_head_channels == 0
+            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            self.num_heads = channels // num_head_channels
+        self.use_checkpoint = use_checkpoint
+        self.norm = normalization(channels)
+        self.qkv = conv_nd(1, channels, channels * 3, 1)
+        if use_new_attention_order:
+            # split qkv before split heads
+            self.attention = QKVAttention(self.num_heads)
+        else:
+            # split heads before split qkv
+            self.attention = QKVAttentionLegacy(self.num_heads)
+
+        self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
+
+    def forward(self, x, **kwargs):
+        return checkpoint(self._forward, (x,), self.parameters(), True)
+
+    def _forward(self, x):
+        b, c, *spatial = x.shape
+        x = x.reshape(b, c, -1)
+        qkv = self.qkv(self.norm(x))
+        h = self.attention(qkv)
+        h = self.proj_out(h)
+        return (x + h).reshape(b, c, *spatial)
 
 
 def count_flops_attn(model, _x, y):
@@ -413,6 +467,15 @@ def count_flops(model, _x, y):
         return count_flops_attn(model, _x, y)
 
 
+class Timestep(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, t):
+        return timestep_embedding(t, self.dim)
+
+
 class UNetModel(nn.Module):
     """
     The full UNet model with attention and timestep embedding.
@@ -464,7 +527,13 @@ def __init__(
         context_dim=None,  # custom transformer support
         n_embed=None,  # custom support for prediction of discrete ids into codebook of first stage vq model
         legacy=True,
+        disable_self_attentions=None,
+        num_attention_blocks=None,
+        disable_middle_self_attn=False,
         use_linear_in_transformer=False,
+        adm_in_channels=None,
+        offload_to_cpu=False,
+        transformer_depth_middle=None,
         from_pretrained: str = None,
         from_NeMo=False,
         # It must be specified when from pretrained is not None. It indicates loading unet from NeMo trained ckpt or HF
@@ -473,6 +542,8 @@ def __init__(
         lora_network_alpha=None,
     ):
         super().__init__()
+        from omegaconf.listconfig import ListConfig
+
         if use_spatial_transformer:
             assert (
                 context_dim is not None
@@ -482,7 +553,6 @@ def __init__(
             assert (
                 use_spatial_transformer
             ), 'You forgot to use the spatial transformer for your cross-attention conditioning...'
-            from omegaconf.listconfig import ListConfig
 
             if type(context_dim) == ListConfig:
                 context_dim = list(context_dim)
@@ -500,7 +570,37 @@ def __init__(
         self.in_channels = in_channels
         self.model_channels = model_channels
         self.out_channels = out_channels
-        self.num_res_blocks = num_res_blocks
+        if isinstance(transformer_depth, int):
+            transformer_depth = len(channel_mult) * [transformer_depth]
+        elif isinstance(transformer_depth, ListConfig):
+            transformer_depth = list(transformer_depth)
+        transformer_depth_middle = default(transformer_depth_middle, transformer_depth[-1])
+
+        if isinstance(num_res_blocks, int):
+            self.num_res_blocks = len(channel_mult) * [num_res_blocks]
+        else:
+            if len(num_res_blocks) != len(channel_mult):
+                raise ValueError(
+                    "provide num_res_blocks either as an int (globally constant) or "
+                    "as a list/tuple (per-level) with the same length as channel_mult"
+                )
+            self.num_res_blocks = num_res_blocks
+        # self.num_res_blocks = num_res_blocks
+        if disable_self_attentions is not None:
+            # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
+            assert len(disable_self_attentions) == len(channel_mult)
+        if num_attention_blocks is not None:
+            assert len(num_attention_blocks) == len(self.num_res_blocks)
+            assert all(
+                map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks)),)
+            )
+            logging.info(
+                f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
+                f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
+                f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
+                f"attention will still not be set."
+            )  # todo: convert to warning
+
         self.attention_resolutions = attention_resolutions
         self.dropout = dropout
         self.channel_mult = channel_mult
@@ -517,8 +617,27 @@ def __init__(
         )
 
         if self.num_classes is not None:
-            self.label_emb = nn.Embedding(num_classes, time_embed_dim)
-
+            if isinstance(self.num_classes, int):
+                self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+            elif self.num_classes == "continuous":
+                logging.info("setting up linear c_adm embedding layer")
+                self.label_emb = nn.Linear(1, time_embed_dim)
+            elif self.num_classes == "timestep":
+                self.label_emb = nn.Sequential(
+                    Timestep(model_channels),
+                    nn.Sequential(
+                        linear(model_channels, time_embed_dim), nn.SiLU(), linear(time_embed_dim, time_embed_dim),
+                    ),
+                )
+            elif self.num_classes == "sequential":
+                assert adm_in_channels is not None
+                self.label_emb = nn.Sequential(
+                    nn.Sequential(
+                        linear(adm_in_channels, time_embed_dim), nn.SiLU(), linear(time_embed_dim, time_embed_dim),
+                    )
+                )
+            else:
+                raise ValueError()
         self.input_blocks = nn.ModuleList(
             [TimestepEmbedSequential(conv_nd(dims, in_channels, model_channels, 3, padding=1))]
         )
@@ -527,7 +646,7 @@ def __init__(
         ch = model_channels
         ds = 1
         for level, mult in enumerate(channel_mult):
-            for _ in range(num_res_blocks):
+            for nr in range(self.num_res_blocks[level]):
                 layers = [
                     ResBlock(
                         ch,
@@ -550,27 +669,34 @@ def __init__(
                     if legacy:
                         # num_heads = 1
                         dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
-                    layers.append(
-                        AttentionBlock(
-                            ch,
-                            use_checkpoint=use_checkpoint,
-                            num_heads=num_heads,
-                            num_head_channels=dim_head,
-                            use_new_attention_order=use_new_attention_order,
-                        )
-                        if not use_spatial_transformer
-                        else SpatialTransformer(
-                            ch,
-                            num_heads,
-                            dim_head,
-                            depth=transformer_depth,
-                            context_dim=context_dim,
-                            use_linear=use_linear_in_transformer,
-                            use_checkpoint=use_checkpoint,
-                            use_flash_attention=use_flash_attention,
-                            lora_network_alpha=lora_network_alpha,
+                    if exists(disable_self_attentions):
+                        disabled_sa = disable_self_attentions[level]
+                    else:
+                        disabled_sa = False
+
+                    if not exists(num_attention_blocks) or nr < num_attention_blocks[level]:
+                        layers.append(
+                            AttentionBlock(
+                                ch,
+                                use_checkpoint=use_checkpoint,
+                                num_heads=num_heads,
+                                num_head_channels=dim_head,
+                                use_new_attention_order=use_new_attention_order,
+                            )
+                            if not use_spatial_transformer
+                            else SpatialTransformer(
+                                ch,
+                                num_heads,
+                                dim_head,
+                                depth=transformer_depth[level],
+                                context_dim=context_dim,
+                                disable_self_attn=disabled_sa,
+                                use_linear=use_linear_in_transformer,
+                                use_checkpoint=use_checkpoint,
+                                use_flash_attention=use_flash_attention,
+                                lora_network_alpha=lora_network_alpha,
+                            )
                         )
-                    )
                 self.input_blocks.append(TimestepEmbedSequential(*layers))
                 self._feature_size += ch
                 input_block_chans.append(ch)
@@ -628,8 +754,9 @@ def __init__(
                 ch,
                 num_heads,
                 dim_head,
-                depth=transformer_depth,
+                depth=transformer_depth_middle,
                 context_dim=context_dim,
+                disable_self_attn=disable_middle_self_attn,
                 use_linear=use_linear_in_transformer,
                 use_checkpoint=use_checkpoint,
                 use_flash_attention=use_flash_attention,
@@ -649,7 +776,7 @@ def __init__(
 
         self.output_blocks = nn.ModuleList([])
         for level, mult in list(enumerate(channel_mult))[::-1]:
-            for i in range(num_res_blocks + 1):
+            for i in range(self.num_res_blocks[level] + 1):
                 ich = input_block_chans.pop()
                 layers = [
                     ResBlock(
@@ -672,28 +799,35 @@ def __init__(
                     if legacy:
                         # num_heads = 1
                         dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
-                    layers.append(
-                        AttentionBlock(
-                            ch,
-                            use_checkpoint=use_checkpoint,
-                            num_heads=num_heads_upsample,
-                            num_head_channels=dim_head,
-                            use_new_attention_order=use_new_attention_order,
-                        )
-                        if not use_spatial_transformer
-                        else SpatialTransformer(
-                            ch,
-                            num_heads,
-                            dim_head,
-                            depth=transformer_depth,
-                            context_dim=context_dim,
-                            use_linear=use_linear_in_transformer,
-                            use_checkpoint=use_checkpoint,
-                            use_flash_attention=use_flash_attention,
-                            lora_network_alpha=lora_network_alpha,
+                    if exists(disable_self_attentions):
+                        disabled_sa = disable_self_attentions[level]
+                    else:
+                        disabled_sa = False
+
+                    if not exists(num_attention_blocks) or i < num_attention_blocks[level]:
+                        layers.append(
+                            AttentionBlock(
+                                ch,
+                                use_checkpoint=use_checkpoint,
+                                num_heads=num_heads_upsample,
+                                num_head_channels=dim_head,
+                                use_new_attention_order=use_new_attention_order,
+                            )
+                            if not use_spatial_transformer
+                            else SpatialTransformer(
+                                ch,
+                                num_heads,
+                                dim_head,
+                                depth=transformer_depth[level],
+                                context_dim=context_dim,
+                                disable_self_attn=disabled_sa,
+                                use_linear=use_linear_in_transformer,
+                                use_checkpoint=use_checkpoint,
+                                use_flash_attention=use_flash_attention,
+                                lora_network_alpha=lora_network_alpha,
+                            )
                         )
-                    )
-                if level and i == num_res_blocks:
+                if level and i == self.num_res_blocks[level]:
                     out_ch = ch
                     layers.append(
                         ResBlock(
@@ -732,16 +866,15 @@ def __init__(
                 state_dict = load_safetensors(from_pretrained)
             else:
                 state_dict = torch.load(from_pretrained, map_location='cpu')
-
             if 'state_dict' in state_dict.keys():
                 state_dict = state_dict['state_dict']
             missing_key, unexpected_keys, _, _ = self._load_pretrained_model(state_dict, from_NeMo=from_NeMo)
             if len(missing_key) > 0:
-                print(
+                logging.info(
                     'Following keys are missing during loading unet weights, which may lead to compromised image quality for a resumed training. Please check the checkpoint you provided.'
                 )
-                print(f"Missing keys: {missing_key}")
-                print(f"Unexpected keys: {unexpected_keys}")
+                logging.info(f"Missing keys: {missing_key}")
+                logging.info(f"Unexpected keys: {unexpected_keys}")
 
         if enable_amp_o2_fp16:
             self.convert_to_fp16()
@@ -835,13 +968,24 @@ def _output_blocks_mapping(self, output_dict):
                 res_dict["output_blocks." + str(target_id) + mid_str + post_fix] = value_
         return res_dict
 
+    def _sdxl_embedding_mapping(self, sdxl_dict):
+        res_dict = {}
+        for key_, value_ in sdxl_dict.items():
+            new_key_ = (
+                key_.replace('add_embedding.', 'label_emb.').replace('linear_1.', '0.0.').replace('linear_2.', '0.2.')
+            )
+            res_dict[new_key_] = value_
+        return res_dict
+
     def _state_key_mapping(self, state_dict: dict):
+        import re
 
         res_dict = {}
         input_dict = {}
         mid_dict = {}
         output_dict = {}
         other_dict = {}
+        sdxl_dict = {}
         for key_, value_ in state_dict.items():
             if "down_blocks" in key_:
                 input_dict[key_.replace('down_blocks', 'input_blocks')] = value_
@@ -849,6 +993,9 @@ def _state_key_mapping(self, state_dict: dict):
                 output_dict[key_.replace('up_blocks', 'output_blocks')] = value_
             elif "mid_block" in key_:
                 mid_dict[key_.replace('mid_block', 'middle_block')] = value_
+            elif "add_embedding" in key_:
+                # SDXL related mapping
+                sdxl_dict[key_] = value_
             else:
                 other_dict[key_] = value_
 
@@ -856,10 +1003,19 @@ def _state_key_mapping(self, state_dict: dict):
         output_dict = self._output_blocks_mapping(output_dict)
         mid_dict = self._mid_blocks_mapping(mid_dict)
         other_dict = self._other_blocks_mapping(other_dict)
+        sdxl_dict = self._sdxl_embedding_mapping(sdxl_dict)
+        # key_list = state_dict.keys()
+        # key_str = " ".join(key_list)
+
+        # for key_, val_ in state_dict.items():
+        #     key_ = key_.replace("down_blocks", "input_blocks")\
+        #         .replace("up_blocks", 'output_blocks')
+        #     res_dict[key_] = val_
         res_dict.update(input_dict)
         res_dict.update(output_dict)
         res_dict.update(mid_dict)
         res_dict.update(other_dict)
+        res_dict.update(sdxl_dict)
 
         return res_dict
 
@@ -875,20 +1031,32 @@ def _load_pretrained_model(self, state_dict, ignore_mismatched_sizes=False, from
         missing_keys = list(set(expected_keys) - set(loaded_keys))
         unexpected_keys = list(set(loaded_keys) - set(expected_keys))
 
+        # SDXL specific mapping
+        if 'output_blocks.2.2.conv.bias' in missing_keys and 'output_blocks.2.1.conv.bias' in loaded_keys:
+            state_dict['output_blocks.2.2.conv.bias'] = state_dict['output_blocks.2.1.conv.bias']
+            state_dict['output_blocks.2.2.conv.weight'] = state_dict['output_blocks.2.1.conv.weight']
+
+        if 'out.1.weight' in missing_keys:
+            state_dict['out.1.weight'] = state_dict['out.2.weight']
+            state_dict['out.1.bias'] = state_dict['out.2.bias']
+
         if (
             'input_blocks.1.0.in_layers.2.weight' in loaded_keys
             and 'input_blocks.1.0.in_layers.1.weight' in expected_keys
         ):
             # GroupNormOpt fuses activation function to one layer, thus the indexing of weights are shifted for following
             for key_ in missing_keys:
-                s = key_.split('.')
-                idx = int(s[-2])
-                new_key_ = ".".join(s[:-2] + [str(int(idx + 1))] + [s[-1]])
-                state_dict[key_] = state_dict[new_key_]
-
-            loaded_keys = list(state_dict.keys())
-            missing_keys = list(set(expected_keys) - set(loaded_keys))
-            unexpected_keys = list(set(loaded_keys) - set(expected_keys))
+                try:
+                    s = key_.split('.')
+                    idx = int(s[-2])
+                    new_key_ = ".".join(s[:-2] + [str(int(idx + 1))] + [s[-1]])
+                    state_dict[key_] = state_dict[new_key_]
+                except:
+                    continue
+
+        loaded_keys = list(state_dict.keys())
+        missing_keys = list(set(expected_keys) - set(loaded_keys))
+        unexpected_keys = list(set(loaded_keys) - set(expected_keys))
 
         def _find_mismatched_keys(
             state_dict, model_state_dict, loaded_keys, ignore_mismatched_sizes,
@@ -922,16 +1090,14 @@ def _strip_unet_key_prefix(self, state_dict):
         for key_, value_ in state_dict.items():
             if key_.startswith('model.diffusion_model'):
                 re_state_dict[key_.replace('model.diffusion_model.', '')] = value_
-            elif key_.startswith('model.model.diffusion_model'):
+            if key_.startswith('model.model.diffusion_model'):
                 re_state_dict[key_.replace('model.model.diffusion_model.', '')] = value_
-            elif key_.startswith('model._orig_mod.diffusion_model.'):
+            if key_.startswith('model._orig_mod.diffusion_model.'):
                 re_state_dict[key_.replace('model._orig_mod.diffusion_model.', '')] = value_
-            elif key_.startswith('model.model._orig_mod.diffusion_model.'):
+            if key_.startswith('model.model._orig_mod.diffusion_model.'):
                 re_state_dict[key_.replace('model.model._orig_mod.diffusion_model.', '')] = value_
-            elif key_.startswith('model.model.diffusion_model._orig_mod.'):
+            if key_.startswith('model.model.diffusion_model._orig_mod.'):
                 re_state_dict[key_.replace('model.model.diffusion_model._orig_mod.', '')] = value_
-            else:
-                re_state_dict[key_] = value_
         return re_state_dict
 
     def _load_state_dict_into_model(self, state_dict):
@@ -981,9 +1147,8 @@ def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
         hs = []
         t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
         emb = self.time_embed(t_emb)
-
         if self.num_classes is not None:
-            assert y.shape == (x.shape[0],)
+            assert y.shape[0] == x.shape[0]
             emb = emb + self.label_emb(y)
 
         h = x.type(emb.dtype)
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/sampling.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/sampling.py
new file mode 100644
index 000000000000..c636ffec345d
--- /dev/null
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/sampling.py
@@ -0,0 +1,315 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+    Partially ported from https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/sampling.py
+"""
+
+from typing import Dict, Union
+
+import torch
+from omegaconf import ListConfig, OmegaConf
+from tqdm import tqdm
+
+from nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sampling_utils import (
+    get_ancestral_step,
+    linear_multistep_coeff,
+    to_d,
+    to_neg_log_sigma,
+    to_sigma,
+)
+from nemo.collections.multimodal.parts.stable_diffusion.utils import append_dims, default, instantiate_from_config
+
+DEFAULT_GUIDER = {
+    "target": "nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.guiders.IdentityGuider"
+}
+
+
+class BaseDiffusionSampler:
+    def __init__(
+        self,
+        discretization_config: Union[Dict, ListConfig, OmegaConf],
+        num_steps: Union[int, None] = None,
+        guider_config: Union[Dict, ListConfig, OmegaConf, None] = None,
+        verbose: bool = False,
+        device: str = "cuda",
+    ):
+        self.num_steps = num_steps
+        self.discretization = instantiate_from_config(discretization_config)
+        self.guider = instantiate_from_config(default(guider_config, DEFAULT_GUIDER,))
+        self.verbose = verbose
+        self.device = device
+
+    def prepare_sampling_loop(self, x, cond, uc=None, num_steps=None):
+        sigmas = self.discretization(self.num_steps if num_steps is None else num_steps, device=self.device)
+        uc = default(uc, cond)
+
+        x *= torch.sqrt(1.0 + sigmas[0] ** 2.0)
+        num_sigmas = len(sigmas)
+
+        s_in = x.new_ones([x.shape[0]])
+
+        return x, s_in, sigmas, num_sigmas, cond, uc
+
+    def denoise(self, x, denoiser, sigma, cond, uc):
+        denoised = denoiser(*self.guider.prepare_inputs(x, sigma, cond, uc))
+        denoised = self.guider(denoised, sigma)
+        return denoised
+
+    def get_sigma_gen(self, num_sigmas):
+        sigma_generator = range(num_sigmas - 1)
+        if self.verbose:
+            print("#" * 30, " Sampling setting ", "#" * 30)
+            print(f"Sampler: {self.__class__.__name__}")
+            print(f"Discretization: {self.discretization.__class__.__name__}")
+            print(f"Guider: {self.guider.__class__.__name__}")
+            sigma_generator = tqdm(
+                sigma_generator,
+                total=num_sigmas,
+                desc=f"Sampling with {self.__class__.__name__} for {num_sigmas} steps",
+            )
+        return sigma_generator
+
+
+class SingleStepDiffusionSampler(BaseDiffusionSampler):
+    def sampler_step(self, sigma, next_sigma, denoiser, x, cond, uc, *args, **kwargs):
+        raise NotImplementedError
+
+    def euler_step(self, x, d, dt):
+        return x + dt * d
+
+
+class EDMSampler(SingleStepDiffusionSampler):
+    def __init__(self, s_churn=0.0, s_tmin=0.0, s_tmax=float("inf"), s_noise=1.0, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.s_churn = s_churn
+        self.s_tmin = s_tmin
+        self.s_tmax = s_tmax
+        self.s_noise = s_noise
+
+    def sampler_step(self, sigma, next_sigma, denoiser, x, cond, uc=None, gamma=0.0):
+        sigma_hat = sigma * (gamma + 1.0)
+        if gamma > 0:
+            eps = torch.randn_like(x) * self.s_noise
+            x = x + eps * append_dims(sigma_hat ** 2 - sigma ** 2, x.ndim) ** 0.5
+
+        denoised = self.denoise(x, denoiser, sigma_hat, cond, uc)
+        d = to_d(x, sigma_hat, denoised)
+        dt = append_dims(next_sigma - sigma_hat, x.ndim)
+
+        euler_step = self.euler_step(x, d, dt)
+        x = self.possible_correction_step(euler_step, x, d, dt, next_sigma, denoiser, cond, uc)
+        return x
+
+    def __call__(self, denoiser, x, cond, uc=None, num_steps=None):
+        x, s_in, sigmas, num_sigmas, cond, uc = self.prepare_sampling_loop(x, cond, uc, num_steps)
+
+        for i in self.get_sigma_gen(num_sigmas):
+            gamma = (
+                min(self.s_churn / (num_sigmas - 1), 2 ** 0.5 - 1) if self.s_tmin <= sigmas[i] <= self.s_tmax else 0.0
+            )
+            x = self.sampler_step(s_in * sigmas[i], s_in * sigmas[i + 1], denoiser, x, cond, uc, gamma,)
+
+        return x
+
+
+class AncestralSampler(SingleStepDiffusionSampler):
+    def __init__(self, eta=1.0, s_noise=1.0, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.eta = eta
+        self.s_noise = s_noise
+        self.noise_sampler = lambda x: torch.randn_like(x)
+
+    def ancestral_euler_step(self, x, denoised, sigma, sigma_down):
+        d = to_d(x, sigma, denoised)
+        dt = append_dims(sigma_down - sigma, x.ndim)
+
+        return self.euler_step(x, d, dt)
+
+    def ancestral_step(self, x, sigma, next_sigma, sigma_up):
+        x = torch.where(
+            append_dims(next_sigma, x.ndim) > 0.0,
+            x + self.noise_sampler(x) * self.s_noise * append_dims(sigma_up, x.ndim),
+            x,
+        )
+        return x
+
+    def __call__(self, denoiser, x, cond, uc=None, num_steps=None):
+        x, s_in, sigmas, num_sigmas, cond, uc = self.prepare_sampling_loop(x, cond, uc, num_steps)
+
+        for i in self.get_sigma_gen(num_sigmas):
+            x = self.sampler_step(s_in * sigmas[i], s_in * sigmas[i + 1], denoiser, x, cond, uc,)
+
+        return x
+
+
+class LinearMultistepSampler(BaseDiffusionSampler):
+    def __init__(
+        self, order=4, *args, **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+
+        self.order = order
+
+    def __call__(self, denoiser, x, cond, uc=None, num_steps=None, **kwargs):
+        x, s_in, sigmas, num_sigmas, cond, uc = self.prepare_sampling_loop(x, cond, uc, num_steps)
+
+        ds = []
+        sigmas_cpu = sigmas.detach().cpu().numpy()
+        for i in self.get_sigma_gen(num_sigmas):
+            sigma = s_in * sigmas[i]
+            denoised = denoiser(*self.guider.prepare_inputs(x, sigma, cond, uc), **kwargs)
+            denoised = self.guider(denoised, sigma)
+            d = to_d(x, sigma, denoised)
+            ds.append(d)
+            if len(ds) > self.order:
+                ds.pop(0)
+            cur_order = min(i + 1, self.order)
+            coeffs = [linear_multistep_coeff(cur_order, sigmas_cpu, i, j) for j in range(cur_order)]
+            x = x + sum(coeff * d for coeff, d in zip(coeffs, reversed(ds)))
+
+        return x
+
+
+class EulerEDMSampler(EDMSampler):
+    def possible_correction_step(self, euler_step, x, d, dt, next_sigma, denoiser, cond, uc):
+        return euler_step
+
+
+class HeunEDMSampler(EDMSampler):
+    def possible_correction_step(self, euler_step, x, d, dt, next_sigma, denoiser, cond, uc):
+        if torch.sum(next_sigma) < 1e-14:
+            # Save a network evaluation if all noise levels are 0
+            return euler_step
+        else:
+            denoised = self.denoise(euler_step, denoiser, next_sigma, cond, uc)
+            d_new = to_d(euler_step, next_sigma, denoised)
+            d_prime = (d + d_new) / 2.0
+
+            # apply correction if noise level is not 0
+            x = torch.where(append_dims(next_sigma, x.ndim) > 0.0, x + d_prime * dt, euler_step)
+            return x
+
+
+class EulerAncestralSampler(AncestralSampler):
+    def sampler_step(self, sigma, next_sigma, denoiser, x, cond, uc):
+        sigma_down, sigma_up = get_ancestral_step(sigma, next_sigma, eta=self.eta)
+        denoised = self.denoise(x, denoiser, sigma, cond, uc)
+        x = self.ancestral_euler_step(x, denoised, sigma, sigma_down)
+        x = self.ancestral_step(x, sigma, next_sigma, sigma_up)
+
+        return x
+
+
+class DPMPP2SAncestralSampler(AncestralSampler):
+    def get_variables(self, sigma, sigma_down):
+        t, t_next = [to_neg_log_sigma(s) for s in (sigma, sigma_down)]
+        h = t_next - t
+        s = t + 0.5 * h
+        return h, s, t, t_next
+
+    def get_mult(self, h, s, t, t_next):
+        mult1 = to_sigma(s) / to_sigma(t)
+        mult2 = (-0.5 * h).expm1()
+        mult3 = to_sigma(t_next) / to_sigma(t)
+        mult4 = (-h).expm1()
+
+        return mult1, mult2, mult3, mult4
+
+    def sampler_step(self, sigma, next_sigma, denoiser, x, cond, uc=None, **kwargs):
+        sigma_down, sigma_up = get_ancestral_step(sigma, next_sigma, eta=self.eta)
+        denoised = self.denoise(x, denoiser, sigma, cond, uc)
+        x_euler = self.ancestral_euler_step(x, denoised, sigma, sigma_down)
+
+        if torch.sum(sigma_down) < 1e-14:
+            # Save a network evaluation if all noise levels are 0
+            x = x_euler
+        else:
+            h, s, t, t_next = self.get_variables(sigma, sigma_down)
+            mult = [append_dims(mult, x.ndim) for mult in self.get_mult(h, s, t, t_next)]
+
+            x2 = mult[0] * x - mult[1] * denoised
+            denoised2 = self.denoise(x2, denoiser, to_sigma(s), cond, uc)
+            x_dpmpp2s = mult[2] * x - mult[3] * denoised2
+
+            # apply correction if noise level is not 0
+            x = torch.where(append_dims(sigma_down, x.ndim) > 0.0, x_dpmpp2s, x_euler)
+
+        x = self.ancestral_step(x, sigma, next_sigma, sigma_up)
+        return x
+
+
+class DPMPP2MSampler(BaseDiffusionSampler):
+    def get_variables(self, sigma, next_sigma, previous_sigma=None):
+        t, t_next = [to_neg_log_sigma(s) for s in (sigma, next_sigma)]
+        h = t_next - t
+
+        if previous_sigma is not None:
+            h_last = t - to_neg_log_sigma(previous_sigma)
+            r = h_last / h
+            return h, r, t, t_next
+        else:
+            return h, None, t, t_next
+
+    def get_mult(self, h, r, t, t_next, previous_sigma):
+        mult1 = to_sigma(t_next) / to_sigma(t)
+        mult2 = (-h).expm1()
+
+        if previous_sigma is not None:
+            mult3 = 1 + 1 / (2 * r)
+            mult4 = 1 / (2 * r)
+            return mult1, mult2, mult3, mult4
+        else:
+            return mult1, mult2
+
+    def sampler_step(
+        self, old_denoised, previous_sigma, sigma, next_sigma, denoiser, x, cond, uc=None,
+    ):
+        denoised = self.denoise(x, denoiser, sigma, cond, uc)
+
+        h, r, t, t_next = self.get_variables(sigma, next_sigma, previous_sigma)
+        mult = [append_dims(mult, x.ndim) for mult in self.get_mult(h, r, t, t_next, previous_sigma)]
+
+        x_standard = mult[0] * x - mult[1] * denoised
+        if old_denoised is None or torch.sum(next_sigma) < 1e-14:
+            # Save a network evaluation if all noise levels are 0 or on the first step
+            return x_standard, denoised
+        else:
+            denoised_d = mult[2] * denoised - mult[3] * old_denoised
+            x_advanced = mult[0] * x - mult[1] * denoised_d
+
+            # apply correction if noise level is not 0 and not first step
+            x = torch.where(append_dims(next_sigma, x.ndim) > 0.0, x_advanced, x_standard)
+
+        return x, denoised
+
+    def __call__(self, denoiser, x, cond, uc=None, num_steps=None, **kwargs):
+        x, s_in, sigmas, num_sigmas, cond, uc = self.prepare_sampling_loop(x, cond, uc, num_steps)
+
+        old_denoised = None
+        for i in self.get_sigma_gen(num_sigmas):
+            x, old_denoised = self.sampler_step(
+                old_denoised,
+                None if i == 0 else s_in * sigmas[i - 1],
+                s_in * sigmas[i],
+                s_in * sigmas[i + 1],
+                denoiser,
+                x,
+                cond,
+                uc=uc,
+            )
+
+        return x
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/sampling_utils.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/sampling_utils.py
new file mode 100644
index 000000000000..9770f4d76942
--- /dev/null
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/sampling_utils.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from scipy import integrate
+
+from nemo.collections.multimodal.parts.stable_diffusion.utils import append_dims
+
+
+class NoDynamicThresholding:
+    def __call__(self, uncond, cond, scale):
+        return uncond + scale * (cond - uncond)
+
+
+def linear_multistep_coeff(order, t, i, j, epsrel=1e-4):
+    if order - 1 > i:
+        raise ValueError(f"Order {order} too high for step {i}")
+
+    def fn(tau):
+        prod = 1.0
+        for k in range(order):
+            if j == k:
+                continue
+            prod *= (tau - t[i - k]) / (t[i - j] - t[i - k])
+        return prod
+
+    return integrate.quad(fn, t[i], t[i + 1], epsrel=epsrel)[0]
+
+
+def get_ancestral_step(sigma_from, sigma_to, eta=1.0):
+    if not eta:
+        return sigma_to, 0.0
+    sigma_up = torch.minimum(
+        sigma_to, eta * (sigma_to ** 2 * (sigma_from ** 2 - sigma_to ** 2) / sigma_from ** 2) ** 0.5,
+    )
+    sigma_down = (sigma_to ** 2 - sigma_up ** 2) ** 0.5
+    return sigma_down, sigma_up
+
+
+def to_d(x, sigma, denoised):
+    return (x - denoised) / append_dims(sigma, x.ndim)
+
+
+def to_neg_log_sigma(sigma):
+    return sigma.log().neg()
+
+
+def to_sigma(neg_log_sigma):
+    return neg_log_sigma.neg().exp()
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/sigma_sampling.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/sigma_sampling.py
new file mode 100644
index 000000000000..5f54f7c4146f
--- /dev/null
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/sigma_sampling.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from nemo.collections.multimodal.parts.stable_diffusion.utils import default, instantiate_from_config
+
+
+class EDMSampling:
+    def __init__(self, p_mean=-1.2, p_std=1.2):
+        self.p_mean = p_mean
+        self.p_std = p_std
+
+    def __call__(self, n_samples, rand=None):
+        log_sigma = self.p_mean + self.p_std * default(rand, torch.randn((n_samples,)))
+        return log_sigma.exp()
+
+
+class DiscreteSampling:
+    def __init__(self, discretization, num_idx, do_append_zero=False, flip=True):
+        self.num_idx = num_idx
+        self.sigmas = discretization(num_idx, do_append_zero=do_append_zero, flip=flip)
+
+    def idx_to_sigma(self, idx):
+        return self.sigmas[idx]
+
+    def __call__(self, n_samples, rand=None):
+        idx = default(rand, torch.randint(0, self.num_idx, (n_samples,)),)
+        return self.idx_to_sigma(idx)
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py
index 695333edc649..d22693a12801 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py
@@ -24,6 +24,7 @@
 '''
 
 import math
+from inspect import isfunction
 
 import numpy as np
 import torch
@@ -317,3 +318,13 @@ def expand_dims(v, dims):
     Expand the tensor `v` to the dim `dims`.
     """
     return v[(...,) + (None,) * (dims - 1)]
+
+
+def exists(x):
+    return x is not None
+
+
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/wrappers.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/wrappers.py
new file mode 100644
index 000000000000..0d465c1275c6
--- /dev/null
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/wrappers.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+from packaging import version
+
+OPENAIUNETWRAPPER = "nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.wrappers.OpenAIWrapper"
+
+
+class IdentityWrapper(nn.Module):
+    def __init__(self, diffusion_model, compile_model: bool = False):
+        super().__init__()
+        compile = (
+            torch.compile
+            if (version.parse(torch.__version__) >= version.parse("2.0.0")) and compile_model
+            else lambda x: x
+        )
+        self.diffusion_model = compile(diffusion_model)
+
+    def forward(self, *args, **kwargs):
+        return self.diffusion_model(*args, **kwargs)
+
+
+class OpenAIWrapper(IdentityWrapper):
+    def forward(self, x: torch.Tensor, t: torch.Tensor, c: dict, **kwargs) -> torch.Tensor:
+        if c.get("concat", None):
+            x = torch.cat((x, c.get("concat")), dim=1)
+        return self.diffusion_model(
+            x, timesteps=t, context=c.get("crossattn", None), y=c.get("vector", None), **kwargs,
+        )
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/encoders/modules.py b/nemo/collections/multimodal/modules/stable_diffusion/encoders/modules.py
index ca61f88fd901..446b81ab11b6 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/encoders/modules.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/encoders/modules.py
@@ -14,20 +14,29 @@
 import os
 import tempfile
 from functools import partial
+from typing import Dict, List, Optional, Tuple, Union
 
 import open_clip
 import torch
 import torch.nn as nn
-from omegaconf import OmegaConf
+from einops import rearrange, repeat
+from omegaconf import ListConfig, OmegaConf
 from torch.utils.checkpoint import checkpoint
 from transformers import CLIPTextModel, CLIPTokenizer
 
 from nemo.collections.multimodal.data.clip.clip_dataset import get_preprocess_fns
 from nemo.collections.multimodal.models.vision_language_foundation.clip.megatron_clip_models import CLIPModel
+from nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel import Timestep
 from nemo.collections.multimodal.modules.stable_diffusion.encoders.x_transformer import (
     TransformerWrapper,  # TODO: can we directly rely on lucidrains code and simply add this as a reuirement? --> test
 )
 from nemo.collections.multimodal.modules.stable_diffusion.encoders.x_transformer import Encoder
+from nemo.collections.multimodal.parts.stable_diffusion.utils import (
+    count_params,
+    disabled_train,
+    expand_dims_like,
+    instantiate_from_config,
+)
 from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import (
     AdapterName,
     ParallelLinearAdapterConfig,
@@ -82,6 +91,178 @@ def _enable_lora(self, lora_model):
                     module.add_module(lora_name, lora_layer)
 
 
+class AbstractEmbModel(nn.Module):
+    def __init__(self, enable_lora_finetune=False, target_block=[], target_module=[]):
+        super().__init__()
+        self._is_trainable = None
+        self._ucg_rate = None
+        self._input_key = None
+
+        self.TARGET_BLOCK = target_block
+        self.TARGET_MODULE = target_module
+        if enable_lora_finetune:
+            self.lora_layers = []
+
+    @property
+    def is_trainable(self) -> bool:
+        return self._is_trainable
+
+    @property
+    def ucg_rate(self) -> Union[float, torch.Tensor]:
+        return self._ucg_rate
+
+    @property
+    def input_key(self) -> str:
+        return self._input_key
+
+    @is_trainable.setter
+    def is_trainable(self, value: bool):
+        self._is_trainable = value
+
+    @ucg_rate.setter
+    def ucg_rate(self, value: Union[float, torch.Tensor]):
+        self._ucg_rate = value
+
+    @input_key.setter
+    def input_key(self, value: str):
+        self._input_key = value
+
+    @is_trainable.deleter
+    def is_trainable(self):
+        del self._is_trainable
+
+    @ucg_rate.deleter
+    def ucg_rate(self):
+        del self._ucg_rate
+
+    @input_key.deleter
+    def input_key(self):
+        del self._input_key
+
+    def encode(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def _enable_lora(self, lora_model):
+        for module_name, module in lora_model.named_modules():
+            if module.__class__.__name__ in self.TARGET_BLOCK:
+                tmp = {}
+                for sub_name, sub_module in module.named_modules():
+                    if sub_module.__class__.__name__ in self.TARGET_MODULE:
+                        if hasattr(sub_module, "input_size") and hasattr(
+                            sub_module, "output_size"
+                        ):  # for megatron ParallelLinear
+                            lora = LoraWrapper(sub_module, sub_module.input_size, sub_module.output_size)
+                        else:  # for nn.Linear
+                            lora = LoraWrapper(sub_module, sub_module.in_features, sub_module.out_features)
+                        self.lora_layers.append(lora)
+                        if sub_name not in tmp.keys():
+                            tmp.update({sub_name: lora})
+                        else:
+                            print(f"Duplicate subnames are found in module {module_name}")
+                for sub_name, lora_layer in tmp.items():
+                    lora_name = f'{sub_name}_lora'
+                    module.add_module(lora_name, lora_layer)
+
+
+class GeneralConditioner(nn.Module):
+    OUTPUT_DIM2KEYS = {2: "vector", 3: "crossattn", 4: "concat", 5: "concat"}
+    KEY2CATDIM = {"vector": 1, "crossattn": 2, "concat": 1}
+
+    def __init__(self, emb_models: List[ListConfig]):
+        super().__init__()
+        embedders = []
+
+        for n, embconfig in enumerate(emb_models):
+            embedder = embconfig['emb_model']
+            assert isinstance(
+                embedder, AbstractEmbModel
+            ), f"embedder model {embedder.__class__.__name__} has to inherit from AbstractEmbModel"
+            embedder.is_trainable = embconfig.get("is_trainable", False)
+            embedder.ucg_rate = embconfig.get("ucg_rate", 0.0)
+            if not embedder.is_trainable:
+                embedder.train = disabled_train
+                for param in embedder.parameters():
+                    param.requires_grad = False
+                embedder.eval()
+            print(
+                f"Initialized embedder #{n}: {embedder.__class__.__name__} "
+                f"with {count_params(embedder, False)} params. Trainable: {embedder.is_trainable}"
+            )
+
+            if "input_key" in embconfig:
+                embedder.input_key = embconfig["input_key"]
+            elif "input_keys" in embconfig:
+                embedder.input_keys = embconfig["input_keys"]
+            else:
+                raise KeyError(f"need either 'input_key' or 'input_keys' for embedder {embedder.__class__.__name__}")
+
+            embedder.legacy_ucg_val = embconfig.get("legacy_ucg_value", None)
+            if embedder.legacy_ucg_val is not None:
+                embedder.ucg_prng = np.random.RandomState()
+
+            embedders.append(embedder)
+        self.embedders = nn.ModuleList(embedders)
+
+    def possibly_get_ucg_val(self, embedder: AbstractEmbModel, batch: Dict) -> Dict:
+        assert embedder.legacy_ucg_val is not None
+        p = embedder.ucg_rate
+        val = embedder.legacy_ucg_val
+        for i in range(len(batch[embedder.input_key])):
+            if embedder.ucg_prng.choice(2, p=[1 - p, p]):
+                batch[embedder.input_key][i] = val
+        return batch
+
+    def forward(self, batch: Dict, force_zero_embeddings: Optional[List] = None) -> Dict:
+        output = dict()
+        if force_zero_embeddings is None:
+            force_zero_embeddings = []
+        for embedder in self.embedders:
+            embedding_context = nullcontext if embedder.is_trainable else torch.no_grad
+            with embedding_context():
+                if hasattr(embedder, "input_key") and (embedder.input_key is not None):
+                    if embedder.legacy_ucg_val is not None:
+                        batch = self.possibly_get_ucg_val(embedder, batch)
+                    emb_out = embedder(batch[embedder.input_key])
+                elif hasattr(embedder, "input_keys"):
+                    emb_out = embedder(*[batch[k] for k in embedder.input_keys])
+            assert isinstance(
+                emb_out, (torch.Tensor, list, tuple)
+            ), f"encoder outputs must be tensors or a sequence, but got {type(emb_out)}"
+            if not isinstance(emb_out, (list, tuple)):
+                emb_out = [emb_out]
+            for emb in emb_out:
+                out_key = self.OUTPUT_DIM2KEYS[emb.dim()]
+                if embedder.ucg_rate > 0.0 and embedder.legacy_ucg_val is None:
+                    emb = (
+                        expand_dims_like(
+                            torch.bernoulli((1.0 - embedder.ucg_rate) * torch.ones(emb.shape[0], device=emb.device)),
+                            emb,
+                        )
+                        * emb
+                    )
+                if hasattr(embedder, "input_key") and embedder.input_key in force_zero_embeddings:
+                    emb = torch.zeros_like(emb)
+                if out_key in output:
+                    output[out_key] = torch.cat((output[out_key], emb), self.KEY2CATDIM[out_key])
+                else:
+                    output[out_key] = emb
+        return output
+
+    def get_unconditional_conditioning(self, batch_c, batch_uc=None, force_uc_zero_embeddings=None):
+        if force_uc_zero_embeddings is None:
+            force_uc_zero_embeddings = []
+        ucg_rates = list()
+        for embedder in self.embedders:
+            ucg_rates.append(embedder.ucg_rate)
+            embedder.ucg_rate = 0.0
+        c = self(batch_c)
+        uc = self(batch_c if batch_uc is None else batch_uc, force_uc_zero_embeddings)
+
+        for embedder, rate in zip(self.embedders, ucg_rates):
+            embedder.ucg_rate = rate
+        return c, uc
+
+
 class ClassEmbedder(nn.Module):
     def __init__(self, embed_dim, n_classes=1000, key='class'):
         super().__init__()
@@ -254,11 +435,20 @@ def add_adapter(self, name, cfg, **kwargs):
         del self.target_module
 
 
-class FrozenCLIPEmbedder(AbstractEncoder):
+class FrozenCLIPEmbedder(AbstractEmbModel):
     """Uses the CLIP transformer encoder for text (from Hugging Face)"""
 
+    LAYERS = ["last", "pooled", "hidden"]
+
     def __init__(
-        self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77, enable_lora_finetune=False
+        self,
+        version="openai/clip-vit-large-patch14",
+        device="cuda",
+        max_length=77,
+        enable_lora_finetune=False,
+        layer="last",
+        layer_idx=None,
+        always_return_pooled=False,
     ):
         super().__init__(enable_lora_finetune, target_block=["CLIPAttention", "CLIPMLP"], target_module=["Linear"])
         self.tokenizer = CLIPTokenizer.from_pretrained(version)
@@ -270,6 +460,13 @@ def __init__(
             self._enable_lora(self.transformer)
             print(f"CLIP transformer encoder add {len(self.lora_layers)} lora layers.")
 
+        self.layer = layer
+        self.layer_idx = layer_idx
+        self.return_pooled = always_return_pooled
+        if layer == "hidden":
+            assert layer_idx is not None
+            assert 0 <= abs(layer_idx) <= 12
+
     def freeze(self):
         self.transformer = self.transformer.eval()
         for param in self.parameters():
@@ -286,13 +483,20 @@ def forward(self, text):
             return_tensors="pt",
         )
         tokens = batch_encoding["input_ids"].to(self.device, non_blocking=True)
-        outputs = self.transformer(input_ids=tokens)
+        outputs = self.transformer(input_ids=tokens, output_hidden_states=(self.layer == "hidden"))
 
-        z = outputs.last_hidden_state
+        if self.layer == "last":
+            z = outputs.last_hidden_state
+        elif self.layer == "pooled":
+            z = outputs.pooler_output[:, None, :]
+        else:
+            z = outputs.hidden_states[self.layer_idx]
 
         # Pad the seq length to multiple of 8
         seq_len = (z.shape[1] + 8 - 1) // 8 * 8
         z = torch.nn.functional.pad(z, (0, 0, 0, seq_len - z.shape[1]), value=0.0)
+        if self.return_pooled:
+            return z, outputs.pooler_output
         return z
 
     def encode(self, text):
@@ -324,13 +528,13 @@ def __init__(
         super().__init__()
         assert layer in self.LAYERS
         print(f"Downloading clip with", arch, version, cache_dir)
+        self.device = device
         model, _, _ = open_clip.create_model_and_transforms(
             arch, device=torch.device("cpu"), pretrained=version, cache_dir=cache_dir,
         )
         del model.visual
         self.model = model
 
-        self.device = device
         self.max_length = max_length
         if freeze:
             self.freeze()
@@ -379,7 +583,7 @@ def encode(self, text):
         return self(text)
 
 
-class FrozenMegatronCLIPEmbedder(AbstractEncoder):
+class FrozenMegatronCLIPEmbedder(AbstractEmbModel):
     def __init__(
         self,
         restore_from_path,
@@ -387,7 +591,7 @@ def __init__(
         layer="last",
         freeze=True,
         cfg=None,
-        use_fp16=False,
+        always_return_pooled=False,
         enable_lora_finetune=False,
     ):
         super().__init__(
@@ -405,6 +609,7 @@ def __init__(
         self.cfg = cfg
         self.build_tokenizer(cfg)
         self.load_model(cfg, state_dict)
+        self.return_pooled = always_return_pooled
 
         self.device = device
         if freeze:
@@ -503,10 +708,12 @@ def forward(self, text):
         Get embeddings from input text
         '''
         texts = self.text_transform(text)
-        z = self.encode_with_transformer(texts.to(self.device))
+        z, z_pooled = self.encode_with_transformer(texts.to(self.device))
         # # Pad the seq length to multiple of 8
         seq_len = (z.shape[1] + 8 - 1) // 8 * 8
         z = torch.nn.functional.pad(z, (0, 0, 0, seq_len - z.shape[1]), value=0.0)
+        if self.return_pooled:
+            return z, z_pooled
         return z
 
     def encode_with_transformer(self, text):
@@ -516,6 +723,14 @@ def encode_with_transformer(self, text):
         x = self.text_transformer_forward(x, attn_mask=self.model.attn_mask)
         x = self.model.language_model.encoder.final_layernorm(x)
         x = x.permute(1, 0, 2)  # LND -> NLD
+        if self.return_pooled:
+            pooled = self.pool(x, text)
+            return x, pooled
+        return x, None
+
+    def pool(self, x, text):
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ (self.model.head.weight.T)
         return x
 
     def text_transformer_forward(self, x: torch.Tensor, attn_mask=None):
@@ -529,6 +744,135 @@ def encode(self, text):
         return self(text)
 
 
+class FrozenOpenCLIPEmbedder2(AbstractEmbModel):
+    """
+    Uses the OpenCLIP transformer encoder for text
+    """
+
+    LAYERS = ["pooled", "last", "penultimate"]
+
+    def __init__(
+        self,
+        arch="ViT-H-14",
+        version="laion2b_s32b_b79k",
+        device="cuda",
+        max_length=77,
+        freeze=True,
+        layer="last",
+        always_return_pooled=False,
+        legacy=True,
+    ):
+        super().__init__()
+        assert layer in self.LAYERS
+        self.projection_dim = 1280
+        model, _, _ = open_clip.create_model_and_transforms(arch, device=torch.device("cpu"), pretrained=version,)
+        del model.visual
+        self.model = model
+
+        self.device = device
+        self.max_length = max_length
+        self.return_pooled = always_return_pooled
+        if freeze:
+            self.freeze()
+        self.layer = layer
+        if self.layer == "last":
+            self.layer_idx = 0
+        elif self.layer == "penultimate":
+            self.layer_idx = 1
+        else:
+            raise NotImplementedError()
+        self.legacy = legacy
+
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, text):
+        tokens = open_clip.tokenize(text)
+        z = self.encode_with_transformer(tokens.to(self.device))
+        if not self.return_pooled and self.legacy:
+            return z
+        if self.return_pooled:
+            assert not self.legacy
+            z_layer = z[self.layer]
+            # # Pad the seq length to multiple of 8
+            seq_len = (z_layer.shape[1] + 8 - 1) // 8 * 8
+            z_layer = torch.nn.functional.pad(z_layer, (0, 0, 0, seq_len - z_layer.shape[1]), value=0.0)
+            return z_layer, z["pooled"]
+        return z[self.layer]
+
+    def encode_with_transformer(self, text):
+        x = self.model.token_embedding(text)  # [batch_size, n_ctx, d_model]
+        x = x + self.model.positional_embedding
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.text_transformer_forward(x, attn_mask=self.model.attn_mask)
+        if self.legacy:
+            x = x[self.layer]
+            x = self.model.ln_final(x)
+            return x
+        else:
+            # x is a dict and will stay a dict
+            o = x["last"]
+            o = self.model.ln_final(o)
+            pooled = self.pool(o, text)
+            x["pooled"] = pooled
+            return x
+
+    def pool(self, x, text):
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.model.text_projection
+        return x
+
+    def text_transformer_forward(self, x: torch.Tensor, attn_mask=None):
+        outputs = {}
+        for i, r in enumerate(self.model.transformer.resblocks):
+            if i == len(self.model.transformer.resblocks) - 1:
+                outputs["penultimate"] = x.permute(1, 0, 2)  # LND -> NLD
+            if self.model.transformer.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(r, x, attn_mask)
+            else:
+                x = r(x, attn_mask=attn_mask)
+        outputs["last"] = x.permute(1, 0, 2)  # LND -> NLD
+        return outputs
+
+    def encode(self, text):
+        return self(text)
+
+
+class ConcatTimestepEmbedderND(AbstractEmbModel):
+    """embeds each dimension independently and concatenates them"""
+
+    def __init__(self, outdim, device='cuda'):
+        super().__init__()
+        self.timestep = Timestep(outdim)
+        self.outdim = outdim
+        self.device = device
+
+    def forward(self, x):
+        if x.ndim == 1:
+            x = x[:, None]
+        assert len(x.shape) == 2
+        b, dims = x.shape[0], x.shape[1]
+        x = rearrange(x, "b d -> (b d)")
+        emb = self.timestep(x)
+        emb = rearrange(emb, "(b d) d2 -> b (d d2)", b=b, d=dims, d2=self.outdim)
+        if self.device == 'cuda':
+            return emb.to(torch.cuda.current_device())
+        return emb
+
+
+class PrecachedEmbModel(AbstractEmbModel):
+    def __init__(self, device='cuda'):
+        super().__init__()
+        self.device = device
+
+    def forward(self, *args):
+        if self.device == 'cuda':
+            return [arg.to(torch.cuda.current_device()) for arg in args]
+        return list(args)
+
+
 if __name__ == "__main__":
     from ldm.util import count_params
 
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/encoders/x_transformer.py b/nemo/collections/multimodal/modules/stable_diffusion/encoders/x_transformer.py
index 938352817190..edbfadffdc33 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/encoders/x_transformer.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/encoders/x_transformer.py
@@ -19,7 +19,7 @@
 
 import torch
 import torch.nn.functional as F
-from einops import rearrange, repeat
+from einops import rearrange, reduce, repeat
 from torch import einsum, nn
 
 # constants
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/schedulers/ddim_scheduler.py b/nemo/collections/multimodal/modules/stable_diffusion/schedulers/ddim_scheduler.py
new file mode 100644
index 000000000000..7f2544ed010f
--- /dev/null
+++ b/nemo/collections/multimodal/modules/stable_diffusion/schedulers/ddim_scheduler.py
@@ -0,0 +1,407 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+
+class DDIMScheduler(ABC):
+    """
+    Denoising diffusion implicit models is a scheduler that extends the denoising procedure introduced in denoising
+    diffusion probabilistic models (DDPMs) with non-Markovian guidance.
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
+
+    For more details, see the original paper: https://arxiv.org/abs/2010.02502
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+        clip_sample (`bool`, default `True`):
+            option to clip predicted sample for numerical stability.
+        clip_sample_range (`float`, default `1.0`):
+            the maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        set_alpha_to_one (`bool`, default `True`):
+            each diffusion step uses the value of alphas product at that step and at the previous one. For the final
+            step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the value of alpha at step 0.
+        steps_offset (`int`, default `0`):
+            an offset added to the inference steps. You can use a combination of `offset=1` and
+            `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in
+            stable diffusion.
+        prediction_type (`str`, default `epsilon`, optional):
+            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
+            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
+            https://imagen.research.google/video/paper.pdf)
+        thresholding (`bool`, default `False`):
+            whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487).
+            Note that the thresholding method is unsuitable for latent-space diffusion models (such as
+            stable-diffusion).
+        dynamic_thresholding_ratio (`float`, default `0.995`):
+            the ratio for the dynamic thresholding method. Default is `0.995`, the same as Imagen
+            (https://arxiv.org/abs/2205.11487). Valid only when `thresholding=True`.
+        sample_max_value (`float`, default `1.0`):
+            the threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        timestep_spacing (`str`, default `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2. of [Common Diffusion Noise Schedules and Sample
+            Steps are Flawed](https://arxiv.org/abs/2305.08891) for more information.
+        rescale_betas_zero_snr (`bool`, default `False`):
+            whether to rescale the betas to have zero terminal SNR (proposed by https://arxiv.org/pdf/2305.08891.pdf).
+            This can enable the model to generate very bright and dark samples instead of limiting it to samples with
+            medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    order = 1
+
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        clip_sample: bool = True,
+        set_alpha_to_one: bool = True,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        clip_sample_range: float = 1.0,
+        sample_max_value: float = 1.0,
+        timestep_spacing: str = "leading",
+        rescale_betas_zero_snr: bool = False,
+    ):
+        self.num_train_timesteps = num_train_timesteps
+        self.beta_start = beta_start
+        self.beta_end = beta_end
+        self.beta_schedule = beta_schedule
+        self.trained_betas = trained_betas
+        self.clip_sample = clip_sample
+        self.steps_offset = steps_offset
+        self.prediction_type = prediction_type
+        self.thresholding = thresholding
+        self.dynamic_thresholding_ratio = dynamic_thresholding_ratio
+        self.clip_sample_range = clip_sample_range
+        self.sample_max_value = sample_max_value
+        self.timestep_spacing = timestep_spacing
+
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = (
+                torch.linspace(beta_start ** 0.5, beta_end ** 0.5, num_train_timesteps, dtype=torch.float32) ** 2
+            )
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        # At every step in ddim, we are looking into the previous alphas_cumprod
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `set_alpha_to_one` decides whether we set this parameter simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
+
+    def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`): input sample
+            timestep (`int`, optional): current timestep
+
+        Returns:
+            `torch.FloatTensor`: scaled input sample
+        """
+        return sample
+
+    def _get_variance(self, timestep, prev_timestep):
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+
+        return variance
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, height, width = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * height * width)
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, height, width)
+        sample = sample.to(dtype)
+
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+        """
+
+        if num_inference_steps > self.num_train_timesteps:
+            raise ValueError(
+                f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.train_timesteps`:"
+                f" {self.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                f" maximal {self.num_train_timesteps} timesteps."
+            )
+
+        self.num_inference_steps = num_inference_steps
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.timestep_spacing == "linspace":
+            timesteps = (
+                np.linspace(0, self.num_train_timesteps - 1, num_inference_steps).round()[::-1].copy().astype(np.int64)
+            )
+        elif self.timestep_spacing == "leading":
+            step_ratio = self.num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+            timesteps += self.steps_offset
+        elif self.timestep_spacing == "trailing":
+            step_ratio = self.num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = np.round(np.arange(self.num_train_timesteps, 0, -step_ratio)).astype(np.int64)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.timestep_spacing} is not supported. Please make sure to choose one of 'leading' or 'trailing'."
+            )
+
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        eta: float = 0.0,
+        use_clipped_model_output: bool = False,
+        generator=None,
+        variance_noise: Optional[torch.FloatTensor] = None,
+    ):
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                current instance of sample being created by diffusion process.
+            eta (`float`): weight of noise for added noise in diffusion step.
+            use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped
+                predicted original sample. Necessary because predicted original sample is clipped to [-1, 1] when
+                `self.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would
+                coincide with the one provided as input and `use_clipped_model_output` will have not effect.
+            generator: random number generator.
+            variance_noise (`torch.FloatTensor`): instead of generating noise for the variance using `generator`, we
+                can directly provide the noise for the variance itself. This is useful for methods such as
+                CycleDiffusion. (https://arxiv.org/abs/2210.05559)
+            return_dict (`bool`): option for returning tuple rather than DDIMSchedulerOutput class
+
+        Returns:
+            [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] or `tuple`:
+            [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
+        # Ideally, read DDIM paper in-detail understanding
+
+        # Notation (<variable name> -> <name in paper>
+        # - pred_noise_t -> e_theta(x_t, t)
+        # - pred_original_sample -> f_theta(x_t, t) or x_0
+        # - std_dev_t -> sigma_t
+        # - eta -> η
+        # - pred_sample_direction -> "direction pointing to x_t"
+        # - pred_prev_sample -> "x_t-1"
+
+        # 1. get previous step value (=t-1)
+        prev_timestep = timestep - self.num_train_timesteps // self.num_inference_steps
+
+        # 2. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+
+        beta_prod_t = 1 - alpha_prod_t
+
+        # 3. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        if self.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+            pred_epsilon = model_output
+        elif self.prediction_type == "sample":
+            pred_original_sample = model_output
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+        elif self.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t ** 0.5) * sample - (beta_prod_t ** 0.5) * model_output
+            pred_epsilon = (alpha_prod_t ** 0.5) * model_output + (beta_prod_t ** 0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.prediction_type} must be one of `epsilon`, `sample`, or"
+                " `v_prediction`"
+            )
+
+        # 4. Clip or threshold "predicted x_0"
+        if self.thresholding:
+            pred_original_sample = self._threshold_sample(pred_original_sample)
+        elif self.clip_sample:
+            pred_original_sample = pred_original_sample.clamp(-self.clip_sample_range, self.clip_sample_range)
+
+        # 5. compute variance: "sigma_t(η)" -> see formula (16)
+        # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+        variance = self._get_variance(timestep, prev_timestep)
+        std_dev_t = eta * variance ** (0.5)
+
+        if use_clipped_model_output:
+            # the pred_epsilon is always re-derived from the clipped x_0 in Glide
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+
+        # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t ** 2) ** (0.5) * pred_epsilon
+
+        # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+
+        if eta > 0:
+            if variance_noise is not None and generator is not None:
+                raise ValueError(
+                    "Cannot pass both generator and variance_noise. Please make sure that either `generator` or"
+                    " `variance_noise` stays `None`."
+                )
+
+            if variance_noise is None:
+                variance_noise = randn_tensor(
+                    model_output.shape, generator=generator, device=model_output.device, dtype=model_output.dtype
+                )
+            variance = std_dev_t * variance_noise
+
+            prev_sample = prev_sample + variance
+
+        return (prev_sample,)
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+        self, original_samples: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
+    def get_velocity(
+        self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as sample
+        alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+
+    def __len__(self):
+        return self.num_train_timesteps
diff --git a/nemo/collections/multimodal/parts/stable_diffusion/sdxl_helpers.py b/nemo/collections/multimodal/parts/stable_diffusion/sdxl_helpers.py
new file mode 100644
index 000000000000..cb50494fc55e
--- /dev/null
+++ b/nemo/collections/multimodal/parts/stable_diffusion/sdxl_helpers.py
@@ -0,0 +1,246 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import os
+from typing import List, Optional, Union
+
+import numpy as np
+import torch
+from einops import rearrange
+from omegaconf import ListConfig
+from PIL import Image
+from torch import autocast
+
+from nemo.collections.multimodal.parts.stable_diffusion.utils import append_dims
+from nemo.collections.multimodal.parts.utils import randn_like
+
+
+def get_unique_embedder_keys_from_conditioner(conditioner):
+    return list({x.input_key for x in conditioner.embedders})
+
+
+def perform_save_locally(save_path, samples):
+    os.makedirs(os.path.join(save_path), exist_ok=True)
+    base_count = len(os.listdir(os.path.join(save_path)))
+    # samples = embed_watermark(samples)
+    for sample in samples:
+        sample = sample.squeeze(0)
+        sample = 255.0 * rearrange(sample.cpu().numpy(), "c h w -> h w c")
+        Image.fromarray(sample.astype(np.uint8)).save(os.path.join(save_path, f"{base_count:09}.png"))
+        base_count += 1
+
+
+class Img2ImgDiscretizationWrapper:
+    """
+    wraps a discretizer, and prunes the sigmas
+    params:
+        strength: float between 0.0 and 1.0. 1.0 means full sampling (all sigmas are returned)
+    """
+
+    def __init__(self, discretization, strength: float = 1.0):
+        self.discretization = discretization
+        self.strength = strength
+        assert 0.0 <= self.strength <= 1.0
+
+    def __call__(self, *args, **kwargs):
+        # sigmas start large first, and decrease then
+        sigmas = self.discretization(*args, **kwargs)
+        print(f"sigmas after discretization, before pruning img2img: ", sigmas)
+        sigmas = torch.flip(sigmas, (0,))
+        sigmas = sigmas[: max(int(self.strength * len(sigmas)), 1)]
+        print("prune index:", max(int(self.strength * len(sigmas)), 1))
+        sigmas = torch.flip(sigmas, (0,))
+        print(f"sigmas after pruning: ", sigmas)
+        return sigmas
+
+
+def do_sample(
+    model,
+    sampler,
+    value_dict,
+    num_samples,
+    H,
+    W,
+    C,
+    F,
+    force_uc_zero_embeddings: Optional[List] = None,
+    batch2model_input: Optional[List] = None,
+    return_latents=False,
+    filter=None,
+    seed=42,
+    device="cuda",
+):
+    if force_uc_zero_embeddings is None:
+        force_uc_zero_embeddings = []
+    if batch2model_input is None:
+        batch2model_input = []
+
+    rng = torch.Generator().manual_seed(seed)
+
+    with torch.no_grad():
+        with autocast(device) as precision_scope:
+            with model.ema_scope():
+                num_samples = [num_samples]
+                batch, batch_uc = get_batch(
+                    get_unique_embedder_keys_from_conditioner(model.conditioner), value_dict, num_samples,
+                )
+                for key in batch:
+                    if isinstance(batch[key], torch.Tensor):
+                        print(key, batch[key].shape)
+                    elif isinstance(batch[key], list):
+                        print(key, [len(l) for l in batch[key]])
+                    else:
+                        print(key, batch[key])
+                c, uc = model.conditioner.get_unconditional_conditioning(
+                    batch, batch_uc=batch_uc, force_uc_zero_embeddings=force_uc_zero_embeddings,
+                )
+
+                for k in c:
+                    if not k == "crossattn":
+                        c[k], uc[k] = map(lambda y: y[k][: math.prod(num_samples)].to(device), (c, uc))
+
+                additional_model_inputs = {}
+                for k in batch2model_input:
+                    additional_model_inputs[k] = batch[k]
+
+                shape = (math.prod(num_samples), C, H // F, W // F)
+                randn = torch.randn(shape, generator=rng).to(device)
+
+                def denoiser(input, sigma, c):
+                    return model.denoiser(model.model, input, sigma, c, **additional_model_inputs)
+
+                samples_z = sampler(denoiser, randn, cond=c, uc=uc)
+                samples_x = model.decode_first_stage(samples_z)
+                samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
+
+                if filter is not None:
+                    samples = filter(samples)
+
+                if return_latents:
+                    return samples, samples_z
+                return samples
+
+
+def get_batch(keys, value_dict, N: Union[List, ListConfig], device="cuda"):
+    # Hardcoded demo setups; might undergo some changes in the future
+
+    batch = {}
+    batch_uc = {}
+
+    for key in keys:
+        if key == "txt":
+            batch["txt"] = np.repeat([value_dict["prompt"]], repeats=math.prod(N)).reshape(N).tolist()
+            batch_uc["txt"] = np.repeat([value_dict["negative_prompt"]], repeats=math.prod(N)).reshape(N).tolist()
+        elif key == "captions":
+            batch["captions"] = np.repeat([value_dict["prompt"]], repeats=math.prod(N)).reshape(N).tolist()
+            batch_uc["captions"] = np.repeat([value_dict["negative_prompt"]], repeats=math.prod(N)).reshape(N).tolist()
+        elif key == "original_size_as_tuple":
+            batch["original_size_as_tuple"] = (
+                torch.tensor([value_dict["orig_height"], value_dict["orig_width"]]).to(device).repeat(*N, 1)
+            )
+        elif key == "crop_coords_top_left":
+            batch["crop_coords_top_left"] = (
+                torch.tensor([value_dict["crop_coords_top"], value_dict["crop_coords_left"]]).to(device).repeat(*N, 1)
+            )
+        elif key == "aesthetic_score":
+            batch["aesthetic_score"] = torch.tensor([value_dict["aesthetic_score"]]).to(device).repeat(*N, 1)
+            batch_uc["aesthetic_score"] = (
+                torch.tensor([value_dict["negative_aesthetic_score"]]).to(device).repeat(*N, 1)
+            )
+
+        elif key == "target_size_as_tuple":
+            batch["target_size_as_tuple"] = (
+                torch.tensor([value_dict["target_height"], value_dict["target_width"]]).to(device).repeat(*N, 1)
+            )
+        else:
+            batch[key] = value_dict[key]
+
+    for key in batch.keys():
+        if key not in batch_uc and isinstance(batch[key], torch.Tensor):
+            batch_uc[key] = torch.clone(batch[key])
+    return batch, batch_uc
+
+
+def get_input_image_tensor(image: Image.Image, device="cuda"):
+    w, h = image.size
+    print(f"loaded input image of size ({w}, {h})")
+    width, height = map(lambda x: x - x % 64, (w, h))  # resize to integer multiple of 64
+    image = image.resize((width, height))
+    image_array = np.array(image.convert("RGB"))
+    image_array = image_array[None].transpose(0, 3, 1, 2)
+    image_tensor = torch.from_numpy(image_array).to(dtype=torch.float32) / 127.5 - 1.0
+    return image_tensor.to(device)
+
+
+def do_img2img(
+    img,
+    model,
+    sampler,
+    value_dict,
+    num_samples,
+    force_uc_zero_embeddings=[],
+    additional_kwargs={},
+    offset_noise_level: float = 0.0,
+    return_latents=False,
+    skip_encode=False,
+    filter=None,
+    seed=42,
+    device="cuda",
+):
+    rng = torch.Generator(device=device).manual_seed(seed)
+
+    with torch.no_grad():
+        with autocast(device) as precision_scope:
+            with model.ema_scope():
+                batch, batch_uc = get_batch(
+                    get_unique_embedder_keys_from_conditioner(model.conditioner), value_dict, [num_samples],
+                )
+                c, uc = model.conditioner.get_unconditional_conditioning(
+                    batch, batch_uc=batch_uc, force_uc_zero_embeddings=force_uc_zero_embeddings,
+                )
+
+                for k in c:
+                    c[k], uc[k] = map(lambda y: y[k][:num_samples].to(device), (c, uc))
+
+                for k in additional_kwargs:
+                    c[k] = uc[k] = additional_kwargs[k]
+                if skip_encode:
+                    z = img
+                else:
+                    z = model.encode_first_stage(img)
+                noise = randn_like(z, generator=rng)
+                sigmas = sampler.discretization(sampler.num_steps)
+                sigma = sigmas[0].to(z.device)
+
+                if offset_noise_level > 0.0:
+                    noise = noise + offset_noise_level * append_dims(torch.randn(z.shape[0], device=z.device), z.ndim)
+                noised_z = z + noise * append_dims(sigma, z.ndim)
+                noised_z = noised_z / torch.sqrt(
+                    1.0 + sigmas[0] ** 2.0
+                )  # Note: hardcoded to DDPM-like scaling. need to generalize later.
+
+                def denoiser(x, sigma, c):
+                    return model.denoiser(model.model, x, sigma, c)
+
+                samples_z = sampler(denoiser, noised_z, cond=c, uc=uc)
+                samples_x = model.decode_first_stage(samples_z)
+                samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
+
+                if filter is not None:
+                    samples = filter(samples)
+
+                if return_latents:
+                    return samples, samples_z
+                return samples
diff --git a/nemo/collections/multimodal/parts/stable_diffusion/sdxl_pipeline.py b/nemo/collections/multimodal/parts/stable_diffusion/sdxl_pipeline.py
new file mode 100644
index 000000000000..5c0c669b189b
--- /dev/null
+++ b/nemo/collections/multimodal/parts/stable_diffusion/sdxl_pipeline.py
@@ -0,0 +1,250 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+from omegaconf import OmegaConf
+
+from nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sampling import (
+    DPMPP2MSampler,
+    DPMPP2SAncestralSampler,
+    EulerAncestralSampler,
+    EulerEDMSampler,
+    HeunEDMSampler,
+    LinearMultistepSampler,
+)
+from nemo.collections.multimodal.parts.stable_diffusion.sdxl_helpers import (
+    Img2ImgDiscretizationWrapper,
+    do_img2img,
+    do_sample,
+)
+
+
+class SamplingPipeline:
+    def __init__(self, model, device="cuda", use_fp16=True, is_legacy=False) -> None:
+        self.device = device
+        self.model = model
+        if use_fp16:
+            model.conditioner.half()
+            model.model.half()
+        self.vae_scale_factor = 2 ** (self.model.first_stage_model.encoder.num_resolutions - 1)
+        self.is_legacy = is_legacy
+
+    def text_to_image(
+        self,
+        params,
+        prompt: str,
+        negative_prompt: str = "",
+        samples: int = 1,
+        return_latents: bool = False,
+        seed: int = 42,
+    ):
+        sampler = get_sampler_config(params)
+        value_dict = OmegaConf.to_container(params, resolve=True)
+        value_dict["prompt"] = prompt
+        value_dict["negative_prompt"] = negative_prompt
+        value_dict["target_width"] = params.width
+        value_dict["target_height"] = params.height
+        return do_sample(
+            self.model,
+            sampler,
+            value_dict,
+            samples,
+            params.height,
+            params.width,
+            self.model.model.diffusion_model.in_channels,
+            self.vae_scale_factor,
+            force_uc_zero_embeddings=["txt"] if not self.is_legacy else [],
+            return_latents=return_latents,
+            filter=None,
+            seed=seed,
+        )
+
+    def image_to_image(
+        self,
+        params,
+        image,
+        prompt: str,
+        negative_prompt: str = "",
+        samples: int = 1,
+        return_latents: bool = False,
+        seed: int = 42,
+    ):
+        sampler = get_sampler_config(params)
+
+        if params.img2img_strength < 1.0:
+            sampler.discretization = Img2ImgDiscretizationWrapper(
+                sampler.discretization, strength=params.img2img_strength,
+            )
+        height, width = image.shape[2], image.shape[3]
+        value_dict = OmegaConf.to_container(params, resolve=True)
+        value_dict["prompt"] = prompt
+        value_dict["negative_prompt"] = negative_prompt
+        value_dict["target_width"] = width
+        value_dict["target_height"] = height
+        return do_img2img(
+            image,
+            self.model,
+            sampler,
+            value_dict,
+            samples,
+            force_uc_zero_embeddings=["txt"] if not self.model.is_legacy else [],
+            return_latents=return_latents,
+            filter=None,
+            seed=seed,
+        )
+
+    def refiner(
+        self,
+        params,
+        image,
+        prompt: str,
+        negative_prompt: Optional[str] = None,
+        samples: int = 1,
+        return_latents: bool = False,
+        seed: int = 42,
+    ):
+        sampler = get_sampler_config(params)
+        if params.img2img_strength < 1.0:
+            sampler.discretization = Img2ImgDiscretizationWrapper(
+                sampler.discretization, strength=params.img2img_strength,
+            )
+        value_dict = {
+            "orig_width": image.shape[3] * 8,
+            "orig_height": image.shape[2] * 8,
+            "target_width": image.shape[3] * 8,
+            "target_height": image.shape[2] * 8,
+            "prompt": prompt,
+            "negative_prompt": negative_prompt,
+            "crop_coords_top": params.crop_coords_top,
+            "crop_coords_left": params.crop_coords_left,
+            "aesthetic_score": params.aesthetic_score,
+            "negative_aesthetic_score": params.negative_aesthetic_score,
+        }
+
+        return do_img2img(
+            image,
+            self.model,
+            sampler,
+            value_dict,
+            samples,
+            skip_encode=True,
+            return_latents=return_latents,
+            filter=None,
+            seed=seed,
+        )
+
+
+def get_guider_config(params):
+    if params.guider == "IdentityGuider":
+        guider_config = {
+            "target": "nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.guiders.IdentityGuider"
+        }
+    elif params.guider == "VanillaCFG":
+        scale = params.scale
+
+        thresholder = params.thresholder
+
+        if thresholder == "None":
+            dyn_thresh_config = {
+                "target": "nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sampling_utils.NoDynamicThresholding"
+            }
+        else:
+            raise NotImplementedError
+
+        guider_config = {
+            "target": "nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.guiders.VanillaCFG",
+            "params": {"scale": scale, "dyn_thresh_config": dyn_thresh_config},
+        }
+    else:
+        raise NotImplementedError
+    return guider_config
+
+
+def get_discretization_config(params):
+    if params.discretization == "LegacyDDPMDiscretization":
+        discretization_config = {
+            "target": "nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization",
+        }
+    elif params.discretization == "EDMDiscretization":
+        discretization_config = {
+            "target": "nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.EDMDiscretization",
+            "params": {"sigma_min": params.sigma_min, "sigma_max": params.sigma_max, "rho": params.rho,},
+        }
+    else:
+        raise ValueError(f"unknown discretization {params.discretization}")
+    return discretization_config
+
+
+def get_sampler_config(params):
+    discretization_config = get_discretization_config(params)
+    guider_config = get_guider_config(params)
+    sampler = None
+    if params.sampler == "EulerEDMSampler":
+        return EulerEDMSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            s_churn=params.s_churn,
+            s_tmin=params.s_tmin,
+            s_tmax=params.s_tmax,
+            s_noise=params.s_noise,
+            verbose=True,
+        )
+    if params.sampler == "HeunEDMSampler":
+        return HeunEDMSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            s_churn=params.s_churn,
+            s_tmin=params.s_tmin,
+            s_tmax=params.s_tmax,
+            s_noise=params.s_noise,
+            verbose=True,
+        )
+    if params.sampler == "EulerAncestralSampler":
+        return EulerAncestralSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            eta=params.eta,
+            s_noise=params.s_noise,
+            verbose=True,
+        )
+    if params.sampler == "DPMPP2SAncestralSampler":
+        return DPMPP2SAncestralSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            eta=params.eta,
+            s_noise=params.s_noise,
+            verbose=True,
+        )
+    if params.sampler == "DPMPP2MSampler":
+        return DPMPP2MSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            verbose=True,
+        )
+    if params.sampler == "LinearMultistepSampler":
+        return LinearMultistepSampler(
+            num_steps=params.steps,
+            discretization_config=discretization_config,
+            guider_config=guider_config,
+            order=params.order,
+            verbose=True,
+        )
+
+    raise ValueError(f"unknown sampler {params.sampler}!")
diff --git a/nemo/collections/multimodal/parts/stable_diffusion/utils.py b/nemo/collections/multimodal/parts/stable_diffusion/utils.py
index 47924534a803..591f72ae67ea 100644
--- a/nemo/collections/multimodal/parts/stable_diffusion/utils.py
+++ b/nemo/collections/multimodal/parts/stable_diffusion/utils.py
@@ -21,6 +21,7 @@
 import numpy as np
 import torch
 from PIL import Image, ImageDraw
+
 from nemo.utils import logging
 
 
@@ -206,3 +207,27 @@ def parallel_data_prefetch(
         return out
     else:
         return gather_res
+
+
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+
+
+def append_dims(x, target_dims):
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less")
+    return x[(...,) + (None,) * dims_to_append]
+
+
+def expand_dims_like(x, y):
+    while x.dim() != y.dim():
+        x = x.unsqueeze(-1)
+    return x
+
+
+def append_zero(x):
+    return torch.cat([x, x.new_zeros([1])])
diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py
index 9b2d86cf73ba..723e965eb8a8 100644
--- a/nemo/collections/multimodal/parts/utils.py
+++ b/nemo/collections/multimodal/parts/utils.py
@@ -54,6 +54,15 @@ def randn_like(x, generator=None):
     return torch.randn(x.shape, dtype=x.dtype, device=x.device, generator=generator)
 
 
+def extend_instance(obj, mixin):
+    """Apply mixins to a class instance after creation"""
+    base_cls = obj.__class__
+    base_cls_name = obj.__class__.__name__
+    obj.__class__ = type(
+        base_cls_name, (mixin, base_cls), {}
+    )  # mixin needs to go first for our forward() logic to work
+
+
 def getattr_recursive(obj, att):
     """
     Return nested attribute of obj
@@ -212,6 +221,7 @@ def setup_trainer_and_models_for_inference(
         else:
             raise ValueError(f"Unrecognized checkpoint type: {single_model_cfg.restore_from_path}")
 
+    # initialize apex DDP strategy
     def dummy():
         return
 
@@ -294,6 +304,7 @@ def setup_trainer_and_model_for_inference(
     else:
         raise ValueError(f"Unrecognized checkpoint type: {cfg.model.restore_from_path}")
 
+    # initialize apex DDP strategy
     def dummy():
         return
 
@@ -309,7 +320,7 @@ def dummy():
 
 
 def create_neva_model_and_processor(cfg):
-    from nemo.collections.multimodal.models.multimodal_llm.neva.neva_model import MegatronNevaModel
+    from nemo.collections.multimodal.models.neva.neva_model import MegatronNevaModel
 
     plugins = []
     if cfg.get('cluster_type', None) == 'BCP':
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 178346abde1c..7d64e490aa2e 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -22,6 +22,7 @@
 
 import omegaconf
 import torch
+import torch.nn as nn
 from omegaconf import OmegaConf, open_dict
 from omegaconf.dictconfig import DictConfig
 from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
@@ -1218,12 +1219,29 @@ def _get_max_steps(self):
         return steps_per_epoch * self._trainer.max_epochs
 
     def configure_sharded_model(self):
+        def find_frozen_submodules(model):
+            frozen_submodules = []
+            frozen_submodule_names = []
+            for name, module in model.named_modules():
+                if (
+                    isinstance(module, nn.Module)
+                    and list(module.parameters())
+                    and all(not param.requires_grad for param in module.parameters())
+                ):
+                    frozen_submodule_names.append(name)
+                    frozen_submodules.append(module)
+            return frozen_submodule_names, frozen_submodules
+
         if self.use_fsdp:
             """ Top-evel FSDP model sharding """
             # Shard the top-level model hierarchically. We shard the strategy-unwrapped model not
             # to lose the structure of non-FSDP wrapped parameters (e.g, embedding)
             # TODO: Currently the main parameter data type is kept in fp32 (when O2=False). This needs to be
             # extended to support lower precision main parameters.
+            frozen_submodule_names, frozen_submodules = find_frozen_submodules(self.model)
+            self.trainer.strategy.kwargs['ignored_states'] = frozen_submodules
+            # FSDP requires uniform status of require_grads
+            # Diffusion models like SD has frozen parts and needs to be added to 'ignored_states' from sharding for FSDP to work
             self.model = self.trainer.strategy._setup_model(self.model)
             # Move the CPU-initialized model (with `use_cpu_initialization=True`) to GPU, which is to avoid
             # out-of-memory carash before sharding. In case of GPU-initialized model, this is no-op.
diff --git a/nemo/collections/nlp/modules/common/megatron/clip_grads.py b/nemo/collections/nlp/modules/common/megatron/clip_grads.py
index a717de6aced4..a643f878dc05 100644
--- a/nemo/collections/nlp/modules/common/megatron/clip_grads.py
+++ b/nemo/collections/nlp/modules/common/megatron/clip_grads.py
@@ -15,7 +15,6 @@
 """Gradient clipping."""
 
 import itertools
-
 import torch
 from torch import inf
 
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index bfbc916c89d1..345a14dc29c2 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -53,6 +53,7 @@
 from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
 from torch.nn.parallel import DistributedDataParallel
 
+from nemo.collections.multimodal.modules.stable_diffusion.attention import BasicTransformerBlock
 from nemo.collections.nlp.modules.common.megatron.module import Float16Module
 from nemo.collections.nlp.modules.common.megatron.transformer import AutocastTransformerLayer, ParallelTransformerLayer
 from nemo.collections.nlp.parts import utils_funcs
@@ -560,6 +561,7 @@ def __init__(
         precision: Union[int, str] = 'bf16-mixed',
         nccl_communicator_config_path: Optional[str] = None,
         sharp: bool = False,
+        set_buffer_dtype: Optional[str] = None,
         **kwargs: Union[Any, Dict[str, Any]],
     ) -> None:
         if not HAVE_APEX:
@@ -573,7 +575,9 @@ def __init__(
             )
 
         # Set the mixed precision recipe
-        kwargs['mixed_precision'] = self._set_mixed_precision_recipe(precision, grad_reduce_dtype)
+        kwargs['mixed_precision'] = self._set_mixed_precision_recipe(
+            precision, grad_reduce_dtype, set_buffer_dtype=set_buffer_dtype
+        )
         # Use the default FSDP backward-prefetch policy for proper communication overlap.
         kwargs['backward_prefetch'] = BackwardPrefetch.BACKWARD_PRE
 
@@ -582,6 +586,7 @@ def __init__(
             MCoreTransformerLayer,
             AutocastTransformerLayer,
             ParallelTransformerLayer,
+            BasicTransformerBlock,
         }
         kwargs['auto_wrap_policy'] = functools.partial(
             transformer_auto_wrap_policy, transformer_layer_cls=self.fsdp_wrap_module
@@ -609,7 +614,7 @@ def __init__(
         super().__init__(**kwargs)
 
     def _set_mixed_precision_recipe(
-        self, precision: Union[int, str], grad_reduce_dtype: Union[int, str]
+        self, precision: Union[int, str], grad_reduce_dtype: Union[int, str], set_buffer_dtype: Union[int, str]
     ) -> MixedPrecision:
         """
         Set FSDP mixed precision recipe.
@@ -630,6 +635,8 @@ def _set_mixed_precision_recipe(
         # Over-write gradient reduction dtype to support bf16 computation with fp32 grad reduction
         if grad_reduce_dtype is not None:
             reduce_dtype = utils_funcs.torch_dtype_from_precision(grad_reduce_dtype, None)
+        if set_buffer_dtype is not None:
+            buffer_dtype = utils_funcs.torch_dtype_from_precision(buffer_dtype, None)
         return MixedPrecision(param_dtype=param_dtype, reduce_dtype=reduce_dtype, buffer_dtype=buffer_dtype,)
 
     def setup_environment(self) -> None:
diff --git a/scripts/multimodal_dataset_conversion/parquet_conversion.py b/scripts/multimodal_dataset_conversion/parquet_conversion.py
new file mode 100644
index 000000000000..c9d0954f54af
--- /dev/null
+++ b/scripts/multimodal_dataset_conversion/parquet_conversion.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import multiprocessing
+import os
+import pickle
+import shutil
+import tarfile
+import time
+from argparse import ArgumentParser
+from glob import glob
+from multiprocessing import Pool
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+import pandas as pd
+
+SHAPES = {
+    'prompt_embeds': (77, 2048),
+    'pooled_prompt_embeds': (1280,),
+    'latents_256': (4, 32, 32),
+}
+
+
+def convert_single_parquet_to_tar(parquet_file):
+    pf = pd.read_parquet(parquet_file)
+    tmp_folder = Path(parquet_file.split('.')[0] + '-tmp-pickle-files')
+    os.makedirs(tmp_folder, exist_ok=True)
+    tar_file = Path(args.output_folder) / os.path.basename(parquet_file).replace('parquet', 'tar')
+    with tarfile.open(tar_file, 'w') as f:
+        tmp_pickle_files = []
+        for i in range(len(pf.index)):
+            data = pf.iloc[i]
+            info = dict()
+            for key, shape in SHAPES.items():
+                info[key] = np.frombuffer(data[key], dtype=np.float32).reshape(shape)
+            tmp_pickle_filename = f'{i}.pickle'
+            pickle.dump(info, open(tmp_folder / tmp_pickle_filename, 'wb'))
+            f.add(tmp_folder / tmp_pickle_filename, tmp_pickle_filename)
+            tmp_pickle_files.append(tmp_pickle_filename)
+    shutil.rmtree(tmp_folder)
+
+
+def generate_wdinfo(tar_folder: str, chunk_size: int, output_path: Optional[str]):
+    if not output_path:
+        return
+    tar_files = []
+    for fname in glob(os.path.join(tar_folder, '*.tar')):
+        # only glob one level of folder structure because we only write basename to the tar files
+        if os.path.getsize(fname) > 0 and not os.path.exists(f"{fname}.INCOMPLETE"):
+            tar_files.append(os.path.basename(fname))
+    data = {'tar_files': sorted(tar_files), 'chunk_size': chunk_size, 'total_key_count': len(tar_files) * chunk_size}
+    with open(output_path, 'wb') as f:
+        pickle.dump(data, f)
+    print("Generated", output_path)
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser()
+    parser.add_argument('--parquet_folder', type=str, default='data/parquet')
+    parser.add_argument('--output_folder', type=str, default='data/output')
+    parser.add_argument('--num_process', type=int, default=-1)
+    parser.add_argument('--num_files', type=int, default=-1)
+    args = parser.parse_args()
+
+    PROFILE = True
+    if PROFILE:
+        shutil.rmtree(args.output_folder)
+
+    os.makedirs(args.output_folder, exist_ok=True)
+    parquets = glob(f'{args.parquet_folder}/*.parquet')
+    if args.num_files > 0:
+        parquets = parquets[: args.num_files]
+    args.num_files = len(parquets)
+    print(f'Processing {args.num_files} files.')
+    if args.num_process <= 0:
+        args.num_process = min(len(parquets), multiprocessing.cpu_count())
+    print(f'Converting using {args.num_process} processes.')
+    assert args.num_process <= args.num_files
+
+    t0 = time.time()
+    with Pool(processes=args.num_process) as pool:
+        pool.map(convert_single_parquet_to_tar, parquets)
+    t1 = time.time()
+    if PROFILE:
+        print("====== Summary ======")
+        print(f"{args.num_process} processes and {args.num_files} files.")
+        print(f"Total time {t1-t0:.2f}")
+        print(f"Time per file {(t1-t0)/len(parquets):.2f}")
+
+    generate_wdinfo(args.output_folder, chunk_size=5000, output_path=os.path.join(args.output_folder, 'wdinfo.pkl'))

From c7cd3a9cea783250921fb107db649b04bfc08670 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Thu, 4 Apr 2024 10:12:52 -0400
Subject: [PATCH 096/140] fix gemma pp (#8808)

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 .../nlp/models/language_modeling/megatron_gpt_model.py        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 859262bfd2ef..fb41614be86d 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -406,7 +406,7 @@ def model_provider_func(self, pre_process, post_process):
                 seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None),
                 rotary_base=self.cfg.get('rotary_base', 10000),
             )
-            if self.cfg.get("apply_embedding_scaling", False):
+            if self.cfg.get("apply_embedding_scaling", False) and parallel_state.is_pipeline_first_stage():
                 extend_instance(model.embedding, EmbeddingScalingMixin)
         else:
             assert self.cfg.get('num_query_groups', None) is None or self.cfg.get(
@@ -476,7 +476,7 @@ def model_provider_func(self, pre_process, post_process):
                 seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None),
                 rotary_base=self.cfg.get('rotary_base', 10000),
             )
-            if self.cfg.get("apply_embedding_scaling", False):
+            if self.cfg.get("apply_embedding_scaling", False) and parallel_state.is_pipeline_first_stage():
                 extend_instance(model.language_model.embedding, EmbeddingScalingMixin)
         return model
 

From c6218e3966cc4bda96d711600dba857b33de5739 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Thu, 4 Apr 2024 10:13:08 -0400
Subject: [PATCH 097/140] revert lora target module default (#8809)

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 .../tuning/conf/megatron_gpt_finetuning_config.yaml             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
index a50b578b95f4..40347f317fbb 100644
--- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
@@ -101,7 +101,7 @@ model:
       position_embedding_strategy: null # used only when weight_tying is True
 
     lora_tuning:
-      target_modules: ['attention_qkv','attention_dense','mlp_fc1','mlp_fc2'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
+      target_modules: ['attention_qkv'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
       adapter_dim: 32
       alpha: ${model.peft.lora_tuning.adapter_dim} 
       adapter_dropout: 0.0

From dd74f7ce9fda356993f772a40776105d710444b4 Mon Sep 17 00:00:00 2001
From: Marek Wawrzos <marek.28.93@gmail.com>
Date: Thu, 4 Apr 2024 16:31:25 +0200
Subject: [PATCH 098/140] Revert "add fsdp fix for tp > 1 (#8689)" (#8807)

This reverts commit e38e3526c0e28f4f5af691e0adab7e472a9663ac.

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>
---
 nemo/collections/nlp/parts/megatron_trainer_builder.py | 7 ++++++-
 nemo/collections/nlp/parts/nlp_overrides.py            | 5 ++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
index 6f77ab1f17cf..9a496c99f08d 100644
--- a/nemo/collections/nlp/parts/megatron_trainer_builder.py
+++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py
@@ -115,7 +115,12 @@ def _plugins(self) -> list:
             if megatron_amp_O2 and not with_distributed_adam:
                 plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
             else:
-                plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+                if self.cfg.model.get('fsdp', False):
+                    plugins.append(FSDPPrecision(precision=plugin_precision, scaler=scaler))
+                else:
+                    plugins.append(
+                        PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)
+                    )
             self.cfg.trainer.precision = None
 
         if self.cfg.get('cluster_type', None) == 'BCP':
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 345a14dc29c2..f77eb7e25813 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -34,7 +34,7 @@
 from pytorch_lightning.loops.fetchers import _DataFetcher
 from pytorch_lightning.plugins import ClusterEnvironment
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
-from pytorch_lightning.plugins.precision import FSDPPrecision, MixedPrecisionPlugin
+from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
 from pytorch_lightning.strategies import DDPStrategy, FSDPStrategy
 from pytorch_lightning.trainer.states import TrainerFn
 from pytorch_lightning.trainer.trainer import Trainer
@@ -66,7 +66,6 @@
 
 try:
     from apex.transformer.pipeline_parallel.utils import get_num_microbatches
-
     from nemo.core.optim.distributed_adam import MegatronDistributedFusedAdam
 
     HAVE_APEX = True
@@ -1151,7 +1150,7 @@ def dummy():
         return instance
 
 
-class PipelineMixedPrecisionPlugin(MixedPrecisionPlugin, FSDPPrecision):
+class PipelineMixedPrecisionPlugin(MixedPrecisionPlugin):
     """ Overrides PTL autocasting to not wrap training/val/test_step.
         We do this because we have the megatron-core fwd/bwd functions in training_step.
         This means .backward is being called in training_step so we do not want the whole

From 6ca22351ea7dd231dfb94284d141de938cdb2822 Mon Sep 17 00:00:00 2001
From: Jaemin Choi <minitu77@gmail.com>
Date: Thu, 4 Apr 2024 08:47:51 -0700
Subject: [PATCH 099/140] Fix MegatronGPTModel validation_step (#8797)

Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>
Co-authored-by: Jaemin Choi <jaeminc@nvidia.com>
---
 .../nlp/models/language_modeling/megatron_gpt_model.py     | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index fb41614be86d..317f036d3a7f 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1245,6 +1245,8 @@ def validation_step(self, dataloader_iter, dataloader_idx=0):
         if isinstance(self.model, list):
             for model_module in self.model:
                 model_module.eval()
+        else:
+            self.model.eval()
 
         if self.cfg.get('fp8', False):
             first_val_step = self.prev_step_training and not self.training
@@ -1252,11 +1254,14 @@ def validation_step(self, dataloader_iter, dataloader_idx=0):
         else:
             first_val_step = None
 
-        loss = self.fwd_bwd_step(dataloader_iter, True, first_val_step)
+        with torch.no_grad():
+            loss = self.fwd_bwd_step(dataloader_iter, True, first_val_step)
 
         if isinstance(self.model, list):
             for model_module in self.model:
                 model_module.train()
+        else:
+            self.model.train()
 
         if mode == 'val':
             # Append with the correct dataloader_idx in case of multiple dataloaders

From 246731d30b8026f91bbfadde83cd24b8bf3b2f5b Mon Sep 17 00:00:00 2001
From: Vladimir Bataev <vbataev@nvidia.com>
Date: Thu, 4 Apr 2024 20:52:58 +0400
Subject: [PATCH 100/140] Cuda graphs + loop labels RNN-T and TDT greedy
 decoding (#8531)

* Add "CUDA graphs" implementation to label looping algorithm for greedy decoding (RNN-T and TDT)
---
 nemo/collections/asr/modules/rnnt.py          |  15 +
 nemo/collections/asr/modules/rnnt_abstract.py |   7 +
 .../cuda_graph_rnnt_greedy_decoding.py        | 133 +----
 .../asr/parts/submodules/rnnt_decoding.py     |   1 +
 .../parts/submodules/rnnt_greedy_decoding.py  |  11 +-
 .../submodules/rnnt_loop_labels_computer.py   | 552 +++++++++++++++--
 .../submodules/tdt_loop_labels_computer.py    | 564 +++++++++++++++++-
 .../collections/asr/parts/utils/rnnt_utils.py |  23 +-
 nemo/core/utils/cuda_python_utils.py          | 130 ++++
 nemo/utils/timers.py                          |  56 +-
 .../test_cuda_graph_rnnt_greedy_decoding.py   |  10 +-
 11 files changed, 1301 insertions(+), 201 deletions(-)

diff --git a/nemo/collections/asr/modules/rnnt.py b/nemo/collections/asr/modules/rnnt.py
index 3a87344421d0..948760e68b30 100644
--- a/nemo/collections/asr/modules/rnnt.py
+++ b/nemo/collections/asr/modules/rnnt.py
@@ -384,6 +384,13 @@ def batch_replace_states_mask(
         # same as `dst_states[0][mask] = src_states[0][mask]`, but non-blocking
         torch.where(mask.unsqueeze(-1), src_states[0], dst_states[0], out=dst_states[0])
 
+    @classmethod
+    def batch_replace_states_all(
+        cls, src_states: list[torch.Tensor], dst_states: list[torch.Tensor],
+    ):
+        """Replace states in dst_states with states from src_states"""
+        dst_states[0].copy_(src_states[0])
+
     def batch_split_states(self, batch_states: list[torch.Tensor]) -> list[list[torch.Tensor]]:
         """
         Split states into a list of states.
@@ -1096,6 +1103,14 @@ def batch_replace_states_mask(
         torch.where(mask.unsqueeze(0).unsqueeze(-1), src_states[0].to(dtype), dst_states[0], out=dst_states[0])
         torch.where(mask.unsqueeze(0).unsqueeze(-1), src_states[1].to(dtype), dst_states[1], out=dst_states[1])
 
+    @classmethod
+    def batch_replace_states_all(
+        cls, src_states: Tuple[torch.Tensor, torch.Tensor], dst_states: Tuple[torch.Tensor, torch.Tensor],
+    ):
+        """Replace states in dst_states with states from src_states"""
+        dst_states[0].copy_(src_states[0])
+        dst_states[1].copy_(src_states[1])
+
     def batch_split_states(
         self, batch_states: Tuple[torch.Tensor, torch.Tensor]
     ) -> list[Tuple[torch.Tensor, torch.Tensor]]:
diff --git a/nemo/collections/asr/modules/rnnt_abstract.py b/nemo/collections/asr/modules/rnnt_abstract.py
index b896dbf37588..d3d9b7cb52d6 100644
--- a/nemo/collections/asr/modules/rnnt_abstract.py
+++ b/nemo/collections/asr/modules/rnnt_abstract.py
@@ -285,6 +285,13 @@ def batch_replace_states_mask(
         """Replace states in dst_states with states from src_states using the mask, in a way that does not synchronize with the CPU"""
         raise NotImplementedError()
 
+    @classmethod
+    def batch_replace_states_all(
+        cls, src_states: list[torch.Tensor], dst_states: list[torch.Tensor],
+    ):
+        """Replace states in dst_states with states from src_states"""
+        raise NotImplementedError()
+
     def batch_split_states(self, batch_states: list[torch.Tensor]) -> list[list[torch.Tensor]]:
         """
         Split states into a list of states.
diff --git a/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
index 536e11d54775..388737443fd4 100644
--- a/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
@@ -12,15 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import contextlib
 
 import numpy as np
 import torch
-from packaging.version import Version
 
 try:
-    from cuda import __version__ as cuda_python_version
-    from cuda import cuda, cudart, nvrtc
+    from cuda import cudart
 
     HAVE_CUDA_PYTHON = True
 except ImportError:
@@ -29,42 +26,13 @@
 
 from nemo.collections.asr.parts.utils import rnnt_utils
 from nemo.core.utils.cuda_python_utils import (
-    assert_drv,
     check_cuda_python_cuda_graphs_conditional_nodes_supported,
     cu_call,
+    run_nvrtc,
+    with_conditional_node,
 )
 
-
-def run_nvrtc(kernel_string, kernel_name):
-    err, prog = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b"while_loop_conditional.cu", 0, [], [])
-    assert_drv(err)
-    # Compile program
-    # Not specifying --gpu-architecture will default us to a fairly low compute capability, which is a safe bet.
-    # Otherwise, there are ways to query the current device's compute capability.
-    # https://stackoverflow.com/questions/48283009/nvcc-get-device-compute-capability-in-runtime
-    opts = []
-    (err,) = nvrtc.nvrtcCompileProgram(prog, len(opts), opts)
-    assert_drv(err)
-    err, size = nvrtc.nvrtcGetProgramLogSize(prog)
-    assert_drv(err)
-    buf = b" " * size
-    (err,) = nvrtc.nvrtcGetProgramLog(prog, buf)
-    assert_drv(err)
-
-    # Get PTX from compilation
-    err, ptxSize = nvrtc.nvrtcGetPTXSize(prog)
-    assert_drv(err)
-    ptx = b" " * ptxSize
-    (err,) = nvrtc.nvrtcGetPTX(prog, ptx)
-    assert_drv(err)
-
-    ptx = np.char.array(ptx)
-    err, module = cuda.cuModuleLoadData(ptx.ctypes.data)
-    assert_drv(err)
-    err, kernel = cuda.cuModuleGetFunction(module, kernel_name)
-    assert_drv(err)
-
-    return kernel
+_CUDA_PROGRAM_NAME = b"while_loop_conditional.cu"
 
 
 def create_outer_for_loop_kernel():
@@ -84,7 +52,7 @@ def create_outer_for_loop_kernel():
      cudaGraphSetConditional(handle, *time_idx < *trip_count);
     }
     """
-    return run_nvrtc(kernel_string, b"for_loop_conditional")
+    return run_nvrtc(kernel_string, b"for_loop_conditional", _CUDA_PROGRAM_NAME)
 
 
 def create_inner_while_loop_kernel():
@@ -104,96 +72,7 @@ def create_inner_while_loop_kernel():
      cudaGraphSetConditional(handle, *not_blank && *symbols_added < *max_symbols);
     }
     """
-    return run_nvrtc(kernel_string, b"while_loop_conditional")
-
-
-@contextlib.contextmanager
-def with_conditional_node(while_loop_kernel, while_loop_args, while_loop_conditional_handle, device):
-    """
-    Even though we add a conditional node only once, we need to
-    capture the kernel that calls cudaGraphSetConditional() both
-    before in the parent graph containing the while loop body graph
-    and after the rest of the while loop body graph (because we need 
-    to decide both whether to enter the loop, and also whether to
-    execute the next iteration of the loop).
-    """
-    capture_status, _, graph, _, _ = cu_call(
-        cudart.cudaStreamGetCaptureInfo(torch.cuda.current_stream(device=device).cuda_stream)
-    )
-    assert capture_status == cudart.cudaStreamCaptureStatus.cudaStreamCaptureStatusActive
-
-    cuda.cuLaunchKernel(
-        while_loop_kernel,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        0,
-        torch.cuda.current_stream(device=device).cuda_stream,
-        while_loop_args.ctypes.data,
-        0,
-    )
-
-    capture_status, _, graph, dependencies, _ = cu_call(
-        cudart.cudaStreamGetCaptureInfo(torch.cuda.current_stream(device=device).cuda_stream)
-    )
-    assert capture_status == cudart.cudaStreamCaptureStatus.cudaStreamCaptureStatusActive
-
-    driver_params = cuda.CUgraphNodeParams()
-    driver_params.type = cuda.CUgraphNodeType.CU_GRAPH_NODE_TYPE_CONDITIONAL
-    driver_params.conditional.handle = while_loop_conditional_handle
-    driver_params.conditional.type = cuda.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_WHILE
-    driver_params.conditional.size = 1
-    if Version(cuda_python_version) == Version("12.3.0"):
-        # Work around for https://github.com/NVIDIA/cuda-python/issues/55
-        # Originally, cuda-python version 12.3.0 failed to allocate phGraph_out
-        # on its own.
-        # This bug is fixed in cuda-python version 12.4.0. In fact, we can
-        # no longer write to phGraph_out in cuda-python 12.4.0, so we must
-        # condition on the version number.
-        driver_params.conditional.phGraph_out = [cuda.CUgraph()]
-    (ctx,) = cu_call(cuda.cuCtxGetCurrent())
-    driver_params.conditional.ctx = ctx
-
-    # Use driver API here because of bug in cuda-python runtime API: https://github.com/NVIDIA/cuda-python/issues/55
-    # TODO: Change call to this after fix goes in (and we bump minimum cuda-python version to 12.4.0):
-    # node, = cu_call(cudart.cudaGraphAddNode(graph, dependencies, len(dependencies), driver_params))
-    (node,) = cu_call(cuda.cuGraphAddNode(graph, dependencies, len(dependencies), driver_params))
-    body_graph = driver_params.conditional.phGraph_out[0]
-
-    cu_call(
-        cudart.cudaStreamUpdateCaptureDependencies(
-            torch.cuda.current_stream(device=device).cuda_stream,
-            [node],
-            1,
-            cudart.cudaStreamUpdateCaptureDependenciesFlags.cudaStreamSetCaptureDependencies,
-        )
-    )
-    body_stream = torch.cuda.Stream(device)
-    previous_stream = torch.cuda.current_stream(device=device)
-    cu_call(
-        cudart.cudaStreamBeginCaptureToGraph(
-            body_stream.cuda_stream,
-            body_graph,
-            None,
-            None,
-            0,
-            cudart.cudaStreamCaptureMode.cudaStreamCaptureModeThreadLocal,
-        )
-    )
-    torch.cuda.set_stream(body_stream)
-
-    yield body_stream, body_graph
-
-    cuda.cuLaunchKernel(
-        while_loop_kernel, 1, 1, 1, 1, 1, 1, 0, body_stream.cuda_stream, while_loop_args.ctypes.data, 0
-    )
-
-    cudart.cudaStreamEndCapture(body_stream.cuda_stream)
-
-    torch.cuda.set_stream(previous_stream)
+    return run_nvrtc(kernel_string, b"while_loop_conditional", _CUDA_PROGRAM_NAME)
 
 
 class RNNTGreedyDecodeCudaGraph:
diff --git a/nemo/collections/asr/parts/submodules/rnnt_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_decoding.py
index 3be7c8dd3534..ad71e5371f01 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_decoding.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_decoding.py
@@ -335,6 +335,7 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int):
                         preserve_alignments=self.preserve_alignments,
                         preserve_frame_confidence=self.preserve_frame_confidence,
                         confidence_method_cfg=self.confidence_method_cfg,
+                        use_cuda_graph_decoder=self.cfg.greedy.get('use_cuda_graph_decoder', False),
                     )
 
             else:
diff --git a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
index 1d503bb6e8ff..d69ed1c41049 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
@@ -607,9 +607,7 @@ def __init__(
         # switch between more efficient batch decoding technique
         self._decoding_computer = None
         if self.decoder.blank_as_pad:
-            if loop_labels and use_cuda_graph_decoder:
-                raise ValueError("loop_labels and use_cuda_graph_decoder is unsupported configuration")
-            elif loop_labels:
+            if loop_labels:
                 # default (faster) algo: loop over labels
                 self._greedy_decode = self._greedy_decode_blank_as_pad_loop_labels
                 self._decoding_computer = GreedyBatchedRNNTLoopLabelsComputer(
@@ -620,6 +618,7 @@ def __init__(
                     preserve_alignments=preserve_alignments,
                     preserve_frame_confidence=preserve_frame_confidence,
                     confidence_method_cfg=confidence_method_cfg,
+                    allow_cuda_graphs=use_cuda_graph_decoder,
                 )
             elif use_cuda_graph_decoder:
                 from nemo.collections.asr.parts.submodules.cuda_graph_rnnt_greedy_decoding import (
@@ -695,7 +694,7 @@ def _greedy_decode_blank_as_pad_loop_labels(
             raise NotImplementedError("`partial_hypotheses` support is not implemented")
 
         batched_hyps, alignments, last_decoder_state = self._decoding_computer(x=x, out_len=out_len)
-        hyps = rnnt_utils.batched_hyps_to_hypotheses(batched_hyps, alignments)
+        hyps = rnnt_utils.batched_hyps_to_hypotheses(batched_hyps, alignments, batch_size=x.shape[0])
         for hyp, state in zip(hyps, self.decoder.batch_split_states(last_decoder_state)):
             hyp.dec_state = state
         return hyps
@@ -2638,6 +2637,7 @@ def __init__(
         preserve_alignments: bool = False,
         preserve_frame_confidence: bool = False,
         confidence_method_cfg: Optional[DictConfig] = None,
+        use_cuda_graph_decoder: bool = False,
     ):
         super().__init__(
             decoder_model=decoder_model,
@@ -2664,6 +2664,7 @@ def __init__(
                 preserve_alignments=preserve_alignments,
                 preserve_frame_confidence=preserve_frame_confidence,
                 confidence_method_cfg=confidence_method_cfg,
+                allow_cuda_graphs=use_cuda_graph_decoder,
             )
             self._greedy_decode = self._greedy_decode_blank_as_pad_loop_labels
         else:
@@ -2737,7 +2738,7 @@ def _greedy_decode_blank_as_pad_loop_labels(
             raise NotImplementedError("`partial_hypotheses` support is not implemented")
 
         batched_hyps, alignments, last_decoder_state = self._decoding_computer(x=x, out_len=out_len)
-        hyps = rnnt_utils.batched_hyps_to_hypotheses(batched_hyps, alignments)
+        hyps = rnnt_utils.batched_hyps_to_hypotheses(batched_hyps, alignments, batch_size=x.shape[0])
         for hyp, state in zip(hyps, self.decoder.batch_split_states(last_decoder_state)):
             hyp.dec_state = state
         return hyps
diff --git a/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
index de83b1bf55f8..89b474e0f8ba 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
@@ -14,19 +14,166 @@
 
 from typing import Any, Optional, Tuple
 
+import numpy as np
 import torch
 import torch.nn.functional as F
 from omegaconf import DictConfig
 
 from nemo.collections.asr.parts.utils import rnnt_utils
 from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceMethodMixin
+from nemo.core.utils.cuda_python_utils import (
+    check_cuda_python_cuda_graphs_conditional_nodes_supported,
+    cu_call,
+    run_nvrtc,
+    with_conditional_node,
+)
+from nemo.utils import logging
+
+try:
+    from cuda import cudart
+
+    HAVE_CUDA_PYTHON = True
+except ImportError:
+    HAVE_CUDA_PYTHON = False
+
+
+class LoopLabelsState:
+    """
+    State for Loop Labels algorithm. Used only with CUDA graphs.
+    In initialization phase it is possible to assign values (tensors) to the state.
+    For algorithm code the storage should be reused (prefer copy data instead of assigning tensors).
+    """
+
+    max_time: int  # maximum length of internal storage for time dimension
+    batch_size: int  # (maximum) length of internal storage for batch dimension
+    device: torch.device  # device to store preallocated tensors
+
+    encoder_output_projected: torch.Tensor  # projected output from the encoder for decoding algorithm
+    encoder_output_length: torch.Tensor  # length of the (projected) output from the encoder
+
+    labels: torch.Tensor  # storage for current labels
+    scores: torch.Tensor  # storage for current scores
+
+    batch_indices: torch.Tensor  # indices of elements in batch (constant, range [0, batch_size-1])
+
+    time_indices: torch.Tensor  # current time indices for each element in batch
+    safe_time_indices: torch.Tensor  # current time indices, but guaranteed to be < encoder_output_length
+    time_indices_current_labels: torch.Tensor  # time indices for found labels (corresponding to `labels` field)
+    last_timesteps: torch.Tensor  # indices of the last timesteps for each element (encoder_output_length - 1)
+
+    active_mask: torch.Tensor  # mask for active hypotheses (the decoding is finished for the utterance if it is False)
+    advance_mask: torch.Tensor  # mask for "advancing" hypotheses (blank is found for the element on the current step)
+    blank_mask: torch.Tensor  # if the element is blank
+    # if the element was active on the previous step: to identify the end of decoding and store final hidden state
+    active_mask_prev: torch.Tensor
+    became_inactive_mask: torch.Tensor  # mask for elements that became inactive (end of decoding)
+
+    active_mask_any: torch.Tensor  # 0-dim bool tensor, condition for outer loop ('any element is still active')
+    advance_mask_any: torch.Tensor  # 0-dim bool tensor, condition for inner loop ('should advance any index')
+
+    last_decoder_state: Any  # last state from the decoder, needed for the output
+    decoder_state: Any  # current decoder state
+    decoder_output: torch.Tensor  # output from the decoder (projected)
+
+    batched_hyps: rnnt_utils.BatchedHyps  # batched hypotheses - decoding result
+    alignments: Optional[rnnt_utils.BatchedAlignments] = None  # batched alignments
+
+    def __init__(
+        self,
+        batch_size: int,
+        max_time: int,
+        encoder_dim: int,
+        max_symbols: int,
+        device: torch.device,
+        float_dtype: torch.dtype,
+        logits_dim: int,
+        preserve_alignments=False,
+        preserve_frame_confidence=False,
+    ):
+        """
+
+        Args:
+            batch_size: batch size for encoder output storage
+            max_time: maximum time for encoder output storage
+            encoder_dim: last dimension for encoder output storage (projected encoder output)
+            max_symbols: max symbols per step (to avoid infinite looping and pre-allocate storage)
+            device: device to store tensors
+            float_dtype: default float dtype for tensors (should match projected encoder output)
+            logits_dim: output dimension for Joint
+            preserve_alignments: if alignments are needed
+            preserve_frame_confidence: if frame confidence is needed
+        """
+        self.device = device
+        self.float_dtype = float_dtype
+        self.batch_size = batch_size
+        self.max_time = max_time
+
+        self.encoder_output_projected = torch.zeros(
+            (self.batch_size, self.max_time, encoder_dim), dtype=float_dtype, device=self.device,
+        )
+        self.encoder_output_length = torch.zeros((self.batch_size,), dtype=torch.long, device=self.device)
+
+        self.labels = torch.zeros([self.batch_size], dtype=torch.long, device=self.device)
+        self.scores = torch.zeros([self.batch_size], dtype=float_dtype, device=self.device)
+
+        # indices of elements in batch (constant)
+        self.batch_indices = torch.arange(self.batch_size, dtype=torch.long, device=self.device)
+
+        self.time_indices = torch.zeros_like(self.batch_indices)
+        self.safe_time_indices = torch.zeros_like(self.batch_indices)
+        self.time_indices_current_labels = torch.zeros_like(self.time_indices)
+        self.last_timesteps = torch.zeros_like(self.time_indices)
+
+        self.active_mask = torch.zeros([self.batch_size], dtype=torch.bool, device=self.device)
+        self.advance_mask = torch.zeros_like(self.active_mask)
+        self.blank_mask = torch.zeros_like(self.active_mask)
+        self.active_mask_prev = torch.zeros_like(self.active_mask)
+        self.became_inactive_mask = torch.zeros_like(self.active_mask)
+
+        self.active_mask_any = torch.tensor(True, device=self.device, dtype=torch.bool)
+        self.advance_mask_any = torch.tensor(True, device=self.device, dtype=torch.bool)
+
+        self.batched_hyps = rnnt_utils.BatchedHyps(
+            batch_size=self.batch_size,
+            init_length=self.max_time * max_symbols,
+            device=self.device,
+            float_dtype=float_dtype,
+        )
+        if preserve_alignments or preserve_frame_confidence:
+            self.alignments = rnnt_utils.BatchedAlignments(
+                batch_size=batch_size,
+                logits_dim=logits_dim,
+                init_length=max_time * (max_symbols + 1),
+                device=self.device,
+                float_dtype=self.float_dtype,
+                store_alignments=preserve_alignments,
+                store_frame_confidence=preserve_frame_confidence,
+            )
+        else:
+            self.alignments = None
+
+    def need_reinit(self, encoder_output_projected: torch.Tensor) -> bool:
+        """Check if need to reinit state: larger batch_size/max_time, or new device"""
+        return (
+            self.batch_size < encoder_output_projected.shape[0]
+            or self.max_time < encoder_output_projected.shape[1]
+            or self.device.index != encoder_output_projected.device.index
+        )
 
 
 class GreedyBatchedRNNTLoopLabelsComputer(ConfidenceMethodMixin):
     """
-    Loop Labels algorithm implementation. Callable.
+    Label Looping algorithm implementation: optimized batched greedy decoding. Callable.
+    Iterates over labels, on each step finding the next non-blank label
+    (evaluating Joint multiple times in inner loop); It uses a minimal possible amount of calls
+    to prediction network (with maximum possible batch size),
+    which makes it especially useful for scaling the prediction network.
+    During decoding all active hypotheses ("texts") have the same lengths.
     """
 
+    INITIAL_MAX_TIME = 375  # initial max time, used to init state for Cuda graphs
+    CUDA_PROGRAM_NAME = b"while_loop_labels_conditional_rnnt.cu"
+
     def __init__(
         self,
         decoder,
@@ -36,6 +183,7 @@ def __init__(
         preserve_alignments=False,
         preserve_frame_confidence=False,
         confidence_method_cfg: Optional[DictConfig] = None,
+        allow_cuda_graphs: bool = True,
     ):
         """
         Init method.
@@ -59,36 +207,47 @@ def __init__(
         self._init_confidence_method(confidence_method_cfg=confidence_method_cfg)
         assert self._SOS == self._blank_index  # "blank as pad" algorithm only
 
-    def __call__(
-        self, x: torch.Tensor, out_len: torch.Tensor,
+        self.use_cuda_graphs = allow_cuda_graphs
+
+        if self.use_cuda_graphs and self.max_symbols is None:
+            logging.warning("Max symbols is None, which is not allowed with Cuda graphs.")
+            self.use_cuda_graphs = False
+
+        if self.use_cuda_graphs:
+            try:
+                check_cuda_python_cuda_graphs_conditional_nodes_supported()
+            except ImportError as e:
+                logging.warning(f"No conditional node support. Cuda graphs will be disabled,\n{e.msg}")
+                self.use_cuda_graphs = False
+
+        self.state: Optional[LoopLabelsState] = None
+
+    def loop_labels_torch(
+        self, encoder_output: torch.Tensor, encoder_output_length: torch.Tensor,
     ) -> Tuple[rnnt_utils.BatchedHyps, Optional[rnnt_utils.BatchedAlignments], Any]:
         """
-        Optimized batched greedy decoding.
-        Iterates over labels, on each step finding the next non-blank label
-        (evaluating Joint multiple times in inner loop); It uses a minimal possible amount of calls
-        to prediction network (with maximum possible batch size),
-        which makes it especially useful for scaling the prediction network.
-        During decoding all active hypotheses ("texts") have the same lengths.
+        Pure PyTorch implementation
 
         Args:
-            x: output from the encoder
-            out_len: lengths of the utterances in `x`
+            encoder_output: output from the encoder
+            encoder_output_length: lengths of the utterances in `encoder_output`
         """
-        batch_size, max_time, _unused = x.shape
-        device = x.device
+        batch_size, max_time, _unused = encoder_output.shape
+        device = encoder_output.device
 
-        x = self.joint.project_encoder(x)  # do not recalculate joint projection, project only once
+        # do not recalculate joint projection, project only once
+        encoder_output_projected = self.joint.project_encoder(encoder_output)
 
         # init output structures: BatchedHyps (for results), BatchedAlignments + last decoder state
         # init empty batched hypotheses
         batched_hyps = rnnt_utils.BatchedHyps(
             batch_size=batch_size,
             init_length=max_time * self.max_symbols if self.max_symbols is not None else max_time,
-            device=x.device,
-            float_dtype=x.dtype,
+            device=device,
+            float_dtype=encoder_output_projected.dtype,
         )
         # sample state, will be replaced further when the decoding for hypothesis is done
-        last_decoder_state = self.decoder.initialize_state(x)
+        last_decoder_state = self.decoder.initialize_state(encoder_output_projected)
         # init alignments if necessary
         use_alignments = self.preserve_alignments or self.preserve_frame_confidence
         # always use alignments variable - for torch.jit adaptation, but keep it as minimal as possible
@@ -96,14 +255,14 @@ def __call__(
             batch_size=batch_size,
             logits_dim=self.joint.num_classes_with_blank,
             init_length=max_time * 2 if use_alignments else 1,  # blank for each timestep + text tokens
-            device=x.device,
-            float_dtype=x.dtype,
+            device=device,
+            float_dtype=encoder_output_projected.dtype,
             store_alignments=self.preserve_alignments,
             store_frame_confidence=self.preserve_frame_confidence,
         )
 
         # initial state, needed for torch.jit to compile (cannot handle None)
-        state = self.decoder.initialize_state(x)
+        state = self.decoder.initialize_state(encoder_output_projected)
         # indices of elements in batch (constant)
         batch_indices = torch.arange(batch_size, dtype=torch.long, device=device)
         # last found labels - initially <SOS> (<blank>) symbol
@@ -113,10 +272,10 @@ def __call__(
         time_indices = torch.zeros_like(batch_indices)
         safe_time_indices = torch.zeros_like(time_indices)  # time indices, guaranteed to be < out_len
         time_indices_current_labels = torch.zeros_like(time_indices)
-        last_timesteps = out_len - 1
+        last_timesteps = encoder_output_length - 1
 
         # masks for utterances in batch
-        active_mask: torch.Tensor = out_len > 0
+        active_mask: torch.Tensor = encoder_output_length > 0
         advance_mask = torch.empty_like(active_mask)
 
         # for storing the last state we need to know what elements became "inactive" on this step
@@ -144,7 +303,9 @@ def __call__(
             # stage 2: get joint output, iteratively seeking for non-blank labels
             # blank label in `labels` tensor means "end of hypothesis" (for this index)
             logits = (
-                self.joint.joint_after_projection(x[batch_indices, safe_time_indices].unsqueeze(1), decoder_output,)
+                self.joint.joint_after_projection(
+                    encoder_output_projected[batch_indices, safe_time_indices].unsqueeze(1), decoder_output,
+                )
                 .squeeze(1)
                 .squeeze(1)
             )
@@ -155,21 +316,21 @@ def __call__(
             blank_mask = labels == self._blank_index
             time_indices_current_labels.copy_(time_indices, non_blocking=True)
             if use_alignments:
-                if self.preserve_frame_confidence:
-                    logits = F.log_softmax(logits, dim=-1)
                 alignments.add_results_masked_(
                     active_mask=active_mask,
                     time_indices=time_indices_current_labels,
                     logits=logits if self.preserve_alignments else None,
                     labels=labels if self.preserve_alignments else None,
-                    confidence=self._get_confidence_tensor(logits) if self.preserve_frame_confidence else None,
+                    confidence=self._get_confidence_tensor(F.log_softmax(logits, dim=-1))
+                    if self.preserve_frame_confidence
+                    else None,
                 )
 
             # advance_mask is a mask for current batch for searching non-blank labels;
             # each element is True if non-blank symbol is not yet found AND we can increase the time index
             time_indices += blank_mask
             torch.minimum(time_indices, last_timesteps, out=safe_time_indices)
-            torch.less(time_indices, out_len, out=active_mask)
+            torch.less(time_indices, encoder_output_length, out=active_mask)
             torch.logical_and(active_mask, blank_mask, out=advance_mask)
 
             # inner loop: find next non-blank labels (if exist)
@@ -179,7 +340,7 @@ def __call__(
                 torch.where(advance_mask, time_indices, time_indices_current_labels, out=time_indices_current_labels)
                 logits = (
                     self.joint.joint_after_projection(
-                        x[batch_indices, safe_time_indices].unsqueeze(1), decoder_output,
+                        encoder_output_projected[batch_indices, safe_time_indices].unsqueeze(1), decoder_output,
                     )
                     .squeeze(1)
                     .squeeze(1)
@@ -193,20 +354,20 @@ def __call__(
                 torch.where(advance_mask, more_scores, scores, out=scores)
 
                 if use_alignments:
-                    if self.preserve_frame_confidence:
-                        logits = F.log_softmax(logits, dim=-1)
                     alignments.add_results_masked_(
                         active_mask=advance_mask,
                         time_indices=time_indices_current_labels,
                         logits=logits if self.preserve_alignments else None,
                         labels=more_labels if self.preserve_alignments else None,
-                        confidence=self._get_confidence_tensor(logits) if self.preserve_frame_confidence else None,
+                        confidence=self._get_confidence_tensor(F.log_softmax(logits, dim=-1))
+                        if self.preserve_frame_confidence
+                        else None,
                     )
 
                 blank_mask = labels == self._blank_index
                 time_indices += blank_mask
                 torch.minimum(time_indices, last_timesteps, out=safe_time_indices)
-                torch.less(time_indices, out_len, out=active_mask)
+                torch.less(time_indices, encoder_output_length, out=active_mask)
                 torch.logical_and(active_mask, blank_mask, out=advance_mask)
 
             # stage 3: filter labels and state, store hypotheses
@@ -245,8 +406,331 @@ def __call__(
                 time_indices += force_blank_mask  # emit blank => advance time indices
                 # update safe_time_indices, non-blocking
                 torch.minimum(time_indices, last_timesteps, out=safe_time_indices)
-                # same as: active_mask = time_indices < out_len
-                torch.less(time_indices, out_len, out=active_mask)
+                # same as: active_mask = time_indices < encoder_output_length
+                torch.less(time_indices, encoder_output_length, out=active_mask)
         if use_alignments:
             return batched_hyps, alignments, last_decoder_state
         return batched_hyps, None, last_decoder_state
+
+    def loop_labels_cuda_graphs(
+        self, encoder_output: torch.Tensor, encoder_output_length: torch.Tensor,
+    ) -> Tuple[rnnt_utils.BatchedHyps, Optional[rnnt_utils.BatchedAlignments], Any]:
+        """
+        Implementation with CUDA graphs.
+
+        Args:
+            encoder_output: output from the encoder
+            encoder_output_length: lengths of the utterances in `encoder_output`
+        """
+        # do not recalculate joint projection, project only once
+        encoder_output = self.joint.project_encoder(encoder_output)
+        current_batch_size = encoder_output.shape[0]
+        current_max_time = encoder_output.shape[1]
+
+        if torch.is_autocast_enabled():
+            encoder_output = encoder_output.to(torch.get_autocast_gpu_dtype())
+
+        # init or reinit graph
+        if self.state is None or self.state.need_reinit(encoder_output):
+            self._graph_reinitialize(encoder_output, encoder_output_length)
+
+        # copy (projected) encoder output and lenghts
+        self.state.encoder_output_projected[:current_batch_size, :current_max_time, ...].copy_(encoder_output)
+        self.state.encoder_output_length[: encoder_output_length.shape[0]].copy_(encoder_output_length)
+        # set length to zero for elements outside the current batch
+        self.state.encoder_output_length[current_batch_size:].fill_(0)
+        self.graph.replay()
+
+        # example manual loop (can be used instead of graph.replay())
+        # self._before_outer_loop()
+        # while self.state.active_mask_any.item():
+        #     self._before_inner_loop_get_decoder_output()
+        #     self._before_inner_loop_get_joint_output()
+        #     while self.state.advance_mask_any.item():
+        #         self._inner_loop_code()
+        #     self._after_inner_loop()
+
+        return (
+            self.state.batched_hyps,
+            self.state.alignments,
+            self.state.last_decoder_state,
+        )
+
+    @classmethod
+    def _create_outer_while_loop_kernel(cls):
+        """
+        Creates a kernel that evaluates whether to enter the outer loop body (not all hypotheses are decoded).
+        Condition: while(active_mask_any).
+        """
+        kernel_string = r"""\
+        typedef __device_builtin__ unsigned long long cudaGraphConditionalHandle;
+    
+        extern "C" __device__ __cudart_builtin__ void cudaGraphSetConditional(cudaGraphConditionalHandle handle, unsigned int value);
+    
+        extern "C" __global__
+        void outer_loop_labels_conditional(cudaGraphConditionalHandle handle, const bool *active_mask_any)
+        {
+         cudaGraphSetConditional(handle, *active_mask_any);
+        }
+        """
+        return run_nvrtc(kernel_string, b"outer_loop_labels_conditional", cls.CUDA_PROGRAM_NAME)
+
+    @classmethod
+    def _create_inner_while_loop_kernel(cls):
+        """
+        Creates a kernel that evaluates whether to enter the inner loop body (not all non-blank labels found).
+        Condition: while(advance_mask_any).
+        """
+        kernel_string = r"""\
+        typedef __device_builtin__ unsigned long long cudaGraphConditionalHandle;
+    
+        extern "C" __device__ __cudart_builtin__ void cudaGraphSetConditional(cudaGraphConditionalHandle handle, unsigned int value);
+    
+        extern "C" __global__
+        void inner_find_non_blank_conditional(cudaGraphConditionalHandle handle, const bool *advance_mask_any)
+        {
+         cudaGraphSetConditional(handle, *advance_mask_any);
+        }
+        """
+        return run_nvrtc(kernel_string, b"inner_find_non_blank_conditional", cls.CUDA_PROGRAM_NAME)
+
+    def _graph_reinitialize(
+        self, encoder_output_projected: torch.Tensor, encoder_output_length: torch.Tensor,
+    ):
+        batch_size, max_time, encoder_dim = encoder_output_projected.shape
+
+        self.state = LoopLabelsState(
+            batch_size=batch_size,
+            max_time=max(max_time, self.INITIAL_MAX_TIME),
+            encoder_dim=encoder_dim,
+            max_symbols=self.max_symbols,
+            device=encoder_output_projected.device,
+            float_dtype=encoder_output_projected.dtype,
+            logits_dim=self.joint.num_classes_with_blank,
+            preserve_alignments=self.preserve_alignments,
+            preserve_frame_confidence=self.preserve_frame_confidence,
+        )
+
+        self.state.last_decoder_state = self.decoder.initialize_state(encoder_output_projected)
+        self.state.decoder_state = self.decoder.initialize_state(encoder_output_projected)
+        decoder_output, *_ = self.decoder.predict(
+            self.state.labels.unsqueeze(1), self.state.decoder_state, add_sos=False, batch_size=self.state.batch_size
+        )
+        # to avoid recalculation of joint projection, store decoder output in state
+        self.state.decoder_output = self.joint.project_prednet(decoder_output)
+
+        # Always create a new stream, because the per-thread default stream disallows stream capture to a graph.
+        stream_for_graph = torch.cuda.Stream(self.state.device)
+        self.graph = torch.cuda.CUDAGraph()
+        with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
+            self.graph, stream=stream_for_graph
+        ):
+            self._before_outer_loop()
+
+            capture_status, _, graph, _, _ = cu_call(
+                cudart.cudaStreamGetCaptureInfo(torch.cuda.current_stream(device=self.state.device).cuda_stream)
+            )
+            assert capture_status == cudart.cudaStreamCaptureStatus.cudaStreamCaptureStatusActive
+
+            # capture: while self.active_mask_any:
+            (outer_loop_conditional_handle,) = cu_call(cudart.cudaGraphConditionalHandleCreate(graph, 0, 0))
+            outer_loop_kernel = self._create_outer_while_loop_kernel()
+            active_mask_any_ptr = np.array([self.state.active_mask_any.data_ptr()], dtype=np.uint64)
+            outer_loop_args = np.array(
+                [outer_loop_conditional_handle.getPtr(), active_mask_any_ptr.ctypes.data], dtype=np.uint64,
+            )
+            # loop while there are active utterances
+            with with_conditional_node(
+                outer_loop_kernel, outer_loop_args, outer_loop_conditional_handle, device=self.state.device
+            ):
+                self._before_inner_loop_get_decoder_output()
+                self._before_inner_loop_get_joint_output()
+                # capture: while self.advance_mask_any.item():
+                inner_while_loop_kernel = self._create_inner_while_loop_kernel()
+                (inner_loop_conditional_handle,) = cu_call(cudart.cudaGraphConditionalHandleCreate(graph, 0, 0))
+                advance_mask_any_ptr = np.array([self.state.advance_mask_any.data_ptr()], dtype=np.uint64)
+                inner_loop_args = np.array(
+                    [inner_loop_conditional_handle.getPtr(), advance_mask_any_ptr.ctypes.data,], dtype=np.uint64,
+                )
+                with with_conditional_node(
+                    inner_while_loop_kernel, inner_loop_args, inner_loop_conditional_handle, device=self.state.device
+                ):
+                    self._inner_loop_code()
+                self._after_inner_loop()
+
+    def _before_outer_loop(self):
+        """Clear state and compute initial active mask"""
+        self.state.batched_hyps.clear_()
+        if self.state.alignments is not None:
+            self.state.alignments.clear_()
+
+        # initial state
+        self.decoder.batch_replace_states_all(
+            src_states=self.decoder.initialize_state(self.state.encoder_output_projected),
+            dst_states=self.state.decoder_state,
+        )
+        # last found labels - initially <SOS> (<blank>) symbol
+        self.state.labels.fill_(self._SOS)
+        self.state.scores.fill_(0.0)
+
+        # time indices
+        self.state.time_indices.fill_(0)
+        self.state.safe_time_indices.fill_(0)  # safe time indices: guaranteed to be < encoder_output_length
+        self.state.time_indices_current_labels.fill_(0)
+        torch.sub(self.state.encoder_output_length, 1, out=self.state.last_timesteps)
+
+        # masks for utterances in batch
+        # same as: active_mask = self.encoder_output_length > 0
+        torch.greater(self.state.encoder_output_length, 0, out=self.state.active_mask)
+
+        # for storing the last state we need to know what elements became "inactive" on this step
+        # same as: self.active_mask_any = active_mask.any()
+        torch.any(self.state.active_mask, out=self.state.active_mask_any)
+
+    def _before_inner_loop_get_decoder_output(self):
+        """Get decoder output"""
+        # stage 1: get decoder (prediction network) output
+        decoder_output, new_state, *_ = self.decoder.predict(
+            self.state.labels.unsqueeze(1), self.state.decoder_state, add_sos=False, batch_size=self.state.batch_size
+        )
+        self.decoder.batch_replace_states_all(src_states=new_state, dst_states=self.state.decoder_state)
+        decoder_output_projected = self.joint.project_prednet(decoder_output)  # do not recalculate joint projection
+        self.state.decoder_output.copy_(decoder_output_projected)
+
+    def _before_inner_loop_get_joint_output(self):
+        """Get Joint output after decoder output, prepare inner loop to search for all next non-blank labels"""
+        # stage 2: get joint output, iteratively seeking for non-blank labels
+        # blank label in `labels` tensor means "end of hypothesis" (for this index)
+        self.state.active_mask_prev.copy_(self.state.active_mask, non_blocking=True)
+        logits = (
+            self.joint.joint_after_projection(
+                self.state.encoder_output_projected[self.state.batch_indices, self.state.safe_time_indices].unsqueeze(
+                    1
+                ),
+                self.state.decoder_output,
+            )
+            .squeeze(1)
+            .squeeze(1)
+        )
+        # same as: scores, labels = logits.max(-1)
+        torch.max(logits, dim=-1, out=(self.state.scores, self.state.labels))
+
+        # search for non-blank labels using joint, advancing time indices for blank labels
+        # checking max_symbols is not needed, since we already forced advancing time indices for such cases
+        torch.eq(self.state.labels, self._blank_index, out=self.state.blank_mask)
+        # blank_mask = self.labels == self._blank_index
+        self.state.time_indices_current_labels.copy_(self.state.time_indices, non_blocking=True)
+        if self.state.alignments is not None:
+            self.state.alignments.add_results_masked_no_checks_(
+                active_mask=self.state.active_mask,
+                time_indices=self.state.time_indices_current_labels,
+                logits=logits if self.preserve_alignments else None,
+                labels=self.state.labels if self.preserve_alignments else None,
+                confidence=self._get_confidence_tensor(F.log_softmax(logits, dim=-1))
+                if self.preserve_frame_confidence
+                else None,
+            )
+
+        # advance_mask is a mask for current batch for searching non-blank labels;
+        # each element is True if non-blank symbol is not yet found AND we can increase the time index
+        self.state.time_indices.add_(self.state.blank_mask)
+        torch.minimum(self.state.time_indices, self.state.last_timesteps, out=self.state.safe_time_indices)
+        torch.less(self.state.time_indices, self.state.encoder_output_length, out=self.state.active_mask)
+        torch.logical_and(self.state.active_mask, self.state.blank_mask, out=self.state.advance_mask)
+
+        # inner loop: find next non-blank labels (if exist)
+        # same as: self.advance_mask_any = advance_mask.any()
+        torch.any(self.state.advance_mask, out=self.state.advance_mask_any)
+
+    def _inner_loop_code(self):
+        """Find next non-blank labels - one iteration"""
+        # same as: time_indices_current_labels[advance_mask] = time_indices[advance_mask], but non-blocking
+        # store current time indices to use further for storing the results
+        torch.where(
+            self.state.advance_mask,
+            self.state.time_indices,
+            self.state.time_indices_current_labels,
+            out=self.state.time_indices_current_labels,
+        )
+        logits = (
+            self.joint.joint_after_projection(
+                self.state.encoder_output_projected[self.state.batch_indices, self.state.safe_time_indices].unsqueeze(
+                    1
+                ),
+                self.state.decoder_output,
+            )
+            .squeeze(1)
+            .squeeze(1)
+        )
+        # get labels (greedy) and scores from current logits, replace labels/scores with new
+        # labels[advance_mask] are blank, and we are looking for non-blank labels
+        more_scores, more_labels = logits.max(-1)
+        # same as: labels[advance_mask] = more_labels[advance_mask], but non-blocking
+        torch.where(self.state.advance_mask, more_labels, self.state.labels, out=self.state.labels)
+        # same as: scores[advance_mask] = more_scores[advance_mask], but non-blocking
+        torch.where(self.state.advance_mask, more_scores, self.state.scores, out=self.state.scores)
+
+        if self.state.alignments is not None:
+            self.state.alignments.add_results_masked_no_checks_(
+                active_mask=self.state.advance_mask,
+                time_indices=self.state.time_indices_current_labels,
+                logits=logits if self.preserve_alignments else None,
+                labels=more_labels if self.preserve_alignments else None,
+                confidence=self._get_confidence_tensor(F.log_softmax(logits, dim=-1))
+                if self.preserve_frame_confidence
+                else None,
+            )
+
+        # blank_mask = self.labels == self._blank_index
+        torch.eq(self.state.labels, self._blank_index, out=self.state.blank_mask)
+        # self.time_indices += self.blank_mask
+        self.state.time_indices.add_(self.state.blank_mask)
+
+        torch.minimum(self.state.time_indices, self.state.last_timesteps, out=self.state.safe_time_indices)
+        torch.less(self.state.time_indices, self.state.encoder_output_length, out=self.state.active_mask)
+        torch.logical_and(self.state.active_mask, self.state.blank_mask, out=self.state.advance_mask)
+        torch.any(self.state.advance_mask, out=self.state.advance_mask_any)
+
+    def _after_inner_loop(self):
+        """Store hypotheses, state for finished hypotheses, avoid looping"""
+        # stage 3: filter labels and state, store hypotheses
+        # select states for hyps that became inactive (is it necessary?)
+        # this seems to be redundant, but used in the `loop_frames` output
+        torch.ne(self.state.active_mask, self.state.active_mask_prev, out=self.state.became_inactive_mask)
+        self.decoder.batch_replace_states_mask(
+            src_states=self.state.decoder_state,
+            dst_states=self.state.last_decoder_state,
+            mask=self.state.became_inactive_mask,
+        )
+
+        self.state.batched_hyps.add_results_masked_no_checks_(
+            self.state.active_mask, self.state.labels, self.state.time_indices_current_labels, self.state.scores,
+        )
+
+        # stage 4: to avoid looping, go to next frame after max_symbols emission
+        # if labels are non-blank (not end-of-utterance), check that last observed timestep with label:
+        # if it is equal to the current time index, and number of observations is >= max_symbols, force blank
+        force_blank_mask = torch.logical_and(
+            self.state.active_mask,
+            torch.logical_and(
+                torch.logical_and(
+                    self.state.labels != self._blank_index,
+                    self.state.batched_hyps.last_timestep_lasts >= self.max_symbols,
+                ),
+                self.state.batched_hyps.last_timestep == self.state.time_indices,
+            ),
+        )
+        self.state.time_indices.add_(force_blank_mask)  # emit blank => advance time indices
+        # update safe_time_indices, non-blocking
+        torch.minimum(self.state.time_indices, self.state.last_timesteps, out=self.state.safe_time_indices)
+        # same as: active_mask = time_indices < encoder_output_length
+        torch.less(self.state.time_indices, self.state.encoder_output_length, out=self.state.active_mask)
+        torch.any(self.state.active_mask, out=self.state.active_mask_any)
+
+    def __call__(
+        self, x: torch.Tensor, out_len: torch.Tensor,
+    ) -> Tuple[rnnt_utils.BatchedHyps, Optional[rnnt_utils.BatchedAlignments], Any]:
+        if self.use_cuda_graphs and x.device.type == "cuda":
+            return self.loop_labels_cuda_graphs(encoder_output=x, encoder_output_length=out_len)
+
+        return self.loop_labels_torch(encoder_output=x, encoder_output_length=out_len)
diff --git a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
index ce34d8362171..e95ea48d15fe 100644
--- a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
+++ b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
@@ -12,21 +12,171 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 from typing import Any, Optional, Tuple, Union
 
+import numpy as np
 import torch
 import torch.nn.functional as F
 from omegaconf import DictConfig, ListConfig
 
 from nemo.collections.asr.parts.utils import rnnt_utils
 from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceMethodMixin
+from nemo.core.utils.cuda_python_utils import (
+    check_cuda_python_cuda_graphs_conditional_nodes_supported,
+    cu_call,
+    run_nvrtc,
+    with_conditional_node,
+)
+from nemo.utils import logging
+
+try:
+    from cuda import cudart
+
+    HAVE_CUDA_PYTHON = True
+except ImportError:
+    HAVE_CUDA_PYTHON = False
+
+
+class LoopLabelsState:
+    """
+    State for Loop Labels algorithm. Used only with CUDA graphs.
+    In initialization phase it is possible to assign values (tensors) to the state.
+    For algorithm code the storage should be reused (prefer copy data instead of assigning tensors).
+    """
+
+    max_time: int  # maximum length of internal storage for time dimension
+    batch_size: int  # (maximum) length of internal storage for batch dimension
+    device: torch.device  # device to store preallocated tensors
+
+    all_durations: torch.Tensor
+
+    encoder_output_projected: torch.Tensor  # projected output from the encoder for decoding algorithm
+    encoder_output_length: torch.Tensor  # length of the (projected) output from the encoder
+
+    labels: torch.Tensor  # storage for current labels
+    scores: torch.Tensor  # storage for current scores
+
+    batch_indices: torch.Tensor  # indices of elements in batch (constant, range [0, batch_size-1])
+
+    time_indices: torch.Tensor  # current time indices for each element in batch
+    safe_time_indices: torch.Tensor  # current time indices, but guaranteed to be < encoder_output_length
+    time_indices_current_labels: torch.Tensor  # time indices for found labels (corresponding to `labels` field)
+    last_timesteps: torch.Tensor  # indices of the last timesteps for each element (encoder_output_length - 1)
+
+    active_mask: torch.Tensor  # mask for active hypotheses (the decoding is finished for the utterance if it is False)
+    advance_mask: torch.Tensor  # mask for "advancing" hypotheses (blank is found for the element on the current step)
+    blank_mask: torch.Tensor  # if the element is blank
+    # if the element was active on the previous step: to identify the end of decoding and store final hidden state
+    active_mask_prev: torch.Tensor
+    became_inactive_mask: torch.Tensor  # mask for elements that became inactive (end of decoding)
+
+    active_mask_any: torch.Tensor  # 0-dim bool tensor, condition for outer loop ('any element is still active')
+    advance_mask_any: torch.Tensor  # 0-dim bool tensor, condition for inner loop ('should advance any index')
+
+    last_decoder_state: Any  # last state from the decoder, needed for the output
+    decoder_state: Any  # current decoder state
+    decoder_output: torch.Tensor  # output from the decoder (projected)
+
+    batched_hyps: rnnt_utils.BatchedHyps  # batched hypotheses - decoding result
+    alignments: Optional[rnnt_utils.BatchedAlignments] = None  # batched alignments
+
+    def __init__(
+        self,
+        batch_size: int,
+        max_time: int,
+        encoder_dim: int,
+        max_symbols: int,
+        device: torch.device,
+        float_dtype: torch.dtype,
+        logits_dim: int,
+        preserve_alignments=False,
+        preserve_frame_confidence=False,
+    ):
+        """
+
+        Args:
+            batch_size: batch size for encoder output storage
+            max_time: maximum time for encoder output storage
+            encoder_dim: last dimension for encoder output storage (projected encoder output)
+            max_symbols: max symbols per step (to avoid infinite looping and pre-allocate storage)
+            device: device to store tensors
+            float_dtype: default float dtype for tensors (should match projected encoder output)
+            logits_dim: output dimension for Joint
+            preserve_alignments: if alignments are needed
+            preserve_frame_confidence: if frame confidence is needed
+        """
+        self.device = device
+        self.float_dtype = float_dtype
+        self.batch_size = batch_size
+        self.max_time = max_time
+
+        self.encoder_output_projected = torch.zeros(
+            (self.batch_size, self.max_time, encoder_dim), dtype=float_dtype, device=self.device,
+        )
+        self.encoder_output_length = torch.zeros((self.batch_size,), dtype=torch.long, device=self.device)
+
+        self.labels = torch.zeros([self.batch_size], dtype=torch.long, device=self.device)
+        self.scores = torch.zeros([self.batch_size], dtype=float_dtype, device=self.device)
+
+        # indices of elements in batch (constant)
+        self.batch_indices = torch.arange(self.batch_size, dtype=torch.long, device=self.device)
+
+        self.time_indices = torch.zeros_like(self.batch_indices)
+        self.safe_time_indices = torch.zeros_like(self.batch_indices)
+        self.time_indices_current_labels = torch.zeros_like(self.time_indices)
+        self.last_timesteps = torch.zeros_like(self.time_indices)
+
+        self.active_mask = torch.zeros([self.batch_size], dtype=torch.bool, device=self.device)
+        self.advance_mask = torch.zeros_like(self.active_mask)
+        self.blank_mask = torch.zeros_like(self.active_mask)
+        self.active_mask_prev = torch.zeros_like(self.active_mask)
+        self.became_inactive_mask = torch.zeros_like(self.active_mask)
+
+        self.active_mask_any = torch.tensor(True, device=self.device, dtype=torch.bool)
+        self.advance_mask_any = torch.tensor(True, device=self.device, dtype=torch.bool)
+
+        self.batched_hyps = rnnt_utils.BatchedHyps(
+            batch_size=self.batch_size,
+            init_length=self.max_time * max_symbols,
+            device=self.device,
+            float_dtype=float_dtype,
+        )
+        if preserve_alignments or preserve_frame_confidence:
+            self.alignments = rnnt_utils.BatchedAlignments(
+                batch_size=batch_size,
+                logits_dim=logits_dim,
+                init_length=max_time * (max_symbols + 1),
+                device=self.device,
+                float_dtype=self.float_dtype,
+                store_alignments=preserve_alignments,
+                store_frame_confidence=preserve_frame_confidence,
+            )
+        else:
+            self.alignments = None
+
+    def need_reinit(self, encoder_output_projected: torch.Tensor) -> bool:
+        """Check if need to reinit state: larger batch_size/max_time, or new device"""
+        return (
+            self.batch_size < encoder_output_projected.shape[0]
+            or self.max_time < encoder_output_projected.shape[1]
+            or self.device.index != encoder_output_projected.device.index
+        )
 
 
 class GreedyBatchedTDTLoopLabelsComputer(ConfidenceMethodMixin):
     """
-    Loop Labels algorithm implementation. Callable.
+    Label Looping algorithm implementation: optimized batched greedy decoding. Callable.
+    Iterates over labels, on each step finding the next non-blank label
+    (evaluating Joint multiple times in inner loop); It uses a minimal possible amount of calls
+    to prediction network (with maximum possible batch size),
+    which makes it especially useful for scaling the prediction network.
+    During decoding all active hypotheses ("texts") have the same lengths.
     """
 
+    INITIAL_MAX_TIME = 375  # initial max time, used to init state for Cuda graphs
+    CUDA_PROGRAM_NAME = b"while_loop_labels_conditional_tdt.cu"
+
     def __init__(
         self,
         decoder,
@@ -37,6 +187,7 @@ def __init__(
         preserve_alignments=False,
         preserve_frame_confidence=False,
         confidence_method_cfg: Optional[DictConfig] = None,
+        allow_cuda_graphs: bool = True,
     ):
         """
         Init method.
@@ -63,36 +214,47 @@ def __init__(
         self._init_confidence_method(confidence_method_cfg=confidence_method_cfg)
         assert self._SOS == self._blank_index  # "blank as pad" algorithm only
 
-    def __call__(
-        self, x: torch.Tensor, out_len: torch.Tensor,
+        self.use_cuda_graphs = allow_cuda_graphs
+
+        if self.use_cuda_graphs and self.max_symbols is None:
+            logging.warning("Max symbols is None, which is not allowed with Cuda graphs.")
+            self.use_cuda_graphs = False
+
+        if self.use_cuda_graphs:
+            try:
+                check_cuda_python_cuda_graphs_conditional_nodes_supported()
+            except ImportError as e:
+                logging.warning(f"No conditional node support. Cuda graphs will be disabled,\n{e.msg}")
+                self.use_cuda_graphs = False
+
+        self.state: Optional[LoopLabelsState] = None
+
+    def loop_labels_torch(
+        self, encoder_output: torch.Tensor, encoder_output_length: torch.Tensor,
     ) -> Tuple[rnnt_utils.BatchedHyps, Optional[rnnt_utils.BatchedAlignments], Any]:
         """
-        Optimized batched greedy decoding.
-        Iterates over labels, on each step finding the next non-blank label
-        (evaluating Joint multiple times in inner loop); It uses a minimal possible amount of calls
-        to prediction network (with maximum possible batch size),
-        which makes it especially useful for scaling the prediction network.
-        During decoding all active hypotheses ("texts") have the same lengths.
+        Pure PyTorch implementation
 
         Args:
-            x: output from the encoder
-            out_len: lengths of the utterances in `x`
+            encoder_output: output from the encoder
+            encoder_output_length: lengths of the utterances in `encoder_output`
         """
-        batch_size, max_time, _unused = x.shape
-        device = x.device
+        batch_size, max_time, _unused = encoder_output.shape
+        device = encoder_output.device
 
-        x = self.joint.project_encoder(x)  # do not recalculate joint projection, project only once
+        # do not recalculate joint projection, project only once
+        encoder_output_projected = self.joint.project_encoder(encoder_output)
 
         # init output structures: BatchedHyps (for results), BatchedAlignments + last decoder state
         # init empty batched hypotheses
         batched_hyps = rnnt_utils.BatchedHyps(
             batch_size=batch_size,
             init_length=max_time * self.max_symbols if self.max_symbols is not None else max_time,
-            device=x.device,
-            float_dtype=x.dtype,
+            device=device,
+            float_dtype=encoder_output_projected.dtype,
         )
         # sample state, will be replaced further when the decoding for hypothesis is done
-        last_decoder_state = self.decoder.initialize_state(x)
+        last_decoder_state = self.decoder.initialize_state(encoder_output_projected)
         # init alignments if necessary
         use_alignments = self.preserve_alignments or self.preserve_frame_confidence
         # always use alignments variable - for torch.jit adaptation, but keep it as minimal as possible
@@ -100,8 +262,8 @@ def __call__(
             batch_size=batch_size,
             logits_dim=self.joint.num_classes_with_blank,
             init_length=max_time * 2 if use_alignments else 1,  # blank for each timestep + text tokens
-            device=x.device,
-            float_dtype=x.dtype,
+            device=device,
+            float_dtype=encoder_output_projected.dtype,
             store_alignments=self.preserve_alignments,
             store_frame_confidence=self.preserve_frame_confidence,
         )
@@ -111,7 +273,7 @@ def __call__(
         num_durations = all_durations.shape[0]
 
         # initial state, needed for torch.jit to compile (cannot handle None)
-        state = self.decoder.initialize_state(x)
+        state = self.decoder.initialize_state(encoder_output_projected)
         # indices of elements in batch (constant)
         batch_indices = torch.arange(batch_size, dtype=torch.long, device=device)
         # last found labels - initially <SOS> (<blank>) symbol
@@ -121,10 +283,10 @@ def __call__(
         time_indices = torch.zeros_like(batch_indices)
         safe_time_indices = torch.zeros_like(time_indices)  # time indices, guaranteed to be < out_len
         time_indices_current_labels = torch.zeros_like(time_indices)
-        last_timesteps = out_len - 1
+        last_timesteps = encoder_output_length - 1
 
         # masks for utterances in batch
-        active_mask: torch.Tensor = out_len > 0
+        active_mask: torch.Tensor = encoder_output_length > 0
         advance_mask = torch.empty_like(active_mask)
 
         # for storing the last state we need to know what elements became "inactive" on this step
@@ -152,7 +314,9 @@ def __call__(
             # stage 2: get joint output, iteratively seeking for non-blank labels
             # blank label in `labels` tensor means "end of hypothesis" (for this index)
             logits = (
-                self.joint.joint_after_projection(x[batch_indices, safe_time_indices].unsqueeze(1), decoder_output,)
+                self.joint.joint_after_projection(
+                    encoder_output_projected[batch_indices, safe_time_indices].unsqueeze(1), decoder_output,
+                )
                 .squeeze(1)
                 .squeeze(1)
             )
@@ -181,8 +345,9 @@ def __call__(
             # each element is True if non-blank symbol is not yet found AND we can increase the time index
             time_indices += durations
             torch.minimum(time_indices, last_timesteps, out=safe_time_indices)
-            torch.less(time_indices, out_len, out=active_mask)
+            torch.less(time_indices, encoder_output_length, out=active_mask)
             torch.logical_and(active_mask, blank_mask, out=advance_mask)
+
             # inner loop: find next non-blank labels (if exist)
             while advance_mask.any():
                 # same as: time_indices_current_labels[advance_mask] = time_indices[advance_mask], but non-blocking
@@ -190,7 +355,7 @@ def __call__(
                 torch.where(advance_mask, time_indices, time_indices_current_labels, out=time_indices_current_labels)
                 logits = (
                     self.joint.joint_after_projection(
-                        x[batch_indices, safe_time_indices].unsqueeze(1), decoder_output,
+                        encoder_output_projected[batch_indices, safe_time_indices].unsqueeze(1), decoder_output,
                     )
                     .squeeze(1)
                     .squeeze(1)
@@ -222,7 +387,7 @@ def __call__(
                 # same as time_indices[advance_mask] += durations[advance_mask], but non-blocking
                 torch.where(advance_mask, time_indices + durations, time_indices, out=time_indices)
                 torch.minimum(time_indices, last_timesteps, out=safe_time_indices)
-                torch.less(time_indices, out_len, out=active_mask)
+                torch.less(time_indices, encoder_output_length, out=active_mask)
                 torch.logical_and(active_mask, blank_mask, out=advance_mask)
 
             # stage 3: filter labels and state, store hypotheses
@@ -261,8 +426,351 @@ def __call__(
                 time_indices += force_blank_mask  # emit blank => advance time indices
                 # update safe_time_indices, non-blocking
                 torch.minimum(time_indices, last_timesteps, out=safe_time_indices)
-                # same as: active_mask = time_indices < out_len
-                torch.less(time_indices, out_len, out=active_mask)
+                # same as: active_mask = time_indices < encoder_output_length
+                torch.less(time_indices, encoder_output_length, out=active_mask)
         if use_alignments:
             return batched_hyps, alignments, last_decoder_state
         return batched_hyps, None, last_decoder_state
+
+    def loop_labels_cuda_graphs(
+        self, encoder_output: torch.Tensor, encoder_output_length: torch.Tensor,
+    ) -> Tuple[rnnt_utils.BatchedHyps, Optional[rnnt_utils.BatchedAlignments], Any]:
+        """
+        Implementation with CUDA graphs.
+
+        Args:
+            encoder_output: output from the encoder
+            encoder_output_length: lengths of the utterances in `encoder_output`
+        """
+        # do not recalculate joint projection, project only once
+        encoder_output = self.joint.project_encoder(encoder_output)
+        current_batch_size = encoder_output.shape[0]
+        current_max_time = encoder_output.shape[1]
+
+        if torch.is_autocast_enabled():
+            encoder_output = encoder_output.to(torch.get_autocast_gpu_dtype())
+
+        # init or reinit graph
+        if self.state is None or self.state.need_reinit(encoder_output):
+            self._graph_reinitialize(encoder_output, encoder_output_length)
+
+        # copy (projected) encoder output and lenghts
+        self.state.encoder_output_projected[:current_batch_size, :current_max_time, ...].copy_(encoder_output)
+        self.state.encoder_output_length[: encoder_output_length.shape[0]].copy_(encoder_output_length)
+        # set length to zero for elements outside the current batch
+        self.state.encoder_output_length[current_batch_size:].fill_(0)
+        self.graph.replay()
+
+        # example manual loop (can be used instead of graph.replay())
+        # self._before_outer_loop()
+        # while self.state.active_mask_any.item():
+        #     self._before_inner_loop_get_decoder_output()
+        #     self._before_inner_loop_get_joint_output()
+        #     while self.state.advance_mask_any.item():
+        #         self._inner_loop_code()
+        #     self._after_inner_loop()
+
+        return (
+            self.state.batched_hyps,
+            self.state.alignments,
+            self.state.last_decoder_state,
+        )
+
+    @classmethod
+    def _create_outer_while_loop_kernel(cls):
+        """
+        Creates a kernel that evaluates whether to enter the outer loop body (not all hypotheses are decoded).
+        Condition: while(active_mask_any).
+        """
+        kernel_string = r"""\
+        typedef __device_builtin__ unsigned long long cudaGraphConditionalHandle;
+    
+        extern "C" __device__ __cudart_builtin__ void cudaGraphSetConditional(cudaGraphConditionalHandle handle, unsigned int value);
+    
+        extern "C" __global__
+        void outer_loop_labels_conditional(cudaGraphConditionalHandle handle, const bool *active_mask_any)
+        {
+         cudaGraphSetConditional(handle, *active_mask_any);
+        }
+        """
+        return run_nvrtc(kernel_string, b"outer_loop_labels_conditional", cls.CUDA_PROGRAM_NAME)
+
+    @classmethod
+    def _create_inner_while_loop_kernel(cls):
+        """
+        Creates a kernel that evaluates whether to enter the inner loop body (not all non-blank labels found).
+        Condition: while(advance_mask_any).
+        """
+        kernel_string = r"""\
+        typedef __device_builtin__ unsigned long long cudaGraphConditionalHandle;
+    
+        extern "C" __device__ __cudart_builtin__ void cudaGraphSetConditional(cudaGraphConditionalHandle handle, unsigned int value);
+    
+        extern "C" __global__
+        void inner_find_non_blank_conditional(cudaGraphConditionalHandle handle, const bool *advance_mask_any)
+        {
+         cudaGraphSetConditional(handle, *advance_mask_any);
+        }
+        """
+        return run_nvrtc(kernel_string, b"inner_find_non_blank_conditional", cls.CUDA_PROGRAM_NAME)
+
+    def _graph_reinitialize(
+        self, encoder_output_projected: torch.Tensor, encoder_output_length: torch.Tensor,
+    ):
+        batch_size, max_time, encoder_dim = encoder_output_projected.shape
+
+        self.state = LoopLabelsState(
+            batch_size=batch_size,
+            max_time=max(max_time, self.INITIAL_MAX_TIME),
+            encoder_dim=encoder_dim,
+            max_symbols=self.max_symbols,
+            device=encoder_output_projected.device,
+            float_dtype=encoder_output_projected.dtype,
+            logits_dim=self.joint.num_classes_with_blank,
+            preserve_alignments=self.preserve_alignments,
+            preserve_frame_confidence=self.preserve_frame_confidence,
+        )
+        self.state.all_durations = self.durations.to(self.state.device)
+
+        self.state.last_decoder_state = self.decoder.initialize_state(encoder_output_projected)
+        self.state.decoder_state = self.decoder.initialize_state(encoder_output_projected)
+        decoder_output, *_ = self.decoder.predict(
+            self.state.labels.unsqueeze(1), self.state.decoder_state, add_sos=False, batch_size=self.state.batch_size
+        )
+        # to avoid recalculation of joint projection, store decoder output in state
+        self.state.decoder_output = self.joint.project_prednet(decoder_output)
+
+        # Always create a new stream, because the per-thread default stream disallows stream capture to a graph.
+        stream_for_graph = torch.cuda.Stream(self.state.device)
+        self.graph = torch.cuda.CUDAGraph()
+        with torch.cuda.stream(stream_for_graph), torch.inference_mode(), torch.cuda.graph(
+            self.graph, stream=stream_for_graph
+        ):
+            self._before_outer_loop()
+
+            capture_status, _, graph, _, _ = cu_call(
+                cudart.cudaStreamGetCaptureInfo(torch.cuda.current_stream(device=self.state.device).cuda_stream)
+            )
+            assert capture_status == cudart.cudaStreamCaptureStatus.cudaStreamCaptureStatusActive
+
+            (outer_loop_conditional_handle,) = cu_call(cudart.cudaGraphConditionalHandleCreate(graph, 0, 0))
+            outer_loop_kernel = self._create_outer_while_loop_kernel()
+            active_mask_any_ptr = np.array([self.state.active_mask_any.data_ptr()], dtype=np.uint64)
+            outer_loop_args = np.array(
+                [outer_loop_conditional_handle.getPtr(), active_mask_any_ptr.ctypes.data], dtype=np.uint64,
+            )
+
+            # loop while there are active utterances
+            # while self.active_mask_any:
+            with with_conditional_node(
+                outer_loop_kernel, outer_loop_args, outer_loop_conditional_handle, device=self.state.device
+            ):
+                self._before_inner_loop_get_decoder_output()
+                self._before_inner_loop_get_joint_output()
+                inner_while_loop_kernel = self._create_inner_while_loop_kernel()
+                (inner_loop_conditional_handle,) = cu_call(cudart.cudaGraphConditionalHandleCreate(graph, 0, 0))
+                advance_mask_any_ptr = np.array([self.state.advance_mask_any.data_ptr()], dtype=np.uint64)
+                inner_loop_args = np.array(
+                    [inner_loop_conditional_handle.getPtr(), advance_mask_any_ptr.ctypes.data,], dtype=np.uint64,
+                )
+                # while self.advance_mask_any.item():
+
+                with with_conditional_node(
+                    inner_while_loop_kernel, inner_loop_args, inner_loop_conditional_handle, device=self.state.device
+                ):
+                    self._inner_loop_code()
+                self._after_inner_loop()
+
+    def _before_outer_loop(self):
+        """Clear state and compute initial active mask"""
+        self.state.batched_hyps.clear_()
+        if self.state.alignments is not None:
+            self.state.alignments.clear_()
+
+        # initial state
+        self.decoder.batch_replace_states_all(
+            src_states=self.decoder.initialize_state(self.state.encoder_output_projected),
+            dst_states=self.state.decoder_state,
+        )
+        # last found labels - initially <SOS> (<blank>) symbol
+        self.state.labels.fill_(self._SOS)
+        self.state.scores.fill_(0.0)
+
+        # time indices
+        self.state.time_indices.fill_(0)
+        self.state.safe_time_indices.fill_(0)  # safe time indices: guaranteed to be < encoder_output_length
+        self.state.time_indices_current_labels.fill_(0)
+        torch.sub(self.state.encoder_output_length, 1, out=self.state.last_timesteps)
+
+        # masks for utterances in batch
+        # same as: active_mask = self.encoder_output_length > 0
+        torch.greater(self.state.encoder_output_length, 0, out=self.state.active_mask)
+
+        # for storing the last state we need to know what elements became "inactive" on this step
+        # same as: self.active_mask_any = active_mask.any()
+        torch.any(self.state.active_mask, out=self.state.active_mask_any)
+
+    def _before_inner_loop_get_decoder_output(self):
+        """Get decoder output"""
+        # stage 1: get decoder (prediction network) output
+        decoder_output, new_state, *_ = self.decoder.predict(
+            self.state.labels.unsqueeze(1), self.state.decoder_state, add_sos=False, batch_size=self.state.batch_size
+        )
+        self.decoder.batch_replace_states_all(src_states=new_state, dst_states=self.state.decoder_state)
+        decoder_output_projected = self.joint.project_prednet(decoder_output)  # do not recalculate joint projection
+        self.state.decoder_output.copy_(decoder_output_projected)
+
+    def _before_inner_loop_get_joint_output(self):
+        """Get Joint output after decoder output, prepare inner loop to search for all next non-blank labels"""
+        # stage 2: get joint output, iteratively seeking for non-blank labels
+        # blank label in `labels` tensor means "end of hypothesis" (for this index)
+        self.state.active_mask_prev.copy_(self.state.active_mask, non_blocking=True)
+        logits = (
+            self.joint.joint_after_projection(
+                self.state.encoder_output_projected[self.state.batch_indices, self.state.safe_time_indices].unsqueeze(
+                    1
+                ),
+                self.state.decoder_output,
+            )
+            .squeeze(1)
+            .squeeze(1)
+        )
+        # same as: scores, labels = logits[:, : -self.state.all_durations.shape[0]].max(-1)
+        torch.max(logits[:, : -self.state.all_durations.shape[0]], dim=-1, out=(self.state.scores, self.state.labels))
+        jump_durations_indices = logits[:, -self.state.all_durations.shape[0] :].argmax(dim=-1)
+        durations = self.state.all_durations[jump_durations_indices]
+
+        # search for non-blank labels using joint, advancing time indices for blank labels
+        # checking max_symbols is not needed, since we already forced advancing time indices for such cases
+        torch.eq(self.state.labels, self._blank_index, out=self.state.blank_mask)
+        # blank_mask = self.labels == self._blank_index
+        self.state.time_indices_current_labels.copy_(self.state.time_indices, non_blocking=True)
+        # for blank labels force duration >= 1
+        durations.masked_fill_(torch.logical_and(durations == 0, self.state.blank_mask), 1)
+        if self.state.alignments is not None:
+            self.state.alignments.add_results_masked_no_checks_(
+                active_mask=self.state.active_mask,
+                time_indices=self.state.time_indices_current_labels,
+                logits=logits if self.preserve_alignments else None,
+                labels=self.state.labels if self.preserve_alignments else None,
+                confidence=self._get_confidence_tensor(
+                    F.log_softmax(logits[:, : -self.state.all_durations.shape[0]], dim=-1)
+                )
+                if self.preserve_frame_confidence
+                else None,
+            )
+
+        # advance_mask is a mask for current batch for searching non-blank labels;
+        # each element is True if non-blank symbol is not yet found AND we can increase the time index
+        self.state.time_indices.add_(durations)
+        torch.minimum(self.state.time_indices, self.state.last_timesteps, out=self.state.safe_time_indices)
+        torch.less(self.state.time_indices, self.state.encoder_output_length, out=self.state.active_mask)
+        torch.logical_and(self.state.active_mask, self.state.blank_mask, out=self.state.advance_mask)
+
+        # inner loop: find next non-blank labels (if exist)
+        # same as: self.advance_mask_any = advance_mask.any()
+        torch.any(self.state.advance_mask, out=self.state.advance_mask_any)
+
+    def _inner_loop_code(self):
+        """Find next non-blank labels - one iteration"""
+        # same as: time_indices_current_labels[advance_mask] = time_indices[advance_mask], but non-blocking
+        # store current time indices to use further for storing the results
+        torch.where(
+            self.state.advance_mask,
+            self.state.time_indices,
+            self.state.time_indices_current_labels,
+            out=self.state.time_indices_current_labels,
+        )
+        logits = (
+            self.joint.joint_after_projection(
+                self.state.encoder_output_projected[self.state.batch_indices, self.state.safe_time_indices].unsqueeze(
+                    1
+                ),
+                self.state.decoder_output,
+            )
+            .squeeze(1)
+            .squeeze(1)
+        )
+        # get labels (greedy) and scores from current logits, replace labels/scores with new
+        # labels[advance_mask] are blank, and we are looking for non-blank labels
+        more_scores, more_labels = logits[:, : -self.state.all_durations.shape[0]].max(-1)
+        jump_durations_indices = logits[:, -self.state.all_durations.shape[0] :].argmax(dim=-1)
+        durations = self.state.all_durations[jump_durations_indices]
+        # same as: labels[advance_mask] = more_labels[advance_mask], but non-blocking
+        torch.where(self.state.advance_mask, more_labels, self.state.labels, out=self.state.labels)
+        # same as: scores[advance_mask] = more_scores[advance_mask], but non-blocking
+        torch.where(self.state.advance_mask, more_scores, self.state.scores, out=self.state.scores)
+
+        if self.state.alignments is not None:
+            self.state.alignments.add_results_masked_no_checks_(
+                active_mask=self.state.advance_mask,
+                time_indices=self.state.time_indices_current_labels,
+                logits=logits if self.preserve_alignments else None,
+                labels=more_labels if self.preserve_alignments else None,
+                confidence=self._get_confidence_tensor(
+                    F.log_softmax(logits[:, : -self.state.all_durations.shape[0]], dim=-1)
+                )
+                if self.preserve_frame_confidence
+                else None,
+            )
+
+        # blank_mask = self.labels == self._blank_index
+        torch.eq(self.state.labels, self._blank_index, out=self.state.blank_mask)
+        # for blank labels force duration >= 1
+        durations.masked_fill_(torch.logical_and(durations == 0, self.state.blank_mask), 1)
+        # self.time_indices += self.blank_mask
+        torch.where(
+            self.state.advance_mask,
+            self.state.time_indices + durations,
+            self.state.time_indices,
+            out=self.state.time_indices,
+        )
+
+        torch.minimum(self.state.time_indices, self.state.last_timesteps, out=self.state.safe_time_indices)
+        torch.less(self.state.time_indices, self.state.encoder_output_length, out=self.state.active_mask)
+        torch.logical_and(self.state.active_mask, self.state.blank_mask, out=self.state.advance_mask)
+        torch.any(self.state.advance_mask, out=self.state.advance_mask_any)
+
+    def _after_inner_loop(self):
+        """Store hypotheses, state for finished hypotheses, avoid looping"""
+        # stage 3: filter labels and state, store hypotheses
+        # select states for hyps that became inactive (is it necessary?)
+        # this seems to be redundant, but used in the `loop_frames` output
+        torch.ne(self.state.active_mask, self.state.active_mask_prev, out=self.state.became_inactive_mask)
+        self.decoder.batch_replace_states_mask(
+            src_states=self.state.decoder_state,
+            dst_states=self.state.last_decoder_state,
+            mask=self.state.became_inactive_mask,
+        )
+
+        self.state.batched_hyps.add_results_masked_no_checks_(
+            self.state.active_mask, self.state.labels, self.state.time_indices_current_labels, self.state.scores,
+        )
+
+        # stage 4: to avoid looping, go to next frame after max_symbols emission
+        # if labels are non-blank (not end-of-utterance), check that last observed timestep with label:
+        # if it is equal to the current time index, and number of observations is >= max_symbols, force blank
+        force_blank_mask = torch.logical_and(
+            self.state.active_mask,
+            torch.logical_and(
+                torch.logical_and(
+                    self.state.labels != self._blank_index,
+                    self.state.batched_hyps.last_timestep_lasts >= self.max_symbols,
+                ),
+                self.state.batched_hyps.last_timestep == self.state.time_indices,
+            ),
+        )
+        self.state.time_indices.add_(force_blank_mask)  # emit blank => advance time indices
+        # update safe_time_indices, non-blocking
+        torch.minimum(self.state.time_indices, self.state.last_timesteps, out=self.state.safe_time_indices)
+        # same as: active_mask = time_indices < encoder_output_length
+        torch.less(self.state.time_indices, self.state.encoder_output_length, out=self.state.active_mask)
+        torch.any(self.state.active_mask, out=self.state.active_mask_any)
+
+    def __call__(
+        self, x: torch.Tensor, out_len: torch.Tensor,
+    ) -> Tuple[rnnt_utils.BatchedHyps, Optional[rnnt_utils.BatchedAlignments], Any]:
+        if self.use_cuda_graphs and x.device.type == "cuda":
+            return self.loop_labels_cuda_graphs(encoder_output=x, encoder_output_length=out_len)
+
+        return self.loop_labels_torch(encoder_output=x, encoder_output_length=out_len)
diff --git a/nemo/collections/asr/parts/utils/rnnt_utils.py b/nemo/collections/asr/parts/utils/rnnt_utils.py
index 764d6f3a8490..1cd2d2ddc255 100644
--- a/nemo/collections/asr/parts/utils/rnnt_utils.py
+++ b/nemo/collections/asr/parts/utils/rnnt_utils.py
@@ -263,6 +263,14 @@ def __init__(
         self._batch_indices = torch.arange(batch_size, device=device)
         self._ones_batch = torch.ones_like(self._batch_indices)
 
+    def clear_(self):
+        self.current_lengths.fill_(0)
+        self.transcript.fill_(0)
+        self.timesteps.fill_(0)
+        self.scores.fill_(0.0)
+        self.last_timestep.fill_(-1)
+        self.last_timestep_lasts.fill_(0)
+
     def _allocate_more(self):
         """
         Allocate 2x space for tensors, similar to common C++ std::vector implementations
@@ -437,6 +445,13 @@ def __init__(
             self.frame_confidence = torch.zeros((batch_size, self._max_length), device=device, dtype=float_dtype)
         self._batch_indices = torch.arange(batch_size, device=device)
 
+    def clear_(self):
+        self.current_lengths.fill_(0)
+        self.timesteps.fill_(0)
+        self.logits.fill_(0.0)
+        self.labels.fill_(0)
+        self.frame_confidence.fill_(0)
+
     def _allocate_more(self):
         """
         Allocate 2x space for tensors, similar to common C++ std::vector implementations
@@ -548,7 +563,7 @@ def add_results_masked_no_checks_(
 
 
 def batched_hyps_to_hypotheses(
-    batched_hyps: BatchedHyps, alignments: Optional[BatchedAlignments] = None
+    batched_hyps: BatchedHyps, alignments: Optional[BatchedAlignments] = None, batch_size=None
 ) -> List[Hypothesis]:
     """
     Convert batched hypotheses to a list of Hypothesis objects.
@@ -557,10 +572,14 @@ def batched_hyps_to_hypotheses(
     Args:
         batched_hyps: BatchedHyps object
         alignments: BatchedAlignments object, optional; must correspond to BatchedHyps if present
+        batch_size: Batch Size to retrieve hypotheses. When working with CUDA graphs the batch size for all tensors
+            is constant, thus we need here the real batch size to return only necessary hypotheses
 
     Returns:
         list of Hypothesis objects
     """
+    assert batch_size is None or batch_size <= batched_hyps.scores.shape[0]
+    num_hyps = batched_hyps.scores.shape[0] if batch_size is None else batch_size
     hypotheses = [
         Hypothesis(
             score=batched_hyps.scores[i].item(),
@@ -569,7 +588,7 @@ def batched_hyps_to_hypotheses(
             alignments=None,
             dec_state=None,
         )
-        for i in range(batched_hyps.scores.shape[0])
+        for i in range(num_hyps)
     ]
     if alignments is not None:
         # move all data to cpu to avoid overhead with moving data by chunks
diff --git a/nemo/core/utils/cuda_python_utils.py b/nemo/core/utils/cuda_python_utils.py
index 305f4a6401f0..fb47c22ceee0 100644
--- a/nemo/core/utils/cuda_python_utils.py
+++ b/nemo/core/utils/cuda_python_utils.py
@@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import contextlib
+
+import numpy as np
+import torch
 from packaging.version import Version
 
 __CUDA_PYTHON_MINIMUM_VERSION_CUDA_GRAPH_CONDITIONAL_NODES_SUPPORTED__ = (12, 3)  # 12030
@@ -89,3 +93,129 @@ def cu_call(f_call_out):
         raise Exception(f"CUDA failure! {error}")
     else:
         return tuple(others)
+
+
+@contextlib.contextmanager
+def with_conditional_node(while_loop_kernel, while_loop_args, while_loop_conditional_handle, device):
+    """
+    Even though we add a conditional node only once, we need to
+    capture the kernel that calls cudaGraphSetConditional() both
+    before in the parent graph containing the while loop body graph
+    and after the rest of the while loop body graph (because we need
+    to decide both whether to enter the loop, and also whether to
+    execute the next iteration of the loop).
+    """
+    from cuda import __version__ as cuda_python_version
+    from cuda import cuda, cudart, nvrtc
+
+    capture_status, _, graph, _, _ = cu_call(
+        cudart.cudaStreamGetCaptureInfo(torch.cuda.current_stream(device=device).cuda_stream)
+    )
+    assert capture_status == cudart.cudaStreamCaptureStatus.cudaStreamCaptureStatusActive
+
+    cuda.cuLaunchKernel(
+        while_loop_kernel,
+        1,
+        1,
+        1,
+        1,
+        1,
+        1,
+        0,
+        torch.cuda.current_stream(device=device).cuda_stream,
+        while_loop_args.ctypes.data,
+        0,
+    )
+
+    capture_status, _, graph, dependencies, _ = cu_call(
+        cudart.cudaStreamGetCaptureInfo(torch.cuda.current_stream(device=device).cuda_stream)
+    )
+    assert capture_status == cudart.cudaStreamCaptureStatus.cudaStreamCaptureStatusActive
+
+    driver_params = cuda.CUgraphNodeParams()
+    driver_params.type = cuda.CUgraphNodeType.CU_GRAPH_NODE_TYPE_CONDITIONAL
+    driver_params.conditional.handle = while_loop_conditional_handle
+    driver_params.conditional.type = cuda.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_WHILE
+    driver_params.conditional.size = 1
+    if Version(cuda_python_version) == Version("12.3.0"):
+        # Work around for https://github.com/NVIDIA/cuda-python/issues/55
+        # Originally, cuda-python version 12.3.0 failed to allocate phGraph_out
+        # on its own.
+        # This bug is fixed in cuda-python version 12.4.0. In fact, we can
+        # no longer write to phGraph_out in cuda-python 12.4.0, so we must
+        # condition on the version number.
+        driver_params.conditional.phGraph_out = [cuda.CUgraph()]
+    (ctx,) = cu_call(cuda.cuCtxGetCurrent())
+    driver_params.conditional.ctx = ctx
+
+    # Use driver API here because of bug in cuda-python runtime API: https://github.com/NVIDIA/cuda-python/issues/55
+    # TODO: Change call to this after fix goes in (and we bump minimum cuda-python version to 12.4.0):
+    # node, = cu_call(cudart.cudaGraphAddNode(graph, dependencies, len(dependencies), driver_params))
+    (node,) = cu_call(cuda.cuGraphAddNode(graph, dependencies, len(dependencies), driver_params))
+    body_graph = driver_params.conditional.phGraph_out[0]
+
+    cu_call(
+        cudart.cudaStreamUpdateCaptureDependencies(
+            torch.cuda.current_stream(device=device).cuda_stream,
+            [node],
+            1,
+            cudart.cudaStreamUpdateCaptureDependenciesFlags.cudaStreamSetCaptureDependencies,
+        )
+    )
+    body_stream = torch.cuda.Stream(device)
+    previous_stream = torch.cuda.current_stream(device=device)
+    cu_call(
+        cudart.cudaStreamBeginCaptureToGraph(
+            body_stream.cuda_stream,
+            body_graph,
+            None,
+            None,
+            0,
+            cudart.cudaStreamCaptureMode.cudaStreamCaptureModeThreadLocal,
+        )
+    )
+    torch.cuda.set_stream(body_stream)
+
+    yield body_stream, body_graph
+
+    cuda.cuLaunchKernel(
+        while_loop_kernel, 1, 1, 1, 1, 1, 1, 0, body_stream.cuda_stream, while_loop_args.ctypes.data, 0
+    )
+
+    cudart.cudaStreamEndCapture(body_stream.cuda_stream)
+
+    torch.cuda.set_stream(previous_stream)
+
+
+def run_nvrtc(kernel_string: str, kernel_name: bytes, program_name: bytes):
+    from cuda import cuda, nvrtc
+
+    err, prog = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), program_name, 0, [], [])
+    assert_drv(err)
+    # Compile program
+    # Not specifying --gpu-architecture will default us to a fairly low compute capability, which is a safe bet.
+    # Otherwise, there are ways to query the current device's compute capability.
+    # https://stackoverflow.com/questions/48283009/nvcc-get-device-compute-capability-in-runtime
+    opts = []
+    (err,) = nvrtc.nvrtcCompileProgram(prog, len(opts), opts)
+    assert_drv(err)
+    err, size = nvrtc.nvrtcGetProgramLogSize(prog)
+    assert_drv(err)
+    buf = b" " * size
+    (err,) = nvrtc.nvrtcGetProgramLog(prog, buf)
+    assert_drv(err)
+
+    # Get PTX from compilation
+    err, ptxSize = nvrtc.nvrtcGetPTXSize(prog)
+    assert_drv(err)
+    ptx = b" " * ptxSize
+    (err,) = nvrtc.nvrtcGetPTX(prog, ptx)
+    assert_drv(err)
+
+    ptx = np.char.array(ptx)
+    err, module = cuda.cuModuleLoadData(ptx.ctypes.data)
+    assert_drv(err)
+    err, kernel = cuda.cuModuleGetFunction(module, kernel_name)
+    assert_drv(err)
+
+    return kernel
diff --git a/nemo/utils/timers.py b/nemo/utils/timers.py
index 6bb84d51c198..a35c257652b9 100644
--- a/nemo/utils/timers.py
+++ b/nemo/utils/timers.py
@@ -16,11 +16,12 @@
 # limitations under the License.
 
 import time
+from typing import Optional
 
 import numpy as np
 import torch
 
-__all__ = ["NamedTimer"]
+__all__ = ["NamedTimer", "SimpleTimer"]
 
 
 class NamedTimer(object):
@@ -158,3 +159,56 @@ def export(self):
         data = {k: fn(v["dt"]) for k, v in self.timers.items() if ("dt" in v)}
 
         return data
+
+
+class SimpleTimer:
+    """
+    Simple Timer with maximum possible resolution, uses `time.perf_counter_ns`.
+    """
+
+    def __init__(self, sync_cuda=True):
+        """
+
+        Args:
+            sync_cuda: synchronize CUDA device.
+                The synchronization is done only if the device for start/stop is None or CUDA device.
+        """
+        self.total_time = 0
+        self._start_time: Optional[int] = None
+        self.sync_cuda = sync_cuda
+
+    def reset(self):
+        """Reset timer"""
+        self.total_time = 0
+        self._start_time = None
+
+    def start(self, device: Optional[torch.device] = None):
+        """
+        Start timer.
+
+        Args:
+            device: CUDA device to synchronize (optional).
+        """
+        if self.sync_cuda and torch.cuda.is_initialized() and (device is None or device.type == "cuda"):
+            torch.cuda.synchronize(device=device)
+        if self._start_time is not None:
+            raise RuntimeError("Timer already started")
+        self._start_time = time.perf_counter_ns()
+
+    def stop(self, device: Optional[torch.device] = None):
+        """
+        Stop device.
+
+        Args:
+            device: CUDA device to synchronize (optional).
+        """
+        if self.sync_cuda and torch.cuda.is_initialized() and (device is None or device.type == "cuda"):
+            torch.cuda.synchronize(device=device)
+        if self._start_time is None:
+            raise RuntimeError("Timer not started")
+        self.total_time += time.perf_counter_ns() - self._start_time
+        self._start_time = None
+
+    def total_sec(self) -> float:
+        """Return total time in seconds"""
+        return self.total_time / 1e9
diff --git a/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py b/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py
index ac5287650454..538ff9d71cf1 100644
--- a/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py
+++ b/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py
@@ -41,13 +41,14 @@
         ),
     ],
 )
-def test_cuda_graph_rnnt_greedy_decoder(model_name, batch_size, enable_bfloat16):
+@pytest.mark.parametrize("loop_labels", [False, True])
+def test_cuda_graph_rnnt_greedy_decoder(model_name, batch_size, enable_bfloat16, loop_labels: bool):
     skip_cuda_python_test_if_cuda_graphs_conditional_nodes_not_supported()
 
     conf = ASRModel.from_pretrained(model_name, return_config=True)
     with open_dict(conf):
         conf["decoding"]["greedy"]["max_symbols"] = 5
-        conf["decoding"]["greedy"]["loop_labels"] = False
+        conf["decoding"]["greedy"]["loop_labels"] = loop_labels
         conf["decoding"]["greedy"]["use_cuda_graph_decoder"] = False
 
     with tempfile.NamedTemporaryFile() as fp:
@@ -78,7 +79,8 @@ def test_cuda_graph_rnnt_greedy_decoder(model_name, batch_size, enable_bfloat16)
             print("New transcript:", fast)
 
 
-def test_change_devices():
+@pytest.mark.parametrize("loop_labels", [False, True])
+def test_change_devices(loop_labels: bool):
     skip_cuda_python_test_if_cuda_graphs_conditional_nodes_not_supported()
 
     if torch.cuda.device_count() < 2:
@@ -93,7 +95,7 @@ def test_change_devices():
     conf = ASRModel.from_pretrained(model_name, return_config=True)
     with open_dict(conf):
         conf["decoding"]["greedy"]["max_symbols"] = 5
-        conf["decoding"]["greedy"]["loop_labels"] = False
+        conf["decoding"]["greedy"]["loop_labels"] = loop_labels
         conf["decoding"]["greedy"]["use_cuda_graph_decoder"] = True
 
     nemo_model = ASRModel.from_pretrained(model_name, map_location=second_device)

From 27d8b20ed7f95e2a93578e8ee831ed71f56e0e2a Mon Sep 17 00:00:00 2001
From: Michal Futrega <mfutrega@nvidia.com>
Date: Thu, 4 Apr 2024 22:56:49 +0200
Subject: [PATCH 101/140] Enable per sample val loss reduction with
 validation_drop_last=False (#8801)

* Enable per sample val loss reduction with validation_drop_last=False

Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Enable per sample val loss reduction with validation_drop_last=False

Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update megatron_gpt_model.py

Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

---------

Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../nlp/data/language_modeling/megatron/gpt_sft_dataset.py  | 5 +++--
 .../nlp/models/language_modeling/megatron_gpt_model.py      | 6 +++++-
 .../nlp/models/language_modeling/megatron_gpt_sft_model.py  | 2 +-
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
index bb7bf07e4ad1..501c766374e1 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
@@ -486,14 +486,13 @@ def collate_fn(self, batch):
 
 class GPTSFTPackedDataset(GPTSFTDataset):
     def __init__(self, file_path: str, tokenizer: TokenizerSpec, return_cu_seqlen: bool = True, **kwargs):
+        np.random.seed(kwargs.get('seed', 1234))
         super().__init__(file_path, tokenizer, **kwargs)
         assert self.virtual_tokens == 0, "P-Tuning with packed sequence is not supported."
 
         # Whether to return `cu_seqlen` to pass to model. This should be true for almost all use cases.
         self.return_cu_seqlen = return_cu_seqlen
 
-        np.random.seed(self.seed)
-
     def __getitem__(self, idx):
         if self.samples_mapping is not None:
             # assert idx < len(self.samples_mapping)
@@ -502,6 +501,8 @@ def __getitem__(self, idx):
         input_ids = self.indexed_dataset[idx]['input_ids']
         seq_boundaries = self.indexed_dataset[idx]['seq_start_id'] + [len(input_ids)]
         loss_mask = self.indexed_dataset[idx]['loss_mask']
+        if idx < 0:
+            loss_mask = [0] * len(loss_mask)
         return {'input_ids': input_ids, 'seq_boundaries': seq_boundaries, 'loss_mask': loss_mask}
 
     def _load_dataset(self):
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 317f036d3a7f..ae33cc6761e9 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1145,11 +1145,15 @@ def loss_func(output_tensor):
                         },
                     )
                 elif validation_step and not self.cfg.data.get('validation_drop_last', True):
+                    sample_weight = self.cfg.data.get('sample_weight', 'token')
                     num_valid_tokens_in_ub = batch['num_valid_tokens_in_ub']
                     if loss_for_ub.isnan():
                         assert batch['loss_mask'].count_nonzero() == 0, 'Got NaN loss with non-empty input'
-                        loss_sum_for_ub = torch.zeros_like(num_valid_tokens_in_ub)
+                        loss_sum_for_ub = torch.zeros_like(loss_for_ub)
+                        num_valid_tokens_in_ub = 0
                     else:
+                        if sample_weight == 'constant':
+                            num_valid_tokens_in_ub = 1
                         loss_sum_for_ub = num_valid_tokens_in_ub * loss_for_ub
 
                     loss_sum_and_ub_size_all_gpu = torch.cat(
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index b2879a9171a7..eba8d8d99f92 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -373,7 +373,7 @@ def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
         fwd_bwd_function = get_forward_backward_func()
 
         losses_reduced_per_micro_batch = fwd_bwd_function(
-            forward_step_func=self.get_forward_output_and_loss_func(tuning=True),
+            forward_step_func=self.get_forward_output_and_loss_func(tuning=True, validation_step=forward_only),
             data_iterator=self._make_data_iterator_list(data_iter),
             model=self.model,
             num_microbatches=get_num_microbatches(),

From 1b0c6c945d3dc223f391f281854d711cd1cbe31e Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Thu, 4 Apr 2024 16:16:51 -0700
Subject: [PATCH 102/140] change trigger mechanism (#8827)

---
 .github/workflows/cicd-main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index b44ab5613b71..70316b8e1e93 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -15,7 +15,7 @@ name: "CICD NeMo"
 
 on:
   pull_request:
-    types: [opened, synchronize]
+    types: [opened, reopened, ready_for_review]
     branches: [ "main" ]
 
 jobs:

From 633379f4101f2c19d92bc2e186d930a98d4494d1 Mon Sep 17 00:00:00 2001
From: Sangkug Lym <slym@nvidia.com>
Date: Thu, 4 Apr 2024 19:51:09 -0700
Subject: [PATCH 103/140] Enable activation recomputation at SFT (#8811)

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 .../nlp/models/language_modeling/megatron_gpt_sft_model.py       | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index eba8d8d99f92..448f912c44d6 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -95,7 +95,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
             self._nsys_profile_start_step = self.cfg.nsys_profile.get('start_step', 0)
             self._nsys_profile_end_step = self.cfg.nsys_profile.get('end_step', 0)
 
-        self._reset_activation_checkpointing_args()
         self.virtual_tokens = 0
         self.init_global_step = 0
 

From 6ffc50478514569f2d3ae12d7221b1c3d98e29c0 Mon Sep 17 00:00:00 2001
From: WanZzzzzz <39144338+WanZzzzzz@users.noreply.github.com>
Date: Fri, 5 Apr 2024 09:56:29 -0700
Subject: [PATCH 104/140] Add NCCL UB support for AG (#8790)

Signed-off-by: Qiyu Wan <qiyuw@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: Qiyu Wan <qiyuw@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com>
---
 nemo/core/optim/distributed_adam.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/nemo/core/optim/distributed_adam.py b/nemo/core/optim/distributed_adam.py
index a85747c9f640..43a784cd7736 100644
--- a/nemo/core/optim/distributed_adam.py
+++ b/nemo/core/optim/distributed_adam.py
@@ -22,6 +22,12 @@
     _disable_pre_forward_hook,
     _multi_tensor_copy,
 )
+
+try:
+    import apex.contrib.nccl_allocator as nccl_allocator
+except ImportError:
+    nccl_allocator = None
+
 from megatron.core import parallel_state
 from megatron.core.dist_checkpointing.dict_utils import dict_list_map_inplace
 from megatron.core.dist_checkpointing.mapping import ShardedTensor
@@ -284,8 +290,15 @@ def init_param_buffer(self) -> None:
             buffer_sizes[dtypes] = max(bucket.contiguous_buffer_offset + bucket.bucket_size, buffer_sizes[dtypes])
         for dtypes, buffer_size in buffer_sizes.items():
             _, _, param_sync_dtype = dtypes
-            self._param_buffers[dtypes] = torch.zeros([buffer_size], dtype=param_sync_dtype, device=self.device)
-
+            if getattr(self, "nccl_ub", False):
+                if not nccl_allocator:
+                    raise RuntimeError("NCCL allocator importing failed but nccl ub is still requested")
+                with nccl_allocator.nccl_mem():
+                    self._param_buffers[dtypes] = torch.zeros(
+                        [buffer_size], dtype=param_sync_dtype, device=self.device
+                    )
+            else:
+                self._param_buffers[dtypes] = torch.zeros([buffer_size], dtype=param_sync_dtype, device=self.device)
         # Figure out corresponding positions in params and param buffer
         params = list(self.parameters())
         param_flat_views = []

From cf3b3a59b70ed4cb05bdac2252b9cc1bf95b34b5 Mon Sep 17 00:00:00 2001
From: Tim Moon <4406448+timmoon10@users.noreply.github.com>
Date: Fri, 5 Apr 2024 14:13:47 -0700
Subject: [PATCH 105/140] Distributed optimizer reduces GPT embedding grads in
 FP32 (#8792)

* Make sure embedding grads are reduced in FP32

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Access correct attr to get position embeddings

Signed-off-by: Tim Moon <tmoon@nvidia.com>

---------

Signed-off-by: Tim Moon <tmoon@nvidia.com>
---
 .../language_modeling/megatron_base_model.py  | 24 +++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 7d64e490aa2e..35c1518434a0 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -770,8 +770,28 @@ def get_config_arg(key: str, default_value: Optional[Any] = None) -> Any:
 
             # Make sure embedding grad reductions are in FP32
             if optim_dtype == torch.float32:
-                for name, param in self.named_parameters():
-                    if 'word_embedding' in name or 'position_embedding' in name or 'output_layer' in name:
+                fp32_params = []
+                modules = self.get_model_module_list()
+                if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
+                    if self.mcore_gpt:
+                        fp32_params.append(modules[0].shared_embedding_or_output_weight())
+                        fp32_params.append(modules[0].embedding.position_embeddings.weight)
+                    else:
+                        fp32_params.append(modules[0].word_embeddings_weight())
+                        fp32_params.append(modules[0].position_embeddings_weight())
+                if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
+                    share_embeddings_and_output_weights = (
+                        modules[-1].share_embeddings_and_output_weights
+                        if self.mcore_gpt
+                        else modules[-1].share_token_embeddings
+                    )
+                    if share_embeddings_and_output_weights:
+                        if self.mcore_gpt:
+                            fp32_params.append(modules[-1].shared_embedding_or_output_weight())
+                        else:
+                            fp32_params.append(modules[-1].word_embeddings_weight())
+                for param in fp32_params:
+                    if param is not None:
                         param._with_fp32_optimizer = True
 
             # Match param allgather with model dtype

From 20c6a18b360edd30510b8bd0fae3248375dbf999 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Fri, 5 Apr 2024 20:16:22 -0700
Subject: [PATCH 106/140] [Nemo CICD] Storage-related fix (#8843)

* change trigger mechanism

* change to use Az files instead of shared disk drive (corruption issue)
---
 .github/workflows/cicd-main.yml | 234 ++++++++++++++++----------------
 1 file changed, 117 insertions(+), 117 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 70316b8e1e93..309e7936ee3b 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -71,7 +71,7 @@ jobs:
       run: |
         # Pull base PyTorch container
         docker pull nvcr.io/nvidia/pytorch:24.01-py3
-        docker run --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume ${{ github.workspace }}/${{ github.run_id }}:/workspace --volume /datadrive/TestData:/home/TestData nvcr.io/nvidia/pytorch:24.01-py3 /bin/bash -c '
+        docker run --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume ${{ github.workspace }}/${{ github.run_id }}:/workspace --volume /mnt/datadrive/TestData:/home/TestData nvcr.io/nvidia/pytorch:24.01-py3 /bin/bash -c '
             set -x
 
             # PyTorch version
@@ -181,7 +181,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
     - name: Checkout repository
       uses: actions/checkout@v2
@@ -204,7 +204,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
     - name: Checkout repository
       uses: actions/checkout@v2
@@ -231,7 +231,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -256,7 +256,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -280,7 +280,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -304,7 +304,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -329,7 +329,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -357,7 +357,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -388,7 +388,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -417,7 +417,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -448,7 +448,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -495,7 +495,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -529,7 +529,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -563,7 +563,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -592,7 +592,7 @@ jobs:
   #       --shm-size=8g 
   #       --env TRANSFORMERS_OFFLINE=0 
   #       --env HYDRA_FULL_ERROR=1
-  #       --volume /datadrive/TestData:/home/TestData
+  #       --volume /mnt/datadrive/TestData:/home/TestData
   #   steps:
   #       - name: Checkout repository
   #         uses: actions/checkout@v2
@@ -642,7 +642,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -674,7 +674,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -707,7 +707,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -743,7 +743,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -774,7 +774,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -805,7 +805,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -832,7 +832,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -862,7 +862,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -892,7 +892,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -931,7 +931,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -963,7 +963,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -998,7 +998,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1025,7 +1025,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1047,7 +1047,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1079,7 +1079,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1113,7 +1113,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1179,7 +1179,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1221,7 +1221,7 @@ jobs:
   #       --shm-size=8g 
   #       --env TRANSFORMERS_OFFLINE=0 
   #       --env HYDRA_FULL_ERROR=1
-  #       --volume /datadrive/TestData:/home/TestData
+  #       --volume /mnt/datadrive/TestData:/home/TestData
   #   steps:
   #       - name: Checkout repository
   #         uses: actions/checkout@v2
@@ -1262,7 +1262,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1301,7 +1301,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1339,7 +1339,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1380,7 +1380,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1422,7 +1422,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1457,7 +1457,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1492,7 +1492,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1533,7 +1533,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1611,7 +1611,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1647,7 +1647,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1717,7 +1717,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1751,7 +1751,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1792,7 +1792,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1831,7 +1831,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1872,7 +1872,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1911,7 +1911,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1952,7 +1952,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -1991,7 +1991,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2021,7 +2021,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2157,7 +2157,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2188,7 +2188,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2224,7 +2224,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2253,7 +2253,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2277,7 +2277,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2308,7 +2308,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2368,7 +2368,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2433,7 +2433,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2496,7 +2496,7 @@ jobs:
   #       --shm-size=8g 
   #       --env TRANSFORMERS_OFFLINE=0 
   #       --env HYDRA_FULL_ERROR=1
-  #       --volume /datadrive/TestData:/home/TestData
+  #       --volume /mnt/datadrive/TestData:/home/TestData
   #   steps:
   #       - name: Checkout repository
   #         uses: actions/checkout@v2
@@ -2559,7 +2559,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2590,7 +2590,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2632,7 +2632,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2674,7 +2674,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2708,7 +2708,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2786,7 +2786,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2826,7 +2826,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2865,7 +2865,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2893,7 +2893,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2936,7 +2936,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -2977,7 +2977,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -3008,7 +3008,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -3117,7 +3117,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -3435,7 +3435,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -3516,7 +3516,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -3598,7 +3598,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -3682,7 +3682,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -3765,7 +3765,7 @@ jobs:
   #       --shm-size=8g 
   #       --env TRANSFORMERS_OFFLINE=0 
   #       --env HYDRA_FULL_ERROR=1
-  #       --volume /datadrive/TestData:/home/TestData
+  #       --volume /mnt/datadrive/TestData:/home/TestData
   #   steps:
   #       - name: Checkout repository
   #         uses: actions/checkout@v2
@@ -3859,7 +3859,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -3886,7 +3886,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -3979,7 +3979,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4169,7 +4169,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4265,7 +4265,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4361,7 +4361,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4461,7 +4461,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4542,7 +4542,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4586,7 +4586,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4632,7 +4632,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4694,7 +4694,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4720,7 +4720,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4748,7 +4748,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4811,7 +4811,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4840,7 +4840,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4869,7 +4869,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -4976,7 +4976,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5083,7 +5083,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5190,7 +5190,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5271,7 +5271,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5325,7 +5325,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5416,7 +5416,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5440,7 +5440,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5520,7 +5520,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5606,7 +5606,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5650,7 +5650,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5699,7 +5699,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5764,7 +5764,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5791,7 +5791,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5821,7 +5821,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5859,7 +5859,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5893,7 +5893,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5937,7 +5937,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -5978,7 +5978,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -6016,7 +6016,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2
@@ -6052,7 +6052,7 @@ jobs:
   #       --shm-size=8g 
   #       --env TRANSFORMERS_OFFLINE=0 
   #       --env HYDRA_FULL_ERROR=1
-  #       --volume /datadrive/TestData:/home/TestData
+  #       --volume /mnt/datadrive/TestData:/home/TestData
   #   steps:
   #       - name: Checkout repository
   #         uses: actions/checkout@v2
@@ -6080,7 +6080,7 @@ jobs:
         --shm-size=8g
         --env TRANSFORMERS_OFFLINE=0 
         --env HYDRA_FULL_ERROR=1
-        --volume /datadrive/TestData:/home/TestData
+        --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
           uses: actions/checkout@v2

From 97d1abb2bca0b5daff6d434c4bb340d3bb702e86 Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Date: Sat, 6 Apr 2024 02:29:50 -0400
Subject: [PATCH 107/140] Open source export and deploy modules (#8743)

* export and deploy modules

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Add export tests

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Address PR reviews

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Add try except

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Moved query_llm to nlp folder

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* removed lambada.json

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Reverting the Jenkinsfile

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Exclude deploy and export from the pip

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Address the CodeQL issues

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Addressing reviews

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* remove deploy test for now

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Addressing CodeQL comments

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* wrap imports with try except

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Add test data param and fix codeql issue

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

---------

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 Jenkinsfile                                   |   8 +-
 nemo/deploy/__init__.py                       |  18 +
 nemo/deploy/deploy_base.py                    | 114 +++
 nemo/deploy/deploy_pytriton.py                | 184 +++++
 nemo/deploy/nlp/__init__.py                   |  20 +
 nemo/deploy/nlp/query_llm.py                  | 229 ++++++
 nemo/deploy/triton_deployable.py              |  31 +
 nemo/deploy/utils.py                          |  79 ++
 nemo/export/__init__.py                       |  12 +
 nemo/export/tensorrt_llm.py                   | 702 ++++++++++++++++++
 nemo/export/trt_llm/__init__.py               |  13 +
 nemo/export/trt_llm/decoder/__init__.py       |  74 ++
 nemo/export/trt_llm/decoder/decoder.py        | 260 +++++++
 nemo/export/trt_llm/decoder/falcon.py         | 135 ++++
 nemo/export/trt_llm/decoder/gemma.py          | 216 ++++++
 nemo/export/trt_llm/decoder/gpt.py            | 121 +++
 nemo/export/trt_llm/decoder/gptj.py           | 105 +++
 nemo/export/trt_llm/decoder/llama.py          | 152 ++++
 nemo/export/trt_llm/model_config.py           | 528 +++++++++++++
 nemo/export/trt_llm/model_config_trt.py       |  82 ++
 nemo/export/trt_llm/nemo/__init__.py          |  16 +
 nemo/export/trt_llm/nemo/convert.py           | 526 +++++++++++++
 nemo/export/trt_llm/nemo/nemo.py              | 283 +++++++
 nemo/export/trt_llm/nemo/nemo_ckpt_convert.py | 592 +++++++++++++++
 .../trt_llm/nemo/sentencepiece_tokenizer.py   | 249 +++++++
 nemo/export/trt_llm/nemo_utils.py             | 325 ++++++++
 nemo/export/trt_llm/quantization_utils.py     | 128 ++++
 nemo/export/trt_llm/tensor_utils.py           |  59 ++
 nemo/export/trt_llm/tensorrt_llm_build.py     | 346 +++++++++
 nemo/export/trt_llm/tensorrt_llm_model.py     | 409 ++++++++++
 nemo/export/trt_llm/tensorrt_llm_run.py       | 673 +++++++++++++++++
 nemo/export/trt_llm/tensorrt_llm_utils.py     |  85 +++
 nemo/export/trt_llm/utils.py                  | 140 ++++
 scripts/export/export_to_trt_llm.py           | 153 ++++
 setup.py                                      |   2 +-
 tests/export/__init__.py                      |  13 +
 tests/export/run.sh                           |  46 ++
 tests/export/test_nemo_export.py              | 540 ++++++++++++++
 tests/infer_data_path.py                      | 291 ++++++++
 39 files changed, 7954 insertions(+), 5 deletions(-)
 create mode 100644 nemo/deploy/__init__.py
 create mode 100644 nemo/deploy/deploy_base.py
 create mode 100644 nemo/deploy/deploy_pytriton.py
 create mode 100644 nemo/deploy/nlp/__init__.py
 create mode 100644 nemo/deploy/nlp/query_llm.py
 create mode 100644 nemo/deploy/triton_deployable.py
 create mode 100644 nemo/deploy/utils.py
 create mode 100644 nemo/export/tensorrt_llm.py
 create mode 100644 nemo/export/trt_llm/__init__.py
 create mode 100644 nemo/export/trt_llm/decoder/__init__.py
 create mode 100644 nemo/export/trt_llm/decoder/decoder.py
 create mode 100644 nemo/export/trt_llm/decoder/falcon.py
 create mode 100644 nemo/export/trt_llm/decoder/gemma.py
 create mode 100644 nemo/export/trt_llm/decoder/gpt.py
 create mode 100644 nemo/export/trt_llm/decoder/gptj.py
 create mode 100644 nemo/export/trt_llm/decoder/llama.py
 create mode 100644 nemo/export/trt_llm/model_config.py
 create mode 100644 nemo/export/trt_llm/model_config_trt.py
 create mode 100644 nemo/export/trt_llm/nemo/__init__.py
 create mode 100644 nemo/export/trt_llm/nemo/convert.py
 create mode 100644 nemo/export/trt_llm/nemo/nemo.py
 create mode 100644 nemo/export/trt_llm/nemo/nemo_ckpt_convert.py
 create mode 100644 nemo/export/trt_llm/nemo/sentencepiece_tokenizer.py
 create mode 100644 nemo/export/trt_llm/nemo_utils.py
 create mode 100644 nemo/export/trt_llm/quantization_utils.py
 create mode 100644 nemo/export/trt_llm/tensor_utils.py
 create mode 100644 nemo/export/trt_llm/tensorrt_llm_build.py
 create mode 100644 nemo/export/trt_llm/tensorrt_llm_model.py
 create mode 100644 nemo/export/trt_llm/tensorrt_llm_run.py
 create mode 100644 nemo/export/trt_llm/tensorrt_llm_utils.py
 create mode 100644 nemo/export/trt_llm/utils.py
 create mode 100644 scripts/export/export_to_trt_llm.py
 create mode 100644 tests/export/__init__.py
 create mode 100644 tests/export/run.sh
 create mode 100644 tests/export/test_nemo_export.py
 create mode 100644 tests/infer_data_path.py

diff --git a/Jenkinsfile b/Jenkinsfile
index 07f34babccf9..f6253b16a6d4 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -2201,7 +2201,7 @@ pipeline {
         }
       }
     }
-    
+
     stage('Punctuation & Capitalization tarred dataset') {
       when {
         anyOf {
@@ -2261,7 +2261,7 @@ pipeline {
         }
       }
     }
-    
+
     stage('Punctuation & Capitalization, Different ways of passing labels to model') {
       when {
         anyOf {
@@ -5585,7 +5585,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
       }
     }
-    
+
     stage('L2: Megatron Mock Data Generation') {
       when {
         anyOf {
@@ -5815,4 +5815,4 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
       cleanWs()
     }
   }
-}
+}
\ No newline at end of file
diff --git a/nemo/deploy/__init__.py b/nemo/deploy/__init__.py
new file mode 100644
index 000000000000..e2860dce0a8d
--- /dev/null
+++ b/nemo/deploy/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from nemo.deploy.deploy_base import DeployBase
+from nemo.deploy.deploy_pytriton import DeployPyTriton
+from nemo.deploy.triton_deployable import ITritonDeployable
diff --git a/nemo/deploy/deploy_base.py b/nemo/deploy/deploy_base.py
new file mode 100644
index 000000000000..63746199bac6
--- /dev/null
+++ b/nemo/deploy/deploy_base.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import logging
+from abc import ABC, abstractmethod
+
+use_pytorch_lightning = True
+try:
+    from pytorch_lightning import Trainer
+except Exception:
+    use_pytorch_lightning = False
+
+from nemo.deploy.triton_deployable import ITritonDeployable
+
+use_nemo = True
+try:
+    from nemo.core.classes.modelPT import ModelPT
+except Exception:
+    use_nemo = False
+
+
+LOGGER = logging.getLogger("NeMo")
+
+
+class DeployBase(ABC):
+    def __init__(
+        self,
+        triton_model_name: str,
+        triton_model_version: int = 1,
+        checkpoint_path: str = None,
+        model=None,
+        max_batch_size: int = 128,
+        port: int = 8000,
+        address="0.0.0.0",
+        allow_grpc=True,
+        allow_http=True,
+        streaming=False,
+        pytriton_log_verbose=0,
+    ):
+        self.checkpoint_path = checkpoint_path
+        self.triton_model_name = triton_model_name
+        self.triton_model_version = triton_model_version
+        self.max_batch_size = max_batch_size
+        self.model = model
+        self.port = port
+        self.address = address
+        self.triton = None
+        self.allow_grpc = allow_grpc
+        self.allow_http = allow_http
+        self.streaming = streaming
+        self.pytriton_log_verbose = pytriton_log_verbose
+
+        if checkpoint_path is None and model is None:
+            raise Exception("Either checkpoint_path or model should be provided.")
+
+    @abstractmethod
+    def deploy(self):
+        pass
+
+    @abstractmethod
+    def serve(self):
+        pass
+
+    @abstractmethod
+    def run(self):
+        pass
+
+    @abstractmethod
+    def stop(self):
+        pass
+
+    def _init_nemo_model(self):
+        if self.checkpoint_path is not None:
+            model_config = ModelPT.restore_from(self.checkpoint_path, return_config=True)
+            module_path, class_name = DeployBase.get_module_and_class(model_config.target)
+            cls = getattr(importlib.import_module(module_path), class_name)
+            self.model = cls.restore_from(restore_path=self.checkpoint_path, trainer=Trainer())
+            self.model.freeze()
+
+            # has to turn off activations_checkpoint_method for inference
+            try:
+                self.model.model.language_model.encoder.activations_checkpoint_method = None
+            except AttributeError as e:
+                LOGGER.warning(e)
+
+        if self.model is None:
+            raise Exception("There is no model to deploy.")
+
+        self._is_model_deployable()
+
+    def _is_model_deployable(self):
+        if not issubclass(type(self.model), ITritonDeployable):
+            raise Exception(
+                "This model is not deployable to Triton." "nemo.deploy.ITritonDeployable class should be inherited"
+            )
+        else:
+            return True
+
+    @staticmethod
+    def get_module_and_class(target: str):
+        ln = target.rindex(".")
+        return target[0:ln], target[ln + 1 : len(target)]
diff --git a/nemo/deploy/deploy_pytriton.py b/nemo/deploy/deploy_pytriton.py
new file mode 100644
index 000000000000..22dea8ac47cd
--- /dev/null
+++ b/nemo/deploy/deploy_pytriton.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+use_pytriton = True
+try:
+    from pytriton.model_config import ModelConfig
+    from pytriton.triton import Triton, TritonConfig
+except Exception:
+    use_pytriton = False
+
+from nemo.deploy.deploy_base import DeployBase
+
+
+class DeployPyTriton(DeployBase):
+
+    """
+    Deploys any models to Triton Inference Server that implements ITritonDeployable interface in nemo.deploy.
+
+    Example:
+        from nemo.deploy import DeployPyTriton, NemoQueryLLM
+        from nemo.export import TensorRTLLM
+
+        trt_llm_exporter = TensorRTLLM(model_dir="/path/for/model/files")
+        trt_llm_exporter.export(
+            nemo_checkpoint_path="/path/for/nemo/checkpoint",
+            model_type="llama",
+            n_gpus=1,
+        )
+
+        nm = DeployPyTriton(model=trt_llm_exporter, triton_model_name="model_name", port=8000)
+        nm.deploy()
+        nm.run()
+        nq = NemoQueryLLM(url="localhost", model_name="model_name")
+
+        prompts = ["hello, testing GPT inference", "another GPT inference test?"]
+        output = nq.query_llm(prompts=prompts, max_output_len=100)
+        print("prompts: ", prompts)
+        print("")
+        print("output: ", output)
+        print("")
+
+        prompts = ["Give me some info about Paris", "Do you think Londan is a good city to visit?", "What do you think about Rome?"]
+        output = nq.query_llm(prompts=prompts, max_output_len=250)
+        print("prompts: ", prompts)
+        print("")
+        print("output: ", output)
+        print("")
+
+    """
+
+    def __init__(
+        self,
+        triton_model_name: str,
+        triton_model_version: int = 1,
+        checkpoint_path: str = None,
+        model=None,
+        max_batch_size: int = 128,
+        port: int = 8000,
+        address="0.0.0.0",
+        allow_grpc=True,
+        allow_http=True,
+        streaming=False,
+        pytriton_log_verbose=0,
+    ):
+        """
+        A nemo checkpoint or model is expected for serving on Triton Inference Server.
+
+        Args:
+            triton_model_name (str): Name for the service
+            triton_model_version(int): Version for the service
+            checkpoint_path (str): path of the nemo file
+            model (ITritonDeployable): A model that implements the ITritonDeployable from nemo.deploy import ITritonDeployable
+            max_batch_size (int): max batch size
+            port (int) : port for the Triton server
+            address (str): http address for Triton server to bind.
+        """
+
+        super().__init__(
+            triton_model_name=triton_model_name,
+            triton_model_version=triton_model_version,
+            checkpoint_path=checkpoint_path,
+            model=model,
+            max_batch_size=max_batch_size,
+            port=port,
+            address=address,
+            allow_grpc=allow_grpc,
+            allow_http=allow_http,
+            streaming=streaming,
+            pytriton_log_verbose=pytriton_log_verbose,
+        )
+
+    def deploy(self):
+
+        """
+        Deploys any models to Triton Inference Server.
+        """
+
+        self._init_nemo_model()
+
+        try:
+            if self.streaming:
+                # TODO: can't set allow_http=True due to a bug in pytriton, will fix in latest pytriton
+                triton_config = TritonConfig(
+                    log_verbose=self.pytriton_log_verbose,
+                    allow_grpc=self.allow_grpc,
+                    allow_http=self.allow_http,
+                    grpc_address=self.address,
+                )
+                self.triton = Triton(config=triton_config)
+                self.triton.bind(
+                    model_name=self.triton_model_name,
+                    model_version=self.triton_model_version,
+                    infer_func=self.model.triton_infer_fn_streaming,
+                    inputs=self.model.get_triton_input,
+                    outputs=self.model.get_triton_output,
+                    config=ModelConfig(decoupled=True),
+                )
+            else:
+                triton_config = TritonConfig(
+                    http_address=self.address,
+                    http_port=self.port,
+                    allow_grpc=self.allow_grpc,
+                    allow_http=self.allow_http,
+                )
+                self.triton = Triton(config=triton_config)
+                self.triton.bind(
+                    model_name=self.triton_model_name,
+                    model_version=self.triton_model_version,
+                    infer_func=self.model.triton_infer_fn,
+                    inputs=self.model.get_triton_input,
+                    outputs=self.model.get_triton_output,
+                    config=ModelConfig(max_batch_size=self.max_batch_size),
+                )
+        except Exception as e:
+            self.triton = None
+            print(e)
+
+    def serve(self):
+
+        """
+        Starts serving the model and waits for the requests
+        """
+
+        if self.triton is None:
+            raise Exception("deploy should be called first.")
+
+        try:
+            self.triton.serve()
+        except Exception as e:
+            self.triton = None
+            print(e)
+
+    def run(self):
+
+        """
+        Starts serving the model asynchronously.
+        """
+
+        if self.triton is None:
+            raise Exception("deploy should be called first.")
+
+        self.triton.run()
+
+    def stop(self):
+        """
+        Stops serving the model.
+        """
+
+        if self.triton is None:
+            raise Exception("deploy should be called first.")
+
+        self.triton.stop()
diff --git a/nemo/deploy/nlp/__init__.py b/nemo/deploy/nlp/__init__.py
new file mode 100644
index 000000000000..21e2ca2751f8
--- /dev/null
+++ b/nemo/deploy/nlp/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+use_query_llm = True
+try:
+    from nemo.deploy.nlp.query_llm import NemoQueryLLM
+except Exception:
+    use_query_llm = False
diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py
new file mode 100644
index 000000000000..6a4337024eeb
--- /dev/null
+++ b/nemo/deploy/nlp/query_llm.py
@@ -0,0 +1,229 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+
+import numpy as np
+
+from nemo.deploy.utils import str_list2numpy
+
+use_pytriton = True
+try:
+    from pytriton.client import DecoupledModelClient, ModelClient
+except Exception:
+    use_pytriton = False
+
+
+class NemoQueryLLMBase(ABC):
+    def __init__(self, url, model_name):
+        self.url = url
+        self.model_name = model_name
+
+    @abstractmethod
+    def query_llm(
+        self,
+        prompts,
+        stop_words_list=None,
+        bad_words_list=None,
+        no_repeat_ngram_size=None,
+        max_output_token=512,
+        top_k=1,
+        top_p=0.0,
+        temperature=1.0,
+        random_seed=None,
+        task_id=None,
+        lora_uids=None,
+        init_timeout=60.0,
+    ):
+        pass
+
+
+class NemoQueryLLM(NemoQueryLLMBase):
+    """
+    Sends a query to Triton for LLM inference
+
+    Example:
+        from nemo.deploy import NemoQueryLLM
+
+        nq = NemoQueryLLM(url="localhost", model_name="GPT-2B")
+
+        prompts = ["hello, testing GPT inference", "another GPT inference test?"]
+        output = nq.query_llm(
+            prompts=prompts,
+            max_output_len=100,
+            top_k=1,
+            top_p=0.0,
+            temperature=0.0,
+        )
+        print("prompts: ", prompts)
+    """
+
+    def __init__(self, url, model_name):
+        super().__init__(
+            url=url, model_name=model_name,
+        )
+
+    def query_llm(
+        self,
+        prompts,
+        stop_words_list=None,
+        bad_words_list=None,
+        no_repeat_ngram_size=None,
+        max_output_token=512,
+        top_k=1,
+        top_p=0.0,
+        temperature=1.0,
+        random_seed=None,
+        task_id=None,
+        lora_uids=None,
+        init_timeout=60.0,
+    ):
+        """
+        Query the Triton server synchronously and return a list of responses.
+
+        Args:
+            prompts (List(str)): list of sentences.
+            max_output_token (int): max generated tokens.
+            top_k (int): limits us to a certain number (K) of the top tokens to consider.
+            top_p (float): limits us to the top tokens within a certain probability mass (p).
+            temperature (float): A parameter of the softmax function, which is the last layer in the network.
+            random_seed (int): Seed to condition sampling.
+            stop_words_list (List(str)): list of stop words.
+            bad_words_list (List(str)): list of bad words.
+            no_repeat_ngram_size (int): no repeat ngram size.
+            task_id (str): downstream task id if virtual tokens are used.
+            init_timeout (flat): timeout for the connection.
+        """
+
+        prompts = str_list2numpy(prompts)
+        inputs = {"prompts": prompts}
+
+        if max_output_token is not None:
+            inputs["max_output_token"] = np.full(prompts.shape, max_output_token, dtype=np.int_)
+
+        if top_k is not None:
+            inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_)
+
+        if top_p is not None:
+            inputs["top_p"] = np.full(prompts.shape, top_p, dtype=np.single)
+
+        if temperature is not None:
+            inputs["temperature"] = np.full(prompts.shape, temperature, dtype=np.single)
+
+        if random_seed is not None:
+            inputs["random_seed"] = np.full(prompts.shape, random_seed, dtype=np.int_)
+
+        if stop_words_list is not None:
+            inputs["stop_words_list"] = str_list2numpy(stop_words_list)
+        if bad_words_list is not None:
+            inputs["bad_words_list"] = str_list2numpy(bad_words_list)
+
+        if no_repeat_ngram_size is not None:
+            inputs["no_repeat_ngram_size"] = np.full(prompts.shape, no_repeat_ngram_size, dtype=np.single)
+
+        if task_id is not None:
+            task_id = np.char.encode(task_id, "utf-8")
+            inputs["task_id"] = np.full((prompts.shape[0], len([task_id])), task_id)
+
+        if lora_uids is not None:
+            lora_uids = np.char.encode(lora_uids, "utf-8")
+            inputs["lora_uids"] = np.full((prompts.shape[0], len(lora_uids)), lora_uids)
+
+        with ModelClient(self.url, self.model_name, init_timeout_s=init_timeout) as client:
+            result_dict = client.infer_batch(**inputs)
+            output_type = client.model_config.outputs[0].dtype
+
+            if output_type == np.bytes_:
+                sentences = np.char.decode(result_dict["outputs"].astype("bytes"), "utf-8")
+                return sentences
+            else:
+                return result_dict["outputs"]
+
+    def query_llm_streaming(
+        self,
+        prompts,
+        stop_words_list=None,
+        bad_words_list=None,
+        no_repeat_ngram_size=None,
+        max_output_token=512,
+        top_k=1,
+        top_p=0.0,
+        temperature=1.0,
+        random_seed=None,
+        task_id=None,
+        lora_uids=None,
+        init_timeout=60.0,
+    ):
+        """
+        Query the Triton server using streaming.
+
+        Args:
+            prompts (List(str)): list of sentences.
+            max_output_token (int): max generated tokens.
+            top_k (int): limits us to a certain number (K) of the top tokens to consider.
+            top_p (float): limits us to the top tokens within a certain probability mass (p).
+            temperature (float): A parameter of the softmax function, which is the last layer in the network.
+            random_seed (int): Seed to condition sampling.
+            stop_words_list (List(str)): list of stop words.
+            bad_words_list (List(str)): list of bad words.
+            no_repeat_ngram_size (int): no repeat ngram size.
+            task_id (str): downstream task id if virtual tokens are used.
+            init_timeout (flat): timeout for the connection.
+        """
+
+        prompts = str_list2numpy(prompts)
+        inputs = {"prompts": prompts}
+
+        if max_output_token is not None:
+            inputs["max_output_token"] = np.full(prompts.shape, max_output_token, dtype=np.int_)
+
+        if top_k is not None:
+            inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_)
+
+        if top_p is not None:
+            inputs["top_p"] = np.full(prompts.shape, top_p, dtype=np.single)
+
+        if temperature is not None:
+            inputs["temperature"] = np.full(prompts.shape, temperature, dtype=np.single)
+
+        if random_seed is not None:
+            inputs["random_seed"] = np.full(prompts.shape, random_seed, dtype=np.int_)
+
+        if stop_words_list is not None:
+            stop_words_list = np.char.encode(stop_words_list, "utf-8")
+            inputs["stop_words_list"] = np.full((prompts.shape[0], len(stop_words_list)), stop_words_list)
+
+        if bad_words_list is not None:
+            bad_words_list = np.char.encode(bad_words_list, "utf-8")
+            inputs["bad_words_list"] = np.full((prompts.shape[0], len(bad_words_list)), bad_words_list)
+
+        if no_repeat_ngram_size is not None:
+            inputs["no_repeat_ngram_size"] = np.full(prompts.shape, no_repeat_ngram_size, dtype=np.single)
+
+        if task_id is not None:
+            task_id = np.char.encode(task_id, "utf-8")
+            inputs["task_id"] = np.full((prompts.shape[0], len([task_id])), task_id)
+
+        if lora_uids is not None:
+            lora_uids = np.char.encode(lora_uids, "utf-8")
+            inputs["lora_uids"] = np.full((prompts.shape[0], len(lora_uids)), lora_uids)
+
+        with DecoupledModelClient(self.url, self.model_name, init_timeout_s=init_timeout) as client:
+            for partial_result_dict in client.infer_batch(**inputs):
+                output_type = client.model_config.outputs[0].dtype
+                if output_type == np.bytes_:
+                    sentences = np.char.decode(partial_result_dict["outputs"].astype("bytes"), "utf-8")
+                    yield sentences
+                else:
+                    yield partial_result_dict["outputs"]
diff --git a/nemo/deploy/triton_deployable.py b/nemo/deploy/triton_deployable.py
new file mode 100644
index 000000000000..c4cc9dd64ee6
--- /dev/null
+++ b/nemo/deploy/triton_deployable.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from abc import ABC, abstractmethod
+import numpy as np
+
+
+class ITritonDeployable(ABC):
+    @abstractmethod
+    def get_triton_input(self):
+        pass
+
+    @abstractmethod
+    def get_triton_output(self):
+        pass
+
+    @abstractmethod
+    def triton_infer_fn(self, **inputs: np.ndarray):
+        pass
diff --git a/nemo/deploy/utils.py b/nemo/deploy/utils.py
new file mode 100644
index 000000000000..fe770debe739
--- /dev/null
+++ b/nemo/deploy/utils.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import typing
+
+import numpy as np
+import torch
+from pytriton.model_config import Tensor
+
+
+def typedict2tensor(
+    typedict_class,
+    overwrite_kwargs: typing.Optional[typing.Dict[str, typing.Any]] = None,
+    defaults: typing.Optional[typing.Dict[str, typing.Any]] = None,
+):
+    def _map_type(type_):
+        if type_ is int:
+            return np.int32
+        elif type_ is float:
+            return np.float32
+        elif type_ is bool:
+            return np.bool_
+        elif type_ is str:
+            return bytes
+        else:
+            raise Exception(f"Unknown type {type_}")
+
+    def _get_tensor_params(type_):
+        count = 0
+        while typing.get_origin(type_) is list:
+            type_ = typing.get_args(type_)[0]
+            count += 1
+        count -= 1  # we don't want to count the last dimension
+        shape = (-1,) * count if count > 1 else (1,)
+        return {"shape": shape, "dtype": _map_type(type_)}
+
+    overwrite_kwargs = overwrite_kwargs or {}
+    return tuple(
+        Tensor(name=name, **_get_tensor_params(type_), **overwrite_kwargs)
+        for name, type_ in typing.get_type_hints(typedict_class).items()
+    )
+
+
+def str_list2numpy(str_list: typing.List[str]) -> np.ndarray:
+    str_ndarray = np.array(str_list)[..., np.newaxis]
+    return np.char.encode(str_ndarray, "utf-8")
+
+
+def str_ndarray2list(str_ndarray: np.ndarray) -> typing.List[str]:
+    str_ndarray = str_ndarray.astype("bytes")
+    str_ndarray = np.char.decode(str_ndarray, encoding="utf-8")
+    str_ndarray = str_ndarray.squeeze(axis=-1)
+    return str_ndarray.tolist()
+
+
+def cast_output(data, required_dtype):
+    if isinstance(data, torch.Tensor):
+        data = data.cpu().numpy()
+    elif not isinstance(data, np.ndarray):
+        data = np.array(data)
+
+        data_is_str = required_dtype in (object, np.object_, bytes, np.bytes_)
+        if data_is_str:
+            data = np.char.encode(data, "utf-8")
+
+    if data.ndim < 2:
+        data = data[..., np.newaxis]
+    return data.astype(required_dtype)
diff --git a/nemo/export/__init__.py b/nemo/export/__init__.py
index d9155f923f18..55712d98852c 100644
--- a/nemo/export/__init__.py
+++ b/nemo/export/__init__.py
@@ -11,3 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+
+import logging
+
+LOGGER = logging.getLogger("NeMo")
+
+
+use_TensorRTLLM = True
+try:
+    from nemo.export.tensorrt_llm import TensorRTLLM
+except Exception as e:
+    LOGGER.warning("TensorRTLLM could not be imported.")
diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
new file mode 100644
index 000000000000..473fefaea6a2
--- /dev/null
+++ b/nemo/export/tensorrt_llm.py
@@ -0,0 +1,702 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import os
+import pickle
+import shutil
+import tempfile
+from pathlib import Path
+from typing import List
+
+import numpy as np
+import tensorrt_llm
+import torch
+import wrapt
+
+from nemo.deploy import ITritonDeployable
+from nemo.export.trt_llm.model_config_trt import model_config_to_tensorrt_llm
+from nemo.export.trt_llm.nemo.nemo_ckpt_convert import build_tokenizer
+from nemo.export.trt_llm.nemo_utils import get_tokenzier, nemo_llm_model_to_model_config, nemo_llm_to_model_config
+from nemo.export.trt_llm.tensorrt_llm_run import generate, generate_streaming, load, load_refit
+from nemo.export.trt_llm.utils import is_nemo_file, unpack_nemo_ckpt
+
+use_deploy = True
+try:
+    from nemo.deploy.utils import cast_output, str_ndarray2list
+except Exception:
+    use_deploy = False
+
+
+@wrapt.decorator
+def noop_decorator(func):
+    def wrapper(*args, **kwargs):
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+use_pytriton = True
+batch = noop_decorator
+try:
+    from pytriton.decorators import batch
+    from pytriton.model_config import Tensor
+except Exception:
+    use_pytriton = False
+
+LOGGER = logging.getLogger("NeMo")
+
+
+class TensorRTLLM(ITritonDeployable):
+    """
+    Exports nemo checkpoints to TensorRT-LLM and run fast inference.
+
+    Example:
+        from nemo.export import TensorRTLLM
+
+        trt_llm_exporter = TensorRTLLM(model_dir="/path/for/model/files")
+        trt_llm_exporter.export(
+            nemo_checkpoint_path="/path/for/nemo/checkpoint",
+            model_type="llama",
+            n_gpus=1,
+        )
+
+        output = trt_llm_exporter.forward(["Hi, how are you?", "I am good, thanks, how about you?"])
+        print("output: ", output)
+
+    """
+
+    def __init__(self, model_dir: str, lora_ckpt_list: List[str] = None, load_model: bool = True):
+        """
+        Args:
+            model_dir (str): path for storing the TensorRT-LLM model files.
+            load_model (bool): load TensorRT-LLM model if the engine files exist in the model_dir.
+        """
+
+        self.model_dir = model_dir
+        self.lora_ckpt_list = lora_ckpt_list
+        self.model = None
+        self.tokenizer = None
+        self.n_gpus = None
+        self.config = None
+        self.ptuning_tables = []
+        self.p_table = None
+        self.task_vocab_size = 0
+        self.task_ids = {}
+
+        if load_model:
+            self._load()
+
+    def export(
+        self,
+        nemo_checkpoint_path: str,
+        model_type: str,
+        delete_existing_files: bool = True,
+        n_gpus: int = 1,
+        tensor_parallel_size: int = None,
+        pipeline_parallel_size: int = None,
+        max_input_token: int = 256,
+        max_output_token: int = 256,
+        max_batch_size: int = 8,
+        max_prompt_embedding_table_size=None,
+        use_inflight_batching: bool = False,
+        enable_context_fmha: bool = True,
+        paged_kv_cache: bool = False,
+        dtype: str = "bfloat16",
+        load_model: bool = True,
+        enable_multi_block_mode: bool = False,
+        use_lora_plugin: str = None,
+        lora_target_modules: List[str] = None,
+        max_lora_rank: int = 64,
+        save_nemo_model_config: bool = False,
+    ):
+        """
+        Exports nemo checkpoints to TensorRT-LLM.
+
+        Args:
+            nemo_checkpoint_path (str): path for the nemo checkpoint.
+            model_type (str): type of the model. Currently, "llama", "gptnext", "falcon", and "starcoder" are supported.
+            delete_existing_files (bool): if Truen, deletes all the files in model_dir.
+            n_gpus (int): number of GPUs to use for inference.
+            tensor_parallel_size (int): tensor parallelism.
+            pipeline_parallel_size (int): pipeline parallelism.
+            max_input_token (int): max input length.
+            max_output_token (int): max output length.
+            max_batch_size (int): max batch size.
+            max_prompt_embedding_table_size (int): max prompt embedding size.
+            use_inflight_batching (bool): if True, enables inflight batching for TensorRT-LLM Triton backend.
+            enable_context_fmha (bool): if True, use fused Context MultiHeadedAttention.
+            paged_kv_cache (bool): if True, uses kv cache feature of the TensorRT-LLM.
+            dtype (str): Floating point type for model weights (Supports BFloat16/Float16).
+            load_model (bool): load TensorRT-LLM model after the export.
+            enable_multi_block_mode (bool): enable faster decoding in multihead attention. Required for long context.
+        """
+
+        if model_type not in self.get_supported_models_list:
+            raise Exception(
+                "Model {0} is not currently a supported model type. "
+                "Supported model types are llama, gptnext, falcon, and starcoder".format(model_type)
+            )
+
+        if model_type == "gpt" or model_type == "starcoder":
+            model_type = "gptnext"
+
+        if model_type == "mixtral":
+            model_type = "llama"
+
+        if pipeline_parallel_size is None:
+            tensor_parallel_size = n_gpus
+            pipeline_parallel_size = 1
+        elif tensor_parallel_size is None:
+            tensor_parallel_size = 1
+            pipeline_parallel_size = n_gpus
+
+        if Path(self.model_dir).exists():
+            if delete_existing_files and len(os.listdir(self.model_dir)) > 0:
+                for files in os.listdir(self.model_dir):
+                    path = os.path.join(self.model_dir, files)
+                    try:
+                        shutil.rmtree(path)
+                    except OSError:
+                        os.remove(path)
+
+                if len(os.listdir(self.model_dir)) > 0:
+                    raise Exception("Couldn't delete all files.")
+            elif len(os.listdir(self.model_dir)) > 0:
+                raise Exception("There are files in this folder. Try setting delete_existing_files=True.")
+        else:
+            Path(self.model_dir).mkdir(parents=True, exist_ok=True)
+
+        if max_prompt_embedding_table_size is None:
+            max_prompt_embedding_table_size = 0
+
+        self.model = None
+
+        tmp_dir = tempfile.TemporaryDirectory()
+        nemo_export_dir = Path(tmp_dir.name)
+
+        model_configs, self.tokenizer = nemo_llm_to_model_config(
+            in_file=nemo_checkpoint_path,
+            decoder_type=model_type,
+            dtype=dtype,
+            tensor_parallel_size=tensor_parallel_size,
+            pipeline_parallel_size=pipeline_parallel_size,
+            nemo_export_dir=nemo_export_dir,
+            save_nemo_model_config=save_nemo_model_config,
+        )
+
+        model_config_to_tensorrt_llm(
+            model_configs,
+            self.model_dir,
+            world_size=tensor_parallel_size * pipeline_parallel_size,
+            max_input_len=max_input_token,
+            max_output_len=max_output_token,
+            max_batch_size=max_batch_size,
+            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+            use_inflight_batching=use_inflight_batching,
+            paged_kv_cache=paged_kv_cache,
+            enable_context_fmha=enable_context_fmha,
+            enable_multi_block_mode=enable_multi_block_mode,
+            use_lora_plugin=use_lora_plugin,
+            lora_target_modules=lora_target_modules,
+            max_lora_rank=max_lora_rank,
+        )
+
+        tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
+        if os.path.exists(tokenizer_path):
+            shutil.copy(tokenizer_path, self.model_dir)
+        else:
+            self.tokenizer.save_pretrained(os.path.join(self.model_dir, 'huggingface_tokenizer'))
+
+        nemo_model_config = os.path.join(nemo_export_dir, "model_config.yaml")
+        if os.path.exists(nemo_model_config):
+            shutil.copy(nemo_model_config, self.model_dir)
+
+        tmp_dir.cleanup()
+
+        if load_model:
+            self._load()
+
+    def build(
+        self,
+        nemo_model,
+        nemo_model_config,
+        tokenizer=None,
+        max_input_token: int = 256,
+        max_output_token: int = 256,
+        max_batch_size: int = 8,
+        use_refit: bool = False,
+        model_type: str = "gptnext",
+    ):
+        from megatron.core import parallel_state
+
+        self.use_refit = use_refit
+        self.stream = torch.cuda.Stream()
+        self.model_type = model_type
+        self.tokenizer = build_tokenizer(tokenizer)
+
+        # Each model shard has its own directory
+        if parallel_state.get_data_parallel_world_size() > 1:
+            self.model_dir = os.path.join(self.model_dir, f"dp{parallel_state.get_data_parallel_rank()}")
+        if parallel_state.get_tensor_model_parallel_world_size() > 1:
+            self.model_dir = os.path.join(self.model_dir, f"tp{parallel_state.get_tensor_model_parallel_rank()}")
+        if parallel_state.get_pipeline_model_parallel_world_size() > 1:
+            self.model_dir = os.path.join(self.model_dir, f"pp{parallel_state.get_pipeline_model_parallel_rank()}")
+
+        # Build or refit TRT-LLM engine from a nemo model.
+        model_configs = nemo_llm_model_to_model_config(
+            nemo_model=nemo_model, decoder_type=model_type, nemo_model_config=nemo_model_config,
+        )
+
+        model_config_to_tensorrt_llm(
+            model_configs,
+            self.model_dir,
+            max_input_len=max_input_token,
+            max_output_len=max_output_token,
+            max_batch_size=max_batch_size,
+            max_beam_width=1,
+            max_prompt_embedding_table_size=0,
+            use_refit=self.use_refit,
+        )
+        # Use load_refit to handle multiprocessed environment
+        self.model = load_refit(
+            tokenizer=self.tokenizer, engine_dir=self.model_dir, model_configs=model_configs, stream=self.stream
+        )
+
+    def refit(
+        self, nemo_model, nemo_model_config,
+    ):
+        assert self.use_refit, "TRT-LLM model must be built() with refit=True"
+
+        # Build or refit TRT-LLM engine from a nemo model.
+        model_configs = nemo_llm_model_to_model_config(
+            nemo_model=nemo_model, decoder_type=self.model_type, nemo_model_config=nemo_model_config
+        )
+
+        self.model = load_refit(
+            tokenizer=self.tokenizer, engine_dir=self.model_dir, model_configs=model_configs, stream=self.stream
+        )
+
+    def forward(
+        self,
+        input_texts: List[str],
+        max_output_token: int = 64,
+        top_k: int = 1,
+        top_p: float = 0.0,
+        temperature: float = 1.0,
+        stop_words_list: List[str] = None,
+        bad_words_list: List[str] = None,
+        no_repeat_ngram_size: int = None,
+        task_ids: List[str] = None,
+        lora_uids: List[str] = None,
+        prompt_embeddings_table=None,
+        prompt_embeddings_checkpoint_path: str = None,
+        streaming: bool = False,
+        output_log_probs: bool = False,
+        **sampling_kwargs,
+    ):
+
+        """
+        Exports nemo checkpoints to TensorRT-LLM.
+
+        Args:
+            input_texts (List(str)): list of sentences.
+            max_output_token (int): max generated tokens.
+            top_k (int): limits us to a certain number (K) of the top tokens to consider.
+            top_p (float): limits us to the top tokens within a certain probability mass (p).
+            temperature (float): A parameter of the softmax function, which is the last layer in the network.
+            stop_words_list (List(str)): list of stop words.
+            bad_words_list (List(str)): list of bad words.
+            no_repeat_ngram_size (int): no repeat ngram size.
+            task_ids (List(str)): list of the task ids for the prompt tables.
+            prompt_embeddings_table (List(float)): prompt embeddings table.
+            prompt_embeddings_checkpoint_path (str): path for the nemo checkpoint for the prompt embedding table.
+            sampling_kwargs: Additional kwargs to set in the SamplingConfig.
+        """
+
+        if self.model is None:
+            raise Exception(
+                "A nemo checkpoint should be exported to TensorRT-LLM and "
+                "then it should be loaded first to run inference."
+            )
+        else:
+            if prompt_embeddings_table is not None or prompt_embeddings_checkpoint_path is not None:
+                prompt_table = self._get_prompt_embedding_table(
+                    prompt_embeddings_table, prompt_embeddings_checkpoint_path
+                )
+                tv_size = prompt_table.size(dim=0)
+            elif len(self.ptuning_tables) > 0:
+                prompt_table = self.p_table
+                tv_size = self.task_vocab_size
+            else:
+                prompt_table = None
+                tv_size = None
+
+            if task_ids is None:
+                assert prompt_table is None, "There is a prompt embedding table and task_ids cannot be None"
+                input_task_ids = None
+            else:
+                if prompt_table is None:
+                    input_task_ids = None
+                else:
+                    if len(task_ids) > 1:
+                        assert len(task_ids) == len(input_texts), (
+                            "Either len of the task_ids has to be 1 or" "it needs to match with len of input_texts."
+                        )
+
+                    if len(task_ids) == 1:
+                        assert task_ids[0] in self.task_ids.keys(), "Task: {0} doesn't exist in the task list.".format(
+                            task_ids[0]
+                        )
+                        input_task_ids = [self.task_ids[task_ids[0]] for i in range(len(input_texts))]
+                    else:
+                        input_task_ids = []
+                        for i in range(len(input_texts)):
+                            assert (
+                                task_ids[i] in self.task_ids.keys()
+                            ), "Task: {0} doesn't exist in the task list.".format(task_ids[i])
+                            input_task_ids.append(self.task_ids[task_ids[i]])
+            if not streaming:
+                if torch.distributed.is_initialized():
+                    multiprocessed_env = True
+                else:
+                    multiprocessed_env = False
+
+                return generate(
+                    input_texts=input_texts,
+                    max_output_len=max_output_token,
+                    host_context=self.model,
+                    top_k=top_k,
+                    top_p=top_p,
+                    temperature=temperature,
+                    prompt_table=prompt_table,
+                    task_vocab_size=tv_size,
+                    task_ids=input_task_ids,
+                    lora_uids=lora_uids,
+                    stop_words_list=stop_words_list,
+                    bad_words_list=bad_words_list,
+                    no_repeat_ngram_size=no_repeat_ngram_size,
+                    output_log_probs=output_log_probs,
+                    multiprocessed_env=multiprocessed_env,
+                    **sampling_kwargs,
+                )
+            else:
+                return generate_streaming(
+                    input_texts=input_texts,
+                    max_output_len=max_output_token,
+                    host_context=self.model,
+                    top_k=top_k,
+                    top_p=top_p,
+                    temperature=temperature,
+                    prompt_table=prompt_table,
+                    task_vocab_size=tv_size,
+                    task_ids=input_task_ids,
+                    lora_uids=lora_uids,
+                    stop_words_list=stop_words_list,
+                    bad_words_list=bad_words_list,
+                    no_repeat_ngram_size=no_repeat_ngram_size,
+                    **sampling_kwargs,
+                )
+
+    def add_prompt_table(self, task_name: str, prompt_embeddings_checkpoint_path: str):
+        if self.model is None:
+            raise Exception(
+                "A nemo checkpoint should be exported to TensorRT-LLM and "
+                "then it should be loaded first to run inference."
+            )
+
+        for pt in self.ptuning_tables:
+            if pt["task_name"] == task_name:
+                raise Exception("Task name: {0} has already added. Please pass a unique task name.".format(task_name))
+
+        prompt_table = self._get_prompt_embedding_table(
+            prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path
+        )
+
+        self.ptuning_tables.append({"table": prompt_table, "task_name": task_name})
+        with open(os.path.join(self.model_dir, 'prompt_tables.pkl'), 'wb') as f:
+            pickle.dump(self.ptuning_tables, f)
+
+        self._prep_ptuning_table()
+
+    def remove_prompt_table(self, task_name: str):
+        if self.ptuning_tables is not None:
+            for i in range(len(self.ptuning_tables)):
+                if self.ptuning_tables[i]["task_name"] == task_name:
+                    self.ptuning_tables.pop(i)
+                    with open(os.path.join(self.model_dir, 'prompt_tables.pkl'), 'wb') as f:
+                        pickle.dump(self.ptuning_tables, f)
+                    return
+            self._prep_ptuning_table()
+
+    @property
+    def get_supported_models_list(self):
+        # gpt and gptnext are the same. Keeping the gptnext due to backward compatibility.
+        return ["gpt", "gptnext", "llama", "falcon", "starcoder", "mixtral", "gemma"]
+
+    @property
+    def get_hidden_size(self):
+        if self.config is None:
+            return None
+        else:
+            return self.config["builder_config"]["hidden_size"]
+
+    @property
+    def get_triton_input(self):
+        inputs = (
+            Tensor(name="prompts", shape=(-1,), dtype=bytes),
+            Tensor(name="max_output_token", shape=(-1,), dtype=np.int_, optional=True),
+            Tensor(name="top_k", shape=(-1,), dtype=np.int_, optional=True),
+            Tensor(name="top_p", shape=(-1,), dtype=np.single, optional=True),
+            Tensor(name="temperature", shape=(-1,), dtype=np.single, optional=True),
+            Tensor(name="random_seed", shape=(-1,), dtype=np.int_, optional=True),
+            Tensor(name="stop_words_list", shape=(-1,), dtype=bytes, optional=True),
+            Tensor(name="bad_words_list", shape=(-1,), dtype=bytes, optional=True),
+            Tensor(name="no_repeat_ngram_size", shape=(-1,), dtype=np.single, optional=True),
+            Tensor(name="task_id", shape=(-1,), dtype=bytes, optional=True),
+            Tensor(name="lora_uids", shape=(-1,), dtype=bytes, optional=True),
+        )
+        return inputs
+
+    @property
+    def get_triton_output(self):
+        outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes),)
+        return outputs
+
+    @batch
+    def triton_infer_fn(self, **inputs: np.ndarray):
+        try:
+            infer_input = {"input_texts": str_ndarray2list(inputs.pop("prompts"))}
+            if "max_output_token" in inputs:
+                infer_input["max_output_token"] = inputs.pop("max_output_token")[0][0]
+            if "top_k" in inputs:
+                infer_input["top_k"] = inputs.pop("top_k")[0][0]
+            if "top_p" in inputs:
+                infer_input["top_p"] = inputs.pop("top_p")[0][0]
+            if "temperature" in inputs:
+                infer_input["temperature"] = inputs.pop("temperature")[0][0]
+            if "random_seed" in inputs:
+                infer_input["random_seed"] = inputs.pop("random_seed")[0][0]
+            if "stop_words_list" in inputs:
+                stop_words_list = str_ndarray2list(inputs.pop("stop_words_list"))
+                infer_input["stop_words_list"] = [[stop_word] for stop_word in stop_words_list]
+            if "bad_words_list" in inputs:
+                bad_words_list = str_ndarray2list(inputs.pop("bad_words_list"))
+                infer_input["bad_words_list"] = [[bad_word] for bad_word in bad_words_list]
+            if "no_repeat_ngram_size" in inputs:
+                infer_input["no_repeat_ngram_size"] = inputs.pop("no_repeat_ngram_size")[0][0]
+            if "task_id" in inputs:
+                task_id = np.char.decode(inputs.pop("task_id").astype("bytes"), encoding="utf-8")
+                infer_input["task_ids"] = task_id[0]
+            if "lora_uids" in inputs:
+                lora_uids = np.char.decode(inputs.pop("lora_uids").astype("bytes"), encoding="utf-8")
+                infer_input["lora_uids"] = lora_uids[0].tolist()
+
+            output_texts = self.forward(**infer_input)
+            output = cast_output(output_texts, np.bytes_)
+        except Exception as error:
+            err_msg = "An error occurred: {0}".format(str(error))
+            output = cast_output([err_msg], np.bytes_)
+
+        return {"outputs": output}
+
+    @batch
+    def triton_infer_fn_streaming(self, **inputs: np.ndarray):
+        try:
+            infer_input = {"input_texts": str_ndarray2list(inputs.pop("prompts"))}
+            if "max_output_token" in inputs:
+                infer_input["max_output_token"] = inputs.pop("max_output_token")[0][0]
+            if "top_k" in inputs:
+                infer_input["top_k"] = inputs.pop("top_k")[0][0]
+            if "top_p" in inputs:
+                infer_input["top_p"] = inputs.pop("top_p")[0][0]
+            if "temperature" in inputs:
+                infer_input["temperature"] = inputs.pop("temperature")[0][0]
+            if "random_seed" in inputs:
+                infer_input["random_seed"] = inputs.pop("random_seed")[0][0]
+            if "stop_words_list" in inputs:
+                stop_words_list = str_ndarray2list(inputs.pop("stop_words_list"))
+                infer_input["stop_words_list"] = [[stop_word] for stop_word in stop_words_list]
+            if "bad_words_list" in inputs:
+                bad_words_list = str_ndarray2list(inputs.pop("bad_words_list"))
+                infer_input["bad_words_list"] = [[bad_word] for bad_word in bad_words_list]
+            if "no_repeat_ngram_size" in inputs:
+                infer_input["no_repeat_ngram_size"] = inputs.pop("no_repeat_ngram_size")[0][0]
+            if "task_id" in inputs:
+                task_id = np.char.decode(inputs.pop("task_id").astype("bytes"), encoding="utf-8")
+                infer_input["task_ids"] = task_id[0]
+            if "lora_uids" in inputs:
+                lora_uids = np.char.decode(inputs.pop("lora_uids").astype("bytes"), encoding="utf-8")
+                infer_input["lora_uids"] = lora_uids[0].tolist()
+
+            partial_outputs = self.forward(**infer_input, streaming=True)
+            # On each request to this generator, run the model for one step and return a dict
+            # with full outputs generated until this step.
+            for output_texts in partial_outputs:
+                yield {"outputs": cast_output(output_texts, np.bytes_)}
+        except Exception as error:
+            err_msg = "An error occurred: {0}".format(str(error))
+            output = cast_output([err_msg], np.bytes_)
+            return {"outputs": output}
+
+    def _prep_ptuning_table(self):
+        self.task_vocab_size = 0
+        for pt in self.ptuning_tables:
+            if self.task_vocab_size < pt["table"].size(dim=0):
+                self.task_vocab_size = pt["table"].size(dim=0)
+
+        # pad tasks to longest task embedding table
+        vtokens_embeddings = []
+        self.task_ids = {}
+        tid = 0
+        for i, ptuning_table in enumerate(self.ptuning_tables):
+            padded_table = torch.zeros((self.task_vocab_size, self.get_hidden_size))
+            padded_table[: ptuning_table["table"].size(dim=0), :] = ptuning_table["table"]
+            vtokens_embeddings.append(padded_table)
+            self.task_ids[ptuning_table["task_name"]] = tid
+            tid = tid + 1
+
+        if len(vtokens_embeddings) > 0:
+            self.p_table = torch.stack(vtokens_embeddings, dim=0).view(-1, self.get_hidden_size)
+        else:
+            self.p_table = None
+
+    def _load_prompt_tables(self):
+        if self.model_dir is not None:
+            pt_path = Path(os.path.join(self.model_dir, 'prompt_tables.pkl'))
+            if pt_path.exists():
+                with open(pt_path, 'rb') as f:
+                    self.ptuning_tables = pickle.load(f)
+                self._prep_ptuning_table()
+            else:
+                self.ptuning_tables = []
+
+    def _get_prompt_embedding_table_ckpt(self, prompt_embeddings_checkpoint_path):
+        with tempfile.TemporaryDirectory() as temp_dir:
+            unpack_nemo_ckpt(prompt_embeddings_checkpoint_path, temp_dir)
+            mw_path = os.path.join(temp_dir, "model_weights.ckpt")
+            if not Path(mw_path).exists():
+                mw_path = os.path.join(temp_dir, "mp_rank_00", "model_weights.ckpt")
+                if not Path(mw_path).exists():
+                    raise FileNotFoundError(
+                        "File: {0} could not be found in the nemo checkpoint. "
+                        "Please check the nemo checkpoint format for the prompt "
+                        "embedding table.".format(mw_path)
+                    )
+            weights = torch.load(mw_path)
+
+            weights_found = True
+            if "model.embedding.adapter_layer.ptuning_adapter.inference_table" in weights:
+                weights = weights["model.embedding.adapter_layer.ptuning_adapter.inference_table"]
+            elif (
+                "model.language_model.adapter_layer.ptuning_adapter.inference_table.prompt_table.taskname.prompt_embeddings.weight"
+                in weights
+            ):
+                weights = weights[
+                    "model.language_model.adapter_layer.ptuning_adapter.inference_table.prompt_table.taskname.prompt_embeddings.weight"
+                ]
+            elif 'prompt_table' in weights:
+                if "prompt_table.taskname.prompt_embeddings.weight" in weights['prompt_table']:
+                    weights = weights['prompt_table']["prompt_table.taskname.prompt_embeddings.weight"]
+                else:
+                    weights_found = False
+            else:
+                weights_found = False
+
+            if not weights_found:
+                raise Exception(
+                    "Could not find the embedding table in the {0}. Please check the nemo file format".format(
+                        prompt_embeddings_checkpoint_path
+                    )
+                )
+
+            return weights.cpu().detach()
+
+    def _get_prompt_embedding_table(
+        self, prompt_embeddings_table=None, prompt_embeddings_checkpoint_path=None,
+    ):
+        if prompt_embeddings_table is not None and prompt_embeddings_checkpoint_path is not None:
+            LOGGER.warning(
+                "prompt_embeddings_table will be used and "
+                "prompt_embeddings_checkpoint_path will be "
+                "ignored for ptuning."
+            )
+            p_tuning = "use_table"
+        elif prompt_embeddings_table is not None:
+            p_tuning = "use_table"
+        elif prompt_embeddings_checkpoint_path is not None:
+            p_tuning = "use_checkpoint"
+        else:
+            return None, None
+
+        if p_tuning == "use_table":
+            if not isinstance(prompt_embeddings_table, np.ndarray):
+                raise TypeError("Only numpy array is allowed for the prompt embeddings table.")
+
+            if len(prompt_embeddings_table.shape) != 2:
+                raise Exception("A two dimensional prompt embeddings table for a single task is only supported.")
+
+            prompt_embeddings_table = torch.from_numpy(prompt_embeddings_table)
+        elif p_tuning == "use_checkpoint":
+            if not is_nemo_file(prompt_embeddings_checkpoint_path):
+                raise TypeError(prompt_embeddings_checkpoint_path + " is not a nemo file.")
+            prompt_embeddings_table = self._get_prompt_embedding_table_ckpt(prompt_embeddings_checkpoint_path)
+
+        dtype = self.config['builder_config']['precision']
+        prompt_embeddings_table = prompt_embeddings_table.to(
+            dtype=tensorrt_llm._utils.str_dtype_to_torch(dtype)
+        ).cuda()
+
+        if prompt_embeddings_table.size(dim=1) != self.config["builder_config"]["hidden_size"]:
+            raise Exception(
+                "Hidden dimension of the model is {0} and does not match with the dimension of the prompt table.".format(
+                    self.config["builder_config"]["hidden_size"]
+                )
+            )
+
+        return prompt_embeddings_table
+
+    def _load_config_file(self):
+        engine_dir = Path(self.model_dir)
+        config_path = engine_dir / 'config.json'
+        if config_path.exists():
+            with open(config_path, 'r') as f:
+                self.config = json.load(f)
+        else:
+            raise FileNotFoundError("file: {0} could not be found.".format(config_path))
+
+    def _load(self):
+        self.model = None
+        self.tokenizer = None
+        self.n_gpus = None
+        self.config = None
+        self.ptuning_tables = []
+
+        if Path(self.model_dir).exists():
+            folders = os.listdir(self.model_dir)
+            if len(folders) > 0:
+                try:
+                    self._load_config_file()
+                    self.tokenizer = get_tokenzier(Path(os.path.join(self.model_dir)))
+                    self.model = load(
+                        tokenizer=self.tokenizer, engine_dir=self.model_dir, lora_ckpt_list=self.lora_ckpt_list
+                    )
+                    self._load_prompt_tables()
+                except Exception as error:
+                    raise Exception(
+                        "Files in the TensorRT-LLM folder is corrupted and "
+                        "model needs to be exported again. "
+                        "Error message: " + str(error)
+                    )
diff --git a/nemo/export/trt_llm/__init__.py b/nemo/export/trt_llm/__init__.py
new file mode 100644
index 000000000000..4fc50543f1d2
--- /dev/null
+++ b/nemo/export/trt_llm/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/export/trt_llm/decoder/__init__.py b/nemo/export/trt_llm/decoder/__init__.py
new file mode 100644
index 000000000000..5fe749408cb9
--- /dev/null
+++ b/nemo/export/trt_llm/decoder/__init__.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Type
+
+import tensorrt as trt
+
+from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
+from nemo.export.trt_llm.decoder.falcon import FALCONDecoderLayerBuilder, FALCONDecoderLayerConfigBuilder
+from nemo.export.trt_llm.decoder.gemma import GemmaDecoderLayerBuilder, GemmaDecoderLayerConfigBuilder
+from nemo.export.trt_llm.decoder.gpt import GPTDecoderLayerBuilder, GPTDecoderLayerConfigBuilder
+from nemo.export.trt_llm.decoder.gptj import GPTJDecoderLayerBuilder, GPTJDecoderLayerConfigBuilder
+from nemo.export.trt_llm.decoder.llama import LLAMADecoderLayerBuilder, LLAMADecoderLayerConfigBuilder
+from nemo.export.trt_llm.model_config import (
+    DECODER_FALCON,
+    DECODER_GEMMA,
+    DECODER_GPT2,
+    DECODER_GPTJ,
+    DECODER_GPTNEXT,
+    DECODER_LLAMA,
+    QUANTIZATION_NONE,
+)
+
+DECODER_CONFIG_REGISTRY: Dict[str, Type[DecoderLayerConfigBuilder]] = {
+    DECODER_GPT2: GPTDecoderLayerConfigBuilder,
+    DECODER_GPTJ: GPTJDecoderLayerConfigBuilder,
+    DECODER_LLAMA: LLAMADecoderLayerConfigBuilder,
+    DECODER_FALCON: FALCONDecoderLayerConfigBuilder,
+    DECODER_GEMMA: GemmaDecoderLayerConfigBuilder,
+}
+
+
+def build_decoder_layer_config(layer, decoder: str, dtype=trt.float16, rank=0, tensor_parallel=1):
+    """Builds the decoder layer config with the input torch module."""
+    assert decoder in DECODER_CONFIG_REGISTRY, f"{decoder} not supported"
+    return DECODER_CONFIG_REGISTRY[decoder](decoder, dtype, rank, tensor_parallel).build_layer(layer)
+
+
+DECODER_REGISTRY: Dict[str, Type[DecoderLayerBuilder]] = {
+    DECODER_GPT2: GPTDecoderLayerBuilder,
+    DECODER_GPTJ: GPTJDecoderLayerBuilder,
+    DECODER_LLAMA: LLAMADecoderLayerBuilder,
+    DECODER_GPTNEXT: GPTDecoderLayerBuilder,
+    DECODER_FALCON: FALCONDecoderLayerBuilder,
+    DECODER_GEMMA: GemmaDecoderLayerBuilder,
+}
+
+
+def build_decoder_layer(
+    layer,
+    layer_id: int,
+    num_layers: int,
+    dtype=trt.float16,
+    quantization=QUANTIZATION_NONE,
+    rank=0,
+    tensor_parallel=1,
+    tp_group=None,
+):
+    """Builds the tensorrt llm decoder layer module with the layer config as the input."""
+    assert layer.decoder_type in DECODER_REGISTRY, f"{layer.decoder_type} not supported"
+    builder = DECODER_REGISTRY[layer.decoder_type]
+    decoder_builder = builder(layer, layer_id, num_layers, dtype, quantization, rank, tensor_parallel, tp_group)
+    return decoder_builder.decoder
diff --git a/nemo/export/trt_llm/decoder/decoder.py b/nemo/export/trt_llm/decoder/decoder.py
new file mode 100644
index 000000000000..b3c0e2257e9f
--- /dev/null
+++ b/nemo/export/trt_llm/decoder/decoder.py
@@ -0,0 +1,260 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from abc import ABC, abstractmethod
+from typing import Optional
+
+import tensorrt as trt
+from transformers.activations import ACT2FN
+
+from nemo.export.trt_llm.model_config import (
+    QUANTIZATION_NONE,
+    AttentionConfig,
+    DecoderLayerConfig,
+    LayernormConfig,
+    MLPConfig,
+)
+from nemo.export.trt_llm.quantization_utils import quantize_linear
+from nemo.export.trt_llm.tensor_utils import get_tensor_parallel_group
+
+
+def _get_hidden_act(act_func):
+    """Returns the name of the hidden activation functon based on ACT2FN."""
+    if isinstance(act_func, str):
+        return act_func
+
+    for name, func in ACT2FN.items():
+        if isinstance(func, tuple):
+            if isinstance(act_func, func[0]):
+                return name
+        elif isinstance(act_func, func):
+            return name
+    assert False, f"Cannot find name for {act_func}"
+
+
+class DecoderLayerConfigBuilder(ABC):
+    """A config builder that translate the LLM decoder layer to the DecoderLayerConfig."""
+
+    @abstractmethod
+    def hidden_act_fn(self, layer):
+        """Returns the hidden act fn in the MLP layer, e.g. SiLUActivation or NewGELUActivation."""
+        pass
+
+    @abstractmethod
+    def infer_num_attention_heads(self, layer):
+        """Returns the num of attention heads of the layer."""
+        pass
+
+    @abstractmethod
+    def infer_max_position_embeddings(self, layer):
+        """Returns the max positional embeddings of the layer."""
+        pass
+
+    @abstractmethod
+    def build_input_layernorm(self, layer) -> LayernormConfig:
+        """Returns the built input layernorm layer."""
+        pass
+
+    @abstractmethod
+    def build_mlp_layernorm(
+        self, layer
+    ) -> LayernormConfig:  # Force all other models to implement. But seems this builder is not used.
+        """Returns the built mlp layernorm layer."""
+        pass
+
+    @abstractmethod
+    def build_attention(self, layer) -> AttentionConfig:
+        """Returns the built attention layer."""
+        pass
+
+    @abstractmethod
+    def build_mlp(self, layer) -> MLPConfig:
+        """Returns the built mlp layer."""
+        pass
+
+    @abstractmethod
+    def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
+        """Returns the built post layernorm."""
+        pass
+
+    def __init__(
+        self, decoder_type: str, dtype: trt.DataType = trt.float16, rank: int = 0, tensor_parallel: int = 1,
+    ):
+        """Initializes the DecoderLayerConfigBuilder."""
+        self.decoder_type = decoder_type
+        self.dtype = dtype
+        self.rank = rank
+        self.tensor_parallel = tensor_parallel
+
+    def build_layer(self, layer) -> DecoderLayerConfig:
+        """Builds the decoder layer and returns the DecoderLayer."""
+        decoder = DecoderLayerConfig()
+
+        decoder.decoder_type = self.decoder_type
+        decoder.num_attention_heads = self.infer_num_attention_heads(layer)
+        decoder.num_kv_heads = self.infer_num_kv_heads(layer)
+        decoder.max_position_embeddings = self.infer_max_position_embeddings(layer)
+
+        decoder.input_layernorm = self.build_input_layernorm(layer)
+        decoder.mlp_layernorm = self.build_mlp_layernorm(layer)
+        decoder.attention = self.build_attention(layer)
+        decoder.post_layernorm = self.build_post_layernorm(layer)
+        decoder.mlp = self.build_mlp(layer)
+        decoder.mlp.hidden_act = _get_hidden_act(self.hidden_act_fn(layer)).split("_")[0]
+
+        return decoder
+
+    def infer_num_kv_heads(self, layer):
+        """Returns the num of key value heads of the layer."""
+        return self.infer_num_attention_heads(layer)
+
+
+class DecoderLayerBuilder(ABC):
+    """An abstracted transformer decoder layer with tensorrt_llm implementation taking DecoderLayerConfig as the input.
+
+    Individual decoder layers are supposed to extend this class and implement the customized
+    abstracted method.
+    """
+
+    @abstractmethod
+    def build_decoder(self, layer):
+        """Returns the built decoder layer."""
+        pass
+
+    def __init__(
+        self,
+        layer: DecoderLayerConfig,
+        layer_id: int,
+        num_layers: int,
+        dtype: trt.DataType = trt.float16,
+        quantization: str = QUANTIZATION_NONE,
+        rank: int = 0,
+        tensor_parallel: int = 1,
+        tp_group=None,
+    ):
+        """Initializes the DecoderLayer."""
+        super().__init__()
+        assert isinstance(dtype, trt.DataType)
+        self.layer_id = layer_id
+        self.num_layers = num_layers
+        self.dtype = dtype
+        self.quantization = quantization
+        self.rank = rank
+        self.tensor_parallel = tensor_parallel
+
+        if tp_group is None:
+            self.tp_group = get_tensor_parallel_group(tensor_parallel)
+        else:
+            self.tp_group = tp_group
+
+        self.hidden_size = layer.hidden_size
+        self.num_attention_heads = layer.num_attention_heads
+        self.num_kv_heads = layer.num_kv_heads if layer.num_kv_heads > 0 else layer.num_attention_heads
+
+        assert (
+            self.num_attention_heads % self.num_kv_heads
+        ) == 0, "MQA/GQA requires the number of heads to be divisible by the number of K/V heads."
+        assert (self.num_kv_heads % self.tensor_parallel) == 0 or (self.tensor_parallel % self.num_kv_heads) == 0, (
+            "MQA/GQA requires either the number of K/V heads to be divisible by the number of GPUs"
+            " OR the number of GPUs to be divisible by the number of K/V heads."
+        )
+
+        self.max_position_embeddings = layer.max_position_embeddings
+        self.hidden_act = layer.mlp.hidden_act
+
+        self.decoder = self.build_decoder(layer)
+        self.assign_weights(layer)
+
+        is_moe = (
+            hasattr(self.decoder, "config")
+            and self.decoder.config.moe_num_experts is not None
+            and self.decoder.config.moe_num_experts > 1
+        )
+        if not is_moe:
+            self.quantize(layer)
+
+    def assign_weights(self, layer: DecoderLayerConfig):
+        """Assign the weights to the attention tensorrt_llm layer."""
+        is_moe = (
+            hasattr(self.decoder, "config")
+            and self.decoder.config.moe_num_experts is not None
+            and self.decoder.config.moe_num_experts > 1
+        )
+
+        self.decoder.input_layernorm.weight.value = layer.input_layernorm.weight
+        if layer.input_layernorm.bias is not None:
+            self.decoder.input_layernorm.bias.value = layer.input_layernorm.bias
+
+        if layer.mlp_layernorm is not None:  # Falcon has mlp layer norm
+            if is_moe:
+                assert layer.post_layernorm is None
+                self.decoder.post_layernorm.weight.value = layer.mlp_layernorm.weight
+                if layer.mlp_layernorm.bias is not None:
+                    self.decoder.post_layernorm.bias.value = layer.mlp_layernorm.bias
+            else:
+                self.decoder.mlp_layernorm.weight.value = layer.mlp_layernorm.weight
+                if layer.mlp_layernorm.bias is not None:
+                    self.decoder.mlp_layernorm.bias.value = layer.mlp_layernorm.bias
+
+        self.decoder.attention.qkv.weight.value = layer.attention.qkv.weight
+        if layer.attention.qkv.bias is not None:
+            self.decoder.attention.qkv.bias.value = layer.attention.qkv.bias
+
+        self.decoder.attention.dense.weight.value = layer.attention.dense.weight
+        if self.decoder.attention.dense.bias is not None:
+            self.decoder.attention.dense.bias.value = layer.attention.dense.bias
+
+        if layer.post_layernorm is not None:
+            self.decoder.post_layernorm.weight.value = layer.post_layernorm.weight
+            if layer.post_layernorm.bias is not None:
+                self.decoder.post_layernorm.bias.value = layer.post_layernorm.bias
+
+        if is_moe:
+            self.decoder.mlp.router.weight.value = layer.mlp.router.weight
+            self.decoder.mlp.experts_weight_1.value = layer.mlp.fc1.weight
+            self.decoder.mlp.experts_weight_2.value = layer.mlp.fc2.weight
+
+            if layer.mlp.fc1.bias is not None:
+                self.decoder.mlp.experts_bias_1.value = layer.mlp.fc1.bias
+
+            if layer.mlp.fc2.bias is not None:
+                self.decoder.mlp.experts_bias_2.value = layer.mlp.fc2.bias
+
+        else:
+            self.decoder.mlp.fc.weight.value = layer.mlp.fc.weight
+            self.decoder.mlp.proj.weight.value = layer.mlp.proj.weight
+            bias = layer.mlp.fc.bias is not None
+            if bias:
+                self.decoder.mlp.fc.bias.value = layer.mlp.fc.bias
+                self.decoder.mlp.proj.bias.value = layer.mlp.proj.bias
+
+            if layer.mlp.gate:
+                self.decoder.mlp.gate.weight.value = layer.mlp.gate.weight
+                if bias:
+                    self.decoder.mlp.gate.bias.value = layer.mlp.gate.bias
+
+    def quantize(self, layer: DecoderLayerConfig):
+        """Quantizes the decoder layer based on the layer config."""
+        self.decoder.attention.qkv = quantize_linear(
+            self.decoder.attention.qkv, self.quantization, layer.attention.qkv
+        )
+        self.decoder.attention.dense = quantize_linear(
+            self.decoder.attention.dense, self.quantization, layer.attention.dense
+        )
+        self.decoder.mlp.fc = quantize_linear(self.decoder.mlp.fc, self.quantization, layer.mlp.fc)
+        self.decoder.mlp.proj = quantize_linear(self.decoder.mlp.proj, self.quantization, layer.mlp.proj)
+
+        if hasattr(self.decoder.mlp, "gate"):
+            self.decoder.mlp.gate = quantize_linear(self.decoder.mlp.gate, self.quantization, layer.mlp.gate)
diff --git a/nemo/export/trt_llm/decoder/falcon.py b/nemo/export/trt_llm/decoder/falcon.py
new file mode 100644
index 000000000000..b0e69d2b99c4
--- /dev/null
+++ b/nemo/export/trt_llm/decoder/falcon.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+from tensorrt_llm.functional import non_gated_version
+from tensorrt_llm.models.falcon.model import FalconDecoderLayer
+from tensorrt_llm.models.modeling_utils import PretrainedConfig
+from tensorrt_llm.quantization import QuantMode
+from typing_extensions import override
+
+from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
+from nemo.export.trt_llm.model_config import (
+    LINEAR_COLUMN,
+    LINEAR_ROW,
+    AttentionConfig,
+    LayernormConfig,
+    LinearConfig,
+    MLPConfig,
+)
+
+
+class FALCONDecoderLayerConfigBuilder(DecoderLayerConfigBuilder):
+    """The FALCON implementation of the DecoderLayerConfigBuilder."""
+
+    @override
+    def hidden_act_fn(self, layer):
+        return layer.mlp.act_fn
+
+    @override
+    def infer_num_attention_heads(self, layer):
+        return layer.self_attn.num_heads
+
+    @override
+    def infer_num_kv_heads(self, layer):
+        return layer.self_attn.num_key_value_heads
+
+    @override
+    def infer_max_position_embeddings(self, layer):
+        return layer.self_attn.max_position_embeddings
+
+    @override
+    def build_input_layernorm(self, layer) -> LayernormConfig:
+        return LayernormConfig.from_nn_module(layer.input_layernorm, dtype=self.dtype)
+
+    @override
+    def build_mlp_layernorm(self, layer) -> LayernormConfig:
+        return LayernormConfig.from_nn_module(layer.mlp_layernorm, dtype=self.dtype)
+
+    @override
+    def build_attention(self, layer) -> AttentionConfig:
+        config = AttentionConfig()
+        config.qkv = LinearConfig.from_qkv_nn_modules(
+            [layer.self_attn.q_proj, layer.self_attn.k_proj, layer.self_attn.v_proj],
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
+        )
+
+        config.dense = LinearConfig.from_nn_module(
+            layer.self_attn.o_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+        )
+
+        return config
+
+    @override
+    def build_mlp(self, layer) -> MLPConfig:
+        config = MLPConfig()
+        config.fc = LinearConfig.from_nn_module(
+            layer.mlp.gate_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+        )
+        config.proj = LinearConfig.from_nn_module(
+            layer.mlp.down_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+        )
+        config.gate = LinearConfig.from_nn_module(
+            layer.mlp.up_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+        )
+
+        return config
+
+    @override
+    def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
+        return LayernormConfig.from_nn_module(layer.post_attention_layernorm, dtype=self.dtype)
+
+
+class FALCONDecoderLayerBuilder(DecoderLayerBuilder):
+    """The FALCON implementation of the DecoderLayer."""
+
+    @override
+    def build_decoder(self, layer):
+        # Falcon 7B: parallel_attention=True, new_decoder_architecture=False
+        # Falcon 40B/180B: parallel_attention=True, new_decoder_architecture=True
+        config = PretrainedConfig(
+            architecture=None,
+            dtype=self.dtype,
+            logits_dtype=self.dtype,
+            vocab_size=layer.vocab_size,
+            max_position_embeddings=self.max_position_embeddings,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_layers,
+            num_attention_heads=self.num_attention_heads,
+            num_key_value_heads=self.num_kv_heads,
+            hidden_act=non_gated_version(self.hidden_act),
+            intermediate_size=layer.ffn_hidden_size_local * self.tensor_parallel,
+            norm_epsilon=layer.norm_epsilon,
+            position_embedding_type="rope_gpt_neox",
+            world_size=self.tensor_parallel,
+            tp_size=self.tensor_parallel,
+            pp_size=1,
+            quant_mode=QuantMode(0),
+            quant_kwargs=None,
+            max_lora_rank=layer.max_lora_rank,
+            use_parallel_embedding=False,
+        )
+
+        # No other way to pass in model variant config, determine model variant by num_layers (7B: 32 layers)
+        config.set_if_not_exist('new_decoder_architecture', False if self.num_layers == 32 else True)
+        config.set_if_not_exist('parallel_attention', True)
+        config.set_if_not_exist('layernorm_epsilon', 1e-5)
+        config.set_if_not_exist('bias', False)
+        config.set_if_not_exist('moe_num_experts', 0)
+
+        return FalconDecoderLayer(config=config, layer_idx=self.layer_id,)
diff --git a/nemo/export/trt_llm/decoder/gemma.py b/nemo/export/trt_llm/decoder/gemma.py
new file mode 100644
index 000000000000..88196a80dd2b
--- /dev/null
+++ b/nemo/export/trt_llm/decoder/gemma.py
@@ -0,0 +1,216 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+from tensorrt_llm.functional import non_gated_version
+from tensorrt_llm.layers import Attention, AttentionMaskType, GatedMLP, PositionEmbeddingType, RmsNorm
+from tensorrt_llm.models.modeling_utils import PretrainedConfig
+from tensorrt_llm.module import Module
+from tensorrt_llm.quantization import QuantMode
+from typing_extensions import override
+
+from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
+from nemo.export.trt_llm.model_config import (
+    LINEAR_COLUMN,
+    LINEAR_ROW,
+    AttentionConfig,
+    LayernormConfig,
+    LinearConfig,
+    MLPConfig,
+)
+
+
+class GemmaDecoderLayer(Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.config = config
+
+        self.input_layernorm = RmsNorm(
+            normalized_shape=config.hidden_size, eps=config.norm_epsilon, dtype=config.dtype
+        )
+
+        self.attention = Attention(
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            attention_head_size=config.head_size,
+            max_position_embeddings=config.max_position_embeddings,
+            dtype=config.dtype,
+            attention_mask_type=AttentionMaskType.causal,
+            bias=config.attn_bias,
+            position_embedding_type=PositionEmbeddingType.rope_gpt_neox,
+            rotary_embedding_base=config.rotary_base,
+            rotary_embedding_scaling=config.rotary_scaling,
+            tp_group=config.mapping.tp_group,
+            tp_size=config.mapping.tp_size,
+            quant_mode=config.quant_mode,
+        )
+
+        mlp_hidden_size = config.hidden_size * 4 if config.intermediate_size is None else config.intermediate_size
+
+        self.mlp = GatedMLP(
+            hidden_size=config.hidden_size,
+            ffn_hidden_size=mlp_hidden_size,
+            hidden_act=config.hidden_act,
+            dtype=config.dtype,
+            bias=config.mlp_bias,
+            tp_group=config.mapping.tp_group,
+            tp_size=config.mapping.tp_size,
+            quant_mode=config.quant_mode,
+        )
+        self.post_layernorm = RmsNorm(normalized_shape=config.hidden_size, eps=config.norm_epsilon, dtype=config.dtype)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        medusa_packed_mask=None,  # For Medusa support
+        medusa_position_offsets=None,
+        use_cache=False,
+        kv_cache_params=None,
+        attention_params=None,
+        lora_layer_params=None,
+    ):
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        attention_output = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            medusa_packed_mask=medusa_packed_mask,  # For Medusa support
+            medusa_position_offsets=medusa_position_offsets,
+            use_cache=use_cache,
+            kv_cache_params=kv_cache_params,
+            attention_params=attention_params,
+            lora_layer_params=lora_layer_params,
+        )
+
+        if use_cache:
+            attention_output, presents = attention_output
+
+        hidden_states = residual + attention_output
+
+        residual = hidden_states
+        hidden_states = self.post_layernorm(hidden_states)
+
+        hidden_states = self.mlp(hidden_states, lora_layer_params=lora_layer_params)
+
+        hidden_states = residual + hidden_states
+        if use_cache:
+            return (hidden_states, presents)
+        return hidden_states
+
+
+class GemmaDecoderLayerConfigBuilder(DecoderLayerConfigBuilder):
+    """The LLAMA implementation of the DecoderLayerConfigBuilder."""
+
+    @override
+    def hidden_act_fn(self, layer):
+        return layer.mlp.act_fn
+
+    @override
+    def infer_num_attention_heads(self, layer):
+        return layer.self_attn.num_heads
+
+    @override
+    def infer_num_kv_heads(self, layer):
+        return layer.self_attn.num_key_value_heads
+
+    @override
+    def infer_max_position_embeddings(self, layer):
+        return layer.self_attn.max_position_embeddings
+
+    @override
+    def build_input_layernorm(self, layer) -> LayernormConfig:
+        return LayernormConfig.from_nn_module(layer.input_layernorm, dtype=self.dtype)
+
+    @override
+    def build_attention(self, layer) -> AttentionConfig:
+        config = AttentionConfig()
+        config.qkv = LinearConfig.from_qkv_nn_modules(
+            [layer.self_attn.q_proj, layer.self_attn.k_proj, layer.self_attn.v_proj],
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
+        )
+
+        config.dense = LinearConfig.from_nn_module(
+            layer.self_attn.o_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+        )
+
+        return config
+
+    @override
+    def build_mlp(self, layer) -> MLPConfig:
+        config = MLPConfig()
+        config.fc = LinearConfig.from_nn_module(
+            layer.mlp.gate_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+        )
+        config.proj = LinearConfig.from_nn_module(
+            layer.mlp.down_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+        )
+        config.gate = LinearConfig.from_nn_module(
+            layer.mlp.up_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+        )
+
+        return config
+
+    @override
+    def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
+        return LayernormConfig.from_nn_module(layer.post_attention_layernorm, dtype=self.dtype)
+
+
+class GemmaDecoderLayerBuilder(DecoderLayerBuilder):
+    """The Gemma implementation of the DecoderLayer."""
+
+    @override
+    def build_decoder(self, layer):
+        rotary_scaling = None
+        if layer.rotary_scaling is not None:
+            rotary_scaling = {"type": "linear", "factor": float(layer.rotary_scaling)}
+
+        config = PretrainedConfig(
+            architecture=None,
+            dtype=self.dtype,
+            logits_dtype=self.dtype,
+            vocab_size=layer.vocab_size,
+            max_position_embeddings=self.max_position_embeddings,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_layers,
+            num_attention_heads=self.num_attention_heads,
+            num_key_value_heads=self.num_kv_heads,
+            head_size=layer.kv_channels,
+            hidden_act=self.hidden_act.split("-")[-1] if layer.moe_num_experts else non_gated_version(self.hidden_act),
+            intermediate_size=layer.ffn_hidden_size_local * self.tensor_parallel,
+            norm_epsilon=layer.norm_epsilon,
+            position_embedding_type="rope_gpt_neox",
+            world_size=self.tensor_parallel,
+            tp_size=self.tensor_parallel,
+            pp_size=1,
+            quant_mode=QuantMode(0),
+            quant_kwargs=None,
+            max_lora_rank=layer.max_lora_rank,
+        )
+
+        config.set_if_not_exist('mlp_bias', False)
+        config.set_if_not_exist('attn_bias', False)
+        config.set_if_not_exist('rotary_base', layer.rotary_base)
+        config.set_if_not_exist('rotary_scaling', rotary_scaling)
+        config.set_if_not_exist('enable_pos_shift', False)
+        config.set_if_not_exist('dense_context_fmha', False)
+        config.set_if_not_exist('moe_num_experts', 0)
+
+        return GemmaDecoderLayer(config=config, layer_idx=self.layer_id,)
diff --git a/nemo/export/trt_llm/decoder/gpt.py b/nemo/export/trt_llm/decoder/gpt.py
new file mode 100644
index 000000000000..294ccb737c1f
--- /dev/null
+++ b/nemo/export/trt_llm/decoder/gpt.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+from tensorrt_llm.layers import AttentionMaskType, PositionEmbeddingType
+from tensorrt_llm.models.gpt.model import GPTDecoderLayer
+from typing_extensions import override
+
+from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
+from nemo.export.trt_llm.model_config import (
+    LINEAR_COLUMN,
+    LINEAR_ROW,
+    AttentionConfig,
+    LayernormConfig,
+    LinearConfig,
+    MLPConfig,
+)
+
+
+class GPTDecoderLayerConfigBuilder(DecoderLayerConfigBuilder):
+    """The GPT2 implementation of the DecoderLayerConfigBuilder."""
+
+    @override
+    def hidden_act_fn(self, layer):
+        return layer.mlp.act
+
+    @override
+    def infer_num_attention_heads(self, layer):
+        return layer.attn.num_heads
+
+    @override
+    def infer_max_position_embeddings(self, layer):
+        return layer.attn.bias.shape[2]
+
+    @override
+    def build_input_layernorm(self, layer) -> LayernormConfig:
+        return LayernormConfig.from_nn_module(layer.ln_1, dtype=self.dtype)
+
+    @override
+    def build_attention(self, layer) -> AttentionConfig:
+        config = AttentionConfig()
+        config.qkv = LinearConfig.from_qkv_nn_modules(
+            [layer.attn.c_attn], rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+        )
+
+        config.dense = LinearConfig.from_nn_module(
+            layer.attn.c_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+        )
+
+        return config
+
+    @override
+    def build_mlp(self, layer) -> MLPConfig:
+        config = MLPConfig()
+        config.fc = LinearConfig.from_nn_module(
+            layer.mlp.c_fc, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+        )
+        config.proj = LinearConfig.from_nn_module(
+            layer.mlp.c_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+        )
+
+        return config
+
+    @override
+    def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
+        return LayernormConfig.from_nn_module(layer.ln_2, dtype=self.dtype)
+
+
+class GPTDecoderLayerBuilder(DecoderLayerBuilder):
+    """The GPT implementation of the DecoderLayer."""
+
+    @override
+    def build_decoder(self, layer):
+        rotary_pct = layer.rotary_pct
+        position_embedding_type = (
+            PositionEmbeddingType.rope_gpt_neox
+            if layer.position_embedding_type == "rope"
+            else PositionEmbeddingType.learned_absolute
+        )
+
+        assert not (position_embedding_type == PositionEmbeddingType.rope_gpt_neox and rotary_pct == 0.0)
+
+        bias_qkv = layer.attention.qkv.bias is not None
+
+        rotary_scaling = None
+        if layer.rotary_scaling is not None:
+            rotary_scaling = {"type": "linear", "factor": float(layer.rotary_scaling)}
+
+        return GPTDecoderLayer(
+            hidden_size=self.hidden_size,
+            num_attention_heads=self.num_attention_heads,
+            max_position_embeddings=self.max_position_embeddings,
+            num_layers=self.num_layers,
+            dtype=self.dtype,
+            apply_query_key_layer_scaling=False,
+            attention_mask_type=AttentionMaskType.causal,
+            hidden_act=self.hidden_act,
+            position_embedding_type=position_embedding_type,
+            rotary_embedding_percentage=rotary_pct,
+            rotary_base=layer.rotary_base,
+            rotary_scaling=rotary_scaling,
+            inter_size=layer.ffn_hidden_size_local * self.tensor_parallel,
+            bias=bias_qkv,
+            num_kv_heads=self.num_kv_heads,
+            tp_group=self.tp_group,
+            tp_size=self.tensor_parallel,
+            max_lora_rank=layer.max_lora_rank,
+        )
diff --git a/nemo/export/trt_llm/decoder/gptj.py b/nemo/export/trt_llm/decoder/gptj.py
new file mode 100644
index 000000000000..aa65ca385a47
--- /dev/null
+++ b/nemo/export/trt_llm/decoder/gptj.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+from tensorrt_llm.models.gptj.model import GPTJDecoderLayer
+from typing_extensions import override
+
+from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
+from nemo.export.trt_llm.model_config import (
+    LINEAR_COLUMN,
+    LINEAR_ROW,
+    AttentionConfig,
+    LayernormConfig,
+    LinearConfig,
+    MLPConfig,
+)
+
+
+class GPTJDecoderLayerConfigBuilder(DecoderLayerConfigBuilder):
+    """The GPTJ implementation of the DecoderLayerConfigBuilder."""
+
+    @override
+    def hidden_act_fn(self, layer):
+        """Returns the hidden act fn in the MLP layer, e.g. SiLUActivation or NewGELUActivation."""
+        return layer.mlp.act
+
+    @override
+    def infer_num_attention_heads(self, layer):
+        return layer.attn.num_attention_heads
+
+    @override
+    def infer_max_position_embeddings(self, layer):
+        return layer.attn.bias.shape[2]
+
+    @override
+    def build_input_layernorm(self, layer) -> LayernormConfig:
+        return LayernormConfig.from_nn_module(layer.ln_1, dtype=self.dtype)
+
+    @override
+    def build_attention(self, layer) -> AttentionConfig:
+        config = AttentionConfig()
+        config.qkv = LinearConfig.from_qkv_nn_modules(
+            [layer.attn.q_proj, layer.attn.k_proj, layer.attn.v_proj],
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
+        )
+
+        config.dense = LinearConfig.from_nn_module(
+            layer.attn.out_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+        )
+
+        config.rotary_dim = layer.attn.rotary_dim
+
+        return config
+
+    @override
+    def build_mlp(self, layer) -> MLPConfig:
+        config = MLPConfig()
+        config.fc = LinearConfig.from_nn_module(
+            layer.mlp.fc_in, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+        )
+        config.proj = LinearConfig.from_nn_module(
+            layer.mlp.fc_out, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+        )
+
+        return config
+
+    @override
+    def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
+        # GPTJ do not have post layer_norm
+        return None
+
+
+class GPTJDecoderLayerBuilder(DecoderLayerBuilder):
+    """The GPTJ implementation of the DecoderLayer."""
+
+    @override
+    def build_decoder(self, layer):
+        assert self.tensor_parallel == 1 and self.rank == 0, "Only single GPU is supported for GPTJ"
+
+        return GPTJDecoderLayer(
+            hidden_size=self.hidden_size,
+            num_attention_heads=self.num_attention_heads,
+            max_position_embeddings=self.max_position_embeddings,
+            rotary_dim=layer.attention.rotary_dim,
+            dtype=self.dtype,
+            hidden_act=self.hidden_act,
+            tp_group=self.tp_group,
+            tp_size=self.tensor_parallel,
+            max_lora_rank=layer.max_lora_rank,
+        )
diff --git a/nemo/export/trt_llm/decoder/llama.py b/nemo/export/trt_llm/decoder/llama.py
new file mode 100644
index 000000000000..e554e18608f7
--- /dev/null
+++ b/nemo/export/trt_llm/decoder/llama.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+from tensorrt_llm.functional import non_gated_version
+from tensorrt_llm.layers import MoeConfig
+from tensorrt_llm.models.llama.model import LLaMADecoderLayer
+from tensorrt_llm.models.modeling_utils import PretrainedConfig
+from tensorrt_llm.quantization import QuantMode
+from typing_extensions import override
+
+from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
+from nemo.export.trt_llm.model_config import (
+    LINEAR_COLUMN,
+    LINEAR_ROW,
+    AttentionConfig,
+    LayernormConfig,
+    LinearConfig,
+    MLPConfig,
+)
+
+
+class LLAMADecoderLayerConfigBuilder(DecoderLayerConfigBuilder):
+    """The LLAMA implementation of the DecoderLayerConfigBuilder."""
+
+    @override
+    def hidden_act_fn(self, layer):
+        return layer.mlp.act_fn
+
+    @override
+    def infer_num_attention_heads(self, layer):
+        return layer.self_attn.num_heads
+
+    @override
+    def infer_num_kv_heads(self, layer):
+        return layer.self_attn.num_key_value_heads
+
+    @override
+    def infer_max_position_embeddings(self, layer):
+        return layer.self_attn.max_position_embeddings
+
+    @override
+    def build_input_layernorm(self, layer) -> LayernormConfig:
+        return LayernormConfig.from_nn_module(layer.input_layernorm, dtype=self.dtype)
+
+    @override
+    def build_attention(self, layer) -> AttentionConfig:
+        config = AttentionConfig()
+        config.qkv = LinearConfig.from_qkv_nn_modules(
+            [layer.self_attn.q_proj, layer.self_attn.k_proj, layer.self_attn.v_proj],
+            rank=self.rank,
+            tensor_parallel=self.tensor_parallel,
+            dtype=self.dtype,
+        )
+
+        config.dense = LinearConfig.from_nn_module(
+            layer.self_attn.o_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+        )
+
+        return config
+
+    @override
+    def build_mlp(self, layer) -> MLPConfig:
+        config = MLPConfig()
+        config.fc = LinearConfig.from_nn_module(
+            layer.mlp.gate_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+        )
+        config.proj = LinearConfig.from_nn_module(
+            layer.mlp.down_proj, LINEAR_ROW, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+        )
+        config.gate = LinearConfig.from_nn_module(
+            layer.mlp.up_proj, LINEAR_COLUMN, rank=self.rank, tensor_parallel=self.tensor_parallel, dtype=self.dtype,
+        )
+
+        return config
+
+    @override
+    def build_post_layernorm(self, layer) -> Optional[LayernormConfig]:
+        return LayernormConfig.from_nn_module(layer.post_attention_layernorm, dtype=self.dtype)
+
+
+class LLAMADecoderLayerBuilder(DecoderLayerBuilder):
+    """The LLAMA implementation of the DecoderLayer."""
+
+    @override
+    def build_decoder(self, layer):
+        rotary_scaling = None
+        if layer.rotary_scaling is not None:
+            rotary_scaling = {"type": "linear", "factor": float(layer.rotary_scaling)}
+
+        config = PretrainedConfig(
+            architecture=None,
+            dtype=self.dtype,
+            logits_dtype=self.dtype,
+            vocab_size=layer.vocab_size,
+            max_position_embeddings=self.max_position_embeddings,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_layers,
+            num_attention_heads=self.num_attention_heads,
+            num_key_value_heads=self.num_kv_heads,
+            hidden_act=self.hidden_act.split("-")[-1] if layer.moe_num_experts else non_gated_version(self.hidden_act),
+            intermediate_size=layer.ffn_hidden_size_local * self.tensor_parallel,
+            norm_epsilon=layer.norm_epsilon,
+            position_embedding_type="rope_gpt_neox",
+            world_size=self.tensor_parallel,
+            tp_size=self.tensor_parallel,
+            pp_size=1,
+            quant_mode=QuantMode(0),
+            quant_kwargs=None,
+            max_lora_rank=layer.max_lora_rank,
+        )
+
+        config.set_if_not_exist('mlp_bias', False)
+        config.set_if_not_exist('attn_bias', False)
+        config.set_if_not_exist('rotary_base', layer.rotary_base)
+        config.set_if_not_exist('rotary_scaling', rotary_scaling)
+        config.set_if_not_exist('enable_pos_shift', False)
+        config.set_if_not_exist('dense_context_fmha', False)
+        config.set_if_not_exist('moe_num_experts', 0)
+
+        if layer.moe_num_experts:
+            if layer.moe_num_experts is not None:
+                if layer.moe_top_k is None:
+                    layer.moe_top_k = 1
+
+                layer.moe_tp_mode = MoeConfig.ParallelismMode.TENSOR_PARALLEL if layer.moe_tp_mode is None else None
+                layer.moe_renorm_mode = (
+                    MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE if layer.moe_renorm_mode is None else None
+                )
+                moe_config = MoeConfig(
+                    layer.moe_num_experts, layer.moe_top_k, layer.moe_tp_mode, layer.moe_renorm_mode
+                )
+                moe_config.validate()
+                config.moe_num_experts = layer.moe_num_experts
+                config.moe_top_k = layer.moe_top_k
+                config.moe_tp_mode = layer.moe_tp_mode
+                config.moe_normalization_mode = layer.moe_renorm_mode
+
+        return LLaMADecoderLayer(config=config, layer_idx=self.layer_id,)
diff --git a/nemo/export/trt_llm/model_config.py b/nemo/export/trt_llm/model_config.py
new file mode 100644
index 000000000000..dd360afd6b8a
--- /dev/null
+++ b/nemo/export/trt_llm/model_config.py
@@ -0,0 +1,528 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dataclasses
+from dataclasses import dataclass, field
+from typing import Dict, List, get_args, get_origin
+
+import numpy as np
+import tensorrt as trt
+import torch.nn as nn
+from tensorrt_llm._utils import pad_vocab_size
+from tensorrt_llm.functional import is_gated_activation
+from transformers import LlamaConfig, PretrainedConfig
+from transformers.models.llama.modeling_llama import LlamaRMSNorm
+
+from nemo.export.trt_llm.tensor_utils import get_tensor_from_dict, split, torch_to_numpy_with_dtype
+
+
+DECODER_GPT2 = "gpt2"
+DECODER_GPTJ = "gptj"
+DECODER_LLAMA = "llama"
+DECODER_GPTNEXT = "gptnext"
+DECODER_FALCON = "falcon"
+DECODER_GEMMA = "gemma"
+
+QUANTIZATION_NONE = ""
+QUANTIZATION_FP8 = "fp8"
+QUANTIZATION_INT8_SQ = "int8_sq"
+
+LINEAR_COLUMN = "column"
+LINEAR_ROW = "row"
+
+LAYERNORM_DEFAULT = ""
+LAYERNORM_RMS = "rms"
+
+LAYER_DEFAULT = ""
+LAYER_QKV = "qkv"
+
+
+@dataclass
+class EmbeddingConfig:
+    """The embedding layer config."""
+
+    weight: np.array = None
+    # Whether the embedding weights are local
+    is_local: bool = False
+
+    @staticmethod
+    def from_nn_module(module: nn.Module, dtype=trt.float16):
+        """Converts an nn.Module to an EmbeddingConfig."""
+        return EmbeddingConfig(weight=torch_to_numpy_with_dtype(module.weight, dtype))
+
+    @property
+    def local_vocab_size(self):
+        """Infers the vocab_size from the embedding layer weights shape."""
+        return self.weight.shape[0]
+
+    @property
+    def hidden_size(self):
+        """Infers the hidden_size from the embedding layer weights shape."""
+        return self.weight.shape[1]
+
+
+@dataclass
+class LayernormConfig:
+    """The layernorm layer config."""
+
+    weight: np.array = None
+    bias: np.array = None
+    layernorm_type: str = LAYERNORM_DEFAULT
+
+    @staticmethod
+    def from_nn_module(module: nn.Module, dtype=trt.float16):
+        """Converts an nn.Module to an LayernormConfig."""
+        layernorm_type = LAYERNORM_RMS if type(module) is LlamaRMSNorm else LAYERNORM_DEFAULT
+
+        config = LayernormConfig(weight=torch_to_numpy_with_dtype(module.weight, dtype), layernorm_type=layernorm_type)
+        if layernorm_type == LAYERNORM_DEFAULT:
+            config.bias = torch_to_numpy_with_dtype(module.bias, dtype)
+
+        return config
+
+
+@dataclass
+class LinearConfig:
+    """The linear layer config."""
+
+    linear_type: str = ""
+    weight: np.array = None
+    bias: np.array = None
+    activation_scaling_factor: np.array = None
+    weights_scaling_factor: np.array = None
+    prequant_scaling_factor: np.array = None
+    layer_type: str = LAYER_DEFAULT
+
+    @staticmethod
+    def from_nn_module(module: nn.Module, linear_type: str, rank=0, tensor_parallel=1, dtype=trt.float16):
+        """Converts an nn.Module to an LinearConfig."""
+        weight = torch_to_numpy_with_dtype(module.weight, dtype)
+        if "Conv1D" in type(module).__name__:
+            weight = weight.transpose()
+        else:
+            assert type(module) is nn.Linear
+
+        config = LinearConfig()
+        config.linear_type = linear_type
+        config.weight = np.ascontiguousarray(
+            split(weight, tensor_parallel, rank, dim=0 if linear_type == LINEAR_COLUMN else 1)
+        )
+
+        if hasattr(module, "bias") and module.bias is not None:
+            if linear_type == LINEAR_COLUMN:
+                config.bias = np.ascontiguousarray(
+                    split(torch_to_numpy_with_dtype(module.bias, dtype), tensor_parallel, rank,)
+                )
+            else:
+                config.bias = torch_to_numpy_with_dtype(module.bias, dtype)
+
+        return config
+
+    @staticmethod
+    def from_qkv_nn_modules(qkv_modules: List[nn.Module], rank=0, tensor_parallel=1, dtype=trt.float16):
+        """Converts the qkv modules to an LinearConfig."""
+        config = LinearConfig()
+        config.linear_type = LINEAR_COLUMN
+        config.layer_type = LAYER_QKV
+        if len(qkv_modules) == 1:
+            # QKV layers combined as a single module, e.g. GPT2
+            qkv_module = qkv_modules[0]
+            assert "Conv1D" in type(qkv_module).__name__
+
+            qkv_shape = qkv_module.weight.shape
+            # Decode the concat QKV weights and split them to different GPU rank.
+            config.weight = np.ascontiguousarray(
+                split(
+                    torch_to_numpy_with_dtype(qkv_module.weight, dtype=dtype).reshape(
+                        qkv_shape[0], 3, qkv_shape[-1] // 3
+                    ),
+                    tensor_parallel,
+                    rank,
+                    dim=-1,
+                )
+                .reshape(qkv_shape[0], -1)
+                .transpose()
+            )
+            config.bias = np.ascontiguousarray(
+                split(
+                    torch_to_numpy_with_dtype(qkv_module.bias, dtype=dtype).reshape(3, qkv_shape[-1] // 3),
+                    tensor_parallel,
+                    rank,
+                    dim=-1,
+                ).reshape(-1)
+            )
+
+        elif len(qkv_modules) == 3:
+            # Separate QKV layers
+            for m in qkv_modules:
+                assert type(m) is nn.Linear
+                assert not (hasattr(m, "bias") and m.bias is not None)
+
+            q_weight = split(torch_to_numpy_with_dtype(qkv_modules[0].weight), tensor_parallel, rank)
+            k_weight = split(torch_to_numpy_with_dtype(qkv_modules[1].weight), tensor_parallel, rank)
+            v_weight = split(torch_to_numpy_with_dtype(qkv_modules[2].weight), tensor_parallel, rank)
+            split_v = np.concatenate((q_weight, k_weight, v_weight))
+            config.weight = np.ascontiguousarray(split_v)
+
+        else:
+            assert False, f"QKV modules format {qkv_modules} not supported"
+
+        return config
+
+
+@dataclass
+class MoEMLPConfig:
+    """The MLP layer config."""
+
+    fc1: LinearConfig = None
+    fc2: LinearConfig = None
+    router: LinearConfig = None
+    hidden_act: str = ""
+
+    @staticmethod
+    def from_nemo(
+        weights_dict: Dict[str, np.ndarray],
+        llm_config: PretrainedConfig,
+        layer_id: int,
+        rank: int = 0,
+        is_mcore: bool = False,
+    ):
+        """Converts the nemo weights and config to `MLPConfig`."""
+        mlp = MoEMLPConfig(hidden_act=llm_config.activation_function)
+        mlp.fc1 = LinearConfig(linear_type=LINEAR_COLUMN)
+
+        mlp.fc1.weight = get_tensor_from_dict(
+            weights_dict, f"layers.{layer_id}.mlp.experts.experts.linear_fc1.weight.{rank}"
+        )
+
+        mlp.fc1.bias = get_tensor_from_dict(
+            weights_dict, f"layers.{layer_id}.mlp.experts.experts.linear_fc1.bias.{rank}"
+        )
+
+        mlp.fc2 = LinearConfig(linear_type=LINEAR_ROW)
+        mlp.fc2.weight = get_tensor_from_dict(
+            weights_dict, f"layers.{layer_id}.mlp.experts.experts.linear_fc2.weight.{rank}"
+        )
+        mlp.fc2.bias = get_tensor_from_dict(
+            weights_dict, f"layers.{layer_id}.mlp.experts.experts.linear_fc2.bias.{rank}"
+        )
+
+        mlp.router = LinearConfig(linear_type=LINEAR_ROW)
+        mlp.router.weight = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.router.weight.{rank}")
+        return mlp
+
+
+@dataclass
+class AttentionConfig:
+    """The attention layer config."""
+
+    qkv: LinearConfig = None
+    dense: LinearConfig = None
+
+    rotary_dim: int = -np.inf
+
+    @staticmethod
+    def from_nemo(
+        weights_dict: Dict[str, np.ndarray], layer_id: int, rank: int = 0,
+    ):
+        """Converts the nemo weights and config to `AttentionConfig`."""
+        attention = AttentionConfig()
+        attention.qkv = LinearConfig(linear_type=LINEAR_COLUMN, layer_type=LAYER_QKV)
+        attention.qkv.weight = get_tensor_from_dict(
+            weights_dict, f"layers.{layer_id}.attention.query_key_value.weight.{rank}"
+        )
+        attention.qkv.bias = get_tensor_from_dict(
+            weights_dict, f"layers.{layer_id}.attention.query_key_value.bias.{rank}",
+        )
+
+        attention.dense = LinearConfig(linear_type=LINEAR_ROW)
+        attention.dense.weight = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.attention.dense.weight.{rank}")
+        attention.dense.bias = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.attention.dense.bias",)
+        return attention
+
+
+@dataclass
+class MLPConfig:
+    """The MLP layer config."""
+
+    fc: LinearConfig = None
+    gate: LinearConfig = None
+    proj: LinearConfig = None
+    hidden_act: str = ""
+
+    @staticmethod
+    def from_nemo(
+        weights_dict: Dict[str, np.ndarray],
+        llm_config: PretrainedConfig,
+        layer_id: int,
+        rank: int = 0,
+        is_mcore: bool = False,
+    ):
+        """Converts the nemo weights and config to `MLPConfig`."""
+        mlp = MLPConfig(hidden_act=llm_config.activation_function)
+        mlp.fc = LinearConfig(linear_type=LINEAR_COLUMN)
+        mlp.fc.weight = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.dense_h_to_4h.weight.{rank}")
+
+        # print("********** mlp.fc.weight : ", mlp.fc.weight )
+
+        mlp.fc.bias = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.dense_h_to_4h.bias.{rank}",)
+
+        gated = is_gated_activation(mlp.hidden_act)
+        is_fast_glu = mlp.hidden_act in ['fast-geglu', 'fast-swiglu', 'fast-reglu']
+        if gated:
+            mlp.gate = LinearConfig(linear_type=LINEAR_COLUMN)
+            layer_name = (
+                f"layers.{layer_id}.mlp.dense_h_to_4h_2.weight.{rank}"
+                if isinstance(llm_config, LlamaConfig) and not is_mcore and not is_fast_glu
+                else f"layers.{layer_id}.mlp.dense_h_to_4h.gate.weight.{rank}"
+            )
+            mlp.gate.weight = get_tensor_from_dict(weights_dict, layer_name,)
+            mlp.gate.bias = get_tensor_from_dict(
+                weights_dict, f"layers.{layer_id}.mlp.dense_h_to_4h.gate.bias.{rank}",
+            )
+
+        mlp.proj = LinearConfig(linear_type=LINEAR_ROW)
+        mlp.proj.weight = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.dense_4h_to_h.weight.{rank}")
+        mlp.proj.bias = get_tensor_from_dict(weights_dict, f"layers.{layer_id}.mlp.dense_4h_to_h.bias")
+        return mlp
+
+
+@dataclass
+class DecoderLayerConfig:
+    """The decoder layer config."""
+
+    decoder_type: str = ""
+    input_layernorm: LayernormConfig = None
+    mlp_layernorm: LayernormConfig = None  # Falcon 40B/180B has mlp_layernorm
+    attention: AttentionConfig = None
+    post_layernorm: LayernormConfig = None
+    mlp: MLPConfig = None
+
+    num_attention_heads: int = 0
+
+    num_kv_heads: int = 0
+    kv_channels: int = None
+    max_position_embeddings: int = 0
+    rotary_pct: float = 0
+    rotary_base: int = 10000
+    rotary_scaling: float = None
+    position_embedding_type: str = None
+
+    moe_num_experts: int = None
+    moe_top_k: int = None
+    moe_tp_mode: int = None
+    moe_renorm_mode: int = None
+
+    vocab_size: int = 0
+    norm_epsilon: float = 0.0
+    max_lora_rank: int = 64
+
+    @property
+    def is_moe(self):
+        return self.moe_num_experts is not None and self.moe_num_experts > 1
+
+    @property
+    def hidden_size(self):
+        """Returns the hidden size of the transformer model."""
+        if self.is_moe:
+            return self.mlp.fc2.weight.shape[1]
+        else:
+            return self.mlp.fc.weight.shape[1]
+
+    @property
+    def ffn_hidden_size_local(self):
+        """Returns the ffn hidden size of the transformer model."""
+        if self.is_moe:
+            return self.mlp.fc2.weight.shape[-1]
+        else:
+            return self.mlp.fc.weight.shape[0]
+
+    @staticmethod
+    def from_nemo(
+        weights_dict: Dict[str, np.ndarray],
+        llm_config: PretrainedConfig,
+        decoder_type: str,
+        layer_id: int,
+        rank: int = 0,
+        is_mcore: bool = False,
+    ):
+        """Converts the nemo weights and config to `DecoderLayerConfig`."""
+        layer_config = DecoderLayerConfig(
+            decoder_type=decoder_type,
+            num_attention_heads=llm_config.n_head,
+            max_position_embeddings=llm_config.n_positions,
+            rotary_pct=llm_config.rotary_pct if hasattr(llm_config, "rotary_pct") else 1.0,
+            rotary_base=(llm_config.rotary_base if hasattr(llm_config, "rotary_base") else 10000),
+            rotary_scaling=(llm_config.rotary_scaling if hasattr(llm_config, "rotary_scaling") else None),
+            position_embedding_type=(
+                llm_config.position_embedding_type if hasattr(llm_config, "position_embedding_type") else None
+            ),
+            num_kv_heads=(llm_config.num_kv_heads if hasattr(llm_config, "num_kv_heads") else 0),
+            kv_channels=(llm_config.kv_channels if hasattr(llm_config, "kv_channels") else None),
+            moe_num_experts=(llm_config.moe_num_experts if hasattr(llm_config, "moe_num_experts") else None),
+            moe_top_k=(llm_config.moe_top_k if hasattr(llm_config, "moe_top_k") else None),
+            moe_tp_mode=(llm_config.moe_tp_mode if hasattr(llm_config, "moe_tp_mode") else None),
+            moe_renorm_mode=(llm_config.moe_renorm_mode if hasattr(llm_config, "moe_renorm_mode") else None),
+            vocab_size=llm_config.vocab_size,
+            norm_epsilon=llm_config.norm_epsilon,
+        )
+        layer_config.input_layernorm = LayernormConfig()
+        layer_config.input_layernorm.layernorm_type = (
+            LAYERNORM_RMS if isinstance(llm_config, LlamaConfig) else LAYERNORM_DEFAULT
+        )
+        layer_config.input_layernorm.weight = get_tensor_from_dict(
+            weights_dict, f"layers.{layer_id}.input_layernorm.weight",
+        )
+        layer_config.input_layernorm.bias = get_tensor_from_dict(
+            weights_dict, f"layers.{layer_id}.input_layernorm.bias",
+        )
+
+        layer_config.mlp_layernorm = LayernormConfig()
+        layer_config.mlp_layernorm.layernorm_type = LAYERNORM_DEFAULT  # Falcon uses default layernorm
+        layer_config.mlp_layernorm.weight = get_tensor_from_dict(
+            weights_dict, f"layers.{layer_id}.pre_mlp_layernorm.weight",
+        )
+        layer_config.mlp_layernorm.bias = get_tensor_from_dict(
+            weights_dict, f"layers.{layer_id}.pre_mlp_layernorm.bias",
+        )
+
+        layer_config.post_layernorm = LayernormConfig()
+        layer_config.post_layernorm.layernorm_type = (
+            LAYERNORM_RMS if isinstance(llm_config, LlamaConfig) else LAYERNORM_DEFAULT
+        )
+
+        layer_config.post_layernorm.weight = get_tensor_from_dict(
+            weights_dict, f"layers.{layer_id}.post_attention_layernorm.weight",
+        )
+        layer_config.post_layernorm.bias = get_tensor_from_dict(
+            weights_dict, f"layers.{layer_id}.post_attention_layernorm.bias",
+        )
+
+        if layer_config.post_layernorm.weight is None:  # Falcon doesn't have post layernorm
+            layer_config.post_layernorm = None
+
+        if layer_config.mlp_layernorm.weight is None:
+            layer_config.mlp_layernorm = None
+
+        layer_config.attention = AttentionConfig.from_nemo(weights_dict, layer_id, rank,)
+
+        moe = False
+        if llm_config.moe_num_experts is not None:
+            if llm_config.moe_num_experts > 1:
+                moe = True
+
+        if moe:
+            layer_config.mlp = MoEMLPConfig.from_nemo(weights_dict, llm_config, layer_id, rank, is_mcore)
+        else:
+            layer_config.mlp = MLPConfig.from_nemo(weights_dict, llm_config, layer_id, rank, is_mcore)
+
+        return layer_config
+
+
+def _from_dict(class_type, data):
+    """Helper function to load the data as a class_type. class_type must be a dataclass."""
+    if data is None:
+        return None
+
+    if dataclasses.is_dataclass(class_type):
+        fieldtypes = {f.name: f.type for f in dataclasses.fields(class_type)}
+        return class_type(**{f: _from_dict(fieldtypes[f], data[f]) for f in data})
+    elif get_origin(class_type) == list and dataclasses.is_dataclass(get_args(class_type)[0]):
+        list_value = []
+        for child in data:
+            child_class_type = get_args(class_type)[0]
+            list_value.append(_from_dict(child_class_type, child))
+        return list_value
+    else:
+        return data
+
+
+@dataclass
+class ModelConfig:
+    """The full LLM model config that includes the full information needed for tensorrt_llm engine building.
+
+    This class includes all the fields that tensorrt_llm supports, but not all of the fields are required.
+    """
+
+    # Global metadata
+    quantization: str = QUANTIZATION_NONE
+    dtype: str = "float16"
+
+    # Model structure and weights
+    vocab_embedding: EmbeddingConfig = None
+    positional_embedding: EmbeddingConfig = None
+    layers: List[DecoderLayerConfig] = field(default_factory=list)
+    final_layernorm: LayernormConfig = None
+    lm_head: LinearConfig = None
+
+    # Ptuning metadata
+    use_prompt_tuning: bool = False
+    use_parallel_embedding: bool = False
+    max_lora_rank: int = 64
+
+    # Parallel metadata
+    mapping = None
+
+    def to_dict(self) -> dict:
+        """Converts the instance to a python dict."""
+        return dataclasses.asdict(self)
+
+    @staticmethod
+    def from_dict(d: dict):
+        """Load a dict to a `ModelConfig` instance."""
+        return _from_dict(ModelConfig, d)
+
+    @property
+    def vocab_size(self):
+        """Returns the vocab_size of the model."""
+        return (
+            self.vocab_embedding.local_vocab_size * self.mapping.tp_size
+            if self.vocab_embedding.is_local
+            else self.vocab_embedding.local_vocab_size
+        )
+
+    @property
+    def vocab_size_padded(self):
+        """Returns the padded vocab_size of the model rounds to the tensor_parallel."""
+        return pad_vocab_size(self.vocab_size, self.mapping.tp_size)
+
+    @property
+    def hidden_size(self):
+        """Returns the hidden_size of the model."""
+        return self.vocab_embedding.hidden_size
+
+    @property
+    def max_position_embeddings(self):
+        """Returns the max_position_embedding of the model."""
+        return self.layers[0].max_position_embeddings
+
+    @property
+    def num_attention_heads(self):
+        """Returns the num_attention_heads of the model."""
+        return self.layers[0].num_attention_heads
+
+    @property
+    def num_kv_heads(self):
+        """Returns the num_key_value_heads of the model."""
+        return self.layers[0].num_kv_heads if self.layers[0].num_kv_heads > 0 else self.num_attention_heads
+
+    @property
+    def head_size(self):
+        """Returns the head_size of the model."""
+        return self.layers[0].kv_channels
+
+    @property
+    def hidden_act(self):
+        """Returns the hidden_act of the model."""
+        return self.layers[0].mlp.hidden_act
diff --git a/nemo/export/trt_llm/model_config_trt.py b/nemo/export/trt_llm/model_config_trt.py
new file mode 100644
index 000000000000..635f6ae4d807
--- /dev/null
+++ b/nemo/export/trt_llm/model_config_trt.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+from pathlib import Path
+from typing import List, Union
+
+from nemo.export.trt_llm.model_config import ModelConfig
+from nemo.export.trt_llm.tensorrt_llm_model import LMHeadModelBuilder
+
+
+def model_config_to_tensorrt_llm(
+    model_configs: List[ModelConfig],
+    engine_dir: Union[str, Path],
+    world_size: int = 1,
+    max_input_len: int = 200,
+    max_output_len: int = 200,
+    max_batch_size: int = 1,
+    max_beam_width: int = 1,
+    max_prompt_embedding_table_size: int = 0,
+    use_inflight_batching: bool = False,
+    paged_kv_cache: bool = False,
+    enable_context_fmha: bool = True,
+    enable_multi_block_mode: bool = False,
+    use_refit: bool = False,
+    use_lora_plugin: str = None,
+    lora_target_modules: List[str] = None,
+    max_lora_rank: int = 64,
+):
+    """The API to convert a torch or huggingface model represented as ModelConfig to tensorrt_llm.
+
+    Args:
+        model_configs: The list of ModelConfig converted, 1 for each GPU.
+        engine_dir: The target output directory to save the built tensorrt_llm engines.
+        gpus: the number of inference gpus for multi gpu inferencing.
+        max_input_len: The max input sequence length.
+        max_output_len: The max output sequence length.
+        max_batch_size: The max batch size.
+        max_beam_width: The max beam search width.
+        max_prompt_embedding_table_size: max size of the prompt embedding table.
+        use_inflight_batching (bool): if True, enables inflight batching for TensorRT-LLM Triton backend.
+        paged_kv_cache (bool): if True, uses kv cache feature of the TensorRT-LLM.
+        enable_context_fmha (bool): if True, use fused Context MultiHeadedAttention.
+        enable_multi_block_mode (bool): enable faster decoding in multihead attention. Required for long context.
+    """
+    engine_dir = Path(engine_dir)
+    if os.path.exists(engine_dir):
+        shutil.rmtree(engine_dir)
+
+    for rank in range(world_size):
+        model_configs[rank].use_prompt_tuning = max_prompt_embedding_table_size > 0
+        model_configs[rank].max_lora_rank = max_lora_rank
+        builder = LMHeadModelBuilder(model_configs[rank])
+        builder.build(
+            output_dir=engine_dir,
+            max_input_len=max_input_len,
+            max_output_len=max_output_len,
+            max_batch_size=max_batch_size,
+            max_beam_width=max_beam_width,
+            parallel_build=False,
+            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+            use_inflight_batching=use_inflight_batching,
+            paged_kv_cache=paged_kv_cache,
+            enable_context_fmha=enable_context_fmha,
+            enable_multi_block_mode=enable_multi_block_mode,
+            use_refit=use_refit,
+            use_lora_plugin=use_lora_plugin,
+            lora_target_modules=lora_target_modules,
+            max_lora_rank=max_lora_rank,
+        )
diff --git a/nemo/export/trt_llm/nemo/__init__.py b/nemo/export/trt_llm/nemo/__init__.py
new file mode 100644
index 000000000000..19059dfa144a
--- /dev/null
+++ b/nemo/export/trt_llm/nemo/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from nemo.export.trt_llm.nemo.sentencepiece_tokenizer import SentencePieceTokenizer
diff --git a/nemo/export/trt_llm/nemo/convert.py b/nemo/export/trt_llm/nemo/convert.py
new file mode 100644
index 000000000000..09476da6b939
--- /dev/null
+++ b/nemo/export/trt_llm/nemo/convert.py
@@ -0,0 +1,526 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for exporting a model to our custom format."""
+
+import numpy as np
+import torch
+from tensorrt_llm._utils import torch_to_numpy
+
+# A global dicts to store exported weights.
+# This is set to be a global variable to avoid extra code modification from tensorrt_llm.
+weights_dict = {}
+
+
+def cpu_map_location(storage, loc):
+    return storage.cpu()
+
+
+def gpu_map_location(storage, loc):
+    if loc.startswith("cuda"):
+        training_gpu_idx = int(loc.split(":")[1])
+        inference_gpu_idx = training_gpu_idx % torch.cuda.device_count()
+        return storage.cuda(inference_gpu_idx)
+    elif loc.startswith("cpu"):
+        return storage.cpu()
+    else:
+        raise ValueError(f"Not handled {loc}")
+
+
+def save_val(val, dir, key, tp_num=None):
+    suffix = "bin" if tp_num is None else f"{tp_num}.bin"
+    # Transpose linear layer weights to the correct shape.
+    if len(val.shape) >= 2:
+        val = np.ascontiguousarray(np.transpose(val.reshape(val.shape[0], -1), [1, 0]))
+    global weights_dict
+    weights_dict[f"model.{key}.{suffix}"] = val
+
+
+def save_split(split_vals, dir, key, i, split_factor):
+    for j, val in enumerate(split_vals):
+        save_val(val, dir, key, i * split_factor + j)
+
+
+def save_expert_split(split_vals, dir, key, i, split_factor):
+    for j, val in enumerate(split_vals):
+        tp_num = i * split_factor + j
+        suffix = "bin" if tp_num is None else f"{tp_num}.bin"
+
+        global weights_dict
+        weights_dict[f"model.{key}.{suffix}"] = val
+
+
+def generate_int8(weights, act_range, is_qkv=False, multi_query_mode=False):
+    """This function has two purposes:
+    - compute quantized weights, scaled either per-tensor or per-column
+    - compute scaling factors.
+
+    Depending on the GEMM API (CUTLASS/CUBLAS) the required scaling factors differ.
+    CUTLASS uses two sets of scaling factors. One for the activation X, one for the weight W.
+    CUBLAS only has one (we can't do per-row scaling). So we must provide pre-multiplied scaling factor.
+
+    Here is the list of what we need (T means per-tensor, C per-column):
+    - scale_x_orig_quant puts fp activation into the quantized range (i.e. [-128, 127], for int8).
+    Used before the GEMM. (T)
+    - scale_y_quant_orig puts quantized activation into the fp range. Used if the GEMM outputs int8. (T)
+    - scale_w_quant_orig puts weights from quant range to fp range (used with CUTLASS) (T, C)
+    - scale_y_accum_quant puts the GEMM result (XW) from accumulation range (int32)
+    to quant range (int8) (used for CUBLAS) (T, C)
+
+    Note that we don't do anything special about row-parallel GEMM.
+    Theoretically, we could have per-GPU scaling factors too,
+    but then the model would change depending on the number of GPUs used.
+
+    For QKV projection, the behavior is special. Even if we have a single matrix to perform QKV projection,
+    we consider it
+    as three different matrices: Q, K, and V. So per-tensor actually means one scaling factor for each Q, K and V.
+    """
+    # compute weight scaling factors for fp->int8 and int8->fp
+    if is_qkv and not multi_query_mode:
+        scale_w_orig_quant_t = 127.0 / act_range["w"].reshape(3, -1).max(dim=-1, keepdims=True)[0].cpu().numpy()
+        scale_w_orig_quant_c = 127.0 / act_range["w"].reshape(3, -1).cpu().numpy()
+    elif is_qkv and multi_query_mode:
+        raise ValueError("Multi-query w/ int8 quant has not been supported yet")
+    else:
+        scale_w_orig_quant_t = 127.0 / act_range["w"].max().cpu().numpy()
+        scale_w_orig_quant_c = 127.0 / act_range["w"].cpu().numpy()
+    scale_w_quant_orig_t = 1.0 / scale_w_orig_quant_t
+    scale_w_quant_orig_c = 1.0 / scale_w_orig_quant_c
+
+    # compute the rest of needed scaling factors
+    scale_x_orig_quant_t = np.array(127.0 / act_range["x"].max().item())
+    scale_y_orig_quant_t = np.array(127.0 / act_range["y"].max().item())
+    scale_y_quant_orig_t = np.array(act_range["y"].max().item() / 127.0)
+    scale_y_accum_quant_t = scale_y_orig_quant_t / (scale_x_orig_quant_t * scale_w_orig_quant_t)
+    scale_y_accum_quant_c = scale_y_orig_quant_t / (scale_x_orig_quant_t * scale_w_orig_quant_c)
+    if is_qkv:
+        scale_y_accum_quant_t = np.broadcast_to(scale_y_accum_quant_t, scale_w_orig_quant_c.shape)
+        scale_w_quant_orig_t = np.broadcast_to(scale_w_quant_orig_t, scale_w_orig_quant_c.shape)
+
+    def to_i8(x):
+        return x.round().clip(-127, 127).astype(np.int8)
+
+    return {
+        "weight.int8": to_i8(weights * scale_w_orig_quant_t),
+        "weight.int8.col": to_i8(weights * scale_w_orig_quant_c),
+        "scale_x_orig_quant": scale_x_orig_quant_t.astype(np.float32),
+        "scale_w_quant_orig": scale_w_quant_orig_t.astype(np.float32),
+        "scale_w_quant_orig.col": scale_w_quant_orig_c.astype(np.float32),
+        "scale_y_accum_quant": scale_y_accum_quant_t.astype(np.float32),
+        "scale_y_accum_quant.col": scale_y_accum_quant_c.astype(np.float32),
+        "scale_y_quant_orig": scale_y_quant_orig_t.astype(np.float32),
+    }
+
+
+def write_int8(vals, dir, base_key, split_dim, tp_rank, split_factor, kv_cache_only=False):
+    if not kv_cache_only:
+        save_split(
+            np.split(vals["weight.int8"], split_factor, axis=split_dim),
+            dir,
+            f"{base_key}.weight.int8",
+            tp_rank,
+            split_factor,
+        )
+        save_split(
+            np.split(vals["weight.int8.col"], split_factor, axis=split_dim),
+            dir,
+            f"{base_key}.weight.int8.col",
+            tp_rank,
+            split_factor,
+        )
+
+    saved_keys_once = ["scale_y_quant_orig"]
+    if not kv_cache_only:
+        saved_keys_once += ["scale_x_orig_quant", "scale_w_quant_orig", "scale_y_accum_quant"]
+    # per-column scaling factors are loaded per-gpu for ColumnParallel GEMMs (QKV, FC1)
+    if not kv_cache_only:
+        if split_dim == -1:
+            save_split(
+                np.split(vals["scale_w_quant_orig.col"], split_factor, axis=split_dim),
+                dir,
+                f"{base_key}.scale_w_quant_orig.col",
+                tp_rank,
+                split_factor,
+            )
+            save_split(
+                np.split(vals["scale_y_accum_quant.col"], split_factor, axis=split_dim),
+                dir,
+                f"{base_key}.scale_y_accum_quant.col",
+                tp_rank,
+                split_factor,
+            )
+        else:
+            saved_keys_once += ["scale_w_quant_orig.col", "scale_y_accum_quant.col"]
+
+    if tp_rank == 0:
+        for save_key in saved_keys_once:
+            save_val(vals[save_key], dir, f"{base_key}.{save_key}")
+
+
+# Note: in multi_query_mode, only query heads are split between multiple GPUs, while key/value head
+# are not split as there is only one head per key/value.
+@torch.no_grad()
+def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_type, act_range, config):
+    use_attention_nemo_shape = config.get("use_attention_nemo_shape", False)
+    split_gated_activation = config.get("split_gated_activation", False)
+    num_attention_heads = config.get("num_attention_heads", 0)
+    tp_size = config.get("tp_size", 1)
+    int8_outputs = config.get("int8_outputs", None)
+    multi_query_mode = config.get("multi_query_mode", False)
+    num_kv_heads = config.get("num_kv_heads", num_attention_heads)
+    size_per_head = config.get("kv_channels", None)
+
+    save_int8 = int8_outputs == "all" or int8_outputs == "kv_cache_only"
+
+    if not isinstance(vals, list):
+        vals = [vals]
+
+    if config.get("transpose_weights", False) and vals[0].ndim == 2:
+        vals = [val.T for val in vals]
+    if "layernorm.weight" in key and config.get("apply_layernorm_1p", False):
+        vals = [val + 1.0 for val in vals]
+
+    if torch.is_tensor(vals[0]):
+        vals = [torch_to_numpy(val.cpu().to(storage_type)) for val in vals]
+
+    if (
+        "input_layernorm.weight" in key
+        or "input_layernorm.bias" in key
+        or "pre_mlp_layernorm.weight" in key
+        or "pre_mlp_layernorm.bias" in key
+        or "attention.dense.bias" in key
+        or "attention.linear_proj.bias" in key
+        or "post_attention_layernorm.weight" in key
+        or "post_attention_layernorm.bias" in key
+        or "post_self_attn_layernorm.weight" in key
+        or "mlp.dense_4h_to_h.bias" in key
+        or "mlp.linear_fc2.bias" in key
+        or "final_layernorm.weight" in key
+        or "final_layernorm.bias" in key
+    ):
+        # shared weights, only need to convert the weights of rank 0
+        if "post_self_attn_layernorm.weight" in key:
+            key = key.replace("post_self_attn_layernorm.weight", "post_attention_layernorm.weight")
+        elif "mlp.linear_fc2.bias" in key:
+            key = key.replace("mlp.linear_fc2.bias", "mlp.dense_4h_to_h.bias")
+        elif "attention.linear_proj.bias" in key:
+            key = key.replace("attention.linear_proj.bias", "attention.dense.bias")
+        if tp_rank == 0:
+            save_val(vals[0], saved_dir, key)
+
+    elif (
+        "attention.dense.weight" in key
+        or "mlp.dense_4h_to_h.weight" in key
+        or "attention.linear_proj.weight" in key
+        or "mlp.linear_fc2.weight" in key
+    ):
+        cat_dim = 0
+        val = np.concatenate(vals, axis=cat_dim)
+        split_vals = np.split(val, split_factor, axis=cat_dim)
+        if "attention.linear_proj.weight" in key:
+            key = key.replace("attention.linear_proj.weight", "attention.dense.weight")
+        elif "mlp.linear_fc2.weight" in key:
+            key = key.replace("mlp.linear_fc2.weight", "mlp.dense_4h_to_h.weight")
+        save_split(split_vals, saved_dir, key, tp_rank, split_factor)
+        if act_range is not None and int8_outputs == "all":
+            base_key = key.replace(".weight", "")
+            vals_i8 = generate_int8(val, act_range, multi_query_mode=multi_query_mode)
+            write_int8(vals_i8, saved_dir, base_key, cat_dim, tp_rank, split_factor)
+
+    elif (
+        "mlp.dense_h_to_4h.weight" in key
+        or "mlp.dense_h_to_4h.bias" in key
+        or "mlp.linear_fc1.weight" in key
+        or "mlp.linear_fc1.bias" in key
+    ):
+        if split_gated_activation:
+            splits = [np.split(val, 2, axis=-1) for val in vals]
+            vals, gates = list(zip(*splits))
+        cat_dim = -1
+        val = np.concatenate(vals, axis=cat_dim)
+        split_vals = np.split(val, split_factor, axis=cat_dim)
+
+        if "mlp.linear_fc1" in key:
+            key = key.replace("mlp.linear_fc1", "mlp.dense_h_to_4h")
+        save_split(split_vals, saved_dir, key, tp_rank, split_factor)
+        if act_range is not None and int8_outputs == "all":
+            base_key = key.replace(".weight", "")
+            vals_i8 = generate_int8(val, act_range, multi_query_mode=multi_query_mode)
+            write_int8(vals_i8, saved_dir, base_key, cat_dim, tp_rank, split_factor)
+
+        if split_gated_activation:
+            assert not save_int8
+            prefix, dot, suffix = key.rpartition(".")
+            key = prefix + ".gate" + dot + suffix
+
+            gate = np.concatenate(gates, axis=cat_dim)
+            split_vals = np.split(gate, split_factor, axis=cat_dim)
+            save_split(split_vals, saved_dir, key, tp_rank, split_factor)
+
+    elif "mlp.dense_h_to_4h_2.weight" in key or "mlp.dense_h_to_4h_2.bias" in key:
+        cat_dim = -1
+        val = np.concatenate(vals, axis=cat_dim)
+        split_vals = np.split(val, split_factor, axis=cat_dim)
+        save_split(split_vals, saved_dir, key, tp_rank, split_factor)
+        if act_range is not None and int8_outputs == "all":
+            base_key = key.replace(".weight", "")
+            vals_i8 = generate_int8(val, act_range, multi_query_mode=multi_query_mode)
+            write_int8(vals_i8, saved_dir, base_key, cat_dim, tp_rank, split_factor)
+
+    elif "attention.query_key_value.bias" in key or "attention.linear_qkv.bias" in key:
+        if "attention.linear_qkv.bias" in key:
+            key = key.replace("attention.linear_qkv.bias", "attention.query_key_value.bias")
+
+        qkv_hidden_dim = vals[0].shape[0]
+        size_per_head = qkv_hidden_dim // (num_attention_heads + 2 * num_kv_heads)
+        q_num = num_attention_heads // num_kv_heads
+
+        # We first concat all sub weights per tp rank together.
+
+        len_vals = len(vals)
+        val = np.concatenate(vals, axis=0)
+        val = val.reshape(num_kv_heads * len_vals // tp_size, q_num + 2, size_per_head)
+
+        # Split the QKV to separate variables.
+
+        qkv = np.split(val, [q_num, q_num + 1], axis=1)
+        q_split = np.split(qkv[0], split_factor, axis=0)
+        k_split = np.split(qkv[1], split_factor, axis=0)
+        v_split = np.split(qkv[2], split_factor, axis=0)
+
+        # Concatenate Q, K, and V together
+        split_vals = [
+            np.concatenate([q_split[i].reshape(-1), k_split[i].reshape(-1), v_split[i].reshape(-1)], axis=0)
+            for i in range(split_factor)
+        ]
+        save_split(split_vals, saved_dir, key, tp_rank, split_factor)
+
+    elif "attention.query_key_value.weight" in key or "attention.linear_qkv.weight" in key:
+        assert use_attention_nemo_shape, "Only support NEMO shape for QKV weights"
+        hidden_dim = vals[0].shape[0]
+        if size_per_head is None:
+            size_per_head = hidden_dim // num_attention_heads
+        q_num = num_attention_heads // num_kv_heads
+
+        # When the merge factor exceeds 1, the 'vals' list will have multiple entries.
+        # Depending on the format, 'vals' can look like either [QQQQ..KV, QQQQ..KV, ...](for GQA) or [QKV, QKV, ...](for MHA).
+
+        # We first concat all sub weights per tp rank together.
+        len_vals = len(vals)
+        val = np.concatenate(vals, axis=1)
+
+        val = val.reshape(hidden_dim, num_kv_heads * len_vals // tp_size, q_num + 2, size_per_head)
+
+        # Split the QKV to separate variables.
+        qkv = np.split(val, [q_num, q_num + 1], axis=2)
+
+        q_split = np.split(qkv[0], split_factor, axis=1)
+        k_split = np.split(qkv[1], split_factor, axis=1)
+        v_split = np.split(qkv[2], split_factor, axis=1)
+
+        # Concatenate Q, K, and V together
+        split_vals = [
+            np.concatenate(
+                [
+                    q_split[i].reshape(hidden_dim, -1),
+                    k_split[i].reshape(hidden_dim, -1),
+                    v_split[i].reshape(hidden_dim, -1),
+                ],
+                axis=1,
+            )
+            for i in range(split_factor)
+        ]
+
+        if "attention.linear_qkv.weight" in key:
+            key = key.replace("attention.linear_qkv.weight", "attention.query_key_value.weight")
+        save_split(split_vals, saved_dir, key, tp_rank, split_factor)
+        if save_int8:
+            base_key = key.replace(".weight", "")
+            vals_i8 = generate_int8(val, act_range, is_qkv=True, multi_query_mode=multi_query_mode)
+            write_int8(
+                vals_i8,
+                saved_dir,
+                base_key,
+                cat_dim,
+                tp_rank,
+                split_factor,
+                kv_cache_only=int8_outputs == "kv_cache_only",
+            )
+    elif (
+        "attention.query.weight" in key
+        or "attention.query.bias" in key
+        or "attention.key_value.weight" in key
+        or "attention.key_value.bias" in key
+    ):
+        pass
+    elif "mlp.router.weight" in key:
+        val = np.concatenate(vals, axis=1)
+        split_vals = np.split(val, split_factor, axis=0)
+        save_split(split_vals, saved_dir, key, tp_rank, split_factor)
+    elif "experts.linear_fc1.weight" in key:
+        cat_dim = -1
+        val = np.concatenate(vals, axis=cat_dim)
+        w1, w3 = np.split(val, 2, axis=1)
+        # w1 splits
+        split_w1s = np.split(w1, split_factor, axis=1)
+        # w3 splits
+        split_w3s = np.split(w3, split_factor, axis=1)
+
+        split_vals = [np.concatenate(item, axis=1) for item in zip(split_w3s, split_w1s)]
+        save_expert_split(split_vals, saved_dir, key, tp_rank, split_factor)
+
+    elif "experts.linear_fc2.weight" in key:
+        cat_dim = -1
+        val = np.concatenate(vals, axis=cat_dim)
+        split_vals = np.split(val, split_factor, axis=cat_dim)
+        save_expert_split(split_vals, saved_dir, key, tp_rank, split_factor)
+    else:
+        print(f"[WARNING] {key} not handled by converter")
+
+    global weights_dict
+    return weights_dict
+
+
+# Similar to split_save_weight but done on GPU for performance
+@torch.no_grad()
+def save_weight_torch(tp_rank, saved_dir, split_factor, key, vals, storage_type, act_range, config):
+    def save_tranpose(val, key, shared=False):
+        if shared or tp_rank is None:
+            suffix = "bin"
+        else:
+            suffix = f"{tp_rank}.bin"
+
+        # Transpose linear layer weights to the correct shape.
+        assert torch.is_tensor(val)
+        if len(val.shape) >= 2:
+            val = val.reshape(val.shape[0], -1)
+            val = torch.transpose(val, 0, 1)
+        val = val.contiguous().to("cpu", non_blocking=True)
+
+        if type(saved_dir) is dict:
+            saved_dir[f"model.{key}.{suffix}"] = val
+        else:
+            global weights_dict
+            weights_dict[f"model.{key}.{suffix}"] = val
+
+    use_attention_nemo_shape = config.get("use_attention_nemo_shape", False)
+    split_gated_activation = config.get("split_gated_activation", False)
+    num_attention_heads = config.get("num_attention_heads", 0)
+    tp_size = config.get("tp_size", 1)
+    num_kv_heads = config.get("num_kv_heads", num_attention_heads)
+
+    if not isinstance(vals, list):
+        vals = [vals]
+
+    if config.get("transpose_weights", False) and vals[0].ndim == 2:
+        vals = [val.T for val in vals]
+    if "layernorm.weight" in key and config.get("apply_layernorm_1p", False):
+        vals = [val + 1.0 for val in vals]
+
+    gpu_vals = [val.to(storage_type) for val in vals]
+    gpu_val = gpu_vals[0]
+
+    if (
+        "input_layernorm.weight" in key
+        or "input_layernorm.bias" in key
+        or "pre_mlp_layernorm.weight" in key
+        or "pre_mlp_layernorm.bias" in key
+        or "attention.dense.bias" in key
+        or "attention.linear_proj.bias" in key
+        or "post_attention_layernorm.weight" in key
+        or "post_attention_layernorm.bias" in key
+        or "post_self_attn_layernorm.weight" in key
+        or "mlp.dense_4h_to_h.bias" in key
+        or "mlp.linear_fc2.bias" in key
+        or "final_layernorm.weight" in key
+        or "final_layernorm.bias" in key
+    ):
+        if "post_self_attn_layernorm.weight" in key:
+            key = key.replace("post_self_attn_layernorm.weight", "post_attention_layernorm.weight")
+        elif "mlp.linear_fc2.bias" in key:
+            key = key.replace("mlp.linear_fc2.bias", "mlp.dense_4h_to_h.bias")
+        elif "attention.linear_proj.bias" in key:
+            key = key.replace("attention.linear_proj.bias", "attention.dense.bias")
+
+        save_tranpose(gpu_val, key, shared=True)
+    elif (
+        "attention.dense.weight" in key
+        or "mlp.dense_4h_to_h.weight" in key
+        or "attention.linear_proj.weight" in key
+        or "mlp.linear_fc2.weight" in key
+    ):
+        if "attention.linear_proj.weight" in key:
+            key = key.replace("attention.linear_proj.weight", "attention.dense.weight")
+        elif "mlp.linear_fc2.weight" in key:
+            key = key.replace("mlp.linear_fc2.weight", "mlp.dense_4h_to_h.weight")
+        save_tranpose(gpu_val, key)
+    elif (
+        "mlp.dense_h_to_4h.weight" in key
+        or "mlp.dense_h_to_4h.bias" in key
+        or "mlp.linear_fc1.weight" in key
+        or "mlp.linear_fc1.bias" in key
+    ):
+        if split_gated_activation:
+            val, gate = torch.chunk(gpu_val, 2, axis=-1)
+        else:
+            val, gate = None, None
+
+        if "mlp.linear_fc1" in key:
+            key = key.replace("mlp.linear_fc1", "mlp.dense_h_to_4h")
+
+        save_tranpose(val, key)
+
+        if split_gated_activation:
+            prefix, dot, suffix = key.rpartition(".")
+            key = prefix + ".gate" + dot + suffix
+            save_tranpose(gate, key)
+
+    elif "mlp.dense_h_to_4h_2.weight" in key or "mlp.dense_h_to_4h_2.bias" in key:
+        save_tranpose(gpu_val, key)
+
+    elif "attention.query_key_value.bias" in key or "attention.linear_qkv.bias" in key:
+        raise NotImplementedError("Attention QKV bias not implemented")
+
+    elif "attention.query_key_value.weight" in key or "attention.linear_qkv.weight" in key:
+        assert use_attention_nemo_shape, "Only support NEMO shape for QKV weights"
+        hidden_dim = vals[0].shape[0]
+        size_per_head = hidden_dim // num_attention_heads
+        q_num = num_attention_heads // num_kv_heads
+
+        len_vals = len(vals)
+        gpu_val = gpu_val.reshape(hidden_dim, num_kv_heads * len_vals // tp_size, q_num + 2, size_per_head)
+
+        # Split the QKV to separate variables.
+        # [qqqqkkvv] - > [qqqq,kk,vv]
+        qkv = torch.split(gpu_val, [q_num, 1, 1], dim=2)
+        split_vals = torch.concatenate(
+            [qkv[0].reshape(hidden_dim, -1), qkv[1].reshape(hidden_dim, -1), qkv[2].reshape(hidden_dim, -1)], dim=1
+        )
+
+        if "attention.linear_qkv.weight" in key:
+            key = key.replace("attention.linear_qkv.weight", "attention.query_key_value.weight")
+        save_tranpose(split_vals, key)
+
+    elif (
+        "attention.query.weight" in key
+        or "attention.query.bias" in key
+        or "attention.key_value.weight" in key
+        or "attention.key_value.bias" in key
+    ):
+        pass
+    else:
+        print(f"[WARNING] {key} not handled by converter")
+
+    global weights_dict
+    return weights_dict
diff --git a/nemo/export/trt_llm/nemo/nemo.py b/nemo/export/trt_llm/nemo/nemo.py
new file mode 100644
index 000000000000..e0688582481a
--- /dev/null
+++ b/nemo/export/trt_llm/nemo/nemo.py
@@ -0,0 +1,283 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import functools
+import logging
+import os
+import pathlib
+import tarfile
+import typing
+
+import torch
+import yaml
+from transformers import FalconConfig, GPT2Config, LlamaConfig
+
+from nemo.export.trt_llm.nemo.convert import cpu_map_location, gpu_map_location
+
+
+LOGGER = logging.getLogger("NeMo")
+
+
+def nemo_to_llm_config(nemo_model_config, vocab_size, eos_id, bos_id, decoder_type):
+    convertion_dict = {
+        "activation_function": "activation",
+        "layer_norm_epsilon": "layernorm_epsilon",
+        "n_embd": "hidden_size",
+        "n_head": "num_attention_heads",
+        "n_layer": "num_layers",
+        "n_positions": "max_position_embeddings",
+        "rotary_pct": "rotary_percentage",
+        "rotary_base": "rotary_base",
+        "rotary_scaling": "seq_len_interpolation_factor",
+        "position_embedding_type": "position_embedding_type",
+        "bias": "bias",
+        "intermediate_size": "ffn_hidden_size",
+        "num_kv_heads": "num_query_groups",
+        "moe_num_experts": "num_moe_experts",
+        "moe_top_k": "moe_router_topk",
+        "moe_renorm_mode": "moe_renorm_mode",
+        "kv_channels": "kv_channels",
+        "norm_epsilon": "layernorm_epsilon",
+    }
+
+    kwargs = {key: nemo_model_config[value] for key, value in convertion_dict.items() if value in nemo_model_config}
+    kwargs["vocab_size"] = vocab_size
+    kwargs["eos_token_id"] = eos_id
+    kwargs["bos_token_id"] = eos_id if decoder_type == 'falcon' else bos_id  # in HF falcon eos==bos
+    if "moe_num_experts" not in kwargs:
+        kwargs["moe_num_experts"] = 0
+    config_dict = {"llama": LlamaConfig, "falcon": FalconConfig, "gemma": LlamaConfig}
+    llm_config = config_dict[decoder_type] if decoder_type in config_dict else GPT2Config
+
+    return llm_config(**kwargs)
+
+
+def add_special_tokens_to_tokenizer(tokenizer):
+    # Need to add cls, sep, mask tokens to the tokenizer if they don't exist.
+    # If cls, sep and mask are not attributes of the tokenizer, add it.
+    if not hasattr(tokenizer, "cls_token"):
+        tokenizer.add_special_tokens({"cls_token": "<cls>"})
+    if not hasattr(tokenizer.tokenizer, "sep_id"):
+        tokenizer.add_special_tokens({"sep_token": "<sep>"})
+    if not hasattr(tokenizer.tokenizer, "mask_id"):
+        tokenizer.add_special_tokens({"mask_token": "<mask>"})
+
+    # bos, eos, pad and unk may be present in the provided spm .model file, if they are, use it.
+    if not hasattr(tokenizer, "pad_token"):
+        if hasattr(tokenizer.tokenizer, "pad_id") and tokenizer.tokenizer.pad_id() > 0:
+            tokenizer.pad_token = tokenizer.tokenizer.id_to_piece(tokenizer.tokenizer.pad_id())
+        else:
+            tokenizer.add_special_tokens({"pad_token": "<pad>"})
+    else:
+        tokenizer.add_special_tokens({"pad_token": "<pad>"})
+
+    if not hasattr(tokenizer, "bos_token"):
+        if hasattr(tokenizer.tokenizer, "bos_id") and tokenizer.tokenizer.bos_id() > 0:
+            tokenizer.bos_token = tokenizer.tokenizer.id_to_piece(tokenizer.tokenizer.bos_id())
+        else:
+            tokenizer.add_special_tokens({"bos_token": "<bos>"})
+    else:
+        tokenizer.add_special_tokens({"bos_token": "<s>"})
+
+    if not hasattr(tokenizer, "eos_token"):
+        if hasattr(tokenizer.tokenizer, "eos_id") and tokenizer.tokenizer.eos_id() > 0:
+            tokenizer.eos_token = tokenizer.tokenizer.id_to_piece(tokenizer.tokenizer.eos_id())
+        else:
+            tokenizer.add_special_tokens({"eos_token": "<eos>"})
+    else:
+        tokenizer.add_special_tokens({"eos_token": "</s>"})
+
+
+def unpack_nemo_ckpt(
+    nemo_archive_path: typing.Union[str, pathlib.Path], out_dir_path: typing.Union[str, pathlib.Path],
+):
+    nemo_archive_path = pathlib.Path(nemo_archive_path)
+    if not nemo_archive_path.exists():
+        raise FileNotFoundError(f"{nemo_archive_path} does not exist")
+
+    for tar_mode in ["r:", "r:gz"]:
+        try:
+            with tarfile.open(nemo_archive_path, mode=tar_mode) as tar_file:
+
+                def is_within_directory(directory, target):
+                    abs_directory = os.path.abspath(directory)
+                    abs_target = os.path.abspath(target)
+
+                    prefix = os.path.commonprefix([abs_directory, abs_target])
+
+                    return prefix == abs_directory
+
+                def safe_members(tar_file):
+                    members = []
+                    for member in tar_file.getmembers():
+                        member_path = os.path.join(out_dir_path, member.name)
+                        if not is_within_directory(out_dir_path, member_path):
+                            raise Exception("Attempted Path Traversal in Tar File")
+                        members.append(member)
+                    return members
+
+                tar_file.extractall(
+                    out_dir_path, members=safe_members(tar_file), numeric_owner=False
+                )  # nosec - tar path has been validated.
+
+            return out_dir_path
+        except tarfile.ReadError:
+            pass
+
+    raise RuntimeError(f"Could not unpack {nemo_archive_path}")
+
+
+def extract_layers_with_prefix(model_, prefix):
+    length_to_trim = len(prefix)
+    model_state = model_.get("state_dict", model_)
+    return {key[length_to_trim:]: model_state[key] for key in model_state.keys() if prefix in key}
+
+
+class UnpackedNemoCheckpointDir:
+    def __init__(
+        self, checkpoints_dir: typing.Union[str, pathlib.Path], load_checkpoints_to_cpu: bool = False,
+    ):
+        self._checkpoints_dir = pathlib.Path(checkpoints_dir)
+        self._load_checkpoints_to_cpu = load_checkpoints_to_cpu
+
+    @property
+    @functools.lru_cache
+    def model_config(self):
+        model_config = None
+
+        model_config_filename = "model_config.yaml"
+        model_configs_paths = list(self._checkpoints_dir.rglob(model_config_filename))
+        if model_configs_paths:
+            if len(model_configs_paths) > 1:
+                raise RuntimeError(
+                    f"There are more than single {model_config_filename} in"
+                    f" {self._checkpoints_dir}:"
+                    f" {', '.join(map(lambda p: p.as_posix(), model_configs_paths))}"
+                )
+            model_config_path = model_configs_paths[0]
+            LOGGER.debug("Loading model config from %s", model_config_path)
+            with model_config_path.open("r") as model_config_file:
+                model_config = yaml.load(model_config_file, Loader=yaml.SafeLoader)
+        else:
+            LOGGER.debug("Searching model config in checkpoints")
+            # try to obtain from checkpoint
+            checkpoint_name = self.checkpoint_name
+            checkpoints_paths = sorted(self._checkpoints_dir.rglob(checkpoint_name))
+            if checkpoints_paths:
+                # assume that parallel ranks 0 checkpoint should have model config embedded
+                checkpoint_path = checkpoints_paths[0]
+
+                map_location_fn = cpu_map_location if self._load_checkpoints_to_cpu else gpu_map_location
+
+                model_00 = torch.load(checkpoint_path, map_location=map_location_fn)
+                if "hyper_parameters" in model_00 and "cfg" in model_00["hyper_parameters"]:
+                    model_config = model_00["hyper_parameters"]["cfg"]
+                    LOGGER.debug("Loaded model config from checkpoint %s", checkpoint_path)
+                else:
+                    LOGGER.debug("Could not find model config in checkpoint %s", checkpoint_path)
+
+                del model_00
+
+        if model_config is None:
+            LOGGER.warning("Could not find checkpoint with NeMo model config in %s", self._checkpoints_dir)
+
+        LOGGER.debug("Loaded model config %s", model_config)
+
+        return model_config
+
+    @property
+    def checkpoints_dir(self):
+        return self._checkpoints_dir
+
+    def get_checkpoints_paths(self, tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
+        """Injects tensor/pipeline model parallel ranks into the filepath.
+        Does nothing if not using model parallelism.
+        """
+        checkpoint_path_without_rank = self.checkpoints_dir / self.checkpoint_name
+
+        def _inject_parallel_ranks(tp_rank, pp_rank):
+            if tensor_model_parallel_size > 1 or pipeline_model_parallel_size > 1:
+                if pipeline_model_parallel_size is None or pipeline_model_parallel_size == 1:
+                    checkpoint_path = (
+                        checkpoint_path_without_rank.parent
+                        / f"mp_rank_{tp_rank:02d}"
+                        / checkpoint_path_without_rank.name
+                    )
+                else:
+                    checkpoint_path = (
+                        checkpoint_path_without_rank.parent
+                        / f"tp_rank_{tp_rank:02d}_pp_rank_{pp_rank:03d}"
+                        / checkpoint_path_without_rank.name
+                    )
+                return checkpoint_path
+            else:
+                return checkpoint_path_without_rank
+
+        return [
+            [
+                _inject_parallel_ranks(tp_rank=tp_rank, pp_rank=pp_rank)
+                for pp_rank in range(pipeline_model_parallel_size)
+            ]
+            for tp_rank in range(tensor_model_parallel_size)
+        ]
+
+    @property
+    @functools.lru_cache
+    def checkpoint_name(self):
+        patterns = [
+            "model_weights.ckpt",  # older megatron checkpoints
+            "*last.ckpt",  # newer format of checkpoints
+        ]
+        for pattern in patterns:
+            model_files = sorted(list(self._checkpoints_dir.rglob(pattern)))
+            if model_files:
+                return model_files[0].name
+
+        raise ValueError(f"Could not find checkpoint files in {self._checkpoints_dir}")
+
+    @functools.lru_cache
+    def get_tokenizer_file_path(self, tokenizer_key, file_key, default_filename_pattern):
+        model_config = self.model_config
+        file_property = None
+        if tokenizer_key in model_config and file_key in model_config[tokenizer_key]:
+            file_property = model_config[tokenizer_key][file_key]
+        elif file_key in model_config:
+            file_property = model_config[file_key]
+
+        LOGGER.debug("model_config[%s][%s]=%s", tokenizer_key, file_key, file_property)
+
+        if file_property and file_property.startswith("nemo:"):
+            filename = file_property.split("nemo:")[1]
+            filename_pattern = f"*{filename}"
+        elif file_property and file_property.startswith("/artifacts/"):
+            filename = pathlib.Path(file_property).name
+            filename_pattern = f"*{filename}"
+        elif file_property is None or file_property == "None":
+            filename_pattern = None
+        else:
+            filename_pattern = default_filename_pattern
+            LOGGER.warning(
+                f"Tokenizer file from config: {tokenizer_key}.{file_key}={file_property} "
+                f"looks like unsupported path. Pattern {filename_pattern} will be used."
+            )
+
+        file_path = None
+        if filename_pattern is not None:
+            files_paths = list(self._checkpoints_dir.glob(filename_pattern))
+            if files_paths:
+                assert len(files_paths) == 1
+                file_path = files_paths[0]
+
+        return file_path
diff --git a/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py b/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py
new file mode 100644
index 000000000000..e803d9c989d1
--- /dev/null
+++ b/nemo/export/trt_llm/nemo/nemo_ckpt_convert.py
@@ -0,0 +1,592 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import configparser
+import logging
+import math
+import multiprocessing
+import os
+import shutil
+import typing
+from collections import defaultdict
+from pathlib import Path
+
+import numpy as np
+import tensorstore  # This is important even though not used. Otherwise zarr raises error.
+import torch
+import zarr
+from tensorrt_llm._utils import np_bfloat16, str_dtype_to_torch, torch_to_numpy
+from tqdm import tqdm
+from transformers import AutoTokenizer, GPT2Tokenizer, LlamaConfig
+
+from nemo.export.trt_llm.nemo.convert import save_weight_torch, split_and_save_weight
+from nemo.export.trt_llm.nemo.nemo import UnpackedNemoCheckpointDir, extract_layers_with_prefix, nemo_to_llm_config
+from nemo.export.trt_llm.nemo.sentencepiece_tokenizer import SentencePieceTokenizer
+
+
+LOGGER = logging.getLogger("NeMo")
+
+layer_names = {
+    "position_embedding": "embedding.position_embeddings.weight",
+    "word_embedding": "embedding.word_embeddings.weight",
+    "output_layer": "output_layer.weight",
+    "final_layernorm.weight": "final_layernorm.weight",
+    "final_layernorm.bias": "final_layernorm.bias",
+}
+
+
+def get_layer_name(layer_type: str, prefix: str):
+    layer_dict = layer_names
+    if layer_type in layer_dict:
+        return prefix + layer_dict[layer_type]
+    else:
+        raise ValueError(f"Unknown layer type {layer_type}")
+
+
+def get_layer_prefix(layer_names, is_mcore):
+    transformer_layer_prefix = None
+
+    for layer_name in layer_names:
+        if 'self_attention' in layer_name:
+            transformer_layer_prefix = layer_name.split('layers')[0]
+            break
+    assert transformer_layer_prefix is not None, "Cannot extract transformer layer prefix from {layer_name}"
+    if is_mcore:
+        model_prefix = transformer_layer_prefix.split('decoder')[0]
+    else:
+        model_prefix = transformer_layer_prefix.split('encoder')[0]
+    assert model_prefix is not None, "Cannot extract model prefix from {layer_name}"
+
+    return model_prefix, transformer_layer_prefix
+
+
+def get_layer_index(split_key):
+    index = 0
+    for key in split_key:
+        if key == "layers":
+            return index + 1
+        index += 1
+
+
+def rename_key(old_key: str, pp_rank: int, num_layers: int, pp_size: int):
+    new_key = old_key
+
+    if "layers." in old_key:
+        split_key = old_key.split(".")
+        layer_index = get_layer_index(split_key)
+        split_key[layer_index] = str(int(split_key[layer_index]) + pp_rank * num_layers // pp_size)
+        new_key = ".".join(split_key)
+
+        if "self_attention" in new_key:
+            new_key = new_key.replace("self_attention", "attention")
+        if "attention.linear_qkv.layer_norm_weight" in new_key:
+            new_key = new_key.replace("attention.linear_qkv.layer_norm_weight", "input_layernorm.weight")
+        if "mlp.linear_fc1.layer_norm_weight" in new_key:
+            new_key = new_key.replace("mlp.linear_fc1.layer_norm_weight", "post_attention_layernorm.weight")
+
+    return new_key
+
+
+def rename_key_dist_ckpt(old_key: str, layer: int):
+    new_key = old_key
+
+    if "layers." in old_key:
+        split_key = old_key.split(".")
+        split_key.insert(1, str(layer))
+        new_key = ".".join(split_key)
+
+        if "self_attention" in new_key:
+            new_key = new_key.replace("self_attention", "attention")
+        if "attention.linear_qkv.layer_norm_weight" in new_key:
+            new_key = new_key.replace("attention.linear_qkv.layer_norm_weight", "input_layernorm.weight")
+        if "attention.linear_qkv.layer_norm_bias" in new_key:
+            new_key = new_key.replace("attention.linear_qkv.layer_norm_bias", "input_layernorm.bias")
+        if "mlp.linear_fc1.layer_norm_weight" in new_key:
+            new_key = new_key.replace("mlp.linear_fc1.layer_norm_weight", "post_attention_layernorm.weight")
+        if "mlp.linear_fc1.layer_norm_bias" in new_key:
+            new_key = new_key.replace("mlp.linear_fc1.layer_norm_bias", "post_attention_layernorm.bias")
+
+    return new_key
+
+
+def load_sharded_metadata(checkpoint_dir: str, torch_tensor=True):
+    checkpoint_dir = Path(checkpoint_dir)
+    sharded_state_dict = {}
+    for subdir in checkpoint_dir.iterdir():
+        if not subdir.is_dir() or not (subdir / '.zarray').exists():
+            continue
+        key = subdir.name
+        arr = zarr.open(str(subdir), 'r')
+        if torch_tensor:
+            # sharded_state_dict[key] = torch.from_numpy(arr[:].astype("float32")).to(dtype=torch.bfloat16)
+            if arr.dtype.name == "bfloat16":
+                sharded_state_dict[key] = torch.from_numpy(arr[:].view(np.int16)).view(torch.bfloat16)
+            else:
+                sharded_state_dict[key] = torch.from_numpy(arr[:])
+        else:
+            sharded_state_dict[key] = arr[:]
+
+    return sharded_state_dict
+
+
+@torch.no_grad()
+def convert_dist_checkpoint(unpacked_checkpoints_dir: UnpackedNemoCheckpointDir, args):
+    nemo_model_config = unpacked_checkpoints_dir.model_config
+    checkpoints_path = unpacked_checkpoints_dir.checkpoints_dir / "model_weights"
+
+    # if checkpoints files could be found - start preparing output dir
+    out_dir = create_out_dir(args)
+
+    storage_type = str_dtype_to_torch(args.storage_type)
+    is_mcore = nemo_model_config.get("mcore_gpt", False)
+
+    # load position_embedding from rank 0
+    model = load_sharded_metadata(checkpoints_path)
+    model_state_dict = model.get("state_dict", model)
+
+    prefix, transformer_layer_prefix = get_layer_prefix(model_state_dict.keys(), is_mcore)
+
+    has_position_embedding = get_layer_name("position_embedding", prefix) in model_state_dict
+    has_lm_head = get_layer_name("output_layer", prefix) in model_state_dict
+    share_embeddings_and_output = nemo_model_config.get("share_embeddings_and_output_weights", False)
+    embedding_scaling = nemo_model_config.get("apply_embedding_scaling", False)
+    hidden_size = nemo_model_config["hidden_size"]
+
+    num_layers = nemo_model_config["num_layers"]
+    training_tp_size = 1
+    training_pp_size = 1
+    inference_tp_size = args.tensor_parallelism
+    num_kv_heads = nemo_model_config.get("num_query_groups", 0)
+    multi_query_mode = nemo_model_config.get("multi_query_mode", False)
+    num_attention_heads = nemo_model_config["num_attention_heads"]
+    kv_channels = nemo_model_config.get("kv_channels", None)
+    if num_kv_heads == 0:
+        if multi_query_mode:
+            num_kv_heads = 1
+        else:
+            num_kv_heads = num_attention_heads
+
+    export_config = {
+        "apply_layernorm_1p": nemo_model_config.get("normalization", "") == "layernorm1p",
+        "tp_size": training_tp_size,
+        "split_gated_activation": nemo_model_config.get("activation", "gelu")
+        in ["swiglu", "geglu", "fast-swiglu", "fast-geglu"]
+        and (args.decoder_type == "gptnext" or is_mcore),
+        "num_attention_heads": num_attention_heads,
+        "num_kv_heads": num_kv_heads,
+        "kv_channels": kv_channels,
+        "use_attention_nemo_shape": True,
+        "transpose_weights": True,
+    }
+
+    # split_factor: in how many parts a TP training node is split
+    split_factor = inference_tp_size
+    model_level_weights = defaultdict(list)
+
+    def handle_model_level_weights(model, tp_idx: int, pp_idx: int):
+        if tp_idx == 0 and pp_idx == 0:
+            if has_position_embedding:
+                val = model[get_layer_name("position_embedding", prefix)]
+                val = torch_to_numpy(val.to(storage_type).cpu())
+                model_level_weights["model.wpe.bin"].append(val)
+        if pp_idx == 0:
+            val = model.get("state_dict", model)[get_layer_name("word_embedding", prefix)]
+            if embedding_scaling:
+                val = val * float(math.sqrt(hidden_size))
+
+            val = torch_to_numpy(val.to(storage_type).cpu())
+            model_level_weights["model.wte.bin"].append(val)
+            if share_embeddings_and_output:
+                val = model.get("state_dict", model)[get_layer_name("word_embedding", prefix)]
+                val = torch_to_numpy(val.to(storage_type).cpu())
+                model_level_weights["model.lm_head.weight.bin"].append(val)
+        if has_lm_head and pp_idx == training_pp_size - 1:
+            val = model.get("state_dict", model)[get_layer_name("output_layer", prefix)]
+            val = torch_to_numpy(val.to(storage_type).cpu())
+            model_level_weights["model.lm_head.weight.bin"].append(val)
+
+    weights_dict = {}
+
+    tp_rank = 0
+
+    handle_model_level_weights(model, 0, 0)
+    model = extract_layers_with_prefix(model, transformer_layer_prefix)
+
+    starmap_args = []
+    for key, val in model.items():
+        if "_extra_state" not in key:
+            if len(val.size()) == 1:
+                starmap_args.append(
+                    (
+                        tp_rank,
+                        out_dir,
+                        split_factor,
+                        # Let's rename/map the key to the old layer name previously. You can try printing out
+                        # the rename_key output of the old llama checkpoint and compare.
+                        rename_key_dist_ckpt(key, 0),
+                        # Since the state dict value has the full layers, let's select the ith layer weights/biases here.
+                        [val],
+                        storage_type,
+                        None,
+                        export_config,
+                    )
+                )
+            else:
+                for i in range(num_layers):
+                    starmap_args.append(
+                        (
+                            tp_rank,
+                            out_dir,
+                            split_factor,
+                            # Let's rename/map the key to the old layer name previously. You can try printing out
+                            # the rename_key output of the old llama checkpoint and compare.
+                            rename_key_dist_ckpt(key, i),
+                            # Since the state dict value has the full layers, let's select the ith layer weights/biases here.
+                            [val[i]],
+                            storage_type,
+                            None,
+                            export_config,
+                        )
+                    )
+
+    starmap_args = tqdm(starmap_args, desc="saving weights")
+
+    if args.processes > 1:
+        with multiprocessing.Pool(args.processes) as pool:
+            weights_dicts = pool.starmap(split_and_save_weight, starmap_args)
+            weights_dict_local = {k: v for d in weights_dicts for k, v in d.items()}
+    else:
+        # simpler for debug situations
+        for starmap_arg in starmap_args:
+            weights_dict_local = split_and_save_weight(*starmap_arg)
+
+    weights_dict.update(weights_dict_local)
+
+    for key, values in model_level_weights.items():
+        model_level_weights[key] = np.concatenate(values, axis=0)
+
+        weights_dict[key] = model_level_weights[key]
+    vocab_size = model_level_weights["model.wte.bin"].shape[0]
+
+    if nemo_model_config["tokenizer"].get("library", None) == "huggingface":
+        tokenizer = AutoTokenizer.from_pretrained(
+            nemo_model_config["tokenizer"]["type"], use_fast=nemo_model_config["tokenizer"].get("use_fast", False)
+        )
+    else:
+        tokenizer_config = update_tokenizer_paths(nemo_model_config["tokenizer"], unpacked_checkpoints_dir)
+        copy_tokenizer_files(tokenizer_config, out_dir)
+
+        tokenizer_config["model"] = os.path.join(out_dir, "tokenizer.model")
+        tokenizer = build_tokenizer(tokenizer_config)
+
+    llm_config = nemo_to_llm_config(
+        nemo_model_config, vocab_size, tokenizer.eos_token_id, tokenizer.bos_token_id, args.decoder_type,
+    )
+
+    llm_config.is_mcore = is_mcore
+
+    config = configparser.ConfigParser()
+    decoder_name_dict = {"llama": "llama", "falcon": "falcon"}
+    model_name = decoder_name_dict[args.decoder_type] if args.decoder_type in decoder_name_dict else "gpt"
+
+    config[model_name] = {k: str(v) for k, v in vars(llm_config).items()}
+    config[model_name]["storage_dtype"] = args.storage_type
+    config_path = out_dir / "config.ini"
+    with config_path.open("w") as config_file:
+        config.write(config_file)
+
+    return weights_dict, llm_config, tokenizer
+
+
+@torch.no_grad()
+def convert_nemo_model(nemo_model, nemo_model_config, storage_type_str, decoder_type=None):
+    from megatron.core import parallel_state
+
+    is_mcore = nemo_model_config.get("mcore_gpt", False)
+
+    nemo_model_state_dict = nemo_model.state_dict()
+    prefix, transformer_layer_prefix = get_layer_prefix(nemo_model_state_dict, is_mcore)
+    has_position_embedding = get_layer_name("position_embedding", prefix) in nemo_model_state_dict
+    has_lm_head = get_layer_name("output_layer", prefix) in nemo_model_state_dict
+    has_final_layer_bias = get_layer_name("final_layernorm.bias", transformer_layer_prefix) in nemo_model_state_dict
+
+    tp_rank = parallel_state.get_tensor_model_parallel_rank()
+    tp_size = parallel_state.get_tensor_model_parallel_world_size()
+    pp_rank = parallel_state.get_pipeline_model_parallel_rank()
+    pp_size = parallel_state.get_pipeline_model_parallel_world_size()
+    pp_group = parallel_state.get_pipeline_model_parallel_group()
+    # split_factor = 1
+    storage_type = str_dtype_to_torch(storage_type_str)
+
+    num_layers = nemo_model_config["num_layers"]
+    training_tp_size = nemo_model_config.get("tensor_model_parallel_size", 1)
+    training_pp_size = nemo_model_config.get("pipeline_model_parallel_size", 1)
+    num_kv_heads = nemo_model_config.get("num_query_groups", 0)
+    multi_query_mode = nemo_model_config.get("multi_query_mode", False)
+    num_attention_heads = nemo_model_config["num_attention_heads"]
+
+    # pp currently unsupported so reshard away PP
+    is_pp_resharding = False
+    if pp_size > 1:
+        is_pp_resharding = True
+
+    if num_kv_heads == 0:
+        if multi_query_mode:
+            num_kv_heads = 1
+        else:
+            num_kv_heads = num_attention_heads
+
+    export_config = {
+        "apply_layernorm_1p": nemo_model_config.get("normalization", "") == "layernorm1p",
+        "tp_size": training_tp_size,
+        "split_gated_activation": "swiglu" in nemo_model_config.get("activation", "gelu")
+        and (decoder_type == "gptnext" or is_mcore),
+        "num_attention_heads": nemo_model_config["num_attention_heads"],
+        "num_kv_heads": num_kv_heads,
+        "use_attention_nemo_shape": True,
+        "transpose_weights": True,
+        "from_nemo_model": True,
+    }
+
+    # Gather meta data from first and last PP stage
+    if is_pp_resharding:
+        has_lm_head = torch.tensor(has_lm_head).cuda()
+        src_rank = torch.distributed.get_global_rank(pp_group, pp_size - 1)
+        torch.distributed.broadcast(has_lm_head, src_rank, group=pp_group)
+        has_lm_head = has_lm_head.item()
+
+        has_position_embedding = torch.tensor(has_position_embedding).cuda()
+        src_rank = torch.distributed.get_global_rank(pp_group, 0)
+        torch.distributed.broadcast(has_position_embedding, src_rank, group=pp_group)
+        has_position_embedding = has_position_embedding.item()
+
+        has_final_layer_bias = torch.tensor(has_final_layer_bias).cuda()
+        src_rank = torch.distributed.get_global_rank(pp_group, pp_size - 1)
+        torch.distributed.broadcast(has_final_layer_bias, src_rank, group=pp_group)
+        has_final_layer_bias = has_final_layer_bias.item()
+
+    trt_inflight_weights = {}
+    starmap_args = []
+
+    def handle_model_level_weights(model, tp_idx: int, pp_idx: int):
+        def _handle_weights(src_key: str, dst_key: str, pp_src_idx: int, tensor_dim: int):
+            src_pp_global_rank = torch.distributed.get_global_rank(pp_group, pp_src_idx)
+            # Broadcast the shape
+            if pp_idx == pp_src_idx:
+                gathered_tensor = model.get("state_dict", model)[src_key].type(storage_type).cuda()
+                shape = torch.IntTensor(list(gathered_tensor.shape)).cuda()
+            else:
+                shape = torch.zeros(tensor_dim, dtype=torch.int32).cuda()
+            torch.distributed.broadcast(shape, src_pp_global_rank, group=pp_group)
+
+            # Collect the tensor
+            if pp_idx != pp_src_idx:
+                gathered_tensor = torch.zeros(*shape, dtype=storage_type).cuda()
+            torch.distributed.broadcast(gathered_tensor, src_pp_global_rank, group=pp_group)
+
+            if "final_layernorm" not in src_key:
+                gathered_tensor = gathered_tensor.to(storage_type).cpu()
+                trt_inflight_weights[dst_key] = torch_to_numpy(gathered_tensor)
+            else:
+                starmap_args.append(
+                    {
+                        "tp_rank": tp_idx,
+                        "saved_dir": trt_inflight_weights,
+                        "split_factor": 1,
+                        "key": dst_key,
+                        "vals": [gathered_tensor],
+                        "storage_type": storage_type,
+                        "act_range": None,
+                        "config": export_config,
+                    }
+                )
+
+        if has_lm_head:
+            _handle_weights(get_layer_name("output_layer", prefix), "model.lm_head.weight.bin", pp_size - 1, 2)
+        if has_position_embedding:
+            _handle_weights(get_layer_name("position_embedding", prefix), "model.wpe.bin", 0, 2)
+
+        _handle_weights(get_layer_name("word_embedding", prefix), "model.wte.bin", 0, 2)
+        _handle_weights(
+            get_layer_name("final_layernorm.weight", transformer_layer_prefix),
+            "final_layernorm.weight",
+            pp_size - 1,
+            1,
+        )
+
+        if has_final_layer_bias:
+            _handle_weights(
+                get_layer_name("final_layernorm.bias", transformer_layer_prefix),
+                "final_layernorm.bias",
+                pp_size - 1,
+                1,
+            )
+
+        torch.cuda.empty_cache()
+
+    models = []
+
+    handle_model_level_weights(nemo_model_state_dict, tp_rank, pp_rank)
+    layers = extract_layers_with_prefix(nemo_model_state_dict, transformer_layer_prefix)
+    models.append(layers)
+
+    for key in models[0].keys():
+        # Skip final_layernorm.
+        if not key.startswith("layers."):
+            continue
+        if "_extra_state" not in key:
+            starmap_args.append(
+                {
+                    "tp_rank": tp_rank,
+                    "saved_dir": trt_inflight_weights,
+                    "split_factor": 1,
+                    "key": rename_key(key, pp_rank, num_layers, training_pp_size),
+                    "vals": [model[key] for model in models],
+                    "storage_type": storage_type,
+                    "act_range": None,
+                    "config": export_config,
+                }
+            )
+    starmap_args = tqdm(starmap_args, desc="saving weights")
+    for starmap_arg in starmap_args:
+        save_weight_torch(**starmap_arg)
+
+    # Collect weights from different pp stages
+    # Assume each rank has the same number of layers
+    if is_pp_resharding:
+        collect_pp_weights = {}
+        for key, val in trt_inflight_weights.items():
+            # Skip embedding and final layer
+            if not key.startswith("model.layers"):
+                continue
+            # Convert numpy array to torch tensor and gather weights
+            curr_weight = trt_inflight_weights[key]
+            if curr_weight.dtype != np_bfloat16:
+                curr_weight = torch.tensor(curr_weight).cuda()
+            else:
+                curr_weight = torch.tensor(curr_weight.view(np.int16)).view(torch.bfloat16).cuda()
+            weight_list = [torch.zeros_like(curr_weight) for _ in range(pp_size)]
+            torch.distributed.all_gather(weight_list, curr_weight, group=pp_group)
+            # Collect weights name
+            for rank in range(pp_size):
+                split_key = key.split(".")
+                layer_index = get_layer_index(split_key)
+                split_key[layer_index] = str(int(split_key[layer_index]) + (rank - pp_rank) * num_layers // pp_size)
+                new_key = ".".join(split_key)
+                collect_pp_weights[new_key] = torch_to_numpy(weight_list[rank].to(storage_type).cpu())
+
+        trt_inflight_weights.update(collect_pp_weights)
+
+    vocab_size = trt_inflight_weights["model.wte.bin"].shape[0] * tp_size
+
+    llm_config = nemo_to_llm_config(
+        nemo_model_config,
+        vocab_size,
+        None,
+        None,
+        decoder_type=decoder_type,  # how to get eos_id and bos_id from different tokenizer?
+    )
+    llm_config.is_mcore = is_mcore
+
+    config = configparser.ConfigParser()
+    model_name = "llama" if isinstance(llm_config, LlamaConfig) else "gpt"
+    config[model_name] = {k: str(v) for k, v in vars(llm_config).items()}
+    config[model_name]["storage_dtype"] = storage_type_str
+
+    return trt_inflight_weights, llm_config
+
+
+def create_out_dir(args):
+    out_dir = Path(args.out_dir)
+    if not out_dir.exists():
+        out_dir.mkdir(parents=True)
+    return out_dir
+
+
+def update_tokenizer_paths(tokenizer_config: typing.Dict, unpacked_checkpoints_dir):
+    def _update_config_entry(key, file_pattern):
+        old_path = tokenizer_config[key]
+        if old_path is None:
+            return
+        old_path = Path(old_path)
+        new_path = unpacked_checkpoints_dir.get_tokenizer_file_path("tokenizer", key, file_pattern)
+        if new_path:
+            LOGGER.debug(f"Update tokenizer {key} {old_path} -> {new_path}")
+            tokenizer_config[key] = new_path.as_posix()
+        elif not old_path.exists():
+            LOGGER.warning(f"Tokenizer {key}'s path {old_path} does not exists: set it to None")
+            tokenizer_config[key] = None
+
+    _update_config_entry("model", "*.model")
+    _update_config_entry("vocab_file", "*vocab*")
+    _update_config_entry("merge_file", "*merge*.txt")
+
+    return tokenizer_config
+
+
+def copy_tokenizer_files(config, out_dir):
+    basenames = {
+        "model": "tokenizer",
+        "vocab_file": "vocab",
+        "merge_file": "merges",
+    }
+
+    for key in basenames.keys():
+        if config[key] is None:
+            continue
+        path = Path(config[key])
+        if not path.exists():
+            LOGGER.debug(f"Tokenizer {key}: {path} file not found")
+            continue
+
+        dst_path = out_dir / f"{basenames[key]}{path.suffix}"
+        LOGGER.debug(f"Copy tokenizer {key}: {path}->{dst_path}")
+        shutil.copy(path.as_posix(), dst_path.as_posix())
+
+
+def build_tokenizer(tokenizer):
+    if isinstance(tokenizer, dict):
+        tokenizer_config = tokenizer
+        if tokenizer_config["library"] == "sentencepiece":
+            return SentencePieceTokenizer(model_path=tokenizer_config["model"])
+        elif "GPT2" in tokenizer_config["type"]:
+            tokenizer = GPT2Tokenizer(tokenizer_config["vocab_file"], tokenizer_config["merge_file"])
+        else:
+            raise ValueError(f'Tokenizer type {tokenizer_config["library"]} not handled')
+
+        if tokenizer.bos_token_id is None:
+            tokenizer.add_special_tokens({"bos_token": "<s>"})
+        if tokenizer.eos_token_id is None:
+            tokenizer.add_special_tokens({"eos_token": "</s>"})
+    else:
+        try:
+            # If NeMo tokenizer, monkey patch interface
+            from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+
+            if isinstance(tokenizer, TokenizerSpec):
+
+                def batch_encode_patch(self, ids):
+                    if torch.is_tensor(ids):
+                        ids = ids.cpu().numpy()
+                    return self.ids_to_text(ids)
+
+                tokenizer.bos_token_id = tokenizer.bos_id
+                tokenizer.eos_token_id = tokenizer.eos_id
+                tokenizer.encode = tokenizer.text_to_ids
+                TokenizerSpec.batch_decode = batch_encode_patch
+        except:
+            raise TypeError(f'Unsupported tokenizer build input: {type(tokenizer)}')
+
+    return tokenizer
diff --git a/nemo/export/trt_llm/nemo/sentencepiece_tokenizer.py b/nemo/export/trt_llm/nemo/sentencepiece_tokenizer.py
new file mode 100644
index 000000000000..1f86c5887a5e
--- /dev/null
+++ b/nemo/export/trt_llm/nemo/sentencepiece_tokenizer.py
@@ -0,0 +1,249 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import sentencepiece
+import torch
+
+
+class SentencePieceTokenizer:
+    """
+    Sentencepiecetokenizer https://github.com/google/sentencepiece
+
+        Args:
+        model_path: path to sentence piece tokenizer model.
+        special_tokens: either list of special tokens or dictionary of token name to token value
+        legacy: when set to True, the previous behavior of the SentecePiece wrapper will be restored,
+            including the possibility to add special tokens inside wrapper.
+    """
+
+    def __init__(
+        self, model_path: str, special_tokens: Optional[Union[Dict[str, str], List[str]]] = None, legacy: bool = False
+    ):
+        if not model_path or not os.path.exists(model_path):
+            raise ValueError(f"model_path: {model_path} is invalid")
+        self.tokenizer = sentencepiece.SentencePieceProcessor()
+        self.tokenizer.Load(model_path)
+
+        self.original_vocab_size = self.tokenizer.get_piece_size()
+        self.vocab_size = self.tokenizer.get_piece_size()
+        self.legacy = legacy
+        self.special_token_to_id = {}
+        self.id_to_special_token = {}
+        if special_tokens:
+            if not self.legacy:
+                raise ValueError(
+                    "Special tokens must be None when legacy is set to False. Provide special tokens at train time."
+                )
+            self.add_special_tokens(special_tokens)
+        self.space_sensitive = self.text_to_tokens('x y') != self.text_to_tokens('x') + self.text_to_tokens('y')
+
+    def text_to_tokens(self, text):
+        if self.legacy:
+            tokens = []
+            idx = 0
+
+            while 1:
+                indices = {}
+
+                for token in self.special_token_to_id:
+                    try:
+                        indices[token] = text[idx:].index(token)
+                    except ValueError:
+                        continue
+
+                if len(indices) == 0:
+                    break
+
+                next_token = min(indices, key=indices.get)
+                next_idx = idx + indices[next_token]
+
+                tokens.extend(self.tokenizer.encode_as_pieces(text[idx:next_idx]))
+                tokens.append(next_token)
+                idx = next_idx + len(next_token)
+
+            tokens.extend(self.tokenizer.encode_as_pieces(text[idx:]))
+            return tokens
+
+        return self.tokenizer.encode_as_pieces(text)
+
+    def encode(self, text):
+        if self.legacy:
+            ids = []
+            idx = 0
+
+            while 1:
+                indices = {}
+
+                for token in self.special_token_to_id:
+                    try:
+                        indices[token] = text[idx:].index(token)
+                    except ValueError:
+                        continue
+
+                if len(indices) == 0:
+                    break
+
+                next_token = min(indices, key=indices.get)
+                next_idx = idx + indices[next_token]
+
+                ids.extend(self.tokenizer.encode_as_ids(text[idx:next_idx]))
+                ids.append(self.special_token_to_id[next_token])
+                idx = next_idx + len(next_token)
+
+            ids.extend(self.tokenizer.encode_as_ids(text[idx:]))
+            return ids
+
+        return self.tokenizer.encode_as_ids(text)
+
+    def tokens_to_text(self, tokens):
+        if isinstance(tokens, np.ndarray):
+            tokens = tokens.tolist()
+
+        return self.tokenizer.decode_pieces(tokens)
+
+    def batch_decode(self, ids):
+        if isinstance(ids, np.ndarray) or torch.is_tensor(ids):
+            ids = ids.tolist()
+
+        if self.legacy:
+            text = ""
+            last_i = 0
+
+            for i, id in enumerate(ids):
+                if id in self.id_to_special_token:
+                    text += self.tokenizer.decode_ids(ids[last_i:i]) + " "
+                    text += self.id_to_special_token[id] + " "
+                    last_i = i + 1
+
+            text += self.tokenizer.decode_ids(ids[last_i:])
+            return text.strip()
+
+        return self.tokenizer.decode(ids)
+
+    def token_to_id(self, token):
+        if self.legacy and token in self.special_token_to_id:
+            return self.special_token_to_id[token]
+
+        return self.tokenizer.piece_to_id(token)
+
+    def ids_to_tokens(self, ids):
+        tokens = []
+        for id in ids:
+            if id >= self.original_vocab_size:
+                tokens.append(self.id_to_special_token[id])
+            else:
+                tokens.append(self.tokenizer.id_to_piece(id))
+        return tokens
+
+    def tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
+        if isinstance(tokens, str):
+            tokens = [tokens]
+        ids = []
+        for token in tokens:
+            ids.append(self.token_to_id(token))
+        return ids
+
+    def add_special_tokens(self, special_tokens):
+        if not self.legacy:
+            raise AttributeError("Special Token addition does not work when legacy is set to False.")
+
+        if isinstance(special_tokens, list):
+            for token in special_tokens:
+                if (
+                    self.tokenizer.piece_to_id(token) == self.tokenizer.unk_id()
+                    and token not in self.special_token_to_id
+                ):
+                    self.special_token_to_id[token] = self.vocab_size
+                    self.id_to_special_token[self.vocab_size] = token
+                    self.vocab_size += 1
+        elif isinstance(special_tokens, dict):
+            for token_name, token in special_tokens.items():
+                setattr(self, token_name, token)
+                if (
+                    self.tokenizer.piece_to_id(token) == self.tokenizer.unk_id()
+                    and token not in self.special_token_to_id
+                ):
+                    self.special_token_to_id[token] = self.vocab_size
+                    self.id_to_special_token[self.vocab_size] = token
+                    self.vocab_size += 1
+
+    @property
+    def pad_id(self):
+        if self.legacy:
+            pad_id = self.tokens_to_ids([self.pad_token])[0]
+        else:
+            pad_id = self.tokenizer.pad_id()
+        return pad_id
+
+    @property
+    def bos_token_id(self):
+        if self.legacy:
+            bos_id = self.tokens_to_ids([self.bos_token])[0]
+        else:
+            bos_id = self.tokenizer.bos_id()
+        return bos_id
+
+    @property
+    def eos_token_id(self):
+        if self.legacy:
+            eos_id = self.tokens_to_ids([self.eos_token])[0]
+        else:
+            eos_id = self.tokenizer.eos_id()
+        return eos_id
+
+    @property
+    def sep_id(self):
+        if self.legacy:
+            return self.tokens_to_ids([self.sep_token])[0]
+        else:
+            raise NameError("Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.")
+
+    @property
+    def cls_id(self):
+        if self.legacy:
+            return self.tokens_to_ids([self.cls_token])[0]
+        else:
+            raise NameError("Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.")
+
+    @property
+    def mask_id(self):
+        if self.legacy:
+            return self.tokens_to_ids([self.mask_token])[0]
+        else:
+            raise NameError("Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.")
+
+    @property
+    def unk_id(self):
+        return self.tokenizer.unk_id()
+
+    @property
+    def additional_special_tokens_ids(self):
+        """Returns a list of the additional special tokens (excluding bos, eos, pad, unk). Used to return sentinel tokens for e.g. T5."""
+        special_tokens = set(
+            [self.bos_token, self.eos_token, self.pad_token, self.mask_token, self.cls_token, self.sep_token]
+        )
+        return [v for k, v in self.special_token_to_id.items() if k not in special_tokens]
+
+    @property
+    def vocab(self):
+        main_vocab = [self.tokenizer.id_to_piece(id) for id in range(self.tokenizer.get_piece_size())]
+        special_tokens = [
+            self.id_to_special_token[self.original_vocab_size + i]
+            for i in range(self.vocab_size - self.original_vocab_size)
+        ]
+        return main_vocab + special_tokens
diff --git a/nemo/export/trt_llm/nemo_utils.py b/nemo/export/trt_llm/nemo_utils.py
new file mode 100644
index 000000000000..af077ea32a8e
--- /dev/null
+++ b/nemo/export/trt_llm/nemo_utils.py
@@ -0,0 +1,325 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import copy
+import csv
+import datetime
+import logging
+import os
+import shutil
+import sys
+import tempfile
+from pathlib import Path
+from typing import Dict, List, Tuple, Union
+
+import numpy as np
+import tensorrt_llm
+from tensorrt_llm import str_dtype_to_trt
+from transformers import AutoTokenizer, LlamaConfig, PretrainedConfig, PreTrainedTokenizer
+
+from nemo.export.trt_llm.model_config import (
+    LAYERNORM_DEFAULT,
+    LAYERNORM_RMS,
+    LINEAR_COLUMN,
+    DecoderLayerConfig,
+    EmbeddingConfig,
+    LayernormConfig,
+    LinearConfig,
+    ModelConfig,
+)
+from nemo.export.trt_llm.nemo.nemo import UnpackedNemoCheckpointDir, unpack_nemo_ckpt
+from nemo.export.trt_llm.nemo.nemo_ckpt_convert import build_tokenizer, convert_dist_checkpoint, convert_nemo_model
+from nemo.export.trt_llm.tensor_utils import get_tensor_from_dict, get_tensor_parallel_group, split
+
+LOGGER = logging.getLogger("NeMo")
+
+
+def _nemo_llm_decode(
+    in_file: str,
+    out_dir: str,
+    tensor_parallelism: int = 1,
+    processes: int = 1,
+    storage_type: str = "bfloat16",
+    load_checkpoints_on_gpu: bool = False,
+    decoder_type: str = "gptnext",
+    save_nemo_model_config: bool = False,
+) -> Tuple[Dict[str, np.ndarray], PretrainedConfig, PreTrainedTokenizer]:
+    """Decodes the NEMO file and returns the weights dict, llm config and tokenizer."""
+    args = argparse.Namespace()
+    args.in_file = in_file
+    args.out_dir = out_dir
+    args.tensor_parallelism = tensor_parallelism
+    args.processes = processes
+    args.storage_type = storage_type
+    args.load_checkpoints_on_gpu = load_checkpoints_on_gpu
+    args.verbose = False
+    args.decoder_type = decoder_type
+
+    input_path = Path(args.in_file)
+    if not input_path.exists():
+        LOGGER.error("%s does not exists", input_path)
+        sys.exit(1)
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        temp_dir = Path(temp_dir)
+
+        # unpack if needed
+        if input_path.is_dir():
+            nemo_dir = input_path
+        else:
+            start_time = datetime.datetime.now()
+            checkpoint_dir_path = temp_dir / "unpacked"
+            nemo_dir = unpack_nemo_ckpt(args.in_file, checkpoint_dir_path)
+            LOGGER.info("Spent %s (h:m:s) to unpack NeMo archive", datetime.datetime.now() - start_time)
+
+        unpacked_checkpoint_dir = UnpackedNemoCheckpointDir(
+            nemo_dir, load_checkpoints_to_cpu=not args.load_checkpoints_on_gpu
+        )
+
+        start_time = datetime.datetime.now()
+        dist_ckpt_folder = nemo_dir / "model_weights"
+
+        if dist_ckpt_folder.exists():
+            weights_dict, llm_config, tokenizer = convert_dist_checkpoint(unpacked_checkpoint_dir, args)
+        else:
+            raise Exception(
+                "Not a supported nemo file format. " "Only distributed mcore nemo checkpoints are support."
+            )
+
+        LOGGER.info("Spent %s (h:m:s) to convert the model", datetime.datetime.now() - start_time)
+
+        if save_nemo_model_config:
+            shutil.copyfile(
+                unpacked_checkpoint_dir._checkpoints_dir / "model_config.yaml", args.out_dir / "model_config.yaml"
+            )
+
+        return weights_dict, llm_config, tokenizer
+
+
+def get_tokenzier(tokenizer_dir_or_path: Path) -> PreTrainedTokenizer:
+    """Loads the tokenizer from the decoded NEMO weights dir."""
+    if os.path.isdir(os.path.join(tokenizer_dir_or_path, "huggingface_tokenizer")):
+        return AutoTokenizer.from_pretrained(os.path.join(tokenizer_dir_or_path, "huggingface_tokenizer"))
+
+    model_path = tokenizer_dir_or_path / "tokenizer.model" if tokenizer_dir_or_path.is_dir() else tokenizer_dir_or_path
+    tokenizer_config = {"library": "sentencepiece", "model": str(model_path)}
+    return build_tokenizer(tokenizer_config)
+
+
+def nemo_llm_to_model_config(
+    in_file: str,
+    decoder_type: str,
+    nemo_export_dir: Union[str, Path],
+    dtype: str = "bfloat16",
+    tensor_parallel_size: int = 1,
+    pipeline_parallel_size: int = 1,
+    save_nemo_model_config: bool = False,
+) -> Tuple[List[ModelConfig], PreTrainedTokenizer]:
+    """Converts the NEMO file and construct the `ModelConfig` before tensorrt_llm deployment."""
+    dtype_str = dtype
+
+    weights_dict, llm_model_config, tokenizer = _nemo_llm_decode(
+        in_file=in_file,
+        out_dir=nemo_export_dir,
+        tensor_parallelism=tensor_parallel_size,
+        processes=1,
+        storage_type=dtype_str,
+        load_checkpoints_on_gpu=False,
+        decoder_type=decoder_type,
+        save_nemo_model_config=save_nemo_model_config,
+    )
+
+    world_size = tensor_parallel_size * pipeline_parallel_size
+    model_config_template = ModelConfig()
+    model_config_template.dtype = dtype_str
+
+    str_dtype_to_trt(dtype_str)
+
+    model_configs = []
+    for i in range(world_size):
+
+        model_configs.append(copy.deepcopy(model_config_template))
+
+        model_configs[i].vocab_embedding = EmbeddingConfig(weight=get_tensor_from_dict(weights_dict, "wte"))
+
+        model_configs[i].positional_embedding = EmbeddingConfig(weight=get_tensor_from_dict(weights_dict, "wpe"))
+
+        model_configs[i].final_layernorm = LayernormConfig(
+            weight=get_tensor_from_dict(weights_dict, "final_layernorm.weight"),
+            bias=get_tensor_from_dict(weights_dict, "final_layernorm.bias"),
+        )
+        model_configs[i].final_layernorm.layernorm_type = (
+            LAYERNORM_RMS if isinstance(llm_model_config, LlamaConfig) else LAYERNORM_DEFAULT
+        )
+        model_configs[i].mapping = tensorrt_llm.Mapping(
+            world_size=world_size, rank=i, tp_size=tensor_parallel_size, pp_size=pipeline_parallel_size
+        )
+
+    for i in range(llm_model_config.n_layer):
+        for j in range(world_size):
+            model_configs[j].layers.append(
+                DecoderLayerConfig.from_nemo(
+                    weights_dict=weights_dict,
+                    llm_config=llm_model_config,
+                    decoder_type=decoder_type,
+                    layer_id=i,
+                    rank=model_configs[j].mapping.tp_rank,
+                    is_mcore=llm_model_config.is_mcore,
+                )
+            )
+
+    lm_head_weight = get_tensor_from_dict(weights_dict, "lm_head.weight")
+
+    if model_configs[0].vocab_size_padded != model_configs[0].vocab_size:
+        pad_width = model_configs[0].vocab_size_padded - model_configs[0].vocab_size
+        lm_head_weight = np.pad(lm_head_weight, ((0, pad_width), (0, 0)), "constant", constant_values=0)
+
+    for i in range(world_size):
+        model_configs[i].lm_head = LinearConfig(linear_type=LINEAR_COLUMN)
+        model_configs[i].lm_head.weight = np.ascontiguousarray(
+            split(lm_head_weight, model_configs[i].mapping.tp_size, model_configs[i].mapping.tp_rank)
+        )
+
+    return model_configs, tokenizer
+
+
+def to_word_list_format(word_dict: List[List[str]], tokenizer=None):
+    '''
+    format of word_dict
+        len(word_dict) should be same to batch_size
+        word_dict[i] means the words for batch i
+        len(word_dict[i]) must be 1, which means it only contains 1 string
+        This string can contains several sentences and split by ",".
+        For example, if word_dict[2] = " I am happy, I am sad", then this function will return
+        the ids for two short sentences " I am happy" and " I am sad".
+    '''
+    assert tokenizer is not None, "need to set tokenizer"
+
+    flat_ids = []
+    offsets = []
+    # We use a similar trick as in NeMo to deal with the fact that the encoding of a single word
+    # can't always be trusted. See
+    #   https://github.com/NVIDIA/NeMo/blob/bb575b72fd0be51ae10cc77d9f89ddb9e9d3b96d/nemo/collections/nlp/modules/common/text_generation_strategy.py#L229
+    ids_ref = tokenizer.encode("<extra_id_1>")
+    for word_dict_item in word_dict:
+        item_flat_ids = []
+        item_offsets = []
+
+        if isinstance(word_dict_item[0], bytes):
+            word_dict_item = [word_dict_item[0].decode()]
+
+        words = list(csv.reader(word_dict_item))[0]
+        for word in words:
+            ids = tokenizer.encode(f"<extra_id_1>{word}")
+            if ids[0 : len(ids_ref)] == ids_ref:
+                # It worked! We can obtain the token(s) associated to `word` by stripping the prefix tokens.
+                ids = ids[len(ids_ref) :]
+            else:
+                # Unfortunately the prefix was merged with `word`. We could try with a different prefix, but
+                # for now we just use the basic encoding since this should be a very rare edge case.
+                ids = tokenizer.encode(word)
+                logging.warning(f"The encoding of word '{word}' into tokens {ids} might be incorrect")
+
+            if len(ids) == 0:
+                continue
+
+            item_flat_ids += ids
+            item_offsets.append(len(ids))
+
+        flat_ids.append(np.array(item_flat_ids))
+        offsets.append(np.cumsum(np.array(item_offsets)))
+
+    pad_to = max(1, max(len(ids) for ids in flat_ids))
+
+    for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
+        flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0)
+        offsets[i] = np.pad(offs, (0, pad_to - len(offs)), constant_values=-1)
+
+    return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
+
+
+def nemo_llm_model_to_model_config(
+    nemo_model: str, decoder_type: str, nemo_model_config: str, dtype_str: str = "float32",
+) -> Tuple[List[ModelConfig], PreTrainedTokenizer]:
+    """Converts the NEMO model object and construct the `ModelConfig` before tensorrt_llm deployment."""
+    from megatron.core import parallel_state
+
+    assert nemo_model_config is not None, "gpt_model_config must be provided when in is a nemo model"
+
+    weights_dict, llm_model_config = convert_nemo_model(nemo_model, nemo_model_config, dtype_str, decoder_type)
+    is_mcore = nemo_model_config.get("mcore_gpt", False)
+    llm_model_config.is_mcore = is_mcore
+
+    model_config = ModelConfig()
+    model_config.use_prompt_tuning = False
+    model_config.dtype = dtype_str
+    model_config.use_parallel_embedding = True
+    str_dtype_to_trt(dtype_str)
+
+    model_config.vocab_embedding = EmbeddingConfig(weight=get_tensor_from_dict(weights_dict, "wte"), is_local=True)
+
+    model_config.positional_embedding = EmbeddingConfig(
+        weight=get_tensor_from_dict(weights_dict, "wpe"), is_local=True
+    )
+
+    model_config.final_layernorm = LayernormConfig(
+        weight=get_tensor_from_dict(weights_dict, "final_layernorm.weight"),
+        bias=get_tensor_from_dict(weights_dict, "final_layernorm.bias"),
+    )
+    model_config.final_layernorm.layernorm_type = (
+        LAYERNORM_RMS if isinstance(llm_model_config, LlamaConfig) else LAYERNORM_DEFAULT
+    )
+
+    tensor_parallel_size = nemo_model_config.tensor_model_parallel_size
+    pipeline_parallel_size = 1
+    world_size = tensor_parallel_size * pipeline_parallel_size
+
+    # hack since tensorrt_llm doesnt support DP natively so init all ranks with DP=1
+    model_config.mapping = tensorrt_llm.Mapping(
+        world_size=tensor_parallel_size * pipeline_parallel_size,
+        rank=tensorrt_llm.mpi_rank() % world_size,
+        tp_size=tensor_parallel_size,
+        pp_size=pipeline_parallel_size,
+    )
+    model_config.mapping.rank = tensorrt_llm.mpi_rank()
+    model_config.mapping.tp_group = get_tensor_parallel_group(tensor_parallel_size)
+
+    LOGGER.info(
+        f'''Resharing: Rank {tensorrt_llm.mpi_rank()} mapping:
+        tp_rank  {parallel_state.get_tensor_model_parallel_rank()} -> {model_config.mapping.tp_rank}, 
+        pp_rank  {parallel_state.get_pipeline_model_parallel_rank()} -> {model_config.mapping.pp_rank}, 
+        tp_group {model_config.mapping.tp_group}'''
+    )
+
+    for i in range(llm_model_config.n_layer):
+        model_config.layers.append(
+            DecoderLayerConfig.from_nemo(
+                weights_dict=weights_dict,
+                llm_config=llm_model_config,
+                decoder_type=decoder_type,
+                layer_id=i,
+                rank=model_config.mapping.tp_rank,
+                is_mcore=llm_model_config.is_mcore,
+            )
+        )
+    lm_head_weight = get_tensor_from_dict(weights_dict, "lm_head.weight")
+
+    assert model_config.vocab_size_padded == model_config.vocab_size
+
+    model_config.lm_head = LinearConfig(linear_type=LINEAR_COLUMN)
+    model_config.lm_head.weight = lm_head_weight
+
+    return [model_config]
diff --git a/nemo/export/trt_llm/quantization_utils.py b/nemo/export/trt_llm/quantization_utils.py
new file mode 100644
index 000000000000..86365f774bb7
--- /dev/null
+++ b/nemo/export/trt_llm/quantization_utils.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import numpy as np
+from tensorrt_llm.layers import Linear, RowLinear
+from tensorrt_llm.quantization.layers import FP8Linear, FP8RowLinear, Int8SmoothQuantLinear, Int8SmoothQuantRowLinear
+
+from nemo.export.trt_llm.model_config import (
+    QUANTIZATION_FP8,
+    QUANTIZATION_INT8_SQ,
+    QUANTIZATION_NONE,
+    LinearConfig,
+    ModelConfig,
+)
+
+
+def quantize_linear(tensorrt_llm_layer, quantization: str, layer_config: LinearConfig):
+    """Returns the quantized tensorrt_llm linear layer."""
+    if quantization == QUANTIZATION_NONE:
+        return tensorrt_llm_layer
+
+    if quantization == QUANTIZATION_FP8:
+        # FP8 is not sensitive to scaling factors. So we just quantize all layers possible.
+        default_scaling_factor = np.array([1], dtype=np.float32)
+        if layer_config.activation_scaling_factor is None:
+            layer_config.activation_scaling_factor = default_scaling_factor
+        if layer_config.weights_scaling_factor is None:
+            layer_config.weights_scaling_factor = default_scaling_factor
+
+    if layer_config.activation_scaling_factor is None or layer_config.weights_scaling_factor is None:
+        print(f"No valid scaling factors in {tensorrt_llm_layer._get_name()}, skipping quantization" " on this layer")
+        return tensorrt_llm_layer
+    else:
+        assert np.all(layer_config.activation_scaling_factor > 0)
+        assert np.all(layer_config.weights_scaling_factor > 0)
+
+    bias = tensorrt_llm_layer.bias is not None
+
+    linear_layer_type = type(tensorrt_llm_layer)
+    if linear_layer_type == Linear:
+        if quantization == QUANTIZATION_FP8:
+            linear = FP8Linear
+        elif quantization == QUANTIZATION_INT8_SQ:
+            linear = Int8SmoothQuantLinear
+        else:
+            assert False, f"{quantization} is not supported."
+        quantized_linear_layer = linear(
+            in_features=tensorrt_llm_layer.in_features,
+            out_features=tensorrt_llm_layer.out_features * tensorrt_llm_layer.tp_size,
+            bias=bias,
+            dtype=tensorrt_llm_layer.dtype,
+            tp_group=tensorrt_llm_layer.tp_group,
+            tp_size=tensorrt_llm_layer.tp_size,
+            gather_output=tensorrt_llm_layer.gather_output,
+        )
+    elif linear_layer_type == RowLinear:
+        if quantization == QUANTIZATION_FP8:
+            row_linear = FP8RowLinear
+        elif quantization == QUANTIZATION_INT8_SQ:
+            row_linear = Int8SmoothQuantRowLinear
+        else:
+            assert False, f"{quantization} is not supported."
+        quantized_linear_layer = row_linear(
+            in_features=tensorrt_llm_layer.in_features * tensorrt_llm_layer.tp_size,
+            out_features=tensorrt_llm_layer.out_features,
+            bias=bias,
+            dtype=tensorrt_llm_layer.dtype,
+            tp_group=tensorrt_llm_layer.tp_group,
+            tp_size=tensorrt_llm_layer.tp_size,
+        )
+    else:
+        assert False, f"{linear_layer_type} is not supported."
+
+    quantized_linear_layer.weight = tensorrt_llm_layer.weight
+    quantized_linear_layer.bias = tensorrt_llm_layer.bias
+
+    quantized_linear_layer.activation_scaling_factor.value = layer_config.activation_scaling_factor
+    quantized_linear_layer.weights_scaling_factor.value = layer_config.weights_scaling_factor
+
+    if hasattr(quantized_linear_layer, "prequant_scaling_factor"):
+        quantized_linear_layer.prequant_scaling_factor.value = layer_config.prequant_scaling_factor
+
+    return quantized_linear_layer
+
+
+def naive_quantization(config: ModelConfig, quantization: str):
+    """Generates a constant scaling factor (1) with target quantization.
+
+    This is for debugging and performance measurement only.
+    """
+    config.quantization = quantization
+    # Here the scaling factor is not inversed.
+    # In nvidia systems:
+    # pytorch_quantization uses inv scale
+    # onnx & trt uses non-inv scale
+    # cask uses inv scale
+    default_scaling_factor = np.array([1], dtype=np.float32)
+
+    if quantization == QUANTIZATION_FP8:
+        for layer in config.layers:
+            linear_layers = [
+                layer.attention.qkv,
+                layer.attention.dense,
+                layer.mlp.fc,
+                layer.mlp.proj,
+                layer.mlp.gate,
+            ]
+            for linear_layer in linear_layers:
+                if linear_layer:
+                    linear_layer.activation_scaling_factor = default_scaling_factor
+                    linear_layer.weights_scaling_factor = default_scaling_factor
+        config.lm_head.activation_scaling_factor = default_scaling_factor
+        config.lm_head.weights_scaling_factor = default_scaling_factor
+
+    else:
+        assert False, f"{quantization} not supported"
diff --git a/nemo/export/trt_llm/tensor_utils.py b/nemo/export/trt_llm/tensor_utils.py
new file mode 100644
index 000000000000..2fce81b91647
--- /dev/null
+++ b/nemo/export/trt_llm/tensor_utils.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Dict
+
+import numpy as np
+import tensorrt as trt
+import tensorrt_llm
+import torch
+
+
+def torch_to_numpy_with_dtype(tensor, dtype=trt.float16):
+    """Converts a torch tensor to numpy array with the dtype."""
+    if dtype == trt.float16:
+        torch_dtype = torch.float16
+    elif dtype == trt.float32:
+        torch_dtype = torch.float32
+    elif dtype == trt.bfloat16:
+        torch_dtype = torch.bfloat16
+    else:
+        assert False, f"{dtype} not supported"
+    return tensorrt_llm._utils.torch_to_numpy(tensor.detach().to(torch_dtype))
+
+
+def split(v, tp_size, idx, dim=0):
+    """Splits the np tensor v on dim and return the idx's slice."""
+    if tp_size == 1:
+        return v
+    if len(v.shape) == 1:
+        return np.ascontiguousarray(np.split(v, tp_size)[idx])
+    else:
+        return np.ascontiguousarray(np.split(v, tp_size, axis=dim)[idx])
+
+
+def get_tensor_parallel_group(tensor_parallel: int):
+    """Returns the tensor_parallel_group config based on tensor_parallel."""
+    from mpi4py import MPI
+
+    mpi_rank = MPI.COMM_WORLD.Get_rank()
+    offset = mpi_rank - mpi_rank % tensor_parallel
+    tp_group = list(range(offset, offset + tensor_parallel))
+    return None if tensor_parallel == 1 else tp_group
+
+
+def get_tensor_from_dict(weights_dict: Dict[str, np.ndarray], name: str) -> np.array:
+    """Loads tensor from the weights_dict."""
+    return weights_dict.get(f"model.{name}.bin", None)
diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
new file mode 100644
index 000000000000..0941a6d3dbba
--- /dev/null
+++ b/nemo/export/trt_llm/tensorrt_llm_build.py
@@ -0,0 +1,346 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import logging
+import os
+import time
+from pathlib import Path
+from typing import List
+
+import tensorrt as trt
+import tensorrt_llm
+import torch
+from tensorrt_llm import str_dtype_to_trt
+from tensorrt_llm._utils import np_dtype_to_trt
+from tensorrt_llm.builder import Builder
+from tensorrt_llm.logger import logger
+from tensorrt_llm.network import net_guard
+from tensorrt_llm.plugin.plugin import ContextFMHAType
+from tensorrt_llm.quantization import QuantMode
+
+MODEL_NAME = "NeMo"
+
+LOGGER = logging.getLogger("NeMo")
+
+
+def get_engine_name(model, dtype, tp_size, pp_size, rank):
+    """Returns the engine file name based on the provided info."""
+    if pp_size == 1:
+        return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank)
+    return '{}_{}_tp{}_pp{}_rank{}.engine'.format(model, dtype, tp_size, pp_size, rank)
+
+
+def serialize_engine(engine, path):
+    """Serializes the engine to path."""
+    logger.info(f"Serializing engine to {path}...")
+    tik = time.time()
+    with open(path, "wb") as f:
+        f.write(bytearray(engine))
+    tok = time.time()
+    t = time.strftime("%H:%M:%S", time.gmtime(tok - tik))
+    logger.info(f"Engine serialized. Total time: {t}")
+
+
+def refit_runtime_engine(params, cuda_engine):
+    '''
+        @brief: Inplace refit one TensorRT cuda engine using weights from the network,
+            user should guarantee that the engine is built with REFIT flag, and the network has the same structure with the engine.
+        @param engine_buffer: A serialized TensorRT engine.
+        @param network: Network object.
+        @return: A serialized TRT engine if refit successfully, None otherwise
+    '''
+    logger.info(f'Refit runtime engine')
+    tik = time.time()
+
+    # Refit engine
+    assert params is not None
+    refitter = trt.Refitter(cuda_engine, logger.trt_logger)
+    for name, param in params:
+        trt_param = trt.Weights(np_dtype_to_trt(param._value.dtype), param._value.ctypes.data, param._value.size)
+
+        if trt_param is None or not refitter.set_named_weights(name, trt_param):
+            logger.error(f'Failed to refit weight: {name}')
+            return None
+
+    if not refitter.refit_cuda_engine():
+        logger.error(f'Failed to refit engine.')
+        return None
+
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    logger.info(f'Total time of refitting {cuda_engine.name}: {t}')
+
+    return cuda_engine
+
+
+def build_rank_engine(
+    tensorrt_llm_gpt, builder: Builder, builder_config: tensorrt_llm.builder.BuilderConfig, engine_name, args,
+):
+
+    str_dtype_to_trt(args.dtype)
+    ootb = os.getenv("OOTB", False)
+
+    network = builder.create_network()
+    network.trt_network.name = engine_name
+
+    # We have to use the attention plugin for most of the models.
+    if args.use_gpt_attention_plugin:
+        network.plugin_config.set_gpt_attention_plugin(dtype=args.use_gpt_attention_plugin)
+
+    if not ootb:
+        network.plugin_config.use_custom_all_reduce = False
+
+        if args.use_gemm_plugin:
+            network.plugin_config.set_gemm_plugin(dtype=args.use_gemm_plugin)
+        if args.use_layernorm_plugin:
+            network.plugin_config.set_layernorm_plugin(dtype=args.use_layernorm_plugin)
+        assert not (args.enable_context_fmha and args.enable_context_fmha_fp32_acc)
+        if args.enable_context_fmha:
+            network.plugin_config.set_context_fmha(ContextFMHAType.enabled)
+        if args.enable_context_fmha_fp32_acc:
+            network.plugin_config.set_context_fmha(ContextFMHAType.enabled_with_fp32_acc)
+        if args.remove_input_padding:
+            network.plugin_config.enable_remove_input_padding()
+        else:
+            network.plugin_config.remove_input_padding = False
+        if args.paged_kv_cache:
+            network.plugin_config.enable_paged_kv_cache()
+        else:
+            network.plugin_config.paged_kv_cache = False
+        if args.use_ib_gpt_attention_plugin:
+            network.plugin_config.set_inflight_batching_gpt_attention_plugin(dtype=args.use_ib_gpt_attention_plugin)
+        if args.enable_multi_block_mode:
+            network.plugin_config.enable_mmha_multi_block_mode()
+
+        if args.use_lora_plugin:
+            network.plugin_config.set_lora_plugin(dtype=args.use_lora_plugin)
+
+        if args.use_lookup_plugin:
+            # Use the plugin for the embedding parallelism and sharing
+            network.plugin_config.set_lookup_plugin(dtype=args.dtype)
+    else:
+        LOGGER.warning("Build engine in OOTB mode, disable all plugins except nccl.")
+
+    if args.mapping.world_size > 1:
+        network.plugin_config.set_nccl_plugin(args.dtype)
+
+    with net_guard(network):
+        # Prepare
+        network.set_named_parameters(tensorrt_llm_gpt.named_parameters())
+
+        # Forward
+        inputs = tensorrt_llm_gpt.prepare_inputs(
+            max_batch_size=args.max_batch_size,
+            max_input_len=args.max_input_len,
+            max_new_tokens=args.max_input_len + args.max_output_len,
+            use_cache=True,
+            max_beam_width=args.max_beam_width,
+            paged_kv_cache=args.paged_kv_cache,
+            tokens_per_block=args.tokens_per_block,
+            prompt_embedding_table_size=args.max_prompt_embedding_table_size,
+            lora_target_modules=args.lora_target_modules,
+        )
+        tensorrt_llm_gpt(*inputs)
+
+    # Network -> Engine
+    engine = builder.build_engine(network, builder_config)
+    if args.mapping.rank == 0 or args.use_refit:
+        config_path = args.output_dir / "config.json"
+        builder.save_config(builder_config, config_path)
+    return engine
+
+
+def _build_impl(tensorrt_llm_model, args):
+    torch.cuda.set_device(args.mapping.rank % args.gpus_per_node)
+    tensorrt_llm.logger.set_level(args.log_level)
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+    timing_cache_file = args.timing_cache if args.timing_cache else args.output_dir / "model.cache"
+    timing_cache = timing_cache_file
+
+    builder = Builder()
+    apply_query_key_layer_scaling = False
+
+    builder_config = builder.create_builder_config(
+        name=MODEL_NAME,
+        precision=args.dtype,
+        timing_cache=timing_cache,
+        tensor_parallel=args.mapping.tp_size,
+        pipeline_parallel=args.mapping.pp_size,
+        world_size=args.mapping.tp_size * args.mapping.pp_size,
+        parallel_build=args.parallel_build,
+        num_layers=tensorrt_llm_model._num_layers,
+        num_heads=tensorrt_llm_model._num_heads,
+        num_kv_heads=tensorrt_llm_model._num_kv_heads,
+        head_size=tensorrt_llm_model._head_size,
+        hidden_size=tensorrt_llm_model._hidden_size,
+        vocab_size=tensorrt_llm_model._vocab_size,
+        hidden_act=tensorrt_llm_model.hidden_act,
+        max_position_embeddings=tensorrt_llm_model.max_position_embeddings,
+        add_bos=tensorrt_llm_model._add_bos,
+        apply_query_key_layer_scaling=apply_query_key_layer_scaling,
+        max_batch_size=args.max_batch_size,
+        max_input_len=args.max_input_len,
+        max_output_len=args.max_output_len,
+        max_beam_width=args.max_beam_width,
+        max_num_tokens=None,
+        max_draft_len=0,
+        int8="int8" in args.quantization,
+        opt_level=args.builder_opt,
+        paged_kv_cache=args.paged_kv_cache,
+        tokens_per_block=args.tokens_per_block,
+        max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
+        use_parallel_embedding=args.use_parallel_embedding,
+        embedding_sharding_dim=args.embedding_sharding_dim,
+        fp8="fp8" in args.quantization,
+        use_refit=args.use_refit,
+        gather_context_logits=False,
+        gather_generation_logits=False,
+        quant_mode=args.quant_mode,
+        lora_target_modules=args.lora_target_modules,
+        max_lora_rank=args.max_lora_rank,
+    )
+
+    tp_size = args.mapping.tp_size
+    pp_size = args.mapping.pp_size
+    rank = args.mapping.rank
+    engine_name = get_engine_name(MODEL_NAME, args.dtype, tp_size, pp_size, rank)
+    engine = build_rank_engine(tensorrt_llm_model, builder, builder_config, engine_name, args)
+    assert engine is not None, f"Failed to build engine for rank {rank}"
+
+    serialize_engine(engine, args.output_dir / engine_name)
+
+    if args.mapping.rank == 0:
+        ok = builder.save_timing_cache(builder_config, timing_cache_file)
+        assert ok, "Failed to save timing cache."
+
+
+def build(
+    tensorrt_llm_model,
+    output_dir: Path,
+    mapping=None,
+    dtype="float16",
+    timing_cache="",
+    log_level="info",
+    max_batch_size=1,
+    max_input_len=200,
+    max_output_len=200,
+    max_beam_width=1,
+    max_prompt_embedding_table_size=0,
+    parallel_build=False,
+    gpus_per_node=1,
+    quantization=None,
+    use_inflight_batching=False,
+    paged_kv_cache=False,
+    enable_context_fmha: bool = True,
+    enable_multi_block_mode=False,
+    use_refit=False,
+    use_lora_plugin: str = None,
+    lora_target_modules: List[str] = None,
+    max_lora_rank: int = 64,
+):
+    """Builds the tensorrt_llm_model to engine."""
+    args = argparse.Namespace()
+    args.mapping = mapping
+    args.dtype = dtype
+    args.timing_cache = timing_cache
+    args.log_level = log_level
+    args.max_batch_size = max_batch_size
+    args.max_input_len = max_input_len
+    args.max_output_len = max_output_len
+    args.max_beam_width = max_beam_width
+    args.use_gpt_attention_plugin = dtype
+    args.use_gemm_plugin = dtype
+    args.use_layernorm_plugin = False
+    args.parallel_build = parallel_build
+    args.enable_context_fmha = enable_context_fmha
+    args.enable_context_fmha_fp32_acc = False
+    args.gpus_per_node = gpus_per_node
+    args.builder_opt = None
+    args.output_dir = Path(output_dir)
+    args.remove_input_padding = True
+    args.use_smooth_quant = False
+    args.use_weight_only = False
+    args.weight_only_precision = "int8"
+    args.per_channel = False
+    args.per_token = False
+    args.int8_kv_cache = False
+    args.random_seed = None
+    args.paged_kv_cache = paged_kv_cache
+    args.max_prompt_embedding_table_size = max_prompt_embedding_table_size
+    args.use_inflight_batching = use_inflight_batching
+    args.use_ib_gpt_attention_plugin = False
+    args.use_parallel_embedding = False
+    args.embedding_sharding_dim = 0
+    args.use_lookup_plugin = False
+    args.tokens_per_block = 64
+    args.quantization = quantization
+    args.enable_multi_block_mode = enable_multi_block_mode
+    args.use_refit = use_refit
+    args.use_lora_plugin = use_lora_plugin
+    args.lora_target_modules = lora_target_modules
+    args.max_lora_rank = max_lora_rank
+
+    logger.set_level(args.log_level)
+
+    assert not (
+        args.use_smooth_quant and args.use_weight_only
+    ), "You cannot enable both SmoothQuant and INT8 weight-only together."
+
+    assert not (
+        args.use_smooth_quant and args.use_weight_only
+    ), "You cannot enable both SmoothQuant and INT8 weight-only together."
+
+    if args.use_ib_gpt_attention_plugin:
+        logger.warning(
+            "use_ib_gpt_attention_plugin is deprecated. Use combination of"
+            " --use_gpt_attention_plugin=dtype --use_inflight_batching instead."
+        )
+
+    if args.use_inflight_batching:
+        assert args.use_gpt_attention_plugin, "You have to use GPT attention plugin for in-flight batching mode"
+
+        if not args.paged_kv_cache:
+            logger.warning("Paged kv cache feature will enabled for in-flight batching mode.")
+            args.paged_kv_cache = True
+
+        if not args.remove_input_padding:
+            logger.warning("Remove input padding feature will enabled for in-flight batching mode.")
+            args.remove_input_padding = True
+
+    if args.use_smooth_quant:
+        args.quant_mode = QuantMode.use_smooth_quant(args.per_token, args.per_channel)
+    elif args.use_weight_only:
+        args.quant_mode = QuantMode.use_weight_only(args.weight_only_precision == "int4")
+    else:
+        args.quant_mode = QuantMode(0)
+
+    if args.int8_kv_cache:
+        args.quant_mode = args.quant_mode.set_int8_kv_cache()
+
+    if args.random_seed is not None:
+        torch.manual_seed(args.random_seed)
+
+    if args.mapping.is_first_pp_rank():
+        if tensorrt_llm_model._modules['vocab_embedding'].tp_size > 1:
+            args.use_parallel_embedding = True
+            args.embedding_sharding_dim = tensorrt_llm_model._modules['vocab_embedding'].sharding_dim
+
+    tik = time.time()
+    _build_impl(tensorrt_llm_model, args)
+
+    tok = time.time()
+    t = time.strftime("%H:%M:%S", time.gmtime(tok - tik))
+    logger.info(f"Total time of building all {args.mapping.world_size} engines: {t}")
diff --git a/nemo/export/trt_llm/tensorrt_llm_model.py b/nemo/export/trt_llm/tensorrt_llm_model.py
new file mode 100644
index 000000000000..b2da7855ccdc
--- /dev/null
+++ b/nemo/export/trt_llm/tensorrt_llm_model.py
@@ -0,0 +1,409 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from pathlib import Path
+from typing import List
+
+import numpy as np
+import torch
+from tensorrt_llm import default_net, str_dtype_to_trt
+from tensorrt_llm._utils import trt_dtype_to_str
+from tensorrt_llm.functional import expand_mask, gather_last_token_logits, recv, send, shape
+from tensorrt_llm.layers import AttentionParams, ColumnLinear, KeyValueCacheParams, LoraParams
+from tensorrt_llm.models.generation_mixin import GenerationMixin
+from tensorrt_llm.module import Module, ModuleList
+
+from nemo.export.trt_llm.decoder import build_decoder_layer
+from nemo.export.trt_llm.model_config import DECODER_GEMMA, ModelConfig
+from nemo.export.trt_llm.quantization_utils import quantize_linear
+from nemo.export.trt_llm.tensorrt_llm_build import build
+from nemo.export.trt_llm.tensorrt_llm_utils import (
+    build_embedding_from_config,
+    build_layernorm_from_config,
+    print_tensorrt_llm,
+)
+
+
+def get_transformer_layers(mapping, num_layers):
+    layers_per_pipeline_stage = num_layers // mapping.pp_size
+    layers_range = list(
+        range(mapping.pp_rank * layers_per_pipeline_stage, (mapping.pp_rank + 1) * layers_per_pipeline_stage, 1)
+    )
+    return layers_range
+
+
+class ModelBuilder(Module):
+    def __init__(self, model_config: ModelConfig):
+        super().__init__()
+        self.quantization = model_config.quantization
+        self.max_position_embeddings = model_config.max_position_embeddings
+        self.hidden_act = model_config.hidden_act
+
+        self._dtype = str_dtype_to_trt(model_config.dtype)
+        self._kv_dtype = self._dtype
+        self._tensor_parallel = model_config.mapping.tp_size
+        self._vocab_size = model_config.vocab_size
+        self._hidden_size = model_config.hidden_size
+        self._num_layers = len(model_config.layers)
+        self._num_heads = model_config.num_attention_heads
+        self._num_kv_heads = model_config.num_kv_heads
+        self._head_size = (
+            model_config.hidden_size // model_config.num_attention_heads
+            if model_config.head_size is None
+            else model_config.head_size
+        )
+        self._use_prompt_tuning = model_config.use_prompt_tuning
+        self._add_bos = model_config.layers[0].decoder_type == DECODER_GEMMA
+        self._mapping = model_config.mapping
+        self.rank = model_config.mapping.rank
+        self.max_lora_rank = model_config.max_lora_rank
+
+        if self._mapping.is_first_pp_rank():
+            self.vocab_embedding = build_embedding_from_config(
+                model_config.vocab_embedding,
+                self._dtype,
+                use_prompt_tuning=self._use_prompt_tuning,
+                tensor_parallel=model_config.mapping.tp_size,
+                tensor_parallel_rank=model_config.mapping.tp_rank,
+            )
+
+            if model_config.positional_embedding.weight is not None:
+                self.positional_embedding = build_embedding_from_config(
+                    model_config.positional_embedding,
+                    self._dtype,
+                    tensor_parallel=model_config.mapping.tp_size,
+                    tensor_parallel_rank=model_config.mapping.tp_rank,
+                )
+
+        self.layers = []
+        for layer_id in get_transformer_layers(self._mapping, self._num_layers):
+            model_config.layers[layer_id].max_lora_rank = self.max_lora_rank
+            self.layers.append(
+                build_decoder_layer(
+                    model_config.layers[layer_id],
+                    layer_id,
+                    self._num_layers,
+                    dtype=self._dtype,
+                    quantization=model_config.quantization,
+                    rank=self.rank,
+                    tensor_parallel=self._tensor_parallel,
+                    tp_group=model_config.mapping.tp_group,
+                )
+            )
+
+        self.layers = ModuleList(self.layers)
+
+        if self._mapping.is_last_pp_rank():
+            self.ln_f = build_layernorm_from_config(model_config.final_layernorm, self._dtype)
+
+    def forward(
+        self,
+        input_ids,
+        position_ids,
+        use_cache=False,
+        attention_mask=None,
+        kv_cache_params=None,
+        attention_params=None,
+        prompt_embedding_table=None,
+        prompt_tasks=None,
+        prompt_vocab_size=None,
+        inflight_batching_args=None,
+        hidden_states=None,
+        lora_params=None,
+    ):
+        ptuning_args = []
+        if self._use_prompt_tuning:
+            ptuning_args = [prompt_embedding_table, prompt_tasks, prompt_vocab_size]
+
+        if self._mapping.is_first_pp_rank():
+            x = self.vocab_embedding(input_ids, *ptuning_args)
+            if hasattr(self, "positional_embedding") and self.positional_embedding:
+                assert position_ids
+                x = x + self.positional_embedding(position_ids)
+            hidden_states = x
+        else:
+            hidden_states = recv(hidden_states, self._mapping.prev_pp_rank())
+
+        kv_cache_params.fill_none_tensor_list(len(self.layers))
+
+        if use_cache:
+            presents = []
+
+        if attention_mask is not None:
+            attention_mask = expand_mask(attention_mask, shape(input_ids, -1))
+
+        for layer_idx, (layer, past, pointer, host_pointer, max_attention_window_size) in enumerate(
+            zip(
+                self.layers,
+                kv_cache_params.past_key_value,
+                kv_cache_params.kv_cache_block_pointers,
+                kv_cache_params.host_kv_cache_block_pointers,
+                kv_cache_params.host_max_attention_window_sizes,
+            )
+        ):
+
+            decoder_params = {
+                "hidden_states": hidden_states,
+                "attention_mask": attention_mask,
+                "use_cache": use_cache,
+                "kv_cache_params": KeyValueCacheParams(
+                    past_key_value=[past],
+                    host_past_key_value_lengths=kv_cache_params.host_past_key_value_lengths,
+                    kv_cache_block_pointers=[pointer],
+                    host_max_attention_window_sizes=max_attention_window_size,
+                    cache_indirection=kv_cache_params.cache_indirection,
+                    host_sink_token_length=kv_cache_params.host_sink_token_length,
+                    host_kv_cache_block_pointers=kv_cache_params.host_kv_cache_block_pointers,
+                ),
+                "attention_params": attention_params,
+            }
+
+            if lora_params.lora_ranks is not None:
+                decoder_params["lora_layer_params"] = lora_params.get_layer_params(layer_idx)
+
+            hidden_states = layer(**decoder_params)
+
+            if use_cache:
+                presents.append(hidden_states[1])
+                hidden_states = hidden_states[0]
+
+        if self._mapping.is_last_pp_rank():
+            hidden_states = self.ln_f(hidden_states)
+        else:
+            hidden_states = send(hidden_states, self._mapping.next_pp_rank())
+
+        if use_cache:
+            return hidden_states, tuple(presents)
+        return hidden_states
+
+
+class LMHeadModelBuilder(ModelBuilder, GenerationMixin):
+    def __init__(self, model_config: ModelConfig):
+        super().__init__(model_config)
+
+        if self._mapping.is_last_pp_rank():
+            self.lm_head = ColumnLinear(
+                self._hidden_size,
+                model_config.vocab_size_padded,
+                bias=False,
+                dtype=self._dtype,
+                tp_group=self._mapping.tp_group,
+                tp_size=self._tensor_parallel,
+                gather_output=True,
+                share_weight=None,
+            )
+            self.lm_head.weight.value = model_config.lm_head.weight
+            if model_config.quantization:
+                self.lm_head = quantize_linear(self.lm_head, model_config.quantization, model_config.lm_head)
+
+    def forward(
+        self,
+        input_ids,
+        position_ids,
+        use_cache=False,
+        last_token_ids=None,
+        attention_mask=None,
+        kv_cache_params=None,
+        attention_params=None,
+        prompt_embedding_table=None,
+        prompt_tasks=None,
+        prompt_vocab_size=None,
+        inflight_batching_args=None,
+        hidden_states=None,
+        lora_params=None,
+    ):
+
+        hidden_states = super().forward(
+            input_ids,
+            position_ids,
+            use_cache,
+            attention_mask,
+            kv_cache_params,
+            attention_params,
+            prompt_embedding_table,
+            prompt_tasks,
+            prompt_vocab_size,
+            inflight_batching_args,
+            hidden_states,
+            lora_params,
+        )
+
+        if use_cache:
+            hidden_states, presents = hidden_states
+
+        if self._mapping.is_last_pp_rank():
+            assert last_token_ids is not None, "Expecting last token ids to be not None"
+            hidden_states = gather_last_token_logits(
+                hidden_states, last_token_ids, default_net().plugin_config.remove_input_padding
+            )
+
+            # [batch_size, hidden_size] -> [batch_size, vocab_size]
+            lm_logits = self.lm_head(hidden_states)
+            lm_logits.mark_output("logits", str_dtype_to_trt("float16"))
+        else:
+            hidden_states.mark_output('hidden_states_output', self._dtype)
+
+        if use_cache:
+            if not default_net().plugin_config.paged_kv_cache:
+                for i, present in zip(self._mapping.pp_layers(self._num_layers), presents):
+                    present.mark_output(f'present_key_value_{i}', self._kv_dtype)
+            if self._mapping.is_last_pp_rank():
+                return (lm_logits, presents)
+            return (hidden_states, presents)
+        else:
+            if self._mapping.is_last_pp_rank():
+                return lm_logits
+            return hidden_states
+
+    def prepare_inputs(
+        self,
+        max_batch_size,
+        max_input_len,
+        max_new_tokens,
+        use_cache=True,
+        max_beam_width: int = 1,
+        paged_kv_cache: bool = False,
+        tokens_per_block: int = 64,
+        prompt_embedding_table_size: int = 0,
+        lora_target_modules: List[str] = None,
+    ):
+
+        # Prepare inputs
+        head_size = self._head_size
+        num_heads_kv = self._num_kv_heads
+        remove_input_padding = default_net().plugin_config.remove_input_padding
+        use_gpt_attention_plugin = default_net().plugin_config.gpt_attention_plugin
+        use_gemm_plugin = default_net().plugin_config.gemm_plugin
+        use_custom_all_reduce = default_net().plugin_config.use_custom_all_reduce
+        use_lora_plugin = default_net().plugin_config.lora_plugin
+
+        model_inputs = self.prepare_basic_inputs(
+            max_batch_size=max_batch_size,
+            max_beam_width=max_beam_width,
+            max_input_len=max_input_len,
+            max_seq_len=max_new_tokens,
+            num_kv_heads=num_heads_kv,
+            head_size=head_size,
+            num_layers=self._num_layers,
+            kv_dtype=self._kv_dtype,
+            remove_input_padding=remove_input_padding,
+            use_gpt_attention_plugin=use_gpt_attention_plugin,
+            use_gemm_plugin=use_gemm_plugin,
+            paged_kv_cache=paged_kv_cache,
+            tokens_per_block=tokens_per_block,
+            gather_context_logits=False,
+            gather_generation_logits=False,
+            dtype=self._dtype,
+            num_heads=self._num_heads,
+            mapping=self._mapping,
+            max_num_tokens=None,
+            prompt_embedding_table_size=prompt_embedding_table_size,
+            position_encoding_2d=False,
+            use_lora_plugin=use_lora_plugin,
+            lora_target_modules=lora_target_modules,
+            max_draft_len=0,
+            use_custom_all_reduce=use_custom_all_reduce,
+        )
+
+        inflight_batching_args = None
+
+        return (
+            model_inputs["input_ids"],
+            model_inputs["position_ids"],
+            use_cache,
+            model_inputs["last_token_ids"],
+            model_inputs["attention_mask"],
+            KeyValueCacheParams(
+                past_key_value=model_inputs['past_key_value'],
+                host_past_key_value_lengths=model_inputs['host_past_key_value_lengths'],
+                host_max_attention_window_sizes=model_inputs['host_max_attention_window_sizes'],
+                kv_cache_block_pointers=model_inputs['kv_cache_block_pointers_list'],
+                host_kv_cache_block_pointers=model_inputs['host_kv_cache_block_pointers_list'],
+                cache_indirection=model_inputs['cache_indirection'],
+                host_sink_token_length=model_inputs['host_sink_token_length'],
+            ),
+            AttentionParams(
+                sequence_length=model_inputs['sequence_length'],
+                context_lengths=model_inputs['context_lengths'],
+                host_context_lengths=model_inputs['host_context_lengths'],
+                max_context_length=max_input_len,
+                host_request_types=model_inputs['host_request_types'],
+            ),
+            model_inputs['prompt_embedding_table'],
+            model_inputs['tasks'],
+            model_inputs['prompt_vocab_size'],
+            inflight_batching_args,
+            model_inputs["hidden_states_input"],
+            LoraParams(
+                model_inputs['lora_ranks'],
+                model_inputs['lora_weights_pointers'],
+                host_context_lengths=model_inputs['host_context_lengths'],
+                max_context_length=max_input_len,
+                host_request_types=model_inputs['host_request_types'],
+            ),
+        )
+
+    def build(
+        self,
+        output_dir: Path,
+        timing_cache: str = "",
+        log_level: str = "info",
+        max_batch_size: int = 1,
+        max_input_len: int = 200,
+        max_output_len: int = 200,
+        max_beam_width: int = 1,
+        parallel_build: bool = False,
+        max_prompt_embedding_table_size: int = 0,
+        use_inflight_batching: bool = False,
+        paged_kv_cache: bool = False,
+        enable_context_fmha: bool = True,
+        enable_multi_block_mode: bool = False,
+        use_refit: bool = False,
+        use_lora_plugin: str = None,
+        lora_target_modules: List[str] = None,
+        max_lora_rank: int = 64,
+    ):
+
+        if self.rank > torch.cuda.device_count():
+            print(f"warning: Rank {self.rank} larger than GPUs available ({torch.cuda.device_count()})")
+
+        build(
+            tensorrt_llm_model=self,
+            output_dir=output_dir,
+            mapping=self._mapping,
+            dtype=trt_dtype_to_str(self._dtype),
+            timing_cache=timing_cache,
+            log_level=log_level,
+            max_batch_size=max_batch_size,
+            max_input_len=max_input_len,
+            max_output_len=max_output_len,
+            max_beam_width=max_beam_width,
+            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+            parallel_build=parallel_build,
+            gpus_per_node=torch.cuda.device_count(),
+            quantization=self.quantization,
+            use_inflight_batching=use_inflight_batching,
+            paged_kv_cache=paged_kv_cache,
+            enable_context_fmha=enable_context_fmha,
+            enable_multi_block_mode=enable_multi_block_mode,
+            use_refit=use_refit,
+            use_lora_plugin=use_lora_plugin,
+            lora_target_modules=lora_target_modules,
+            max_lora_rank=max_lora_rank,
+        )
+
+    def print(self):
+        np.set_printoptions(threshold=36)
+        print_tensorrt_llm(f"rank.{self.rank}", self)
diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
new file mode 100644
index 000000000000..cdc0b78d6c18
--- /dev/null
+++ b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -0,0 +1,673 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import logging
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional
+
+import tensorrt_llm
+import torch
+from mpi4py.futures import MPIPoolExecutor
+from tensorrt_llm.logger import logger
+from tensorrt_llm.quantization import QuantMode
+from tensorrt_llm.runtime import LoraManager, ModelConfig, SamplingConfig
+from transformers import PreTrainedTokenizer
+
+from nemo.export.trt_llm.tensor_utils import get_tensor_parallel_group
+from nemo.export.trt_llm.tensorrt_llm_model import LMHeadModelBuilder
+from nemo.export.trt_llm.tensorrt_llm_build import get_engine_name, MODEL_NAME, refit_runtime_engine  # isort:skip
+from nemo.export.trt_llm.nemo_utils import to_word_list_format  # isort:skip
+
+
+LOGGER = logging.getLogger("NeMo")
+
+
+@dataclass
+class TensorrtLLMHostContext:
+    """The host side context for TRT LLM inference."""
+
+    executor: MPIPoolExecutor = None
+    world_size: int = 1
+    tokenizer: PreTrainedTokenizer = None
+    max_batch_size: int = 0
+    max_input_len: int = 0
+    add_bos: bool = False
+
+
+@dataclass
+class TensorrtLLMWorkerContext:
+    """The MPI worker side context for TRT LLM inference."""
+
+    decoder: tensorrt_llm.runtime.GenerationSession = None
+    sampling_config: SamplingConfig = None
+    lora_manager: LoraManager = None
+    max_batch_size: int = 0
+    max_input_len: int = 0
+
+
+# This is a global context that will be initialized during the model loading process as MPI worker.
+tensorrt_llm_worker_context = TensorrtLLMWorkerContext()
+
+
+def _read_config(config_path: Path):
+    with open(config_path, "r") as f:
+        config = json.load(f)
+
+    tensor_parallel_size = config["builder_config"]["tensor_parallel"]
+    pipeline_parallel_size = config["builder_config"]["pipeline_parallel"]
+    world_size = tensor_parallel_size * pipeline_parallel_size
+
+    assert world_size <= torch.cuda.device_count(), f"Not enough GPUs, requesting {world_size}"
+
+    num_heads = config["builder_config"]["num_heads"]
+    num_kv_heads = config["builder_config"].get("num_kv_heads", num_heads)
+    head_size = config["builder_config"]["head_size"]
+    hidden_size = config["builder_config"]["hidden_size"] // tensor_parallel_size
+
+    num_heads = num_heads // tensor_parallel_size
+    num_kv_heads = (num_kv_heads + tensor_parallel_size - 1) // tensor_parallel_size
+
+    if "tokens_per_block" in config["plugin_config"]:
+        tokens_per_block = config["plugin_config"]["tokens_per_block"]
+    else:
+        tokens_per_block = config["builder_config"]["tokens_per_block"]
+
+    model_config = ModelConfig(
+        model_name=config["builder_config"]["name"],
+        max_batch_size=config["builder_config"]["max_batch_size"],
+        vocab_size=config["builder_config"]["vocab_size"],
+        num_layers=config["builder_config"]["num_layers"],
+        num_heads=num_heads,
+        num_kv_heads=num_kv_heads,
+        hidden_size=hidden_size,
+        head_size=head_size,
+        gpt_attention_plugin=config["plugin_config"]["gpt_attention_plugin"],
+        remove_input_padding=config["plugin_config"]["remove_input_padding"],
+        paged_kv_cache=config["plugin_config"]["paged_kv_cache"],
+        tokens_per_block=tokens_per_block,
+        max_prompt_embedding_table_size=config["builder_config"]["max_prompt_embedding_table_size"],
+        dtype=config["builder_config"]["precision"],
+        lora_plugin=config["plugin_config"]["lora_plugin"],
+        lora_target_modules=config["builder_config"]["lora_target_modules"],
+        quant_mode=QuantMode(config["builder_config"]["quant_mode"]),
+        use_custom_all_reduce=config["plugin_config"]["use_custom_all_reduce"],
+        use_context_fmha_for_generation=config["plugin_config"]["use_context_fmha_for_generation"],
+        gather_context_logits=config["builder_config"]["gather_context_logits"],
+        gather_generation_logits=config["builder_config"]["gather_generation_logits"],
+    )
+
+    dtype = config["builder_config"]["precision"]
+    max_input_len = config["builder_config"]["max_input_len"]
+    max_batch_size = config["builder_config"]["max_batch_size"]
+
+    return model_config, world_size, tensor_parallel_size, pipeline_parallel_size, dtype, max_input_len, max_batch_size
+
+
+def _load(tokenizer: PreTrainedTokenizer, engine_dir, lora_ckpt_list=None, num_beams=1):
+    """The impl of `load` API for on a single GPU worker."""
+    try:
+        tensorrt_llm.logger.set_level("info")
+
+        engine_dir = Path(engine_dir)
+        config_path = engine_dir / "config.json"
+        model_config, world_size, tp_size, pp_size, dtype, max_input_len, max_batch_size = _read_config(config_path)
+
+        runtime_rank = tensorrt_llm.mpi_rank()
+
+        assert runtime_rank < torch.cuda.device_count(), f"Rank {runtime_rank} out of bound"
+        runtime_mapping = tensorrt_llm.Mapping(world_size, runtime_rank, tp_size=tp_size, pp_size=pp_size)
+
+        torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
+        engine_name = get_engine_name(MODEL_NAME, dtype, tp_size, pp_size, runtime_rank)
+        serialize_path = os.path.join(engine_dir, engine_name)
+        logger.info(f"Reading from serialize path {serialize_path}")
+
+        with open(serialize_path, "rb") as f:
+            engine_buffer = f.read()
+        decoder = tensorrt_llm.runtime.GenerationSession(
+            model_config, engine_buffer, runtime_mapping, debug_mode=False
+        )
+
+        sampling_config = SamplingConfig(
+            end_id=tokenizer.eos_token_id, pad_id=tokenizer.eos_token_id, num_beams=num_beams
+        )
+
+        if decoder.use_lora_plugin:
+            lora_manager = LoraManager()
+            if lora_ckpt_list is not None:
+                lora_manager.load_from_nemo(
+                    model_files=lora_ckpt_list, model_config=model_config, runtime_mapping=runtime_mapping,
+                )
+        else:
+            lora_manager = None
+
+        # Initialize the global context so it can be used during `run` API.
+        global tensorrt_llm_worker_context
+        tensorrt_llm_worker_context.decoder = decoder
+        tensorrt_llm_worker_context.sampling_config = sampling_config
+        tensorrt_llm_worker_context.lora_manager = lora_manager
+        tensorrt_llm_worker_context.max_batch_size = max_batch_size
+        tensorrt_llm_worker_context.max_input_len = max_input_len
+
+    except Exception as e:
+        print(e)
+        raise e
+
+
+def _forward(
+    input_tensors: List[torch.IntTensor],
+    max_output_len: int,
+    top_k: int = 1,
+    top_p: float = 0.0,
+    temperature: float = 1.0,
+    prompt_table=None,
+    task_vocab_size=None,
+    task_ids: List[int] = None,
+    lora_uids: List[str] = None,
+    stop_words_list=None,
+    bad_words_list=None,
+    no_repeat_ngram_size=None,
+    streaming: bool = False,
+    multiprocessed_env=False,
+    **sampling_kwargs,
+) -> Optional[torch.IntTensor]:
+    """The impl of `forward` API for on a single GPU worker with tensor as IO.
+
+    Returns:
+        the output tokens tensor with shape [batch_size, num_beams, output_len].
+    """
+    try:
+        # Loading the global context initialized from the `load` API.
+        global tensorrt_llm_worker_context
+        decoder = tensorrt_llm_worker_context.decoder
+        assert decoder is not None, "Invalid worker context, decoder is not loaded."
+        sampling_config = tensorrt_llm_worker_context.sampling_config
+        lora_manager = tensorrt_llm_worker_context.lora_manager
+        max_batch_size = tensorrt_llm_worker_context.max_batch_size
+        max_input_len = tensorrt_llm_worker_context.max_input_len
+
+        batch_size = len(input_tensors)
+        assert batch_size <= max_batch_size, f"batch size {batch_size} exceedng max batch size {max_batch_size}"
+        input_lengths = [t.shape[0] for t in input_tensors]
+        max_length = max(input_lengths)
+        assert max_length <= max_input_len, f"input length {max_length} exceedng max input length {max_input_len}"
+        pad_id = sampling_config.pad_id
+
+        if decoder.remove_input_padding:
+            line_encoded = torch.concat(input_tensors).cuda()
+        else:
+            line_encoded = torch.nested.to_padded_tensor(
+                torch.nested.nested_tensor(input_tensors, dtype=torch.int32), pad_id
+            ).cuda()
+
+        input_lengths = torch.tensor(input_lengths, dtype=torch.int32).cuda()
+
+        if prompt_table is None:
+            ptuning_args = []
+        else:
+            if task_vocab_size is None:
+                raise Exception("task_vocab_size cannot be None")
+
+            task_vocab_size = torch.tensor([task_vocab_size], dtype=torch.int32, device="cuda")
+            task_ids = torch.tensor(task_ids, dtype=torch.int32, device="cuda")
+            prompt_table = prompt_table.cuda()
+            ptuning_args = [prompt_table, task_ids, task_vocab_size]
+
+        with torch.no_grad():
+            sampling_config.top_k = top_k
+            sampling_config.top_p = top_p
+            sampling_config.temperature = temperature
+            for key, param in sampling_kwargs.items():
+                # set any additional SamplingConfig kwargs
+                setattr(sampling_config, key, param)
+
+            decoder.setup(
+                batch_size,
+                max_context_length=max_length,
+                max_new_tokens=max_output_len,
+                lora_manager=lora_manager,
+                lora_uids=lora_uids,
+            )
+
+            outputs = decoder.decode(
+                line_encoded,
+                input_lengths,
+                sampling_config,
+                *ptuning_args,
+                stop_words_list=stop_words_list,
+                bad_words_list=bad_words_list,
+                no_repeat_ngram_size=no_repeat_ngram_size,
+                streaming=streaming,
+                output_sequence_lengths=True,
+                return_dict=True,
+            )
+            torch.cuda.synchronize()
+
+        runtime_rank = tensorrt_llm.mpi_rank()
+        if runtime_rank == 0 or multiprocessed_env:
+            return outputs, decoder.log_probs
+        else:
+            return None
+
+    except Exception as e:
+        print(e)
+        raise e
+
+
+def load(
+    tokenizer: PreTrainedTokenizer, engine_dir: str, lora_ckpt_list: List[str] = None, num_beams: int = 1
+) -> TensorrtLLMHostContext:
+    """Loaded the compiled LLM model and run it.
+
+    It also supports running the TRT LLM model on multi-GPU.
+    """
+    # the parent dir of the engine_dir
+    config_path = os.path.join(engine_dir, "config.json")
+    with open(config_path, "r") as f:
+        config = json.load(f)
+    world_size = config["builder_config"]["world_size"]
+    if world_size == 1:
+        _load(tokenizer, engine_dir, lora_ckpt_list, num_beams)
+        executor = None
+    else:
+        executor = MPIPoolExecutor(max_workers=world_size)
+        futures = []
+        for _ in range(world_size):
+            future = executor.submit(_load, tokenizer, engine_dir, lora_ckpt_list, num_beams)
+            futures.append(future)
+        for future in futures:
+            future.result()
+
+    max_batch_size = config["builder_config"]["max_batch_size"]
+    max_input_len = config["builder_config"]["max_input_len"]
+    add_bos = config["builder_config"]["add_bos"]
+
+    return TensorrtLLMHostContext(
+        executor=executor,
+        world_size=world_size,
+        tokenizer=tokenizer,
+        max_batch_size=max_batch_size,
+        max_input_len=max_input_len,
+        add_bos=add_bos,
+    )
+
+
+def load_refit(
+    tokenizer,
+    engine_dir: str,
+    lora_ckpt_list: List[str] = None,
+    num_beams: int = 1,
+    model_configs: List = None,
+    stream=None,
+) -> TensorrtLLMHostContext:
+    """Loaded the compiled LLM model and run it.
+
+    It also supports running the TRT LLM model on multi-GPU.
+    """
+
+    config_path = os.path.join(engine_dir, "config.json")
+    with open(config_path, "r") as f:
+        config = json.load(f)
+    """The impl of `load` API for on a single GPU worker."""
+    tensorrt_llm.logger.set_level("error")
+
+    engine_dir = Path(engine_dir)
+    config_path = engine_dir / "config.json"
+
+    (
+        model_config,
+        world_size,
+        tensor_parallel_size,
+        pipeline_parallel_size,
+        dtype,
+        max_input_len,
+        max_batch_size,
+    ) = _read_config(config_path)
+
+    runtime_rank = torch.cuda.current_device()
+    assert runtime_rank < torch.cuda.device_count(), f"Rank {runtime_rank} out of bound"
+
+    # Manipulate the tensorrt_llm mapping to make it compatible with the multiprocessed env.
+    assert tensorrt_llm.mpi_world_size() == torch.distributed.get_world_size(), "MPI world size mismatch"
+    runtime_mapping = tensorrt_llm.Mapping(
+        world_size=tensorrt_llm.mpi_world_size(), rank=runtime_rank, tp_size=tensorrt_llm.mpi_world_size(), pp_size=1,
+    )
+
+    engine_name = get_engine_name(
+        MODEL_NAME, dtype, tensor_parallel_size, pipeline_parallel_size, tensorrt_llm.mpi_rank()
+    )
+
+    logger.info(f"Loading engine: Rank ({tensorrt_llm.mpi_rank()} -> {engine_dir}/{engine_name}")
+
+    serialize_path = os.path.join(engine_dir, engine_name)
+    with open(serialize_path, "rb") as f:
+        engine_buffer = f.read()
+
+    decoder = tensorrt_llm.runtime.GenerationSession(
+        model_config, engine_buffer, runtime_mapping, debug_mode=False, stream=stream
+    )
+    runtime_mapping.rank = runtime_rank
+    runtime_mapping.tp_group = get_tensor_parallel_group(
+        tensor_parallel_size
+    )  # Override the tp_group to support TP+DP
+    runtime_mapping.tp_rank = runtime_rank
+    runtime_mapping.tp_size = tensor_parallel_size
+    runtime_mapping.pp_group = [runtime_rank]
+    runtime_mapping.pp_rank = 0
+
+    sampling_config = SamplingConfig(end_id=tokenizer.eos_token_id, pad_id=tokenizer.eos_token_id, num_beams=num_beams)
+
+    if decoder.use_lora_plugin:
+        lora_manager = LoraManager()
+        if lora_ckpt_list is not None:
+            lora_manager.load_from_nemo(
+                model_files=lora_ckpt_list, model_config=model_config, runtime_mapping=runtime_mapping,
+            )
+    else:
+        lora_manager = None
+
+    # create a new builder and refit the current engine
+    new_builder = LMHeadModelBuilder(model_configs[0])
+    engine = decoder.runtime.engine
+    refit_runtime_engine(new_builder.named_parameters(), engine)
+
+    # Initialize the global context so it can be used during `run` API.
+    global tensorrt_llm_worker_context
+    tensorrt_llm_worker_context.decoder = decoder
+    tensorrt_llm_worker_context.sampling_config = sampling_config
+    tensorrt_llm_worker_context.lora_manager = lora_manager
+    tensorrt_llm_worker_context.max_batch_size = max_batch_size
+    tensorrt_llm_worker_context.max_input_len = max_input_len
+
+    max_batch_size = config["builder_config"]["max_batch_size"]
+    max_input_len = config["builder_config"]["max_input_len"]
+
+    return TensorrtLLMHostContext(
+        executor=None,
+        world_size=world_size,
+        tokenizer=tokenizer,
+        max_batch_size=max_batch_size,
+        max_input_len=max_input_len,
+    )
+
+
+def forward(
+    input_tensors: List[torch.IntTensor],
+    max_output_len: int,
+    host_context: TensorrtLLMHostContext,
+    top_k: int = 1,
+    top_p: float = 0.0,
+    temperature: float = 1.0,
+    prompt_table=None,
+    task_vocab_size=None,
+    task_ids: List[int] = None,
+    lora_uids: List[str] = None,
+    stop_words_list=None,
+    bad_words_list=None,
+    no_repeat_ngram_size=None,
+    streaming: bool = False,
+    multiprocessed_env=False,
+    **sampling_kwargs,
+) -> Optional[torch.IntTensor]:
+    """Run the loaded model with the host_context provided from the `load` API."""
+    batch_size = len(input_tensors)
+    max_batch_size = host_context.max_batch_size
+    assert batch_size <= max_batch_size, f"batch size {batch_size} exceedng max batch size {max_batch_size}"
+    max_length = max([t.shape[0] for t in input_tensors])
+    max_input_len = host_context.max_input_len
+    assert max_length <= max_input_len, f"input length {max_length} exceedng max input length {max_input_len}"
+
+    world_size = host_context.world_size
+    if world_size == 1 or multiprocessed_env:
+        return _forward(
+            input_tensors=input_tensors,
+            max_output_len=max_output_len,
+            top_k=top_k,
+            top_p=top_p,
+            temperature=temperature,
+            prompt_table=prompt_table,
+            task_vocab_size=task_vocab_size,
+            task_ids=task_ids,
+            lora_uids=lora_uids,
+            stop_words_list=stop_words_list,
+            bad_words_list=bad_words_list,
+            no_repeat_ngram_size=no_repeat_ngram_size,
+            streaming=streaming,
+            multiprocessed_env=multiprocessed_env,
+            **sampling_kwargs,
+        )
+    else:
+        executor = host_context.executor
+        futures = []
+        for _ in range(world_size):
+            future = executor.submit(
+                _forward,
+                input_tensors=input_tensors,
+                max_output_len=max_output_len,
+                top_k=top_k,
+                top_p=top_p,
+                temperature=temperature,
+                prompt_table=prompt_table,
+                task_vocab_size=task_vocab_size,
+                task_ids=task_ids,
+                lora_uids=lora_uids,
+                stop_words_list=stop_words_list,
+                bad_words_list=bad_words_list,
+                no_repeat_ngram_size=no_repeat_ngram_size,
+                streaming=streaming,
+                **sampling_kwargs,
+            )
+            futures.append(future)
+        for future in futures:
+            result = future.result()
+            if result is not None:
+                return result
+
+        raise RuntimeError("Internal error")
+
+
+def generate(
+    input_texts: List[str],
+    max_output_len: int,
+    host_context: TensorrtLLMHostContext,
+    top_k: int = 1,
+    top_p: float = 0.0,
+    temperature: float = 1.0,
+    prompt_table=None,
+    task_vocab_size=None,
+    task_ids: List[int] = None,
+    lora_uids: List[str] = None,
+    stop_words_list=None,
+    bad_words_list=None,
+    no_repeat_ngram_size=None,
+    streaming: bool = False,
+    output_log_probs=False,
+    multiprocessed_env=False,
+    **sampling_kwargs,
+) -> Optional[List[List[str]]]:
+    """Generate the output sequence from the input sequence.
+
+    Returns a 2D string list with shape [batch_size, num_beams].
+    """
+    tokenizer = host_context.tokenizer
+
+    if host_context.add_bos:
+        input_tensors = [torch.IntTensor([tokenizer.bos_token_id] + tokenizer.encode(t)) for t in input_texts]
+    else:
+        input_tensors = [torch.IntTensor(tokenizer.encode(t)) for t in input_texts]
+
+    stop_words_list_tensors = None
+    if stop_words_list is not None:
+        stop_words_arrays = to_word_list_format(stop_words_list, tokenizer)
+        stop_words_list_tensors = (
+            torch.Tensor(stop_words_arrays).to(torch.int32).to(torch.cuda.current_device()).contiguous()
+        )
+
+    bad_words_list_tensors = None
+    if bad_words_list is not None:
+        bad_words_arrays = to_word_list_format(bad_words_list, tokenizer)
+        bad_words_list_tensors = (
+            torch.Tensor(bad_words_arrays).to(torch.int32).to(torch.cuda.current_device()).contiguous()
+        )
+
+    if no_repeat_ngram_size is not None:
+        no_repeat_ngram_size = torch.IntTensor(no_repeat_ngram_size).to(torch.cuda.current_device())
+
+    outputs, log_probs = forward(
+        input_tensors=input_tensors,
+        max_output_len=max_output_len,
+        host_context=host_context,
+        top_k=top_k,
+        top_p=top_p,
+        temperature=temperature,
+        prompt_table=prompt_table,
+        task_vocab_size=task_vocab_size,
+        task_ids=task_ids,
+        lora_uids=lora_uids,
+        stop_words_list=stop_words_list_tensors,
+        bad_words_list=bad_words_list_tensors,
+        no_repeat_ngram_size=no_repeat_ngram_size,
+        streaming=False,
+        output_log_probs=output_log_probs,
+        multiprocessed_env=multiprocessed_env,
+        **sampling_kwargs,
+    )
+    assert outputs is not None
+
+    output_ids = outputs['output_ids']
+    sequence_lengths = outputs['sequence_lengths']
+    input_lengths = [t.shape[0] for t in input_tensors]
+
+    output_lines_list = [
+        tokenizer.batch_decode(output_ids[b, :, input_lengths[b] : sequence_lengths[b][0]])
+        for b in range(output_ids.shape[0])
+    ]
+
+    if output_log_probs:
+        return output_lines_list, log_probs
+    return output_lines_list
+
+
+def generate_streaming(
+    input_texts: List[str],
+    max_output_len: int,
+    host_context: TensorrtLLMHostContext,
+    top_k: int = 1,
+    top_p: float = 0.0,
+    temperature: float = 1.0,
+    prompt_table=None,
+    task_vocab_size=None,
+    task_ids: List[int] = None,
+    lora_uids: List[str] = None,
+    stop_words_list=None,
+    bad_words_list=None,
+    no_repeat_ngram_size=None,
+    **sampling_kwargs,
+) -> Optional[List[List[str]]]:
+    """Generate the output sequence from the input sequence.
+
+    Returns a 2D string list with shape [batch_size, num_beams].
+    """
+    tokenizer = host_context.tokenizer
+
+    if host_context.add_bos:
+        input_tensors = [torch.IntTensor([tokenizer.bos_token_id] + tokenizer.encode(t)) for t in input_texts]
+    else:
+        input_tensors = [torch.IntTensor(tokenizer.encode(t)) for t in input_texts]
+
+    batch_size = len(input_texts)
+
+    stop_words_list_tensors = None
+    if stop_words_list is not None:
+        stop_words_list_tensors = [tokenizer.encode(t) for t in stop_words_list]
+        stop_words_list_tensors = torch.IntTensor(stop_words_list_tensors)
+        stop_words_list_tensors = (
+            stop_words_list_tensors.unsqueeze(0).repeat(batch_size, 1, 1).to(torch.cuda.current_device())
+        )
+
+    bad_words_list_tensors = None
+    if bad_words_list is not None:
+        bad_words_list_tensors = [tokenizer.encode(t) for t in bad_words_list]
+        bad_words_list_tensors = torch.IntTensor(bad_words_list_tensors)
+        bad_words_list_tensors = (
+            bad_words_list_tensors.unsqueeze(0).repeat(batch_size, 1, 1).to(torch.cuda.current_device())
+        )
+
+    if no_repeat_ngram_size is not None:
+        no_repeat_ngram_size = torch.IntTensor(no_repeat_ngram_size).to(torch.cuda.current_device())
+
+    outputs, log_probs = forward(
+        input_tensors=input_tensors,
+        max_output_len=max_output_len,
+        host_context=host_context,
+        top_k=top_k,
+        top_p=top_p,
+        temperature=temperature,
+        prompt_table=prompt_table,
+        task_vocab_size=task_vocab_size,
+        task_ids=task_ids,
+        lora_uids=lora_uids,
+        stop_words_list=stop_words_list_tensors,
+        bad_words_list=bad_words_list_tensors,
+        no_repeat_ngram_size=no_repeat_ngram_size,
+        streaming=True,
+        **sampling_kwargs,
+    )
+    assert outputs is not None
+
+    input_lengths = [t.shape[0] for t in input_tensors]
+
+    # 'outputs' is a generator that yields one generator, not sure why... Unwrap that.
+    for output in outputs:
+        output_ids = output['output_ids']
+        # Now iterate over the partial outputs, decode and yield each intermediate result.
+        generated_tokens = 0
+        for partial_outputs in output_ids:
+            if partial_outputs is None:
+                break
+            # partial_outputs is a tensor with shape=(len(input_texts), 1, output_length),
+            # where the last dimension contains a progressively increasing number of valid, generated tokens.
+            assert partial_outputs.shape[0] == len(input_texts)
+            outputs = []
+            generated_tokens += 1
+
+            # For each input in the batch...
+            for input_index in range(len(input_texts)):
+                # Extract the generated part of the output tensor and decode it.
+                input_length = input_lengths[input_index]
+                decoded_output = tokenizer.batch_decode(
+                    partial_outputs[input_index, :, input_length : input_length + generated_tokens]
+                )[0]
+                outputs.append(decoded_output)
+
+            # Yield the list of decoded partial responses.
+            yield outputs
+        # See above - 'outputs' yields just one item.
+        break
+
+
+def unload(host_context: TensorrtLLMHostContext):
+    """Frees the GPU resource from the TensorrtLLMHostContext and reset the host_context."""
+    if host_context.executor is not None:
+        host_context.executor.shutdown(wait=True)
+        host_context.executor = None
+        return
+
+    global tensorrt_llm_worker_context
+    tensorrt_llm_worker_context.decoder = None
+    tensorrt_llm_worker_context = TensorrtLLMWorkerContext()
diff --git a/nemo/export/trt_llm/tensorrt_llm_utils.py b/nemo/export/trt_llm/tensorrt_llm_utils.py
new file mode 100644
index 000000000000..b732daca2525
--- /dev/null
+++ b/nemo/export/trt_llm/tensorrt_llm_utils.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+import tensorrt as trt
+from tensorrt_llm.layers import Embedding, LayerNorm, PromptTuningEmbedding, RmsNorm
+from tensorrt_llm.module import Module
+
+from nemo.export.trt_llm.model_config import LAYERNORM_DEFAULT, LAYERNORM_RMS, EmbeddingConfig, LayernormConfig
+from nemo.export.trt_llm.tensor_utils import get_tensor_parallel_group
+
+LOGGER = logging.getLogger("NeMo")
+
+
+def build_embedding_from_config(
+    config: EmbeddingConfig,
+    dtype: trt.DataType,
+    tensor_parallel: int = 1,
+    tensor_parallel_rank: int = 0,
+    use_prompt_tuning: bool = False,
+):
+    """Returns the tensorrt_llm embedding layer from the embedding config."""
+    # If the config is empty, return an empty impl.
+    if config is None:
+        return None
+    EmbeddingCls = PromptTuningEmbedding if use_prompt_tuning else Embedding
+
+    trt_embedding = EmbeddingCls(
+        config.weight.shape[0] * tensor_parallel,
+        config.weight.shape[1],
+        dtype=dtype,
+        tp_size=tensor_parallel,
+        tp_rank=tensor_parallel_rank,
+        tp_group=get_tensor_parallel_group(tensor_parallel),
+    )
+    trt_embedding.weight.value = config.weight
+    return trt_embedding
+
+
+def build_layernorm_from_config(config: LayernormConfig, dtype: trt.DataType):
+    """Returns the tensorrt_llm layernorm layer from the torch layernorm."""
+    # If the config is empty, return an empty impl.
+    if config is None:
+        return None
+
+    if config.layernorm_type == LAYERNORM_DEFAULT:
+        trt_layernorm = LayerNorm(normalized_shape=config.weight.shape[0], dtype=dtype)
+        trt_layernorm.weight.value = config.weight
+        trt_layernorm.bias.value = config.bias
+    elif config.layernorm_type == LAYERNORM_RMS:
+        trt_layernorm = RmsNorm(normalized_shape=config.weight.shape[0], dtype=dtype)
+        trt_layernorm.weight.value = config.weight
+    else:
+        raise NotImplementedError(f"{config.layernorm_type} not supported")
+    return trt_layernorm
+
+
+def print_tensorrt_llm(name: str, tensorrt_llm_module: Module):
+    """Prints the tensorrt llm structure including weights and related data for debugging purpose."""
+    for tensor_name in [
+        "weight",
+        "bias",
+        "activation_scaling_factor",
+        "weights_scaling_factor",
+        "prequant_scaling_factor",
+    ]:
+        if hasattr(tensorrt_llm_module, tensor_name):
+            tensor = getattr(tensorrt_llm_module, tensor_name)
+            if tensor is not None:
+                LOGGER.info(f"{name}.{tensor_name}:{tensor._value.dtype}:{tensor._value.shape}:\n{tensor._value}")
+
+    for k, v in tensorrt_llm_module.named_children():
+        print_tensorrt_llm(f"{name}.{k}({v._get_name()})", v)
diff --git a/nemo/export/trt_llm/utils.py b/nemo/export/trt_llm/utils.py
new file mode 100644
index 000000000000..f5894644e3ba
--- /dev/null
+++ b/nemo/export/trt_llm/utils.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+import os
+import pathlib
+import tarfile
+import tempfile
+import typing
+import numpy as np
+import torch
+import yaml
+
+log_format = "%(asctime)s %(name)s [%(levelname)s] %(message)s"
+logging.basicConfig(format=log_format)
+LOGGER = logging.getLogger("NeMo")
+
+# numpy doesn't know bfloat16, define abstract binary type instead
+np_bfloat16 = np.dtype('V2', metadata={"dtype": "bfloat16"})
+
+
+def unpack_nemo_ckpt(
+    nemo_archive_path: typing.Union[str, pathlib.Path], out_dir_path: typing.Union[str, pathlib.Path],
+):
+    nemo_archive_path = pathlib.Path(nemo_archive_path)
+    if not nemo_archive_path.exists():
+        raise FileNotFoundError(f"{nemo_archive_path} does not exist")
+
+    for tar_mode in ["r:", "r:gz"]:
+        try:
+            with tarfile.open(nemo_archive_path, mode=tar_mode) as tar_file:
+
+                def is_within_directory(directory, target):
+
+                    abs_directory = os.path.abspath(directory)
+                    abs_target = os.path.abspath(target)
+
+                    prefix = os.path.commonprefix([abs_directory, abs_target])
+
+                    return prefix == abs_directory
+
+                def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
+
+                    for member in tar.getmembers():
+                        member_path = os.path.join(path, member.name)
+                        if not is_within_directory(path, member_path):
+                            raise Exception("Attempted Path Traversal in Tar File")
+
+                    tar.extractall(path, members, numeric_owner=numeric_owner)
+
+                safe_extract(tar_file, path=out_dir_path)
+            return out_dir_path
+        except tarfile.ReadError:
+            pass
+
+    raise RuntimeError(f"Could not unpack {nemo_archive_path}")
+
+
+def prompt_convert(prompt_config, prompt_weights):
+    if "task_templates" in prompt_config:
+        prompt_templates = prompt_config["task_templates"]
+        actual_task_id = 0
+        vtokens_embeddings = []
+        vtokens_len = []
+        for task_name_id, prompt_task in enumerate(prompt_templates):
+            prompt_task_name = prompt_task["taskname"]
+            LOGGER.info(f"Task {actual_task_id}: {prompt_task['taskname']}")
+            prompt_task_weights = prompt_weights["prompt_table"].get(
+                f"prompt_table.{prompt_task_name}.prompt_embeddings.weight"
+            )
+            if prompt_task_weights is None:
+                continue
+            vtokens_embeddings.append(prompt_task_weights)
+            vtokens_len.append(prompt_task_weights.shape[0])
+            actual_task_id += 1
+
+        max_vtoken_len = max(vtokens_len)
+        embedding_dim = vtokens_embeddings[0].shape[1]
+
+        # pad tasks to longest task embedding table
+        for i, vtoken_emb_table in enumerate(vtokens_embeddings):
+            padded_table = torch.zeros((max_vtoken_len, embedding_dim))
+            padded_table[: vtoken_emb_table.shape[0], :] = vtoken_emb_table
+            vtokens_embeddings[i] = padded_table
+
+        vtokens_embeddings = torch.stack(vtokens_embeddings)
+    else:
+        vtokens_embeddings = prompt_weights["prompt_embeddings_weights"]
+
+    return vtokens_embeddings
+
+
+def cpu_map_location(storage, loc):
+    return storage.cpu()
+
+
+def is_nemo_file(path):
+    flag = False
+
+    if path is not None:
+        if len(path) > 5:
+            pc = Path(path)
+            if pc.exists():
+                if pc.is_file():
+                    if path[-5 : len(path)] == ".nemo":
+                        flag = True
+
+    return flag
+
+
+def get_prompt_embedding_table(prompt_checkpoint_path):
+
+    with tempfile.TemporaryDirectory() as prompt_out_dir:
+        prompt_out_dir = Path(prompt_out_dir)
+        unpack_nemo_ckpt(prompt_checkpoint_path, prompt_out_dir)
+
+        model_weights_ckpt = "model_weights.ckpt"
+        with open(prompt_out_dir / "model_config.yaml") as f:
+            prompt_config = yaml.full_load(f)
+        LOGGER.debug(prompt_config)
+
+        weight_path = prompt_out_dir / model_weights_ckpt
+        if not weight_path.exists():
+            weight_path = prompt_out_dir / "mp_rank_00" / model_weights_ckpt
+
+        prompt_weights = torch.load(weight_path, map_location=cpu_map_location,)
+
+    return prompt_convert(prompt_config, prompt_weights)
diff --git a/scripts/export/export_to_trt_llm.py b/scripts/export/export_to_trt_llm.py
new file mode 100644
index 000000000000..9798473dd880
--- /dev/null
+++ b/scripts/export/export_to_trt_llm.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import sys
+
+from nemo.export import TensorRTLLM
+
+LOGGER = logging.getLogger("NeMo")
+
+
+def get_args(argv):
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description=f"Exports nemo models stored in nemo checkpoints to TensorRT-LLM",
+    )
+    parser.add_argument("-nc", "--nemo_checkpoint", required=True, type=str, help="Source .nemo file")
+    parser.add_argument(
+        "-mt",
+        "--model_type",
+        type=str,
+        required=True,
+        choices=["gptnext", "gpt", "llama", "falcon", "starcoder", "gemma"],
+        help="Type of the model. gptnext, gpt, llama, falcon, and starcoder are only supported."
+        " gptnext and gpt are the same and keeping it for backward compatibility",
+    )
+    parser.add_argument(
+        "-mr", "--model_repository", required=True, default=None, type=str, help="Folder for the trt-llm model files"
+    )
+    parser.add_argument("-ng", "--num_gpus", default=1, type=int, help="Number of GPUs for the deployment")
+    parser.add_argument("-tps", "--tensor_parallelism_size", type=int, help="Tensor parallelism size")
+    parser.add_argument("-pps", "--pipeline_parallelism_size", type=int, help="Pipeline parallelism size")
+    parser.add_argument(
+        "-dt",
+        "--dtype",
+        choices=["bf16", "fp16", "fp8", "int8"],
+        default="bf16",
+        type=str,
+        help="dtype of the model on TensorRT-LLM",
+    )
+    parser.add_argument("-mil", "--max_input_len", default=256, type=int, help="Max input length of the model")
+    parser.add_argument("-mol", "--max_output_len", default=256, type=int, help="Max output length of the model")
+    parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model")
+    parser.add_argument(
+        "-mpet", "--max_prompt_embedding_table_size", default=None, type=int, help="Max prompt embedding table size"
+    )
+    parser.add_argument(
+        "-uib",
+        "--use_inflight_batching",
+        default=False,
+        action='store_true',
+        help="Enable inflight batching for TensorRT-LLM Triton backend.",
+    )
+    parser.add_argument(
+        "-upkc", "--use_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
+    )
+    parser.add_argument(
+        "-mbm",
+        '--multi_block_mode',
+        default=False,
+        action='store_true',
+        help='Split long kv sequence into multiple blocks (applied to generation MHA kernels). \
+                        It is beneifical when batchxnum_heads cannot fully utilize GPU.',
+    )
+    parser.add_argument(
+        '--use_lora_plugin',
+        nargs='?',
+        const=None,
+        default=False,
+        choices=['float16', 'float32', 'bfloat16'],
+        help="Activates the lora plugin which enables embedding sharing.",
+    )
+    parser.add_argument(
+        '--lora_target_modules',
+        nargs='+',
+        default=None,
+        choices=["attn_qkv", "attn_q", "attn_k", "attn_v", "attn_dense", "mlp_h_to_4h", "mlp_gate", "mlp_4h_to_h",],
+        help="Add lora in which modules. Only be activated when use_lora_plugin is enabled.",
+    )
+    parser.add_argument(
+        '--max_lora_rank',
+        type=int,
+        default=64,
+        help='maximum lora rank for different lora modules. '
+        'It is used to compute the workspace size of lora plugin.',
+    )
+    parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode")
+
+    args = parser.parse_args(argv)
+    return args
+
+
+def nemo_export_trt_llm(argv):
+    args = get_args(argv)
+    if args.debug_mode:
+        loglevel = logging.DEBUG
+    else:
+        loglevel = logging.INFO
+
+    LOGGER.setLevel(loglevel)
+    LOGGER.info("Logging level set to {}".format(loglevel))
+    LOGGER.info(args)
+
+    if args.dtype != "bf16":
+        LOGGER.error(
+            "Only bf16 is currently supported for the optimized deployment with TensorRT-LLM. "
+            "Support for the other precisions will be added in the coming releases."
+        )
+        return
+
+    try:
+        trt_llm_exporter = TensorRTLLM(model_dir=args.model_repository)
+
+        LOGGER.info("Export to TensorRT-LLM function is called.")
+        trt_llm_exporter.export(
+            nemo_checkpoint_path=args.nemo_checkpoint,
+            model_type=args.model_type,
+            n_gpus=args.num_gpus,
+            tensor_parallel_size=args.tensor_parallelism_size,
+            pipeline_parallel_size=args.pipeline_parallelism_size,
+            max_input_token=args.max_input_len,
+            max_output_token=args.max_output_len,
+            max_batch_size=args.max_batch_size,
+            max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
+            use_inflight_batching=args.use_inflight_batching,
+            paged_kv_cache=args.use_paged_kv_cache,
+            enable_multi_block_mode=args.multi_block_mode,
+            use_lora_plugin=args.use_lora_plugin,
+            lora_target_modules=args.lora_target_modules,
+            max_lora_rank=args.max_lora_rank,
+            save_nemo_model_config=True,
+        )
+
+        LOGGER.info("Export is successful.")
+    except Exception as error:
+        LOGGER.error("Error message: " + str(error))
+        raise error
+
+
+if __name__ == '__main__':
+    nemo_export_trt_llm(sys.argv[1:])
diff --git a/setup.py b/setup.py
index 9be06e74fea4..2fcc12483a48 100644
--- a/setup.py
+++ b/setup.py
@@ -234,7 +234,7 @@ def finalize_options(self):
     extras_require=extras_require,
     # Add in any packaged data.
     include_package_data=True,
-    exclude=['tools', 'tests'],
+    exclude=['tools', 'tests', 'nemo.deploy', 'nemo.export'],
     package_data={'': ['*.tsv', '*.txt', '*.far', '*.fst', '*.cpp', 'Makefile']},
     zip_safe=False,
     # PyPI package information.
diff --git a/tests/export/__init__.py b/tests/export/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/tests/export/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/export/run.sh b/tests/export/run.sh
new file mode 100644
index 000000000000..8edac5a334e0
--- /dev/null
+++ b/tests/export/run.sh
@@ -0,0 +1,46 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo "unset all SLURM_, PMI_, PMIX_ Variables"
+set -x
+for i in $(env | grep ^SLURM_ | cut -d"=" -f 1); do unset -v $i; done
+for i in $(env | grep ^PMI_ | cut -d"=" -f 1); do unset -v $i; done
+for i in $(env | grep ^PMIX_ | cut -d"=" -f 1); do unset -v $i; done
+set +x
+
+
+python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 1 --max_gpus 2
+python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 1 --streaming
+python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 2 --tp_size 1 --pp_size 2
+python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 4 --tp_size 2 --pp_size 2
+python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 8 --tp_size 1 --pp_size 8
+python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --ptuning --min_gpus 1 --max_gpus 2
+python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --lora --min_gpus 1 --max_gpus 2
+python tests/export/test_nemo_export.py --model_name LLAMA2-7B-code --existing_test_models --min_gpus 1 --max_gpus 2
+python tests/export/test_nemo_export.py --model_name LLAMA2-13B-base --existing_test_models --min_gpus 1 --max_gpus 2
+python tests/export/test_nemo_export.py --model_name LLAMA2-13B-base --existing_test_models --ptuning --min_gpus 1 --max_gpus 2
+python tests/export/test_nemo_export.py --model_name LLAMA2-70B-base --existing_test_models --min_gpus 2 --max_gpus 8
+python tests/export/test_nemo_export.py --model_name NV-GPT-8B-Base-4k --existing_test_models --min_gpus 1 --max_gpus 8
+python tests/export/test_nemo_export.py --model_name NV-GPT-8B-QA-4k --existing_test_models --min_gpus 1 --max_gpus 8
+python tests/export/test_nemo_export.py --model_name NV-GPT-8B-Chat-4k-SFT --existing_test_models --min_gpus 1 --max_gpus 8
+python tests/export/test_nemo_export.py --model_name NV-GPT-8B-Chat-4k-RLHF --existing_test_models --min_gpus 1 --max_gpus 8
+python tests/export/test_nemo_export.py --model_name NV-GPT-8B-Chat-4k-SteerLM --existing_test_models --min_gpus 1 --max_gpus 8
+python tests/export/test_nemo_export.py --model_name GPT-43B-Base --existing_test_models --min_gpus 2 --max_gpus 8
+python tests/export/test_nemo_export.py --model_name FALCON-7B-base --existing_test_models --min_gpus 1 --max_gpus 2
+python tests/export/test_nemo_export.py --model_name FALCON-40B-base --existing_test_models --min_gpus 2 --max_gpus 8
+python tests/export/test_nemo_export.py --model_name FALCON-180B-base --existing_test_models --min_gpus 8 --max_gpus 8
+python tests/export/test_nemo_export.py --model_name STARCODER1-15B-base --existing_test_models --min_gpus 1 --max_gpus 1
+python tests/export/test_nemo_export.py --model_name GEMMA-base --existing_test_models --min_gpus 1 --max_gpus 1 --run_accuracy --test_deployment True
+
+
diff --git a/tests/export/test_nemo_export.py b/tests/export/test_nemo_export.py
new file mode 100644
index 000000000000..6eb41e8a09d7
--- /dev/null
+++ b/tests/export/test_nemo_export.py
@@ -0,0 +1,540 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import shutil
+from pathlib import Path
+import torch
+
+from tests.infer_data_path import get_infer_test_data
+
+run_export_tests = True
+try:
+    from nemo.deploy import DeployPyTriton
+    from nemo.deploy.nlp import NemoQueryLLM
+    from nemo.export import TensorRTLLM
+except Exception as e:
+    run_export_tests = False
+
+
+def get_accuracy_with_lambada(model, nq, task_ids, lora_uids, test_data_path=None):
+    # lambada dataset based accuracy test, which includes more than 5000 sentences.
+    # Use generated last token with original text's last token for accuracy comparison.
+    # If the generated last token start with the original token, trtllm_correct make an increment.
+    # It generates a CSV file for text comparison detail.
+
+    if test_data_path is None:
+        raise Exception("test_data_path cannot be None.")
+
+    trtllm_correct = 0
+    trtllm_deployed_correct = 0
+    trtllm_correct_relaxed = 0
+    trtllm_deployed_correct_relaxed = 0
+    all_expected_outputs = []
+    all_trtllm_outputs = []
+
+    with open(test_data_path, 'r') as file:
+        records = json.load(file)
+
+        for record in records:
+            prompt = record["text_before_last_word"]
+            expected_output = record["last_word"].strip().lower()
+            trtllm_output = model.forward(
+                input_texts=[prompt],
+                max_output_token=1,
+                top_k=1,
+                top_p=0,
+                temperature=0.1,
+                task_ids=task_ids,
+                lora_uids=lora_uids,
+            )
+            trtllm_output = trtllm_output[0][0].strip().lower()
+
+            all_expected_outputs.append(expected_output)
+            all_trtllm_outputs.append(trtllm_output)
+
+            if expected_output == trtllm_output:
+                trtllm_correct += 1
+
+            if (
+                expected_output == trtllm_output
+                or trtllm_output.startswith(expected_output)
+                or expected_output.startswith(trtllm_output)
+            ):
+                if len(trtllm_output) == 1 and len(expected_output) > 1:
+                    continue
+                trtllm_correct_relaxed += 1
+
+            if nq is not None:
+                trtllm_deployed_output = nq.query_llm(
+                    prompts=[prompt], max_output_token=1, top_k=1, top_p=0, temperature=0.1, task_id=task_ids,
+                )
+                trtllm_deployed_output = trtllm_deployed_output[0][0].strip().lower()
+
+                if expected_output == trtllm_deployed_output:
+                    trtllm_deployed_correct += 1
+
+                if (
+                    expected_output == trtllm_deployed_output
+                    or trtllm_deployed_output.startswith(expected_output)
+                    or expected_output.startswith(trtllm_deployed_output)
+                ):
+                    if len(trtllm_deployed_output) == 1 and len(expected_output) > 1:
+                        continue
+                    trtllm_deployed_correct_relaxed += 1
+
+    trtllm_accuracy = trtllm_correct / len(all_expected_outputs)
+    trtllm_accuracy_relaxed = trtllm_correct_relaxed / len(all_expected_outputs)
+
+    trtllm_deployed_accuracy = trtllm_deployed_correct / len(all_expected_outputs)
+    trtllm_deployed_accuracy_relaxed = trtllm_deployed_correct_relaxed / len(all_expected_outputs)
+
+    return (
+        trtllm_accuracy,
+        trtllm_accuracy_relaxed,
+        trtllm_deployed_accuracy,
+        trtllm_deployed_accuracy_relaxed,
+        all_trtllm_outputs,
+        all_expected_outputs,
+    )
+
+
+def run_trt_llm_inference(
+    model_name,
+    model_type,
+    prompt,
+    checkpoint_path,
+    trt_llm_model_dir,
+    n_gpu=1,
+    max_batch_size=8,
+    max_input_token=128,
+    max_output_token=128,
+    ptuning=False,
+    p_tuning_checkpoint=None,
+    lora=False,
+    lora_checkpoint=None,
+    tp_size=None,
+    pp_size=None,
+    top_k=1,
+    top_p=0.0,
+    temperature=1.0,
+    run_accuracy=False,
+    debug=True,
+    streaming=False,
+    stop_words_list=None,
+    test_deployment=False,
+    test_data_path=None,
+):
+    if Path(checkpoint_path).exists():
+        if n_gpu > torch.cuda.device_count():
+            print(
+                "Path: {0} and model: {1} with {2} gpus won't be tested since available # of gpus = {3}".format(
+                    model_info["checkpoint"], model_name, n_gpu, torch.cuda.device_count()
+                )
+            )
+            return None, None, None, None
+
+        Path(trt_llm_model_dir).mkdir(parents=True, exist_ok=True)
+
+        if debug:
+            print("")
+            print("")
+            print(
+                "################################################## NEW TEST ##################################################"
+            )
+            print("")
+
+            print("Path: {0} and model: {1} with {2} gpus will be tested".format(checkpoint_path, model_name, n_gpu))
+
+        prompt_embeddings_checkpoint_path = None
+        task_ids = None
+        max_prompt_embedding_table_size = 0
+
+        if ptuning:
+            if Path(p_tuning_checkpoint).exists():
+                prompt_embeddings_checkpoint_path = p_tuning_checkpoint
+                max_prompt_embedding_table_size = 8192
+                task_ids = ["0"]
+                if debug:
+                    print("---- PTuning enabled.")
+            else:
+                print("---- PTuning could not be enabled and skipping the test.")
+                return None, None, None, None
+
+        lora_ckpt_list = None
+        lora_uids = None
+        use_lora_plugin = None
+        lora_target_modules = None
+
+        if lora:
+            if Path(lora_checkpoint).exists():
+                lora_ckpt_list = [lora_checkpoint]
+                lora_uids = ["0", "-1", "0"]
+                use_lora_plugin = "bfloat16"
+                lora_target_modules = ["attn_qkv"]
+                if debug:
+                    print("---- LoRA enabled.")
+            else:
+                print("---- LoRA could not be enabled and skipping the test.")
+                return None, None, None, None
+
+        trt_llm_exporter = TensorRTLLM(trt_llm_model_dir, lora_ckpt_list)
+
+        trt_llm_exporter.export(
+            nemo_checkpoint_path=checkpoint_path,
+            model_type=model_type,
+            n_gpus=n_gpu,
+            tensor_parallel_size=tp_size,
+            pipeline_parallel_size=pp_size,
+            max_input_token=max_input_token,
+            max_output_token=max_output_token,
+            max_batch_size=max_batch_size,
+            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+            use_lora_plugin=use_lora_plugin,
+            lora_target_modules=lora_target_modules,
+            save_nemo_model_config=True,
+        )
+
+        if ptuning:
+            trt_llm_exporter.add_prompt_table(
+                task_name="0", prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path,
+            )
+
+        output = trt_llm_exporter.forward(
+            input_texts=prompt,
+            max_output_token=max_output_token,
+            top_k=top_k,
+            top_p=top_p,
+            temperature=temperature,
+            task_ids=task_ids,
+            lora_uids=lora_uids,
+            streaming=streaming,
+            stop_words_list=stop_words_list,
+        )
+
+        nq = None
+        nm = None
+        output_deployed = ""
+        if test_deployment:
+            nm = DeployPyTriton(model=trt_llm_exporter, triton_model_name=model_name, port=8000,)
+            nm.deploy()
+            nm.run()
+            nq = NemoQueryLLM(url="localhost:8000", model_name=model_name)
+
+            output_deployed = nq.query_llm(
+                prompts=prompt,
+                max_output_token=max_output_token,
+                top_k=1,
+                top_p=0.0,
+                temperature=1.0,
+                lora_uids=lora_uids,
+            )
+
+        if debug:
+            print("")
+            print("--- Prompt: ", prompt)
+            print("")
+            print("--- Output: ", output)
+            print("")
+            print("")
+            print("--- Output deployed: ", output_deployed)
+            print("")
+
+        if run_accuracy:
+            print("Start model accuracy testing ...")
+            (
+                trtllm_accuracy,
+                trtllm_accuracy_relaxed,
+                trtllm_deployed_accuracy,
+                trtllm_deployed_accuracy_relaxed,
+                all_trtllm_outputs,
+                all_expected_outputs,
+            ) = get_accuracy_with_lambada(trt_llm_exporter, nq, task_ids, lora_uids, test_data_path)
+            if test_deployment:
+                nm.stop()
+            shutil.rmtree(trt_llm_model_dir)
+            return trtllm_accuracy, trtllm_accuracy_relaxed, trtllm_deployed_accuracy, trtllm_deployed_accuracy_relaxed
+
+        if test_deployment:
+            nm.stop()
+        shutil.rmtree(trt_llm_model_dir)
+        return None, None, None, None
+    else:
+        raise Exception("Checkpoint {0} could not be found.".format(checkpoint_path))
+
+
+def run_existing_checkpoints(
+    model_name,
+    n_gpus,
+    tp_size=None,
+    pp_size=None,
+    ptuning=False,
+    lora=False,
+    streaming=False,
+    run_accuracy=False,
+    test_deployment=False,
+    stop_words_list=None,
+    test_data_path=None,
+):
+    if n_gpus > torch.cuda.device_count():
+        print("Skipping the test due to not enough number of GPUs")
+        return None, None, None, None
+
+    test_data = get_infer_test_data()
+    if not (model_name in test_data.keys()):
+        raise Exception("Model {0} is not supported.".format(model_name))
+
+    model_info = test_data[model_name]
+
+    if n_gpus < model_info["min_gpus"]:
+        print("Min n_gpus for this model is {0}".format(n_gpus))
+        return None, None, None, None
+
+    p_tuning_checkpoint = None
+    if ptuning:
+        if "p_tuning_checkpoint" in model_info.keys():
+            p_tuning_checkpoint = model_info["p_tuning_checkpoint"]
+        else:
+            raise Exception("There is not ptuning checkpoint path defined.")
+
+    lora_checkpoint = None
+    if lora:
+        if "lora_checkpoint" in model_info.keys():
+            lora_checkpoint = model_info["lora_checkpoint"]
+        else:
+            raise Exception("There is not lora checkpoint path defined.")
+
+    return run_trt_llm_inference(
+        model_name=model_name,
+        model_type=model_info["model_type"],
+        prompt=model_info["prompt_template"],
+        checkpoint_path=model_info["checkpoint"],
+        trt_llm_model_dir=model_info["trt_llm_model_dir"],
+        n_gpu=n_gpus,
+        max_batch_size=model_info["max_batch_size"],
+        max_input_token=512,
+        max_output_token=model_info["max_output_token"],
+        ptuning=ptuning,
+        p_tuning_checkpoint=p_tuning_checkpoint,
+        lora=lora,
+        lora_checkpoint=lora_checkpoint,
+        tp_size=tp_size,
+        pp_size=pp_size,
+        top_k=1,
+        top_p=0.0,
+        temperature=1.0,
+        run_accuracy=run_accuracy,
+        debug=True,
+        streaming=streaming,
+        stop_words_list=stop_words_list,
+        test_deployment=test_deployment,
+        test_data_path=test_data_path,
+    )
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description=f"Deploy nemo models to Triton and benchmark the models",
+    )
+
+    parser.add_argument(
+        "--model_name", type=str, required=True,
+    )
+    parser.add_argument(
+        "--existing_test_models", default=False, action='store_true',
+    )
+    parser.add_argument(
+        "--model_type", type=str, required=False,
+    )
+    parser.add_argument(
+        "--min_gpus", type=int, default=1, required=True,
+    )
+    parser.add_argument(
+        "--max_gpus", type=int,
+    )
+    parser.add_argument(
+        "--checkpoint_dir", type=str, default="/tmp/nemo_checkpoint/", required=False,
+    )
+    parser.add_argument(
+        "--trt_llm_model_dir", type=str,
+    )
+    parser.add_argument(
+        "--max_batch_size", type=int, default=8,
+    )
+    parser.add_argument(
+        "--max_input_token", type=int, default=256,
+    )
+    parser.add_argument(
+        "--max_output_token", type=int, default=128,
+    )
+    parser.add_argument(
+        "--p_tuning_checkpoint", type=str,
+    )
+    parser.add_argument(
+        "--ptuning", default=False, action='store_true',
+    )
+    parser.add_argument(
+        "--lora_checkpoint", type=str,
+    )
+    parser.add_argument(
+        "--lora", default=False, action='store_true',
+    )
+    parser.add_argument(
+        "--tp_size", type=int,
+    )
+    parser.add_argument(
+        "--pp_size", type=int,
+    )
+    parser.add_argument(
+        "--top_k", type=int, default=1,
+    )
+    parser.add_argument(
+        "--top_p", type=float, default=0.0,
+    )
+    parser.add_argument(
+        "--temperature", type=float, default=1.0,
+    )
+    parser.add_argument(
+        "--run_accuracy", default=False, action='store_true',
+    )
+    parser.add_argument("--streaming", default=False, action="store_true")
+    parser.add_argument(
+        "--test_deployment", type=str, default="False",
+    )
+    parser.add_argument(
+        "--debug", default=False, action='store_true',
+    )
+    parser.add_argument(
+        "--ci_upload_test_results_to_cloud", default=False, action='store_true',
+    )
+    parser.add_argument(
+        "--test_data_path", type=str, default=None,
+    )
+
+    return parser.parse_args()
+
+
+def run_inference_tests(args):
+    if args.test_deployment == "False":
+        args.test_deployment = False
+    else:
+        args.test_deployment = True
+
+    if args.run_accuracy:
+        if args.test_data_path is None:
+            raise Exception("test_data_path param cannot be None.")
+
+    result_dic = {}
+
+    if args.existing_test_models:
+        n_gpus = args.min_gpus
+        if args.max_gpus is None:
+            args.max_gpus = args.min_gpus
+
+        while n_gpus <= args.max_gpus:
+            (
+                trtllm_accuracy,
+                trtllm_accuracy_relaxed,
+                trtllm_deployed_accuracy,
+                trtllm_deployed_accuracy_relaxed,
+            ) = run_existing_checkpoints(
+                model_name=args.model_name,
+                n_gpus=n_gpus,
+                ptuning=args.ptuning,
+                lora=args.lora,
+                tp_size=args.tp_size,
+                pp_size=args.pp_size,
+                streaming=args.streaming,
+                test_deployment=args.test_deployment,
+                run_accuracy=args.run_accuracy,
+                test_data_path=args.test_data_path,
+            )
+            result_dic[n_gpus] = (
+                trtllm_accuracy,
+                trtllm_accuracy_relaxed,
+                trtllm_deployed_accuracy,
+                trtllm_deployed_accuracy_relaxed,
+            )
+
+            n_gpus = n_gpus * 2
+    else:
+        prompt_template = ["The capital of France is", "Largest animal in the sea is"]
+        n_gpus = args.min_gpus
+        if args.max_gpus is None:
+            args.max_gpus = args.min_gpus
+
+        while n_gpus <= args.max_gpus:
+            (
+                trtllm_accuracy,
+                trtllm_accuracy_relaxed,
+                trtllm_deployed_accuracy,
+                trtllm_deployed_accuracy_relaxed,
+            ) = run_trt_llm_inference(
+                model_name=args.model_name,
+                model_type=args.model_type,
+                prompt=prompt_template,
+                checkpoint_path=args.checkpoint_dir,
+                trt_llm_model_dir=args.trt_llm_model_dir,
+                n_gpu=n_gpus,
+                max_batch_size=args.max_batch_size,
+                max_input_token=args.max_input_token,
+                max_output_token=args.max_output_token,
+                ptuning=args.ptuning,
+                p_tuning_checkpoint=args.p_tuning_checkpoint,
+                lora=args.lora,
+                lora_checkpoint=args.lora_checkpoint,
+                tp_size=args.tp_size,
+                pp_size=args.pp_size,
+                top_k=args.top_k,
+                top_p=args.top_p,
+                temperature=args.temperature,
+                run_accuracy=args.run_accuracy,
+                debug=args.debug,
+                streaming=args.streaming,
+                test_deployment=args.test_deployment,
+                test_data_path=args.test_data_path,
+            )
+            result_dic[n_gpus] = (
+                trtllm_accuracy,
+                trtllm_accuracy_relaxed,
+                trtllm_deployed_accuracy,
+                trtllm_deployed_accuracy_relaxed,
+            )
+
+            n_gpus = n_gpus * 2
+
+    test_result = "PASS"
+    print("======================================= Test Summary =======================================")
+    for i, results in result_dic.items():
+        if not results[0] is None and not results[1] is None:
+            print(
+                "Number of GPUS: {0}, Model Accuracy: {1}, Relaxed Model Accuracy: {2}, "
+                "Deployed Model Accuracy: {3}, Deployed Relaxed Model Accuracy: {4}".format(
+                    i, results[0], results[1], results[2], results[3]
+                )
+            )
+            if results[1] < 0.5:
+                test_result = "FAIL"
+
+    print("=============================================================================================")
+    print("TEST: " + test_result)
+    if test_result == "FAIL":
+        raise Exception("Model accuracy is below 0.5")
+
+
+if __name__ == '__main__':
+    args = get_args()
+    run_inference_tests(args)
diff --git a/tests/infer_data_path.py b/tests/infer_data_path.py
new file mode 100644
index 000000000000..32d733117c99
--- /dev/null
+++ b/tests/infer_data_path.py
@@ -0,0 +1,291 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import urllib.request as req
+from pathlib import Path
+
+
+def get_infer_test_data():
+    test_data = {}
+
+    test_data["NV-GPT-8B-Base-4k"] = {}
+    test_data["NV-GPT-8B-Base-4k"]["model_type"] = "gptnext"
+    test_data["NV-GPT-8B-Base-4k"]["min_gpus"] = 1
+    test_data["NV-GPT-8B-Base-4k"]["location"] = "Local"
+    test_data["NV-GPT-8B-Base-4k"]["trt_llm_model_dir"] = "/tmp/NV-GPT-8B-Base-4k/nv-gpt-8b-base-4k_v1.0/"
+    test_data["NV-GPT-8B-Base-4k"][
+        "checkpoint"
+    ] = "/opt/checkpoints/NV-GPT-8B-Base-4k/nv-gpt-8b-base-4k_v1.0/NV-GPT-8B-Base-4k.nemo"
+    test_data["NV-GPT-8B-Base-4k"]["p_tuning_checkpoint"] = "/opt/checkpoints/NV-GPT-8B-PTuning/nv-gpt-8B-ptuning.nemo"
+    test_data["NV-GPT-8B-Base-4k"]["prompt_template"] = [
+        "The capital of France is",
+        "Largest animal in the sea is",
+        "Fastest animal in the world is",
+    ]
+    test_data["NV-GPT-8B-Base-4k"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
+    test_data["NV-GPT-8B-Base-4k"]["max_output_token"] = 128
+    test_data["NV-GPT-8B-Base-4k"]["max_batch_size"] = 10
+
+    test_data["NV-GPT-8B-Base-16k"] = {}
+    test_data["NV-GPT-8B-Base-16k"]["model_type"] = "gptnext"
+    test_data["NV-GPT-8B-Base-16k"]["min_gpus"] = 1
+    test_data["NV-GPT-8B-Base-16k"]["location"] = "Local"
+    test_data["NV-GPT-8B-Base-16k"]["trt_llm_model_dir"] = "/tmp/NV-GPT-8B-Base-16k/nv-gpt-8b-base-16k_v1.0/"
+    test_data["NV-GPT-8B-Base-16k"][
+        "checkpoint"
+    ] = "/opt/checkpoints/NV-GPT-8B-Base-16k/nv-gpt-8b-base-16k_v1.0/NV-GPT-8B-Base-16k.nemo"
+    test_data["NV-GPT-8B-Base-16k"]["prompt_template"] = [
+        "The capital of France is",
+        "Largest animal in the sea is",
+        "Fastest animal in the world is",
+    ]
+    test_data["NV-GPT-8B-Base-16k"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
+    test_data["NV-GPT-8B-Base-16k"]["max_output_token"] = 128
+    test_data["NV-GPT-8B-Base-16k"]["max_batch_size"] = 20
+
+    test_data["NV-GPT-8B-QA-4k"] = {}
+    test_data["NV-GPT-8B-QA-4k"]["model_type"] = "gptnext"
+    test_data["NV-GPT-8B-QA-4k"]["min_gpus"] = 1
+    test_data["NV-GPT-8B-QA-4k"]["location"] = "Local"
+    test_data["NV-GPT-8B-QA-4k"]["trt_llm_model_dir"] = "/tmp/NV-GPT-8B-QA-4k/nv-gpt-8b-qa-4k_v1.0/"
+    test_data["NV-GPT-8B-QA-4k"][
+        "checkpoint"
+    ] = "/opt/checkpoints/NV-GPT-8B-QA-4k/nv-gpt-8b-qa-4k_v1.0/NV-GPT-8B-QA-4k.nemo"
+    test_data["NV-GPT-8B-QA-4k"]["prompt_template"] = [
+        "What is the capital of France?",
+        "What is the largest animal in the sea?",
+        "What is the fastest animal in the world?",
+    ]
+    test_data["NV-GPT-8B-QA-4k"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
+    test_data["NV-GPT-8B-QA-4k"]["max_output_token"] = 96
+    test_data["NV-GPT-8B-QA-4k"]["max_batch_size"] = 20
+
+    test_data["NV-GPT-8B-Chat-4k-SFT"] = {}
+    test_data["NV-GPT-8B-Chat-4k-SFT"]["model_type"] = "gptnext"
+    test_data["NV-GPT-8B-Chat-4k-SFT"]["min_gpus"] = 1
+    test_data["NV-GPT-8B-Chat-4k-SFT"]["location"] = "Local"
+    test_data["NV-GPT-8B-Chat-4k-SFT"]["trt_llm_model_dir"] = "/tmp/NV-GPT-8B-Chat-4k-SFT/nv-gpt-8b-chat-4k-sft_v1.0/"
+    test_data["NV-GPT-8B-Chat-4k-SFT"][
+        "checkpoint"
+    ] = "/opt/checkpoints/NV-GPT-8B-Chat-4k-SFT/nv-gpt-8b-chat-4k-sft_v1.0/NV-GPT-8B-Chat-4k-SFT.nemo"
+    test_data["NV-GPT-8B-Chat-4k-SFT"]["prompt_template"] = [
+        "What is the capital of France?",
+        "What is the largest animal in the sea?",
+        "What is the fastest animal in the world?",
+    ]
+    test_data["NV-GPT-8B-Chat-4k-SFT"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
+    test_data["NV-GPT-8B-Chat-4k-SFT"]["max_output_token"] = 256
+    test_data["NV-GPT-8B-Chat-4k-SFT"]["max_batch_size"] = 5
+
+    test_data["NV-GPT-8B-Chat-4k-RLHF"] = {}
+    test_data["NV-GPT-8B-Chat-4k-RLHF"]["model_type"] = "gptnext"
+    test_data["NV-GPT-8B-Chat-4k-RLHF"]["min_gpus"] = 1
+    test_data["NV-GPT-8B-Chat-4k-RLHF"]["location"] = "Local"
+    test_data["NV-GPT-8B-Chat-4k-RLHF"][
+        "trt_llm_model_dir"
+    ] = "/tmp/NV-GPT-8B-Chat-4k-RLHF/nv-gpt-8b-chat-4k-rlhf_v1.0/"
+    test_data["NV-GPT-8B-Chat-4k-RLHF"][
+        "checkpoint"
+    ] = "/opt/checkpoints/NV-GPT-8B-Chat-4k-RLHF/nv-gpt-8b-chat-4k-rlhf_v1.0/NV-GPT-8B-Chat-4k-RLHF.nemo"
+    test_data["NV-GPT-8B-Chat-4k-RLHF"]["prompt_template"] = [
+        "What is the capital of France?",
+        "What is the largest animal in the sea?",
+        "What is the fastest animal in the world?",
+    ]
+    test_data["NV-GPT-8B-Chat-4k-RLHF"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
+    test_data["NV-GPT-8B-Chat-4k-RLHF"]["max_output_token"] = 128
+    test_data["NV-GPT-8B-Chat-4k-RLHF"]["max_batch_size"] = 10
+
+    test_data["NV-GPT-8B-Chat-4k-SteerLM"] = {}
+    test_data["NV-GPT-8B-Chat-4k-SteerLM"]["model_type"] = "gptnext"
+    test_data["NV-GPT-8B-Chat-4k-SteerLM"]["min_gpus"] = 1
+    test_data["NV-GPT-8B-Chat-4k-SteerLM"]["location"] = "Local"
+    test_data["NV-GPT-8B-Chat-4k-SteerLM"][
+        "trt_llm_model_dir"
+    ] = "/tmp/NV-GPT-8B-Chat-4k-SteerLM/nv-gpt-8b-chat-4k-steerlm_v1.0/"
+    test_data["NV-GPT-8B-Chat-4k-SteerLM"][
+        "checkpoint"
+    ] = "/opt/checkpoints/NV-GPT-8B-Chat-4k-SteerLM/nv-gpt-8b-chat-4k-steerlm_v1.0/NV-GPT-8B-Chat-4k-SteerLM.nemo"
+    test_data["NV-GPT-8B-Chat-4k-SteerLM"]["prompt_template"] = [
+        "What is the capital of France?",
+        "What is the largest animal in the sea?",
+        "What is the fastest animal in the world?",
+    ]
+    test_data["NV-GPT-8B-Chat-4k-SteerLM"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
+    test_data["NV-GPT-8B-Chat-4k-SteerLM"]["max_output_token"] = 128
+    test_data["NV-GPT-8B-Chat-4k-SteerLM"]["max_batch_size"] = 10
+
+    test_data["GPT-43B-Base"] = {}
+    test_data["GPT-43B-Base"]["model_type"] = "gptnext"
+    test_data["GPT-43B-Base"]["min_gpus"] = 2
+    test_data["GPT-43B-Base"]["location"] = "Local"
+    test_data["GPT-43B-Base"]["trt_llm_model_dir"] = "/tmp/GPT-43B-Base/gpt-43B-base/"
+    test_data["GPT-43B-Base"]["checkpoint"] = "/opt/checkpoints/GPT-43B-Base/gpt-43B-base.nemo"
+    test_data["GPT-43B-Base"]["prompt_template"] = [
+        "The capital of France is",
+        "Largest animal in the sea is",
+        "Fastest animal in the world is",
+    ]
+    test_data["GPT-43B-Base"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
+    test_data["GPT-43B-Base"]["max_output_token"] = 128
+    test_data["GPT-43B-Base"]["max_batch_size"] = 10
+
+    test_data["LLAMA2-7B-base"] = {}
+    test_data["LLAMA2-7B-base"]["model_type"] = "llama"
+    test_data["LLAMA2-7B-base"]["min_gpus"] = 1
+    test_data["LLAMA2-7B-base"]["location"] = "Local"
+    test_data["LLAMA2-7B-base"]["trt_llm_model_dir"] = "/tmp/LLAMA2-7B-base/trt_llm_model-1/"
+    test_data["LLAMA2-7B-base"]["checkpoint"] = "/opt/checkpoints/LLAMA2-7B-base/LLAMA2-7B-base-1.nemo"
+    test_data["LLAMA2-7B-base"]["p_tuning_checkpoint"] = "/opt/checkpoints/LLAMA2-7B-PTuning/LLAMA2-7B-PTuning-1.nemo"
+    test_data["LLAMA2-7B-base"]["lora_checkpoint"] = "/opt/checkpoints/LLAMA2-7B-Lora/LLAMA2-7B-Lora-1.nemo"
+    test_data["LLAMA2-7B-base"]["prompt_template"] = [
+        "The capital of France is",
+        "Largest animal in the sea",
+        "Fastest animal in the world",
+    ]
+    test_data["LLAMA2-7B-base"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
+    test_data["LLAMA2-7B-base"]["max_output_token"] = 128
+    test_data["LLAMA2-7B-base"]["max_batch_size"] = 10
+
+    test_data["LLAMA2-13B-base"] = {}
+    test_data["LLAMA2-13B-base"]["model_type"] = "llama"
+    test_data["LLAMA2-13B-base"]["min_gpus"] = 1
+    test_data["LLAMA2-13B-base"]["location"] = "Local"
+    test_data["LLAMA2-13B-base"]["trt_llm_model_dir"] = "/tmp/LLAMA2-13B-base/trt_llm_model-1/"
+    test_data["LLAMA2-13B-base"]["checkpoint"] = "/opt/checkpoints/LLAMA2-13B-base/LLAMA2-13B-base-1.nemo"
+    test_data["LLAMA2-13B-base"][
+        "p_tuning_checkpoint"
+    ] = "/opt/checkpoints/LLAMA2-13B-PTuning/LLAMA2-13B-PTuning-1.nemo"
+    test_data["LLAMA2-13B-base"]["prompt_template"] = [
+        "The capital of France is",
+        "Largest animal in the sea is",
+        "Fastest animal in the world is",
+    ]
+    test_data["LLAMA2-13B-base"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
+    test_data["LLAMA2-13B-base"]["max_output_token"] = 128
+    test_data["LLAMA2-13B-base"]["max_batch_size"] = 10
+
+    test_data["LLAMA2-70B-base"] = {}
+    test_data["LLAMA2-70B-base"]["model_type"] = "llama"
+    test_data["LLAMA2-70B-base"]["min_gpus"] = 2
+    test_data["LLAMA2-70B-base"]["location"] = "Local"
+    test_data["LLAMA2-70B-base"]["trt_llm_model_dir"] = "/tmp/LLAMA2-70B-base/trt_llm_model-1/"
+    test_data["LLAMA2-70B-base"]["checkpoint"] = "/opt/checkpoints/LLAMA2-70B-base/LLAMA2-70B-base-1.nemo"
+    test_data["LLAMA2-70B-base"]["prompt_template"] = [
+        "The capital of France is",
+        "Largest animal in the sea is",
+        "Fastest animal in the world is",
+    ]
+    test_data["LLAMA2-70B-base"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
+    test_data["LLAMA2-70B-base"]["max_output_token"] = 128
+    test_data["LLAMA2-70B-base"]["max_batch_size"] = 10
+
+    test_data["LLAMA2-7B-code"] = {}
+    test_data["LLAMA2-7B-code"]["model_type"] = "llama"
+    test_data["LLAMA2-7B-code"]["min_gpus"] = 1
+    test_data["LLAMA2-7B-code"]["location"] = "Local"
+    test_data["LLAMA2-7B-code"]["trt_llm_model_dir"] = "/tmp/LLAMA2-7B-code/trt_llm_model-1/"
+    test_data["LLAMA2-7B-code"]["checkpoint"] = "/opt/checkpoints/LLAMA2-7B-code/LLAMA2-7B-code-1.nemo"
+    test_data["LLAMA2-7B-code"]["prompt_template"] = [
+        "You are an expert programmer that writes simple, concise code and explanations. Write a python function to generate the nth fibonacci number."
+    ]
+    test_data["LLAMA2-7B-code"]["expected_keyword"] = ["Here"]
+    test_data["LLAMA2-7B-code"]["max_output_token"] = 128
+    test_data["LLAMA2-7B-code"]["max_batch_size"] = 10
+
+    test_data["FALCON-7B-base"] = {}
+    test_data["FALCON-7B-base"]["model_type"] = "falcon"
+    test_data["FALCON-7B-base"]["min_gpus"] = 1
+    test_data["FALCON-7B-base"]["location"] = "Local"
+    test_data["FALCON-7B-base"]["trt_llm_model_dir"] = "/tmp/FALCON-7B-base/trt_llm_model-1/"
+    test_data["FALCON-7B-base"]["checkpoint"] = "/opt/checkpoints/FALCON-7B-base/FALCON-7B-base-1.nemo"
+    test_data["FALCON-7B-base"]["prompt_template"] = [
+        "The capital of France is",
+        "Largest animal in the sea is",
+        "Fastest animal in the world is",
+    ]
+    test_data["FALCON-7B-base"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
+    test_data["FALCON-7B-base"]["max_output_token"] = 128
+    test_data["FALCON-7B-base"]["max_batch_size"] = 10
+
+    test_data["FALCON-40B-base"] = {}
+    test_data["FALCON-40B-base"]["model_type"] = "falcon"
+    test_data["FALCON-40B-base"]["min_gpus"] = 2
+    test_data["FALCON-40B-base"]["location"] = "Local"
+    test_data["FALCON-40B-base"]["trt_llm_model_dir"] = "/tmp/FALCON-40B-base/trt_llm_model-1/"
+    test_data["FALCON-40B-base"]["checkpoint"] = "/opt/checkpoints/FALCON-40B-base/FALCON-40B-base-1.nemo"
+    test_data["FALCON-40B-base"]["prompt_template"] = [
+        "The capital of France is",
+        "Largest animal in the sea is",
+        "Fastest animal in the world is",
+    ]
+    test_data["FALCON-40B-base"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
+    test_data["FALCON-40B-base"]["max_output_token"] = 128
+    test_data["FALCON-40B-base"]["max_batch_size"] = 10
+
+    test_data["FALCON-180B-base"] = {}
+    test_data["FALCON-180B-base"]["model_type"] = "falcon"
+    test_data["FALCON-180B-base"]["min_gpus"] = 8
+    test_data["FALCON-180B-base"]["location"] = "Local"
+    test_data["FALCON-180B-base"]["trt_llm_model_dir"] = "/tmp/FALCON-180B-base/trt_llm_model-1/"
+    test_data["FALCON-180B-base"]["checkpoint"] = "/opt/checkpoints/FALCON-180B-base/FALCON-180B-base-1.nemo"
+    test_data["FALCON-180B-base"]["prompt_template"] = [
+        "The capital of France is",
+        "Largest animal in the sea is",
+        "Fastest animal in the world is",
+    ]
+    test_data["FALCON-180B-base"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
+    test_data["FALCON-180B-base"]["max_output_token"] = 128
+    test_data["FALCON-180B-base"]["max_batch_size"] = 10
+
+    test_data["STARCODER1-15B-base"] = {}
+    test_data["STARCODER1-15B-base"]["model_type"] = "starcoder"
+    test_data["STARCODER1-15B-base"]["min_gpus"] = 1
+    test_data["STARCODER1-15B-base"]["location"] = "Local"
+    test_data["STARCODER1-15B-base"]["trt_llm_model_dir"] = "/tmp/STARCODER1-15B-base/trt_llm_model-1/"
+    test_data["STARCODER1-15B-base"]["checkpoint"] = "/opt/checkpoints/STARCODER1-15B-base/STARCODER1-15B-base-1.nemo"
+    test_data["STARCODER1-15B-base"]["prompt_template"] = ["def fibonnaci(n"]
+    test_data["STARCODER1-15B-base"]["expected_keyword"] = ["fibonnaci"]
+    test_data["STARCODER1-15B-base"]["max_output_token"] = 128
+    test_data["STARCODER1-15B-base"]["max_batch_size"] = 5
+
+    test_data["GEMMA-base"] = {}
+    test_data["GEMMA-base"]["model_type"] = "gemma"
+    test_data["GEMMA-base"]["min_gpus"] = 1
+    test_data["GEMMA-base"]["location"] = "Local"
+    test_data["GEMMA-base"]["trt_llm_model_dir"] = "/tmp/GEMMA-base/trt_llm_model-1/"
+    test_data["GEMMA-base"]["checkpoint"] = "/opt/checkpoints/GEMMA-base/GEMMA-base-1.nemo"
+    test_data["GEMMA-base"]["prompt_template"] = [
+        "The capital of France is",
+        "Largest animal in the sea is",
+        "Fastest animal in the world is",
+    ]
+    test_data["GEMMA-base"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
+    test_data["GEMMA-base"]["max_output_token"] = 128
+    test_data["GEMMA-base"]["max_batch_size"] = 10
+
+    return test_data
+
+
+def download_nemo_checkpoint(checkpoint_link, checkpoint_dir, checkpoint_path):
+    if not Path(checkpoint_path).exists():
+        print("Checkpoint: {0}, will be downloaded to {1}".format(checkpoint_link, checkpoint_path))
+        Path(checkpoint_dir).mkdir(parents=True, exist_ok=True)
+        ckp_path = Path("/opt/checkpoints/")
+        if not ckp_path.exists():
+            ckp_path.mkdir(parents=True, exist_ok=False)
+        req.urlretrieve(checkpoint_link, checkpoint_path)
+        print("Checkpoint: {0}, download completed.".format(checkpoint_link))
+    else:
+        print("Checkpoint: {0}, has already been downloaded.".format(checkpoint_link))

From 91349ab87d35c446596783acc1f61abb8d61a5ea Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Date: Sat, 6 Apr 2024 10:38:53 -0500
Subject: [PATCH 108/140] add draft support (#8836)

Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>
---
 .../stable_diffusion/samplers/base_sampler.py | 42 +++++++++++++++++++
 .../stable_diffusion/samplers/ddim.py         | 38 +++++++++++++++++
 .../nlp/parts/megatron_trainer_builder.py     | 23 ++++++++++
 3 files changed, 103 insertions(+)

diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/samplers/base_sampler.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/samplers/base_sampler.py
index e1f2457f34de..08ecdcb830d9 100644
--- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/samplers/base_sampler.py
+++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/samplers/base_sampler.py
@@ -279,6 +279,48 @@ def sampling_fn(
                 intermediates["pred_x0"].append(pred_x0)
         return img, intermediates
 
+    def single_ddim_denoise_step(
+        self,
+        img,
+        total_steps,
+        i,
+        b,
+        device,
+        step,
+        cond,
+        ddim_use_original_steps=None,
+        quantize_denoised=None,
+        temperature=1.0,
+        noise_dropout=0.0,
+        score_corrector=None,
+        corrector_kwargs=None,
+        unconditional_guidance_scale=1.0,
+        unconditional_conditioning=None,
+    ):
+
+        index = total_steps - i - 1
+        ts = torch.full((b,), step, device=device, dtype=torch.long)
+
+        outs, eps_t = self.grad_p_sampling_fn(
+            img,
+            cond,
+            ts,
+            index=index,
+            use_original_steps=ddim_use_original_steps,
+            quantize_denoised=quantize_denoised,
+            temperature=temperature,
+            noise_dropout=noise_dropout,
+            score_corrector=score_corrector,
+            corrector_kwargs=corrector_kwargs,
+            unconditional_guidance_scale=unconditional_guidance_scale,
+            unconditional_conditioning=unconditional_conditioning,
+            old_eps=None,
+            t_next=None,
+        )
+
+        img, pred_x0 = outs[0], outs[1]
+        return img, pred_x0, eps_t
+
     def _get_model_output(
         self, x, t, unconditional_conditioning, unconditional_guidance_scale, score_corrector, c, corrector_kwargs,
     ):
diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/samplers/ddim.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/samplers/ddim.py
index 13f692d27821..761401d11658 100644
--- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/samplers/ddim.py
+++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/samplers/ddim.py
@@ -66,6 +66,44 @@ def p_sampling_fn(
         )
         return x_prev, pred_x0
 
+    def grad_p_sampling_fn(
+        self,
+        x,
+        c,
+        t,
+        index,
+        repeat_noise=False,
+        use_original_steps=False,
+        quantize_denoised=False,
+        temperature=1.0,
+        noise_dropout=0.0,
+        score_corrector=None,
+        corrector_kwargs=None,
+        unconditional_guidance_scale=1.0,
+        unconditional_conditioning=None,
+        old_eps=None,
+        t_next=None,
+    ):
+        b, *_, device = *x.shape, x.device
+        e_t, model_output = self._get_model_output(
+            x, t, unconditional_conditioning, unconditional_guidance_scale, score_corrector, c, corrector_kwargs
+        )
+        outs = self._get_x_prev_and_pred_x0(
+            use_original_steps,
+            b,
+            index,
+            device,
+            x,
+            t,
+            model_output,
+            e_t,
+            quantize_denoised,
+            repeat_noise,
+            temperature,
+            noise_dropout,
+        )
+        return outs, e_t
+
     @torch.no_grad()
     def stochastic_encode(self, x0, t, use_original_steps=False, noise=None):
         # fast, but does not allow for exact reconstruction
diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
index 9a496c99f08d..77d306c17da0 100644
--- a/nemo/collections/nlp/parts/megatron_trainer_builder.py
+++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py
@@ -160,6 +160,29 @@ def create_trainer(self) -> Trainer:
         return Trainer(plugins=plugins, strategy=strategy, **self.cfg.trainer, callbacks=callbacks)
 
 
+class MegatronStableDiffusionTrainerBuilder(MegatronTrainerBuilder):
+    """Builder for SD model Trainer with overrides."""
+
+    def _training_strategy(self) -> NLPDDPStrategy:
+        """
+        Returns a ddp strategy passed to Trainer.strategy.
+        """
+        ddp_overlap = self.cfg.model.get("ddp_overlap", True)
+        if ddp_overlap:
+            return NLPDDPStrategy(
+                no_ddp_communication_hook=False,
+                gradient_as_bucket_view=self.cfg.model.gradient_as_bucket_view,
+                find_unused_parameters=True,
+                bucket_cap_mb=256,
+            )
+        else:
+            return NLPDDPStrategy(
+                no_ddp_communication_hook=True,
+                gradient_as_bucket_view=self.cfg.model.gradient_as_bucket_view,
+                find_unused_parameters=False,
+            )
+
+
 class MegatronLMPPTrainerBuilder(MegatronTrainerBuilder):
     """Builder for scripts where grad scaler is turned off for pipeline parallel LM model. E.g. PEFT tuning scripts"""
 

From 35e400f20fa0da676f01b1eb99388e18b5f6d30c Mon Sep 17 00:00:00 2001
From: Jaemin Choi <minitu77@gmail.com>
Date: Sat, 6 Apr 2024 14:57:05 -0700
Subject: [PATCH 109/140] Set TP overlap flag in ModelParallelConfig, fix TP
 overlap for LoRA (#8839)

* Set TP overlap flag in ModelParallelConfig, fix TP overlap for LoRA

Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>
Co-authored-by: Jaemin Choi <jaeminc@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 .../language_modeling/megatron_base_model.py   |  1 +
 .../common/megatron/adapters/mcore_mixins.py   | 18 ++++++++++++++----
 .../megatron/adapters/parallel_adapters.py     |  3 ++-
 3 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 35c1518434a0..baa6e30af81d 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -1155,6 +1155,7 @@ def build_model_parallel_config(self) -> ModelParallelConfig:
             "no_sync_func": None,  # set dynamically during training
             "grad_sync_func": None,  # set dynamically during training
             "param_sync_func": None,  # set dynamically during training
+            "tp_comm_overlap": self.cfg.get('ub_tp_comm_overlap', False),
         }
 
         # instantitate ModelParallelConfig from this dict
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
index 198a78a73718..64e8fe44e1e8 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
@@ -68,9 +68,14 @@ def mcore_register_adapters(self):
             [LoraKQVAdapterConfig._target_, LoraDenseAttentionAdapterConfig._target_, InfusedAdapterConfig._target_]
         )
         self.linear_qkv.return_layernorm_output = True  # need layernorm output for lora mlp
-        if self.config.sequence_parallel and hasattr(self.linear_qkv, "return_layernorm_output_gathered"):
+        if (
+            self.config.sequence_parallel
+            and hasattr(self.linear_qkv, "return_layernorm_output_gathered")
+            and not self.config.tp_comm_overlap
+        ):
             # for LoRA SP, TE v1.5 can return layernorm output gathered so there is no need
-            # to perform the redundant gather in the adapter module.
+            # to perform the redundant gather in the adapter module, unless TP communication
+            # overlap is used.
             self.linear_qkv.return_layernorm_output_gathered = True
 
     def get_query_key_value_tensors(self, hidden_states, key_value_states=None):
@@ -249,9 +254,14 @@ def mcore_register_adapters(self):
             [LoraHto4HAdapterConfig._target_, Lora4HtoHAdapterConfig._target_, MLPInfusedAdapterConfig._target_]
         )  # only self attn (packed qkv) for now
         self.linear_fc1.return_layernorm_output = True  # need layernorm output for lora mlp
-        if self.config.sequence_parallel and hasattr(self.linear_fc1, "return_layernorm_output_gathered"):
+        if (
+            self.config.sequence_parallel
+            and hasattr(self.linear_fc1, "return_layernorm_output_gathered")
+            and not self.config.tp_comm_overlap
+        ):
             # for LoRA SP, TE v1.5 can return layernorm output gathered so there is no need
-            # to perform the redundant gather in the adapter module.
+            # to perform the redundant gather in the adapter module, unless TP communication
+            # overlap is used.
             self.linear_fc1.return_layernorm_output_gathered = True
 
     def forward(self, hidden_states):
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
index 8c34f528f2d9..70ed4d695b3c 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
@@ -243,9 +243,10 @@ def __init__(
             from pkg_resources import packaging
 
             te_version = packaging.version.Version(version("transformer-engine"))
-            if te_version >= packaging.version.Version("1.5.0dev"):
+            if te_version >= packaging.version.Version("1.5.0dev") and not model_parallel_config.tp_comm_overlap:
                 # TE 1.5 introduces the option `return_layernorm_output_gathered`, so the all gather
                 # in the forward method is not needed, so set self._sequence_parallel to False
+                # unless TP communication overlap is used
                 self._sequence_parallel = False
 
     def _get_init_fn(self, init_method: str):

From 23baa48e441ecb6cc6b49c23bf8cfc076db38bdc Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Mon, 8 Apr 2024 13:27:43 -0400
Subject: [PATCH 110/140] LoRA merge script clean up (#8834)

* remove split rank

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../nlp_language_modeling/merge_lora_weights/merge.py  | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/scripts/nlp_language_modeling/merge_lora_weights/merge.py b/scripts/nlp_language_modeling/merge_lora_weights/merge.py
index b3d7ca81a674..ccdb433630da 100644
--- a/scripts/nlp_language_modeling/merge_lora_weights/merge.py
+++ b/scripts/nlp_language_modeling/merge_lora_weights/merge.py
@@ -175,11 +175,7 @@ def main(cfg) -> None:
     # trainer required for restoring model parallel models
     trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer)
 
-    if (
-        cfg.tensor_model_parallel_size < 0
-        or cfg.pipeline_model_parallel_size < 0
-        or cfg.get('pipeline_model_parallel_split_rank', -1) < 0
-    ):
+    if cfg.tensor_model_parallel_size < 0 or cfg.pipeline_model_parallel_size < 0:
         model_config = MegatronGPTModel.restore_from(
             restore_path=cfg.gpt_model_file, trainer=trainer, return_config=True,
         )
@@ -187,7 +183,6 @@ def main(cfg) -> None:
         with open_dict(cfg):
             cfg.tensor_model_parallel_size = model_config.get('tensor_model_parallel_size', 1)
             cfg.pipeline_model_parallel_size = model_config.get('pipeline_model_parallel_size', 1)
-            cfg.pipeline_model_parallel_split_rank = model_config.get('pipeline_model_parallel_split_rank', 0)
 
     if cfg.gpt_model_file:
         save_restore_connector = NLPSaveRestoreConnector()
@@ -207,7 +202,6 @@ def main(cfg) -> None:
             pretrained_cfg.activations_checkpoint_method = None
             pretrained_cfg.precision = trainer.precision
             pretrained_cfg.use_cpu_initialization = cfg.trainer.accelerator == 'cpu'
-            pretrained_cfg["apply_rope_fusion"] = False
         model = MegatronGPTModel.restore_from(
             restore_path=cfg.gpt_model_file,
             trainer=trainer,
@@ -226,14 +220,12 @@ def main(cfg) -> None:
                 app_state.pipeline_model_parallel_rank,
                 app_state.model_parallel_size,
                 app_state.data_parallel_size,
-                app_state.pipeline_model_parallel_split_rank,
                 app_state.virtual_pipeline_model_parallel_rank,
             ) = fake_initialize_model_parallel(
                 world_size=app_state.model_parallel_size,
                 rank=trainer.global_rank,
                 tensor_model_parallel_size_=cfg.tensor_model_parallel_size,
                 pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size,
-                pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank,
             )
         checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name))
         model = MegatronGPTModel.load_from_checkpoint(checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer)

From c5738263d8b4bedb0957374116d3e90746a51c37 Mon Sep 17 00:00:00 2001
From: Marek Wawrzos <marek.28.93@gmail.com>
Date: Tue, 9 Apr 2024 04:26:58 +0200
Subject: [PATCH 111/140] [SD] CUDA Graphs update (#8613)

* [SD] remove synchronizations

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* Typo in logging

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* [SD] Remove the sync invoked by tensor allocation.

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* Make the model sync-free again.

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* Support PyTorch Lightning 2 for full iteration CUDA graph callback.

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* Add documentation about CUDAGraphCallback.

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* Support synthetic dataset.

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* Fix typo.

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* Fix the bug of wrong GN groups.

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* remove circular dependency

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* Change naming for offline clip

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* Add exception when no gradient allreduce is called.

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* rename enable_amp_o2_fp16 -> unet_precision

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* Adjustments to PyTorch 2.3

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* fix CUDA Graphs support in SD

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* Document incompatibility betwee pipe parallelism and full iteration CUDA Graph callback for SD.

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* update CUDA Graphs callback to PTL 2.1

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* [SD] Full-fp16: push normalization layers in FP16.

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* [SD] enable CUDA Graphs in examples

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* [SD] add model warmup

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* fix sanity-check for CUDA Graphs

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* [SD] CUDA Graphs test

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* Update cuda graph jenkins test

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* fix typo

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* fix path in test

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* handle unexpected precision value for PipelineMixedPrecisionPlugin

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* remove unused import

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* replace unsupported syntax

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* typo

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* Add a gurad for megatron fused adam

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix bugs for FSDP in clip_grads

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

* [SD] skip model warmup when CUDA Graph not captured

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>

---------

Signed-off-by: Marek Wawrzos <mwawrzos@nvidia.com>
Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>
Signed-off-by: Marek Wawrzos <marek.28.93@gmail.com>
Co-authored-by: Szymon Mikler <smikler@nvidia.com>
Co-authored-by: Wil Kong <alpha0422@gmail.com>
Co-authored-by: Mengdi Wang <didow@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Ming <111467530+Victor49152@users.noreply.github.com>
Co-authored-by: Mingyuan Ma <mingyuanm@nvidia.com>
---
 Jenkinsfile                                   | 43 ++++++++
 .../stable_diffusion/conf/sd_lora_train.yaml  |  4 +-
 .../stable_diffusion/conf/sd_train.yaml       |  2 +-
 .../stable_diffusion/sd_train.py              | 32 +++++-
 .../stable_diffusion_dataset.py               | 62 ++++++++----
 .../stable_diffusion/ldm/ddpm.py              | 51 +++++++---
 .../diffusionmodules/openaimodel.py           | 44 +++++++--
 .../stable_diffusion/diffusionmodules/util.py | 32 ++++--
 .../nlp/modules/common/megatron/clip_grads.py | 38 +++----
 .../nlp/modules/common/megatron/module.py     |  4 -
 nemo/collections/nlp/parts/nlp_overrides.py   | 28 ++++--
 nemo/core/optim/megatron_fused_adam.py        |  7 +-
 nemo/utils/callbacks/cuda_graph.py            | 98 ++++++++++++++++---
 nemo/utils/model_utils.py                     |  4 +
 tests/core/test_optimizers_schedulers.py      |  2 +-
 15 files changed, 349 insertions(+), 102 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index f6253b16a6d4..14f9a38a9c17 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -212,10 +212,53 @@ pipeline {
             model.unet_config.use_flash_attention=False \
             model.unet_config.attention_resolutions=[1] \
             model.unet_config.channel_mult=[1] \
+            model.ddp_overlap=False \
             "
         sh "rm -rf /home/TestData/multimodal/stable_diffusion_train"
       }
     }
+    stage('L2: Multimodal Stable Diffusion Train with Cuda Graph') {
+      when {
+        anyOf {
+          branch 'main'
+          changeRequest target: 'main'
+        }
+      }
+      failFast true
+      steps {
+        sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
+        sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \
+            trainer.precision=16 \
+            trainer.num_nodes=1 \
+            trainer.devices=1 \
+            ++exp_manager.max_time_per_run=00:00:03:00 \
+            exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train_with_cuda_graph \
+            trainer.max_steps=20 \
+            model.micro_batch_size=1 \
+            model.global_batch_size=1 \
+            model.data.synthetic_data=True \
+            model.first_stage_key=images_moments \
+            model.cond_stage_key=clip_encoded \
+            model.optim.name=megatron_fused_adam \
+            +model.optim.capturable=True \
+            exp_manager.ema.enable=False \
+            model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
+            ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
+            ++model.cond_stage_config.max_length=77 \
+            model.inductor=False \
+            ~model.cond_stage_config.restore_from_path \
+            ~model.cond_stage_config.freeze \
+            ~model.cond_stage_config.layer \
+            model.first_stage_config.from_pretrained=null \
+            model.ddp_overlap=False \
+            model.capture_cudagraph_iters=15 \
+            model.unet_config.use_flash_attention=False \
+            model.unet_config.attention_resolutions=[1] \
+            model.unet_config.channel_mult=[1] \
+            "
+        sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
+      }
+    }
 //     stage('L2: Multimodal ControlNet Train') {
 //       when {
 //         anyOf {
diff --git a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_lora_train.yaml b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_lora_train.yaml
index 3fbe03aaeaa1..d9981a093288 100644
--- a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_lora_train.yaml
+++ b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_lora_train.yaml
@@ -119,7 +119,7 @@ model:
     use_checkpoint: False
     legacy: False
     use_flash_attention: True
-    enable_amp_o2_fp16: False
+    unet_precision: fp32
     resblock_gn_groups: 32
     lora_network_alpha: null
 
@@ -214,4 +214,4 @@ model:
       row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
       layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
       weight_tying: False
-      position_embedding_strategy: null # used only when weight_tying is True
\ No newline at end of file
+      position_embedding_strategy: null # used only when weight_tying is True
diff --git a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_train.yaml b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_train.yaml
index 0920ae0870e8..8ce009d5458f 100644
--- a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_train.yaml
+++ b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_train.yaml
@@ -119,7 +119,7 @@ model:
     use_checkpoint: False
     legacy: False
     use_flash_attention: True
-    enable_amp_o2_fp16: False
+    unet_precision: fp32
     resblock_gn_groups: 32
 
   first_stage_config:
diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_train.py b/examples/multimodal/text_to_image/stable_diffusion/sd_train.py
index 434150516d0c..968d9bec2884 100644
--- a/examples/multimodal/text_to_image/stable_diffusion/sd_train.py
+++ b/examples/multimodal/text_to_image/stable_diffusion/sd_train.py
@@ -23,6 +23,7 @@
 from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
+from nemo.utils.callbacks import CUDAGraphCallback
 from nemo.utils.exp_manager import exp_manager
 
 
@@ -56,12 +57,41 @@ def main(cfg) -> None:
 
     torch.backends.cuda.matmul.allow_tf32 = True
 
-    trainer = MegatronStableDiffusionTrainerBuilder(cfg).create_trainer()
+    callbacks = (
+        None
+        if cfg.model.capture_cudagraph_iters < 0
+        else [CUDAGraphCallback(capture_iteration=cfg.model.capture_cudagraph_iters)]
+    )
+    trainer = MegatronStableDiffusionTrainerBuilder(cfg).create_trainer(callbacks)
 
     exp_manager(trainer, cfg.exp_manager)
 
     model = MegatronLatentDiffusion(cfg.model, trainer)
 
+    if cfg.model.capture_cudagraph_iters >= 0:
+        # Warmup the model with random data
+        with torch.cuda.stream(torch.cuda.Stream()):
+            n, c, h = cfg.model.micro_batch_size, cfg.model.channels, cfg.model.image_size
+            x = torch.randn((n, c, h, h), dtype=torch.float32, device="cuda")
+            t = torch.randint(77, (n,), device="cuda")
+            cc = torch.randn((n, 77, cfg.model.unet_config.context_dim), dtype=torch.float32, device="cuda",)
+            if cfg.model.precision in [16, '16']:
+                x = x.type(torch.float16)
+                cc = cc.type(torch.float16)
+                autocast_enabled = False
+                dgrad_dtype = torch.float16
+            else:
+                autocast_enabled = True
+                dgrad_dtype = torch.float16
+
+            model = model.cuda()
+            for _ in range(5):
+                with torch.autocast(device_type="cuda", enabled=autocast_enabled, dtype=torch.float16):
+                    out = model.model.model.diffusion_model(x, t, context=cc)
+                grad = torch.randn_like(out, dtype=dgrad_dtype)
+                out.backward(grad)
+                model.zero_grad()
+
     if cfg.model.get('peft', None):
 
         peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
diff --git a/nemo/collections/multimodal/data/stable_diffusion/stable_diffusion_dataset.py b/nemo/collections/multimodal/data/stable_diffusion/stable_diffusion_dataset.py
index 5929798267a5..61f1f9c91c01 100644
--- a/nemo/collections/multimodal/data/stable_diffusion/stable_diffusion_dataset.py
+++ b/nemo/collections/multimodal/data/stable_diffusion/stable_diffusion_dataset.py
@@ -36,18 +36,22 @@ def __init__(
         self.W = image_W
         self.image_key = image_key
         self.txt_key = txt_key
-        assert image_key.endswith('encoded') == txt_key.endswith(
-            'encoded'
-        ), 'In precached mode, first and second stage key must both end with "encoded"'
-        self.precached = self.image_key.endswith('encoded')
+        img_precached = image_key.endswith('encoded') or image_key.endswith('moments')
+        txt_precached = txt_key.endswith('encoded')
+        assert (
+            img_precached == txt_precached
+        ), 'First and second stage keys should enable/disable precache at the same time.'
         self.seq_len = seq_len
         self.context_dim = context_dim
 
     def __getitem__(self, index):
         item = {}
-        if self.precached:
+        if self.image_key.endswith('encoded'):
             item[self.image_key] = torch.randn(8, self.H // 8, self.W // 8)
             item[self.txt_key] = torch.randn(self.seq_len, self.context_dim)
+        elif self.image_key.endswith('moments'):
+            item[self.image_key] = torch.randn(1, 8, self.H // 8, self.W // 8)
+            item[self.txt_key] = torch.randn(self.seq_len, self.context_dim)
         else:
             item[self.image_key] = torch.randn(self.H, self.W, 3)
             item[self.txt_key] = f'This is meaningless fake text No.{index}'
@@ -174,7 +178,7 @@ def transform_fn(sample):
     if data_cfg.get("validation") is not None and data_cfg.validation.get("data_path"):
         if data_cfg.get('synthetic_data', False):
             H, W = data_cfg.train.augmentations.center_crop_h_w.split(',')
-            train_data = SDSyntheticDataset(
+            val_data = SDSyntheticDataset(
                 int(H),
                 int(W),
                 image_key=model_cfg.first_stage_key,
@@ -212,24 +216,46 @@ def transform_fn(sample):
         # latents are of shape ([4, 64, 64])
         return latents, text_embed
 
-    train_data = WebDatasetCommon(
-        dataset_cfg=data_cfg,
-        consumed_samples=consumed_samples,
-        map_fn=transform_fn,
-        compose_fn=tuple_to_dict,
-        is_train=True,
-    )
-
-    val_data = None
-    if data_cfg.get("validation") is not None and data_cfg.validation.get("data_path"):
-        val_data = WebDatasetCommon(
+    if data_cfg.get('synthetic_data', False):
+        H, W = data_cfg.train.augmentations.center_crop_h_w.split(',')
+        train_data = SDSyntheticDataset(
+            int(H),
+            int(W),
+            image_key=model_cfg.first_stage_key,
+            txt_key=model_cfg.cond_stage_key,
+            context_dim=model_cfg.unet_config.context_dim,
+            seq_len=77,
+        )
+    else:
+        train_data = WebDatasetCommon(
             dataset_cfg=data_cfg,
             consumed_samples=consumed_samples,
             map_fn=transform_fn,
             compose_fn=tuple_to_dict,
-            is_train=False,
+            is_train=True,
         )
 
+    val_data = None
+    if data_cfg.get("validation") is not None and data_cfg.validation.get("data_path"):
+        if data_cfg.get('synthetic_data', False):
+            H, W = data_cfg.train.augmentations.center_crop_h_w.split(',')
+            val_data = SDSyntheticDataset(
+                int(H),
+                int(W),
+                image_key=model_cfg.first_stage_key,
+                txt_key=model_cfg.cond_stage_key,
+                context_dim=model_cfg.unet_config.context_dim,
+                seq_len=77,
+            )
+        else:
+            val_data = WebDatasetCommon(
+                dataset_cfg=data_cfg,
+                consumed_samples=consumed_samples,
+                map_fn=transform_fn,
+                compose_fn=tuple_to_dict,
+                is_train=False,
+            )
+
     return train_data, val_data
 
 
diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
index 61bb664e43ed..33a194500a69 100644
--- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
+++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
@@ -283,7 +283,6 @@ def init_from_ckpt(
             for k in keys:
                 if k.startswith("cond_stage_model"):
                     deleted += 1
-                    logging.info("Deleting ignored key {} from state_dict.".format(k))
                     del sd[k]
             logging.info(f"Deleted {deleted} keys from `cond_stage_model` state_dict.")
 
@@ -294,7 +293,7 @@ def init_from_ckpt(
                 if k.startswith("model.diffusion_model"):
                     deleted += 1
                     del sd[k]
-            logging.info(f"Deleted {deleted} keys from `cond_stage_model` state_dict.")
+            logging.info(f"Deleted {deleted} keys from `model.diffusion_model` state_dict.")
 
         missing, unexpected = (
             self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(sd, strict=False)
@@ -1675,18 +1674,21 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         # megatron_amp_O2 is not yet supported in diffusion models
         self.megatron_amp_O2 = cfg.get('megatron_amp_O2', False)
 
+        if self.cfg.precision in ['16', 16, 'bf16']:
+            self.model_parallel_config.enable_autocast = False
+
         self.model = self.model_provider_func()
 
         self.conditioning_keys = []
 
-        if self.trainer.precision in ['bf16', 'bf16-mixed']:
+        if self.model.precision in ['bf16', 'bf16-mixed']:
             self.autocast_dtype = torch.bfloat16
-        elif self.trainer.precision in [32, '32', '32-true']:
+        elif self.model.precision in [32, '32', '32-true']:
             self.autocast_dtype = torch.float
-        elif self.trainer.precision in [16, '16', '16-mixed']:
+        elif self.model.precision in ['16-mixed', '16', 16]:
             self.autocast_dtype = torch.half
         else:
-            raise ValueError('precision must be in ["32-true", "16-mixed", "bf16-mixed"]')
+            raise ValueError('precision must be in [32, "32", "32-true", "16-mixed", "16", 16, "bf16-mixed", "bf16"]')
 
         self.log_train_loss = bool(int(os.getenv("NEMO_LOG_TRAIN_LOSS", 1)))
         self.loss_broadcast_src_rank = None
@@ -1780,8 +1782,18 @@ def fwd_bwd_step(self, dataloader_iter, forward_only):
 
         return loss_mean, loss_dict
 
-    def training_step(self, dataloader_iter):
+    def training_step(self, batch):
         """
+            Notice: `training_step` used to have the following signature to support pipeline
+            parallelism:
+
+                def training_step(self, dataloader_iter, batch_idx):
+
+            However, full iteration CUDA Graph callback is not compatible with this signature
+            right now, due to we need to wrap the dataloader to generate static tensor outside
+            the CUDA Graph. This signature moves `next(dataloader)` into the CUDA Graph
+            capturing region, thus we disabled it.
+
             Our dataloaders produce a micro-batch and then we fetch
             a number of microbatches depending on the global batch size and model parallel size
             from the dataloader to produce a list of microbatches.
@@ -1793,6 +1805,7 @@ def training_step(self, dataloader_iter):
         # we zero grads here because we also call backward in the megatron-core fwd/bwd functions
         self._optimizer.zero_grad()
 
+        dataloader_iter = iter([batch])
         loss_mean, loss_dict = self.fwd_bwd_step(dataloader_iter, False)
 
         # when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced
@@ -1812,6 +1825,8 @@ def training_step(self, dataloader_iter):
             # async grad allreduce is not currently implemented for O1/autocasting mixed precision training
             # so we all-reduce gradients after the pipeline
             self.allreduce_gradients()  # @sangkug we think this is causing memory to blow up (hurts perf)
+        else:
+            raise ValueError("Either distributed_fused_adam or megatron_amp_O2 needs to be set if ddp_overlap is set")
 
         # for cuda graph with pytorch lightning
         # these values will be used outside the capturing range
@@ -1828,22 +1843,28 @@ def training_step(self, dataloader_iter):
         return loss_mean
 
     def non_cuda_graph_capturable(self):
+        # Moving CUDA metrics to CPU leads to sync, do not show on progress bar
+        # if CUDA graph is enabled.
+        show_metric = self.cfg.get("show_prog_bar_metric", True) and (self.cfg.get("capture_cudagraph_iters", -1) < 0)
+
         if self.log_train_loss:
-            self.log('reduced_train_loss', self.loss_mean, prog_bar=True, rank_zero_only=True, batch_size=1)
+            self.log('reduced_train_loss', self.loss_mean, prog_bar=show_metric, rank_zero_only=True, batch_size=1)
 
         if self.cfg.precision in [16, '16', '16-mixed']:
             loss_scale = self.trainer.precision_plugin.scaler._scale
             if loss_scale is not None:
                 self.log('loss_scale', loss_scale, batch_size=1)
 
-        self.log_dict(self.loss_dict, prog_bar=False, logger=True, on_step=True, rank_zero_only=True, batch_size=1)
+        self.log_dict(
+            self.loss_dict, prog_bar=show_metric, logger=True, on_step=True, rank_zero_only=True, batch_size=1
+        )
         lr = self._optimizer.param_groups[0]['lr']
-        self.log('lr', lr, prog_bar=True, rank_zero_only=True, batch_size=1)
-        self.log('global_step', self.trainer.global_step + 1, prog_bar=True, rank_zero_only=True, batch_size=1)
+        self.log('lr', lr, prog_bar=show_metric, rank_zero_only=True, batch_size=1)
+        self.log('global_step', self.trainer.global_step + 1, prog_bar=show_metric, rank_zero_only=True, batch_size=1)
         self.log(
             'consumed_samples',
             self.compute_consumed_samples(self.trainer.global_step + 1 - self.init_global_step),
-            prog_bar=True,
+            prog_bar=show_metric,
             rank_zero_only=True,
             batch_size=1,
         )
@@ -1902,7 +1923,7 @@ def process_batch(batch):
             return [x, *c_list]
 
         def fwd_output_and_loss_func(dataloader_iter, model):
-            batch, _, _ = next(dataloader_iter)
+            batch = next(dataloader_iter)
             batch = process_batch(batch)
             batch = [x.cuda(non_blocking=True) for x in batch]
             if len(self.conditioning_keys) == 0:
@@ -1991,7 +2012,7 @@ def build_train_valid_test_datasets(self):
             raise ValueError("limit_val_batches must be an integer or float less than or equal to 1.0.")
 
         if self.cfg.first_stage_key.endswith("encoded") or self.cfg.first_stage_key.endswith("moments"):
-            if self.cfg.cond_stage_key.endswith("precached_clip"):
+            if self.cfg.cond_stage_key.endswith("clip_encoded"):
                 self._train_ds, self._validation_ds = build_train_valid_precached_clip_datasets(
                     model_cfg=self.cfg, consumed_samples=self.compute_consumed_samples(0),
                 )
@@ -2020,7 +2041,7 @@ def setup_training_data(self, cfg):
             logging.info(
                 f'Setting up train dataloader with len(len(self._train_ds)): {len(self._train_ds)} and consumed samples: {consumed_samples}'
             )
-            if self.cfg.cond_stage_key.endswith("precached_clip"):
+            if self.cfg.cond_stage_key.endswith("clip_encoded"):
                 collate_fn = get_collate_fn(
                     first_stage_key=self.cfg.first_stage_key, cond_stage_key=self.cfg.cond_stage_key,
                 )
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
index 91a214b90713..14560ba5d9d1 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
@@ -23,9 +23,11 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
+from apex.contrib.group_norm import GroupNorm
 from nemo.collections.multimodal.modules.stable_diffusion.attention import SpatialTransformer
 from nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.util import (
     avg_pool_nd,
+    build_timestep_embedding,
     checkpoint,
     conv_nd,
     default,
@@ -38,16 +40,26 @@
 from nemo.utils import logging
 
 
-def convert_module_to_dtype(module, dtype):
+def convert_module_to_dtype(module, dtype, enable_norm_layers=False):
     # Convert module parameters to dtype
     if isinstance(module, (torch.nn.Conv1d, torch.nn.Conv2d, torch.nn.Linear)):
         module.weight.data = module.weight.data.to(dtype)
         if module.bias is not None:
             module.bias.data = module.bias.data.to(dtype)
 
+    if enable_norm_layers:
+        if isinstance(module, (nn.LayerNorm, nn.GroupNorm, GroupNorm)):
+            module.weight.data = module.weight.data.to(dtype)
+            if module.bias is not None:
+                module.bias.data = module.bias.data.to(dtype)
 
-def convert_module_to_fp16(module):
-    convert_module_to_dtype(module, torch.float16)
+
+def convert_module_to_fp16(module, enable_norm_layers=False):
+    convert_module_to_dtype(module, torch.float16, enable_norm_layers)
+
+
+def convert_module_to_fp32(module, enable_norm_layers=False):
+    convert_module_to_dtype(module, torch.float32, enable_norm_layers)
 
 
 class AttentionPool2d(nn.Module):
@@ -538,8 +550,9 @@ def __init__(
         from_NeMo=False,
         # It must be specified when from pretrained is not None. It indicates loading unet from NeMo trained ckpt or HF
         use_flash_attention: bool = False,
-        enable_amp_o2_fp16: bool = False,
+        unet_precision: str = "fp32",
         lora_network_alpha=None,
+        timesteps=1000,
     ):
         super().__init__()
         from omegaconf.listconfig import ListConfig
@@ -616,6 +629,10 @@ def __init__(
             linear(model_channels, time_embed_dim), nn.SiLU(), linear(time_embed_dim, time_embed_dim),
         )
 
+        self.time_embeddings = torch.Tensor(build_timestep_embedding(model_channels, timesteps)).to('cuda')
+        if unet_precision == 'fp16-mixed' or unet_precision == 'fp16':
+            self.time_embeddings = self.time_embeddings.to(torch.float16)
+
         if self.num_classes is not None:
             if isinstance(self.num_classes, int):
                 self.label_emb = nn.Embedding(num_classes, time_embed_dim)
@@ -787,6 +804,7 @@ def __init__(
                         dims=dims,
                         use_checkpoint=use_checkpoint,
                         use_scale_shift_norm=use_scale_shift_norm,
+                        resblock_gn_groups=resblock_gn_groups,
                     )
                 ]
                 ch = model_channels * mult
@@ -876,8 +894,12 @@ def __init__(
                 logging.info(f"Missing keys: {missing_key}")
                 logging.info(f"Unexpected keys: {unexpected_keys}")
 
-        if enable_amp_o2_fp16:
+        if unet_precision == "fp16-mixed":  # AMP O2
             self.convert_to_fp16()
+        elif unet_precision == 'fp16':
+            self.convert_to_fp16(enable_norm_layers=True)
+
+        self.unet_precision = unet_precision
 
     def _input_blocks_mapping(self, input_dict):
         res_dict = {}
@@ -1120,11 +1142,11 @@ def load(module: torch.nn.Module, prefix=""):
 
         return error_msgs
 
-    def convert_to_fp16(self):
+    def convert_to_fp16(self, enable_norm_layers=False):
         """
         Convert the torso of the model to float16.
         """
-        self.apply(convert_module_to_fp16)
+        self.apply(lambda module: convert_module_to_fp16(module=module, enable_norm_layers=enable_norm_layers))
 
     def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
         """
@@ -1145,7 +1167,13 @@ def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
             self.num_classes is not None
         ), "must specify y if and only if the model is class-conditional"
         hs = []
-        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+
+        if self.unet_precision == "fp16-mixed" or self.unet_precision == "fp16":
+            x = x.type(torch.float16)
+            if context is not None:
+                context = context.type(torch.float16)
+
+        t_emb = timestep_embedding(timesteps, self.model_channels, cached_embedding=self.time_embeddings)
         emb = self.time_embed(t_emb)
         if self.num_classes is not None:
             assert y.shape[0] == x.shape[0]
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py
index d22693a12801..3cf0e45e8e46 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/util.py
@@ -177,7 +177,19 @@ def get_idx(end, device):
     return torch.arange(start=0, end=end, dtype=torch.float32, device=device)
 
 
-def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
+def build_timestep_embedding(dim, max_timesteps, max_period=10000):
+    timesteps = np.arange(start=0, stop=max_timesteps, dtype=np.float32)
+    half = dim // 2
+    idx = np.arange(start=0, stop=half, dtype=np.float32)
+    freqs = np.exp(-math.log(max_period) / half * idx)
+    args = timesteps[:, None] * freqs[None]
+    embedding = np.concatenate([np.cos(args), np.sin(args)], axis=-1)
+    if dim % 2:
+        embedding = np.concatenate([embedding, np.zeros_like(embedding[:, :1])], axis=-1)
+    return embedding
+
+
+def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False, cached_embedding=None):
     """
     Create sinusoidal timestep embeddings.
 
@@ -193,13 +205,17 @@ def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
     """
 
     if not repeat_only:
-        half = dim // 2
-        idx = get_idx(half, timesteps.device)
-        freqs = torch.exp(-math.log(max_period) / half * idx)
-        args = timesteps[:, None].float() * freqs[None]
-        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-        if dim % 2:
-            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        if cached_embedding is not None:
+            # using cached embedding and lookup in the cache
+            embedding = cached_embedding[timesteps, :]
+        else:
+            half = dim // 2
+            idx = get_idx(half, timesteps.device)
+            freqs = torch.exp(-math.log(max_period) / half * idx)
+            args = timesteps[:, None].float() * freqs[None]
+            embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+            if dim % 2:
+                embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
     else:
         embedding = repeat(timesteps, "b -> b d", d=dim)
     return embedding
diff --git a/nemo/collections/nlp/modules/common/megatron/clip_grads.py b/nemo/collections/nlp/modules/common/megatron/clip_grads.py
index a643f878dc05..7edc6720574e 100644
--- a/nemo/collections/nlp/modules/common/megatron/clip_grads.py
+++ b/nemo/collections/nlp/modules/common/megatron/clip_grads.py
@@ -18,8 +18,8 @@
 import torch
 from torch import inf
 
-from nemo.collections.nlp.modules.common.megatron.module import param_is_not_shared
 from nemo.utils import logging
+from nemo.utils.model_utils import param_is_not_shared
 
 try:
     import amp_C
@@ -82,7 +82,6 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2, use_fsdp=False):
     grads_for_norm = []
     sharded_grads = []
     sharded_grads_for_norm = []
-    dummy_overflow_buf = torch.cuda.IntTensor([0])
 
     for param in parameters:
         if param.grad is not None:
@@ -110,7 +109,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2, use_fsdp=False):
     # Norm parameters.
     max_norm = float(max_norm)
     norm_type = float(norm_type)
-    total_norm = 0.0
+    total_norm = torch.zeros(1, device='cuda', dtype=torch.float32).squeeze()
 
     # Calculate norm.
     if norm_type == inf:
@@ -118,23 +117,20 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2, use_fsdp=False):
             total_norm = max(grad.abs().max() for grad in grads_for_norm)
 
         if not use_fsdp:
-            total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
             # Take max across all model-parallel GPUs.
             torch.distributed.all_reduce(
-                total_norm_cuda, op=torch.distributed.ReduceOp.MAX, group=parallel_state.get_model_parallel_group()
+                total_norm, op=torch.distributed.ReduceOp.MAX, group=parallel_state.get_model_parallel_group()
             )
         else:
             if len(sharded_grads_for_norm) > 0:
                 sharded_total_norm = max(grad.abs().max() for grad in sharded_grads_for_norm)
                 total_norm = max(total_norm, sharded_total_norm)
-            total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
             # Take max across both model-parallel and data-parallel GPUs.
-            torch.distributed.all_reduce(total_norm_cuda, op=torch.distributed.ReduceOp.MAX)
-
-        total_norm = total_norm_cuda[0].item()
+            torch.distributed.all_reduce(total_norm, op=torch.distributed.ReduceOp.MAX)
 
     else:
         if norm_type == 2.0:
+            dummy_overflow_buf = torch.zeros(1, device='cuda', dtype=torch.int32).squeeze()
             # Use apex's multi-tensor applier for efficiency reasons.
             # Multi-tensor applier takes a function and a list of list
             # and performs the operation on that list all in one kernel.
@@ -143,7 +139,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2, use_fsdp=False):
                     amp_C.multi_tensor_l2norm, dummy_overflow_buf, [grads_for_norm], False  # no per-parameter norm
                 )
             else:
-                grad_norm = 0.0
+                grad_norm = torch.zeros(1, device='cuda', dtype=torch.float32).squeeze()
             # Since we will be summing across data parallel groups,
             # we need the pow(norm-type).
             total_norm = grad_norm ** norm_type
@@ -153,7 +149,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2, use_fsdp=False):
                         amp_C.multi_tensor_l2norm, dummy_overflow_buf.fill_(0), [sharded_grads_for_norm], False
                     )
                 else:
-                    sharded_grad_norm = 0.0
+                    sharded_grad_norm = torch.zeros(1, device='cuda', dtype=torch.float32).squeeze()
                 total_sharded_norm = sharded_grad_norm ** norm_type
         else:
             for grad in grads_for_norm:
@@ -164,29 +160,25 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2, use_fsdp=False):
                     grad_norm = torch.norm(grad, norm_type)
                     total_sharded_norm += grad_norm ** norm_type
 
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
         if use_fsdp:
-            total_sharded_norm_cuda = torch.cuda.FloatTensor([float(total_sharded_norm)])
             # Sum norm of grad shards across data-parallel GPUs.
             torch.distributed.all_reduce(
-                total_sharded_norm_cuda,
-                op=torch.distributed.ReduceOp.SUM,
-                group=parallel_state.get_data_parallel_group(),
+                total_sharded_norm, op=torch.distributed.ReduceOp.SUM, group=parallel_state.get_data_parallel_group(),
             )
-            total_norm_cuda += total_sharded_norm_cuda
+            total_norm += total_sharded_norm.squeeze()
 
+        # Sum across all model-parallel GPUs.
         torch.distributed.all_reduce(
-            total_norm_cuda, op=torch.distributed.ReduceOp.SUM, group=parallel_state.get_model_parallel_group()
+            total_norm, op=torch.distributed.ReduceOp.SUM, group=parallel_state.get_model_parallel_group()
         )
-        total_norm = total_norm_cuda[0].item()
         total_norm = total_norm ** (1.0 / norm_type)
 
     # Scale.
     clip_coeff = max_norm / (total_norm + 1.0e-6)
-    if clip_coeff < 1.0:
-        if len(grads) > 0 or len(sharded_grads) > 0:  # (@adithyare) grads can be empty for adapter training.
-            grads += sharded_grads
-            multi_tensor_applier(amp_C.multi_tensor_scale, dummy_overflow_buf.fill_(0), [grads, grads], clip_coeff)
+    clip_coeff_clamped = torch.clamp(clip_coeff, max=1.0)
+    if len(grads) > 0 or len(sharded_grads) > 0:  # (@adithyare) grads can be empty for adapter training.
+        grads += sharded_grads
+        torch._foreach_mul_(grads, clip_coeff_clamped.squeeze())
 
     return total_norm
 
diff --git a/nemo/collections/nlp/modules/common/megatron/module.py b/nemo/collections/nlp/modules/common/megatron/module.py
index 88d201d10001..ccd485427c3c 100644
--- a/nemo/collections/nlp/modules/common/megatron/module.py
+++ b/nemo/collections/nlp/modules/common/megatron/module.py
@@ -38,10 +38,6 @@
 _BF16_TYPES = (torch.BFloat16Tensor, torch.cuda.BFloat16Tensor)
 
 
-def param_is_not_shared(param):
-    return not hasattr(param, 'shared') or not param.shared
-
-
 class MegatronModule(torch.nn.Module):
     """Megatron specific extensions of torch Module with support
     for pipelining."""
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index f77eb7e25813..91f1fab348da 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -53,6 +53,12 @@
 from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
 from torch.nn.parallel import DistributedDataParallel
 
+try:
+    from torch.cuda.amp.grad_scaler import _refresh_per_optimizer_state
+except ImportError:
+    # since PyTorch 2.3 the path has changed
+    from torch.amp.grad_scaler import _refresh_per_optimizer_state
+
 from nemo.collections.multimodal.modules.stable_diffusion.attention import BasicTransformerBlock
 from nemo.collections.nlp.modules.common.megatron.module import Float16Module
 from nemo.collections.nlp.modules.common.megatron.transformer import AutocastTransformerLayer, ParallelTransformerLayer
@@ -1161,16 +1167,26 @@ class PipelineMixedPrecisionPlugin(MixedPrecisionPlugin):
 
     def __init__(
         self,
-        precision: Literal["16-mixed", "bf16-mixed"],
+        precision: Literal["16-mixed", "bf16-mixed", '16', 'bf16', 16],
         device: str,
         scaler: Optional[torch.cuda.amp.GradScaler] = None,
     ) -> None:
-        super().__init__(precision, device, scaler=scaler)
-        dtype = None
         # MixedPrecisionPlugin class in PTL >= 2.0 takes only "16-mixed" or "bf16-mixed" for precision arg
-        if precision == '16-mixed':
+        if precision in ['16-mixed', '16', 16]:
+            plugin_precision = '16-mixed'
+        elif precision in ['bf16-mixed', 'bf16']:
+            plugin_precision = 'bf16-mixed'
+        else:
+            raise RuntimeError(
+                "precision expected to be one of: "
+                "['16-mixed', '16', 16, 'bf16-mixed', 'bf16']"
+                f" but {precision} found"
+            )
+        super().__init__(plugin_precision, device, scaler=scaler)
+        dtype = None
+        if precision in ['16-mixed', '16', 16]:
             dtype = torch.float16
-        elif precision == 'bf16-mixed':
+        elif precision in ['bf16-mixed', 'bf16']:
             dtype = torch.bfloat16
 
         torch.set_autocast_gpu_dtype(dtype)
@@ -1331,7 +1347,7 @@ def update(self, new_scale=None):
                     self._hysteresis_tracker = self.hysteresis
 
         # To prepare for next iteration, clear the data collected from optimizers this iteration.
-        self._per_optimizer_states = defaultdict(torch.cuda.amp.grad_scaler._refresh_per_optimizer_state)
+        self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
 
     def state_dict(self):
         """
diff --git a/nemo/core/optim/megatron_fused_adam.py b/nemo/core/optim/megatron_fused_adam.py
index 9278f0a134ef..9035a393c8e8 100755
--- a/nemo/core/optim/megatron_fused_adam.py
+++ b/nemo/core/optim/megatron_fused_adam.py
@@ -14,7 +14,8 @@
 
 import amp_C
 import torch
-from nemo.collections.nlp.modules.common.megatron.module import param_is_not_shared
+
+from nemo.utils.model_utils import param_is_not_shared
 
 try:
     from megatron.core import parallel_state
@@ -118,7 +119,7 @@ def step(self, closure=None, grad_scaler=None):
                         False,
                     )
                 else:
-                    fp32_grad_norm = torch.tensor([0.0], dtype=torch.float32, device=device)
+                    fp32_grad_norm = torch.zeros(1, dtype=torch.float32, device=device)
 
                 if fp16_grads_for_norm:
                     fp16_grad_norm, _ = multi_tensor_applier(
@@ -129,7 +130,7 @@ def step(self, closure=None, grad_scaler=None):
                         False,
                     )
                 else:
-                    fp16_grad_norm = torch.tensor([0.0], dtype=torch.float32, device=device)
+                    fp16_grad_norm = torch.zeros(1, dtype=torch.float32, device=device)
 
                 # Prep L2 norm for allreduce
                 total_norm = (fp32_grad_norm ** self.norm_type + fp16_grad_norm ** self.norm_type).squeeze()
diff --git a/nemo/utils/callbacks/cuda_graph.py b/nemo/utils/callbacks/cuda_graph.py
index ba6046b79850..247c67856c7b 100644
--- a/nemo/utils/callbacks/cuda_graph.py
+++ b/nemo/utils/callbacks/cuda_graph.py
@@ -12,6 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# CUDAGraphCallback is a full iteration CUDA graph callback designed for
+# models with PyTorch Lightning first, this has been tested with Stable
+# Diffusion right now.
+#
+# Prerequisites for this callback:
+# 1. Capturable: user has to make sure (almost) all the host & device
+#    synchronizations are removed, some of the syncs regarding logging
+#    of metrics introduced by PyTorch Lightning itself have been removed
+#    by this callback. This ensures the graph can be captured.
+# 2. Topology: user has to make sure there's no dynamic control flow
+#    within the iteration. Please use APEX alternatives for building
+#    blocks that contain dynamic control flow, e.g. gradient clipping.
+#    Otherwise the captured graph can run, but may raise silent failure,
+#    e.g. NaN loss.
+# 3. Parameters: user has to make sure pointers involved in the graph
+#    capturing range don't change across iterations. In this case users
+#    have to ensure that data is copied to static tensors. Otherwise this
+#    can also lead to silent failure.
+
 import os
 import time
 from dataclasses import dataclass
@@ -20,9 +39,11 @@
 
 import pytorch_lightning as pl
 import torch
+from pytorch_lightning import LightningModule
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.loops.optimization.automatic import ClosureResult
-from pytorch_lightning.utilities.rank_zero import rank_zero_info
+from pytorch_lightning.trainer.connectors.logger_connector.result import _ResultCollection, _ResultMetric
+from pytorch_lightning.utilities import CombinedLoader, rank_zero_info
 from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature
 from pytorch_lightning.utilities.types import STEP_OUTPUT
 from torch.nn.parallel import DistributedDataParallel
@@ -104,6 +125,39 @@ def zero_grad(optimizer, *args, **kwargs):
         optimizer.__orig_zero_grad__(*args, **kwargs)
 
 
+def to_tensor(self, value, name):
+    # Log metrics in PyTorch Lightning often invokes CPU & GPU synchronizations. Here
+    # we implement smart metrics to avoid those synchronizations.
+    # Refer to: https://github.com/Lightning-AI/pytorch-lightning/blob/2.0.7/src/lightning/pytorch/core/module.py#L615
+    value = value.clone().detach() if isinstance(value, torch.Tensor) else torch.tensor(value)
+    if not torch.numel(value) == 1:
+        raise ValueError(
+            f"`self.log({name}, {value})` was called, but the tensor must have a single element."
+            f" You can try doing `self.log({name}, {value}.mean())`"
+        )
+    value = value.squeeze()
+    return value
+
+
+def register_key(self, key, meta, value):
+    # PyTorch Lightning creates all metrics on GPU, but creating the metric on
+    # its input device is prefered.
+    # Refer to: https://github.com/Lightning-AI/pytorch-lightning/blob/2.0.7/src/lightning/pytorch/trainer/connectors/logger_connector/result.py#L409
+    metric = _ResultMetric(meta, isinstance(value, torch.Tensor))
+    device = value.device if isinstance(value, torch.Tensor) else self.device
+    metric = metric.to(device)
+    self[key] = metric
+
+
+def update_metrics(self, key, value, batch_size):
+    # PyTorch Lightning always move all metrics to GPU, but moving the metric to
+    # its input device is prefered.
+    result_metric = self[key]
+    device = value.device if isinstance(value, torch.Tensor) else self.device
+    result_metric.forward(value.to(device), batch_size)
+    result_metric.has_reset = False
+
+
 def get_optimizer_step(state):
     def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure=None,) -> None:
         # Not all optimizer supports set_to_none.
@@ -131,7 +185,7 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure=None,) -
             # Sleep for one second to let environment stable
             time.sleep(1)
             rank_zero_info("CUDAGraphCallback: capturing CUDA graph for module %s.", self.__class__.__name__)
-            with torch.cuda.graph(state.graph, stream=state.stream):
+            with torch.cuda.graph(state.graph, stream=state.stream, capture_error_mode="global"):
                 self.__orig_optimizer_step__(
                     epoch, batch_idx, optimizer, optimizer_closure=optimizer_closure,
                 )
@@ -152,8 +206,8 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure=None,) -
 
 
 def get_training_step(state):
-    def training_step(self, batch, batch_idx):
-        results = self.__orig_training_step__(batch, batch_idx)
+    def training_step(self, batch):
+        results = self.__orig_training_step__(batch)
         if state.output is None:
             state.output = struct_copy_one(results)
 
@@ -246,7 +300,7 @@ def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -
         if self.state.capture_iteration < 0:
             return
 
-        if is_param_in_hook_signature(pl_module, "dataloader_iter", explicit=True):
+        if is_param_in_hook_signature(pl_module.training_step, "dataloader_iter", explicit=True):
             raise Exception(
                 "Found `dataloader_iter` argument in the `training_step`. This is "
                 "not supported by full iteration CUDA graph capturing yet since "
@@ -270,13 +324,17 @@ def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule")
             return
 
         # Ensure training dataloader loads data to static buffer
-        dataloader = trainer.train_dataloader
+        dataloader = trainer.fit_loop._combined_loader._iterables
         assert isinstance(
             dataloader, torch.utils.data.dataloader.DataLoader
         ), f"Expect Dataloader type but got {type(dataloader)}"
-        trainer.train_dataloader.__orig_dataloader__ = dataloader
         static_loader = StaticBufferLoader(dataloader)
-        trainer.train_dataloader.loaders = static_loader
+        _mode = trainer.fit_loop._combined_loader._mode
+        combined_loader = CombinedLoader(static_loader, mode=_mode)
+        trainer.fit_loop.__orig_combined_loader__ = trainer.fit_loop._combined_loader
+        trainer.fit_loop._combined_loader = combined_loader
+        trainer.fit_loop._data_fetcher.setup(trainer.fit_loop._combined_loader)
+        iter(trainer.fit_loop._data_fetcher)
 
         # Warn if `optimizer.zero_grad()` invoked during graph capturing
         for optimizer in trainer.optimizers:
@@ -290,10 +348,18 @@ def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule")
         for config in trainer.lr_scheduler_configs:
             assert isinstance(
                 config.scheduler, torch.optim.lr_scheduler._LRScheduler
-            ), f"Expect _LRScheduler type but got {type(dataloader)}"
+            ), f"Expect _LRScheduler type but got {type(config.scheduler)}"
             config.scheduler.__orig_get_lr__ = config.scheduler.get_lr
             config.scheduler.get_lr = MethodType(get_lr, config.scheduler)
 
+        # Use smart metrics to avoid syncs
+        LightningModule.__orig_to_tensor__ = LightningModule._LightningModule__to_tensor
+        LightningModule._LightningModule__to_tensor = to_tensor
+        _ResultCollection.__orig_register_key__ = _ResultCollection.register_key
+        _ResultCollection.register_key = register_key
+        _ResultCollection.__orig_update_metrics__ = _ResultCollection.update_metrics
+        _ResultCollection.update_metrics = update_metrics
+
         # Save model outputs to static buffer for PL states reconstruct
         pl_module.__orig_training_step__ = pl_module.training_step
         training_step = get_training_step(self.state)
@@ -309,9 +375,10 @@ def on_train_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -
         if self.state.capture_iteration < 0:
             return
 
-        dataloader = trainer.train_dataloader.__orig_dataloader__
-        trainer.train_dataloader.loaders = dataloader
-        del trainer.train_dataloader.__orig_dataloader__
+        trainer.fit_loop._combined_loader = trainer.fit_loop.__orig_combined_loader__
+        trainer.fit_loop._data_fetcher.setup(trainer.fit_loop._combined_loader)
+        iter(trainer.fit_loop._data_fetcher)
+        del trainer.fit_loop.__orig_combined_loader__
 
         for optimizer in trainer.optimizers:
             optimizer.zero_grad = optimizer.__orig_zero_grad__
@@ -321,6 +388,13 @@ def on_train_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -
             config.scheduler.get_lr = config.scheduler.__orig_get_lr__
             del config.scheduler.__orig_get_lr__
 
+        LightningModule._LightningModule__to_tensor = LightningModule.__orig_to_tensor__
+        del LightningModule.__orig_to_tensor__
+        _ResultCollection.register_key = _ResultCollection.__orig_register_key__
+        del _ResultCollection.__orig_register_key__
+        _ResultCollection.update_metrics = _ResultCollection.__orig_update_metrics__
+        del _ResultCollection.__orig_update_metrics__
+
         pl_module.training_step = pl_module.__orig_training_step__
         del pl_module.__orig_training_step__
 
diff --git a/nemo/utils/model_utils.py b/nemo/utils/model_utils.py
index c7047d4e3b52..680e7a723262 100644
--- a/nemo/utils/model_utils.py
+++ b/nemo/utils/model_utils.py
@@ -79,6 +79,10 @@ def load_config(model_file: str) -> DictConfig:
     return model_config
 
 
+def param_is_not_shared(param):
+    return not hasattr(param, 'shared') or not param.shared
+
+
 def resolve_dataset_name_from_cfg(cfg: 'DictConfig') -> Optional[str]:
     """
     Parses items of the provided sub-config to find the first potential key that
diff --git a/tests/core/test_optimizers_schedulers.py b/tests/core/test_optimizers_schedulers.py
index 2b185cbe476d..5e5d1ee20c83 100644
--- a/tests/core/test_optimizers_schedulers.py
+++ b/tests/core/test_optimizers_schedulers.py
@@ -143,7 +143,7 @@ def test_get_optimizer(self):
             model.cuda()
 
         for opt_name in AVAILABLE_OPTIMIZERS.keys():
-            if opt_name == 'fused_adam':
+            if opt_name == 'fused_adam' or opt_name == 'megatron_fused_adam':
                 if not torch.cuda.is_available():
                     continue
             if opt_name == 'distributed_fused_adam':

From b273fa6caf41bde383e93adfaa10349a3f361807 Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Date: Tue, 9 Apr 2024 14:40:44 -0500
Subject: [PATCH 112/140] remove sbert since we have bert_embedding_model
 (#8844)

---
 .../megatron_sbert_model.py                   | 803 ------------------
 1 file changed, 803 deletions(-)
 delete mode 100644 nemo/collections/nlp/models/information_retrieval/megatron_sbert_model.py

diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_sbert_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_sbert_model.py
deleted file mode 100644
index a9bb7fd40017..000000000000
--- a/nemo/collections/nlp/models/information_retrieval/megatron_sbert_model.py
+++ /dev/null
@@ -1,803 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import logging
-import os
-import random
-from typing import Dict, List, Tuple, Union
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-from omegaconf import DictConfig, OmegaConf, open_dict
-from omegaconf.dictconfig import DictConfig
-from pytorch_lightning.trainer.trainer import Trainer
-from torch import Tensor, nn
-
-from nemo.collections.nlp.data.information_retrieval.bert_embedding_dataset import BertEmbeddingDataset
-from nemo.collections.nlp.data.language_modeling.megatron.data_samplers import (
-    MegatronPretrainingRandomSampler,
-    MegatronPretrainingSampler,
-)
-from nemo.collections.nlp.models.language_modeling.megatron.bert_model import BertModel, bert_extended_attention_mask
-from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
-from nemo.collections.nlp.modules.common.megatron.utils import (
-    ApexGuardDefaults,
-    average_losses_across_data_parallel_group,
-    build_position_ids,
-)
-from nemo.utils import logging
-
-try:
-    from megatron.core import ModelParallelConfig, parallel_state
-
-    HAVE_MEGATRON_CORE = True
-
-except (ImportError, ModuleNotFoundError):
-
-    ModelParallelConfig = ApexGuardDefaults
-
-    HAVE_MEGATRON_CORE = False
-
-
-def set_seed(seed: int = 42) -> None:
-    np.random.seed(seed)
-    random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    # When running on the CuDNN backend, two further options must be set
-    torch.backends.cudnn.deterministic = True
-    torch.backends.cudnn.benchmark = False
-    # Set a fixed value for the hash seed
-    os.environ["PYTHONHASHSEED"] = str(seed)
-    print(f"Random seed set as {seed}")
-
-
-##########################
-# Below class is copied from SentenceTransformer library: https://github.com/UKPLab/sentence-transformers/blob/08a57b4a19ddaf7cccda51cd0c2c8af7bbc339a3/sentence_transformers/models/Normalize.py
-##########################
-
-
-class Normalize(nn.Module):
-    """
-    This layer normalizes embeddings to unit length
-    """
-
-    def __init__(self):
-        super(Normalize, self).__init__()
-
-    def forward(self, features: Dict[str, Tensor]):
-        features.update({"sentence_embedding": F.normalize(features["sentence_embedding"], p=2, dim=1)})
-        return features
-
-
-##########################
-# Below class is copied from SentenceTransformer library: https://github.com/UKPLab/sentence-transformers/blob/08a57b4a19ddaf7cccda51cd0c2c8af7bbc339a3/sentence_transformers/models/Pooling.py
-##########################
-
-
-class Pooling(nn.Module):
-    """Performs pooling (max or mean) on the token embeddings.
-
-    Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding. This layer also allows to use the CLS token if it is returned by the underlying word embedding model.
-    You can concatenate multiple poolings together.
-
-    :param word_embedding_dimension: Dimensions for the word embeddings
-    :param pooling_mode: Can be a string: mean/max/cls. If set, overwrites the other pooling_mode_* settings
-    :param pooling_mode_cls_token: Use the first token (CLS token) as text representations
-    :param pooling_mode_max_tokens: Use max in each dimension over all tokens.
-    :param pooling_mode_mean_tokens: Perform mean-pooling
-    :param pooling_mode_mean_sqrt_len_tokens: Perform mean-pooling, but divide by sqrt(input_length).
-    :param pooling_mode_weightedmean_tokens: Perform (position) weighted mean pooling, see https://arxiv.org/abs/2202.08904
-    :param pooling_mode_lasttoken: Perform last token pooling, see https://arxiv.org/abs/2202.08904 & https://arxiv.org/abs/2201.10005
-    """
-
-    def __init__(
-        self,
-        word_embedding_dimension: int,
-        pooling_mode: str = None,
-        pooling_mode_cls_token: bool = False,
-        pooling_mode_max_tokens: bool = False,
-        pooling_mode_mean_tokens: bool = True,
-        pooling_mode_mean_sqrt_len_tokens: bool = False,
-        pooling_mode_weightedmean_tokens: bool = False,
-        pooling_mode_lasttoken: bool = False,
-    ):
-        super(Pooling, self).__init__()
-
-        self.config_keys = [
-            "word_embedding_dimension",
-            "pooling_mode_cls_token",
-            "pooling_mode_mean_tokens",
-            "pooling_mode_max_tokens",
-            "pooling_mode_mean_sqrt_len_tokens",
-            "pooling_mode_weightedmean_tokens",
-            "pooling_mode_lasttoken",
-        ]
-
-        if pooling_mode is not None:  # Set pooling mode by string
-            pooling_mode = pooling_mode.lower()
-            assert pooling_mode in ["mean", "max", "cls", "weightedmean", "lasttoken"]
-            pooling_mode_cls_token = pooling_mode == "cls"
-            pooling_mode_max_tokens = pooling_mode == "max"
-            pooling_mode_mean_tokens = pooling_mode == "mean"
-            pooling_mode_weightedmean_tokens = pooling_mode == "weightedmean"
-            pooling_mode_lasttoken = pooling_mode == "lasttoken"
-
-        self.word_embedding_dimension = word_embedding_dimension
-        self.pooling_mode_cls_token = pooling_mode_cls_token
-        self.pooling_mode_mean_tokens = pooling_mode_mean_tokens
-        self.pooling_mode_max_tokens = pooling_mode_max_tokens
-        self.pooling_mode_mean_sqrt_len_tokens = pooling_mode_mean_sqrt_len_tokens
-        self.pooling_mode_weightedmean_tokens = pooling_mode_weightedmean_tokens
-        self.pooling_mode_lasttoken = pooling_mode_lasttoken
-
-        pooling_mode_multiplier = sum(
-            [
-                pooling_mode_cls_token,
-                pooling_mode_max_tokens,
-                pooling_mode_mean_tokens,
-                pooling_mode_mean_sqrt_len_tokens,
-                pooling_mode_weightedmean_tokens,
-                pooling_mode_lasttoken,
-            ]
-        )
-        self.pooling_output_dimension = pooling_mode_multiplier * word_embedding_dimension
-
-    def __repr__(self):
-        return "Pooling({})".format(self.get_config_dict())
-
-    def forward(self, features: Dict[str, Tensor]):
-        token_embeddings = features["token_embeddings"]
-        attention_mask = features["attention_mask"]
-
-        ## Pooling strategy
-        output_vectors = []
-        if self.pooling_mode_cls_token:
-            cls_token = features.get("cls_token_embeddings", token_embeddings[:, 0])  # Take first token by default
-            output_vectors.append(cls_token)
-        if self.pooling_mode_max_tokens:
-            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-            token_embeddings[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
-            max_over_time = torch.max(token_embeddings, 1)[0]
-            output_vectors.append(max_over_time)
-        if self.pooling_mode_mean_tokens or self.pooling_mode_mean_sqrt_len_tokens:
-            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-            sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
-
-            # If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present
-            if "token_weights_sum" in features:
-                sum_mask = features["token_weights_sum"].unsqueeze(-1).expand(sum_embeddings.size())
-            else:
-                sum_mask = input_mask_expanded.sum(1)
-
-            sum_mask = torch.clamp(sum_mask, min=1e-9)
-
-            if self.pooling_mode_mean_tokens:
-                output_vectors.append(sum_embeddings / sum_mask)
-            if self.pooling_mode_mean_sqrt_len_tokens:
-                output_vectors.append(sum_embeddings / torch.sqrt(sum_mask))
-        if self.pooling_mode_weightedmean_tokens:
-            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-            # token_embeddings shape: bs, seq, hidden_dim
-            weights = (
-                torch.arange(start=1, end=token_embeddings.shape[1] + 1)
-                .unsqueeze(0)
-                .unsqueeze(-1)
-                .expand(token_embeddings.size())
-                .float()
-                .to(token_embeddings.device)
-            )
-            assert weights.shape == token_embeddings.shape == input_mask_expanded.shape
-            input_mask_expanded = input_mask_expanded * weights
-
-            sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
-
-            # If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present
-            if "token_weights_sum" in features:
-                sum_mask = features["token_weights_sum"].unsqueeze(-1).expand(sum_embeddings.size())
-            else:
-                sum_mask = input_mask_expanded.sum(1)
-
-            sum_mask = torch.clamp(sum_mask, min=1e-9)
-            output_vectors.append(sum_embeddings / sum_mask)
-        if self.pooling_mode_lasttoken:
-            bs, seq_len, hidden_dim = token_embeddings.shape
-            # attention_mask shape: (bs, seq_len)
-            # Get shape [bs] indices of the last token (i.e. the last token for each batch item)
-            # argmin gives us the index of the first 0 in the attention mask; We get the last 1 index by subtracting 1
-            # Any sequence where min == 1, we use the entire sequence length since argmin = 0
-            values, indices = torch.min(attention_mask, 1, keepdim=False)
-            gather_indices = torch.where(values == 0, indices, seq_len) - 1  # Shape [bs]
-
-            # There are empty sequences, where the index would become -1 which will crash
-            gather_indices = torch.clamp(gather_indices, min=0)
-
-            # Turn indices from shape [bs] --> [bs, 1, hidden_dim]
-            gather_indices = gather_indices.unsqueeze(-1).repeat(1, hidden_dim)
-            gather_indices = gather_indices.unsqueeze(1)
-            assert gather_indices.shape == (bs, 1, hidden_dim)
-
-            # Gather along the 1st dim (seq_len) (bs, seq_len, hidden_dim -> bs, hidden_dim)
-            # Actually no need for the attention mask as we gather the last token where attn_mask = 1
-            # but as we set some indices (which shouldn't be attended to) to 0 with clamp, we
-            # use the attention mask to ignore them again
-            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-            embedding = torch.gather(token_embeddings * input_mask_expanded, 1, gather_indices).squeeze(dim=1)
-            output_vectors.append(embedding)
-
-        output_vector = torch.cat(output_vectors, 1)
-        features.update({"sentence_embedding": output_vector})
-        return features
-
-    def get_sentence_embedding_dimension(self):
-        return self.pooling_output_dimension
-
-    def get_config_dict(self):
-        return {key: self.__dict__[key] for key in self.config_keys}
-
-
-class SBertModel(BertModel):
-    """
-    Bert Language model.
-    Model returns [seq, batch, hidden] shape
-    """
-
-    def __init__(
-        self,
-        config: ModelParallelConfig,
-        vocab_size,
-        hidden_size,
-        max_position_embeddings,
-        num_layers,
-        num_attention_heads,
-        ffn_hidden_size,
-        apply_query_key_layer_scaling=True,
-        kv_channels=None,
-        num_tokentypes=0,
-        parallel_output=True,
-        pre_process=True,
-        post_process=True,
-        init_method_std=0.02,
-        fp16_lm_cross_entropy=False,
-        hidden_dropout=0.1,
-        precision=16,
-        fp32_residual_connection=False,
-        activations_checkpoint_granularity=None,
-        activations_checkpoint_method=None,
-        activations_checkpoint_num_layers=1,
-        activations_checkpoint_layers_per_pipeline=None,
-        layernorm_epsilon=1e-5,
-        normalization='layernorm',
-        transformer_block_type='pre_ln',
-        masked_softmax_fusion=False,
-        bias_gelu_fusion=True,
-        bias_dropout_add_fusion=True,
-        openai_gelu=False,
-        onnx_safe=False,
-        add_binary_head=True,
-        skip_head=False,
-        megatron_legacy=False,
-        sequence_parallel=False,
-        position_embedding_type='learned_absolute',
-    ):
-        super().__init__(
-            config,
-            vocab_size,
-            hidden_size,
-            max_position_embeddings,
-            num_layers,
-            num_attention_heads,
-            ffn_hidden_size,
-            apply_query_key_layer_scaling,
-            kv_channels,
-            num_tokentypes,
-            parallel_output,
-            pre_process,
-            post_process,
-            init_method_std,
-            fp16_lm_cross_entropy,
-            hidden_dropout,
-            precision,
-            fp32_residual_connection,
-            activations_checkpoint_granularity,
-            activations_checkpoint_method,
-            activations_checkpoint_num_layers,
-            activations_checkpoint_layers_per_pipeline,
-            layernorm_epsilon,
-            normalization,
-            transformer_block_type,
-            masked_softmax_fusion,
-            bias_gelu_fusion,
-            bias_dropout_add_fusion,
-            openai_gelu,
-            onnx_safe,
-            add_binary_head,
-            skip_head,
-            megatron_legacy,
-            sequence_parallel,
-            position_embedding_type,
-        )
-
-        self.pooling_add_on = Pooling(
-            word_embedding_dimension=1024,
-            pooling_mode_cls_token=False,
-            pooling_mode_mean_tokens=True,
-            pooling_mode_max_tokens=False,
-            pooling_mode_mean_sqrt_len_tokens=False,
-        )
-
-        self.normalize_add_on = Normalize()
-
-    def forward(
-        self,
-        bert_model_input,
-        attention_mask,
-        token_type_ids=None,
-        lm_labels=None,
-        checkpoint_activations_all_layers=None,
-    ):
-
-        extended_attention_mask = bert_extended_attention_mask(attention_mask)
-
-        if parallel_state.is_pipeline_first_stage():
-            input_ids = bert_model_input
-            position_ids = build_position_ids(input_ids)
-        else:
-            position_ids = None
-            input_ids = None
-
-        lm_output = self.language_model(
-            input_ids,
-            position_ids,
-            extended_attention_mask,
-            token_type_ids=token_type_ids,
-            checkpoint_activations_all_layers=checkpoint_activations_all_layers,
-        )
-
-        if self.post_process and self.add_binary_head:
-
-            lm_output, _ = lm_output
-
-        add_on_inputs = {"token_embeddings": lm_output[0].permute(1, 0, 2), "attention_mask": attention_mask}
-        lm_output = self.pooling_add_on(add_on_inputs)
-        lm_output = self.normalize_add_on(lm_output)
-
-        return lm_output['sentence_embedding']
-
-
-class MegatronSBertModel(MegatronBertModel):
-    """
-    Megatron Bert pretraining.
-    Model returns [batch, seq, hidden] shape
-    """
-
-    def __init__(self, cfg: DictConfig, trainer: Trainer):
-
-        super().__init__(cfg, trainer=trainer)
-
-        self.cross_entropy_loss = torch.nn.CrossEntropyLoss(label_smoothing=cfg.get('label_smoothing', 0.0))
-        softmax_temp = cfg.get('softmax_temp', 0.05)
-        self.scale = 1.0 / softmax_temp
-        try:
-            train_file_path = self.cfg.data.data_prefix
-            with open(train_file_path) as f:
-                train_data = json.load(f)
-
-            random_seed = 42
-            set_seed(random_seed)
-            random.shuffle(train_data)
-
-            self.train_data = train_data
-            logging.warning("Model is running in training mode")
-        except:
-            logging.warning(
-                "Model is running inference mode as training data is not specified, or could not be loaded"
-            )
-            random_seed = 42
-            set_seed(random_seed)
-
-    def model_provider_func(self, pre_process, post_process):
-        cfg = self.cfg
-        num_tokentypes = 2 if cfg.bert_binary_head else 0
-
-        if self.mcore_bert:
-            raise ValueError("mcore not supported for SBERT")
-
-        else:
-            model = SBertModel(
-                config=self.model_parallel_config,
-                vocab_size=self.padded_vocab_size,
-                hidden_size=cfg.hidden_size,
-                max_position_embeddings=cfg.max_position_embeddings,
-                num_layers=cfg.num_layers,
-                num_attention_heads=cfg.num_attention_heads,
-                apply_query_key_layer_scaling=cfg.get('apply_query_key_layer_scaling', True),
-                kv_channels=cfg.get('kv_channels', None),
-                ffn_hidden_size=cfg.ffn_hidden_size,
-                num_tokentypes=num_tokentypes,
-                parallel_output=True,
-                pre_process=pre_process,
-                post_process=post_process,
-                init_method_std=cfg.get('init_method_std', 0.02),
-                fp16_lm_cross_entropy=cfg.get('fp16_lm_cross_entropy', False),
-                hidden_dropout=cfg.get('hidden_dropout', 0.1),
-                precision=cfg.get('precision', 16),
-                fp32_residual_connection=cfg.get('fp32_residual_connection', False),
-                activations_checkpoint_granularity=self.cfg.get('activations_checkpoint_granularity', None),
-                activations_checkpoint_method=self.cfg.get('activations_checkpoint_method', None),
-                activations_checkpoint_num_layers=self.cfg.get('activations_checkpoint_num_layers', 1),
-                activations_checkpoint_layers_per_pipeline=self.cfg.get(
-                    'activations_checkpoint_layers_per_pipeline', None
-                ),
-                layernorm_epsilon=cfg.get('layernorm_epsilon', 1e-5),
-                masked_softmax_fusion=cfg.get('masked_softmax_fusion', True),
-                normalization=cfg.get('normalization', 'layernorm'),
-                transformer_block_type=cfg.get('transformer_block_type', 'pre_ln'),
-                bias_gelu_fusion=cfg.get('bias_gelu_fusion', True),
-                bias_dropout_add_fusion=cfg.get("bias_dropout_add_fusion", True),
-                onnx_safe=cfg.get('onnx_safe', False),
-                add_binary_head=cfg.bert_binary_head,
-                skip_head=cfg.get('skip_head', False),
-                megatron_legacy=cfg.get('megatron_legacy', False),
-                position_embedding_type=self.cfg.get("position_embedding_type", "learned_absolute"),
-            )
-
-        return model
-
-    def build_train_valid_test_datasets(self):
-
-        train_file_path = self.cfg.data.data_prefix
-
-        train_data = self.train_data
-
-        query_prefix = "query:"
-        passage_prefix = "passage:"
-        evaluation_sample_size = self.cfg.data.get("evaluation_sample_size", 100)
-        hard_negatives_to_train = self.cfg.data.get("hard_negatives_to_train", 4)
-        evaluation_steps = self.cfg.data.get("evaluation_steps", 100)
-
-        # TODO @ataghibakhsh: Handle valid and test datasets better
-
-        self._train_ds = None
-        self._validation_ds = None
-        self._test_ds = None
-
-        if train_file_path:  # we don't support calculating validation loss for multiple train files
-            valid_data = None
-            if evaluation_sample_size:
-                if evaluation_steps == 0:
-                    raise ValueError(
-                        "The --evaluation_steps should be greater than 0 " "when --evaluation_sample_size is set"
-                    )
-
-                if evaluation_sample_size >= len(train_data):
-                    raise ValueError("The --evaluation_sample_size cannot be greater " "than train set size.")
-
-                valid_data = train_data[-evaluation_sample_size:]
-                train_data = train_data[:-evaluation_sample_size]
-
-            if evaluation_sample_size:
-                self._validation_ds = BertEmbeddingDataset(
-                    valid_data,
-                    num_hard_negs=hard_negatives_to_train,
-                    query_prefix=query_prefix,
-                    passage_prefix=passage_prefix,
-                )
-
-        self._train_ds = BertEmbeddingDataset(
-            train_data, num_hard_negs=hard_negatives_to_train, query_prefix=query_prefix, passage_prefix=passage_prefix
-        )
-
-        if self._train_ds is not None:
-            logging.info(f'Length of train dataset: {len(self._train_ds)}')
-        if self._validation_ds is not None:
-            logging.info(f'Length of val dataset: {len(self._validation_ds)}')
-        if self._test_ds is not None:
-            logging.info(f'Length of test dataset: {len(self._test_ds)}')
-        logging.info(f'Finished building Bert datasets.')
-
-        return self._train_ds, self._validation_ds, self._test_ds
-
-    def setup(self, stage=None):
-        """ PTL hook that is executed after DDP spawns.
-            We setup datasets here as megatron datasets require DDP to instantiate.
-            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
-        Args:
-            stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
-        """
-
-        num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert(
-            self.model
-        )
-
-        logging.info(
-            f'Pipeline model parallel rank: {parallel_state.get_pipeline_model_parallel_rank()}, '
-            f'Tensor model parallel rank: {parallel_state.get_tensor_model_parallel_rank()}, '
-            f'Number of model parameters on device: {num_parameters_on_device:.2e}. '
-            f'Total number of model parameters: {total_num_parameters:.2e}.'
-        )
-
-        resume_checkpoint_path = self.trainer.ckpt_path
-        if resume_checkpoint_path:
-            init_consumed_samples = self._extract_consumed_samples_from_ckpt(resume_checkpoint_path)
-        else:
-            init_consumed_samples = 0
-        self.init_consumed_samples = init_consumed_samples
-        self.init_global_step = self.trainer.global_step
-
-        if stage == 'predict':
-            return
-        else:
-            # TODO: consider adding a ModelPT guard to check if model is being restored.
-            # allowing restored models to optionally setup datasets
-            if self.cfg.data.dataloader_type == "LDDL":
-                self.build_LDDL_data(self.cfg.data)
-                torch.distributed.barrier()
-            else:
-                self.build_train_valid_test_datasets()
-                self.setup_training_data(self.cfg.data)
-                self.setup_validation_data(self.cfg.data)
-                # self.setup_test_data(self.cfg.data)
-
-        # when using pipeline model parallel the final stage need to initialize word embeddings
-        if parallel_state.get_pipeline_model_parallel_world_size() > 1:
-            if isinstance(self.model, list):
-                for i, module in enumerate(self.model):
-                    parallel_state.set_virtual_pipeline_model_parallel_rank(i)
-                    sync_embeddings = (
-                        module.initialize_last_stage_with_word_embeddings
-                        if self.mcore_bert
-                        else module.sync_initial_word_embeddings
-                    )
-                    sync_embeddings()
-                parallel_state.set_virtual_pipeline_model_parallel_rank(0)
-            else:
-                sync_embeddings = (
-                    self.model.initialize_last_stage_with_word_embeddings
-                    if self.mcore_bert
-                    else self.model.sync_initial_word_embeddings
-                )
-                sync_embeddings()
-
-        if self.cfg.get('transformer_engine', False) or self.cfg.get('mcore_bert', False):
-            self.setup_transformer_engine_tp_groups()
-
-    @classmethod
-    def merge_cfg_with(cls, path, cfg):
-        """
-        Merge a given configuration dictionary `cfg` with the configuration dictionary
-        obtained from restoring a MegatronBertModel at the specified `path`.
-
-        Args:
-            path (str): The path to the Bert model checkpoint to be restored.
-            cfg (DictConfig): The configuration dictionary to merge.
-
-        Returns:
-            DictConfig: The merged configuration dictionary.
-
-        Examples:
-            >>> path = "/path/to/model/checkpoint"
-            >>> cfg = DictConfig({"model": {"key": "value"}, "trainer": {"precision": 16}})
-            >>> merged_cfg = merge_cfg_with(path, cfg)
-
-        Notes:
-            - The function resolves variables within the `cfg` dictionary using `OmegaConf.resolve`.
-            - Keys in `cfg.model` will override the corresponding keys in the output dictionary.
-            - If "train_ds" exists in `cfg.model.data`, it updates `micro_batch_size` and `global_batch_size`.
-            - If `cfg.trainer` contains a "precision" key, it updates `output.precision`.
-
-        """
-
-        base_cfg = cls.restore_from(path, return_config=True)
-
-        OmegaConf.resolve(cfg)
-        with open_dict(base_cfg):
-            for key, val in cfg.model.items():
-                base_cfg[key] = val
-            if "train_ds" in cfg.model.data:
-                base_cfg.micro_batch_size = cfg.model.data.train_ds.micro_batch_size
-                base_cfg.global_batch_size = cfg.model.data.train_ds.global_batch_size
-            if cfg.get("trainer", None) and cfg.trainer.get("precision"):
-                base_cfg.precision = cfg.trainer.precision
-
-        return base_cfg
-
-    def build_pretraining_data_loader(self, dataset, consumed_samples):
-        """Buld dataloader given an input dataset."""
-
-        if dataset is None:
-            return None
-
-        # Megatron sampler
-        if hasattr(self.cfg.data, 'dataloader_type') and self.cfg.data.dataloader_type is not None:
-            if self.cfg.data.dataloader_type == 'single':
-                batch_sampler = MegatronPretrainingSampler(
-                    total_samples=len(dataset),
-                    consumed_samples=consumed_samples,
-                    micro_batch_size=self.cfg.micro_batch_size,
-                    global_batch_size=self.cfg.global_batch_size,
-                    data_parallel_rank=parallel_state.get_data_parallel_rank(),
-                    data_parallel_size=parallel_state.get_data_parallel_world_size(),
-                    drop_last=self.cfg.get('drop_last', True),
-                )
-            elif self.cfg.data.dataloader_type == 'cyclic':
-                batch_sampler = MegatronPretrainingRandomSampler(
-                    total_samples=len(dataset),
-                    consumed_samples=consumed_samples,
-                    micro_batch_size=self.cfg.micro_batch_size,
-                    data_parallel_rank=parallel_state.get_data_parallel_rank(),
-                    data_parallel_size=parallel_state.get_data_parallel_world_size(),
-                    drop_last=self.cfg.get('drop_last', True),
-                )
-            else:
-                raise ValueError('cfg.data.dataloader_type must be "single" or "cyclic"')
-        else:
-            raise ValueError('cfg.data.dataloader_type not found. Must be "single" or "cyclic"')
-
-        # Torch dataloader.
-
-        dataloader = torch.utils.data.DataLoader(
-            dataset,
-            shuffle=False,
-            batch_sampler=batch_sampler,
-            num_workers=self.cfg.data.num_workers,
-            pin_memory=True,
-            persistent_workers=True if self.cfg.data.num_workers > 0 else False,
-        )
-
-        dataloader.collate_fn = self.batching_collate
-
-        return dataloader
-
-    def tokenize(self, texts: Union[List[str], List[Dict], List[Tuple[str, str]]]):
-
-        max_seq_length = self.cfg.encoder_seq_length
-        do_lower_case = self.cfg.tokenizer.get("do_lower_case", False)
-        """
-        Tokenizes a text and maps tokens to token-ids
-        """
-        output = {}
-        if isinstance(texts[0], str):
-            to_tokenize = [texts]
-        elif isinstance(texts[0], dict):
-            to_tokenize = []
-            output["text_keys"] = []
-            for lookup in texts:
-                text_key, text = next(iter(lookup.items()))
-                to_tokenize.append(text)
-                output["text_keys"].append(text_key)
-            to_tokenize = [to_tokenize]
-        else:
-            batch1, batch2 = [], []
-            for text_tuple in texts:
-                batch1.append(text_tuple[0])
-                batch2.append(text_tuple[1])
-            to_tokenize = [batch1, batch2]
-
-        # strip
-        to_tokenize = [[str(s).strip() for s in col] for col in to_tokenize]
-
-        # Lowercase
-        if do_lower_case:
-            to_tokenize = [[s.lower() for s in col] for col in to_tokenize]
-
-        output.update(
-            self.tokenizer.tokenizer(
-                *to_tokenize, padding=True, truncation="longest_first", return_tensors="pt", max_length=max_seq_length,
-            )
-        )
-        return output
-
-    def batching_collate(self, batch):
-        """
-            Transforms a batch from a SmartBatchingDataset to a batch of tensors for the model
-            Here, batch is a list of InputExample instances: [InputExample(...), ...]
-
-            :param batch:
-                a batch from a SmartBatchingDataset
-            :return:
-                a batch of tensors for the model
-            """
-
-        sentence_features = [self.tokenize(sentence) for sentence in zip(*batch)]
-
-        return sentence_features
-
-    def get_forward_output_and_loss_func(self):
-        def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_layers=None):
-
-            batches = next(dataloader_iter)
-
-            (
-                tokens_batch,
-                types_batch,
-                sentence_order_batch,
-                loss_mask_batch,
-                lm_labels_batch,
-                padding_mask_batch,
-            ) = ([], [], [], [], [], [])
-            for batch in batches:
-                tokens, types, sentence_order, loss_mask, lm_labels, padding_mask = (
-                    batch['input_ids'].cuda(non_blocking=True),
-                    batch['token_type_ids'].cuda(non_blocking=True),
-                    None,
-                    None,
-                    None,
-                    batch['attention_mask'].cuda(non_blocking=True),
-                )
-                tokens_batch.append(tokens)
-                types_batch.append(types)
-                sentence_order_batch.append(sentence_order)
-                loss_mask_batch.append(loss_mask)
-                lm_labels_batch.append(lm_labels)
-                padding_mask_batch.append(padding_mask)
-
-            if not self.cfg.bert_binary_head:
-                types = None
-
-            forward_args = [
-                {"input_ids": tokens, "token_type_ids": types, "attention_mask": padding_mask}
-                for tokens, padding_mask, types in zip(tokens_batch, padding_mask_batch, types_batch)
-            ]
-
-            if self.mcore_bert:
-                raise Exception("mcore not supported at the moment. It will be added in the near future")
-            else:
-                output_tensor = [self.forward(**forward_arg).permute(1, 0) for forward_arg in forward_args]
-
-            def loss_func(output_tensor):
-
-                loss_dict = self.loss_func(output_tensor)
-
-                if 'sop loss' in loss_dict:
-                    lm_loss = loss_dict['lm loss']
-                    sop_loss = loss_dict['sop loss']
-                    loss = lm_loss + sop_loss
-                    reduced_loss = average_losses_across_data_parallel_group([loss, lm_loss, sop_loss])
-                else:
-                    lm_loss = loss_dict['lm loss']
-                    loss = lm_loss
-                    reduced_loss = average_losses_across_data_parallel_group([loss, lm_loss])
-
-                return loss, {'loss': reduced_loss}
-
-            return output_tensor, loss_func
-
-        return fwd_output_and_loss_func
-
-    def loss_func(self, output_tensor):
-        queries = output_tensor[0]  # shape (bs, embedding_dim)
-        positives = output_tensor[1]  # shape (bs, embedding_dim)
-
-        pos_inbatch_negs_scores = torch.mm(
-            queries, positives.transpose(0, 1)
-        )  # shape (bs, bs); each positive is negative for other queries.
-
-        hard_negs = output_tensor[2:]  # List of length "num_negatives", each tensor of shape (bs, embedding_dim)
-
-        hard_negs_scores = (
-            torch.multiply(queries.unsqueeze(0).repeat(len(hard_negs), 1, 1), torch.stack(hard_negs),).sum(axis=-1).T
-        )  # shape = (bs, num_negatives); Hard negatives are not shared between queries.
-
-        scores = torch.cat([pos_inbatch_negs_scores, hard_negs_scores], axis=1)
-
-        scores *= self.scale
-
-        labels = torch.tensor(
-            range(len(scores)), dtype=torch.long, device=scores.device
-        )  # Indices of the (query, positive) pairs
-
-        return {'lm loss': self.cross_entropy_loss(scores, labels)}

From b2debd8d83ab88dbfbd6a1d9e7949320c4a7e8e2 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Tue, 9 Apr 2024 16:03:12 -0700
Subject: [PATCH 113/140] Akoumparouli/fix get params for weight decay
 optimization (#8841)

* fix get_params_for_weight_decay_optimization

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* filter returned values by presence of parameters

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* use module_._parameters.items instead of .named_parameters to avoid duplicate params

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../nlp/modules/common/megatron/utils.py      | 31 ++++++++++++-------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/utils.py b/nemo/collections/nlp/modules/common/megatron/utils.py
index 42d14592c363..97022ab5e459 100644
--- a/nemo/collections/nlp/modules/common/megatron/utils.py
+++ b/nemo/collections/nlp/modules/common/megatron/utils.py
@@ -366,17 +366,25 @@ def get_params_for_weight_decay_optimization(
     is_expert = lambda param: not getattr(param, 'allreduce', True)
     # Do the actual param classification
     for module in modules:
-        for name, param in module.named_parameters():
-            if param is None:
-                continue
-            if name.endswith('.bias'):
-                no_weight_decay_params['params'].extend([param])
+        for module_ in module.modules():
+            if isinstance(module_, (FusedLayerNorm, FastLayerNorm, MixedFusedRMSNorm)):
+                no_weight_decay_params['params'].extend(
+                    list(filter(lambda p: p is not None, module_._parameters.values()))
+                )
             else:
-                if is_expert(param):
-                    weight_decay_expert_params['params'].extend([param])
-                else:
-                    weight_decay_params['params'].extend([param])
-    return weight_decay_params, weight_decay_expert_params, no_weight_decay_params
+                for name, param in module_._parameters.items():
+                    if param is None:
+                        continue
+                    if name.endswith('bias'):
+                        no_weight_decay_params['params'].extend([param])
+                    else:
+                        if is_expert(param):
+                            weight_decay_expert_params['params'].extend([param])
+                        else:
+                            weight_decay_params['params'].extend([param])
+
+    param_groups = [weight_decay_params, weight_decay_expert_params, no_weight_decay_params]
+    return tuple(filter(lambda g: len(g['params']) > 0, param_groups))
 
 
 def get_all_params_for_weight_decay_optimization(
@@ -394,7 +402,8 @@ def get_all_params_for_weight_decay_optimization(
         weight_decay_params['params'] += list(filter(lambda x: not is_expert(x), module.parameters()))
         weight_decay_expert_params['params'] += list(filter(is_expert, module.parameters()))
 
-    return weight_decay_params, weight_decay_expert_params
+    param_groups = [weight_decay_params, weight_decay_expert_params]
+    return tuple(filter(lambda g: len(g['params']) > 0, param_groups))
 
 
 def get_iterator_k_split(batch: List[torch.Tensor], num_microbatches: int) -> Iterator:

From 3f3df1c34aa73cfc890b93af678e3c3a63760fe5 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Tue, 9 Apr 2024 17:07:27 -0700
Subject: [PATCH 114/140] Akoumparouli/peft fix (#8823)

* Move precision restoration inside megtron_trainer_builder

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Don't enforce O1 in eval

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* safer prefix replacer

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* comment

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* drop conf resolve

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* typo

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../tuning/megatron_gpt_finetuning.py          |  5 -----
 .../nlp/parts/megatron_trainer_builder.py      |  7 ++++++-
 .../nlp/parts/mixins/nlp_adapter_mixins.py     | 18 +++++++++++++++---
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py b/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py
index 1e6f680fad7e..aaa087a46623 100644
--- a/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py
+++ b/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py
@@ -56,12 +56,7 @@ def main(cfg) -> None:
     logging.info("\n\n************** Experiment configuration ***********")
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
-    # cfg.trainer.precision becomes None in TrainerBuilder if precision_plugins exist since both precision plugins and precision
-    # can't exist in PTL >= 2.1, hence storing precision value from cfg.trainer.precision as its used for future steps like in merge_cfg_with func.
-    precision = cfg.trainer.precision
     trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
-    # Restore the precision value after Trainer is built.
-    cfg.trainer.precision = precision
     exp_manager(trainer, cfg.exp_manager)
 
     model_cfg = MegatronGPTSFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg)
diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
index 77d306c17da0..b25ce249d09d 100644
--- a/nemo/collections/nlp/parts/megatron_trainer_builder.py
+++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py
@@ -129,12 +129,17 @@ def _plugins(self) -> list:
         return plugins
 
     def create_trainer(self, callbacks=None) -> Trainer:
+        # cfg.trainer.precision becomes None in Trainer if precision_plugins exist since both precision plugins and precision
+        precision = self.cfg.trainer.precision
         strategy = self._training_strategy()
         plugins = self._plugins()
         # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
         if 'enable_progress_bar' not in self.cfg.trainer or self.cfg.trainer.enable_progress_bar:
             callbacks = [CustomProgressBar()]
-        return Trainer(plugins=plugins, strategy=strategy, **self.cfg.trainer, callbacks=callbacks)
+        trainer = Trainer(plugins=plugins, strategy=strategy, **self.cfg.trainer, callbacks=callbacks)
+        # Restore the precision value after Trainer is built.
+        self.cfg.trainer.precision = precision
+        return trainer
 
 
 class MegatronBertTrainerBuilder(MegatronTrainerBuilder):
diff --git a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
index 3797ec909737..123f0f06a33d 100644
--- a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
+++ b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
@@ -47,6 +47,14 @@
     HAVE_MEGATRON_CORE = False
 
 
+def replace_prefix(name, old_prefix, new_prefix):
+    if name.startswith(new_prefix):
+        return name
+    if not name.startswith(old_prefix):
+        return name
+    return name.replace(old_prefix, new_prefix, 1)
+
+
 class NLPAdapterModelMixin:
     """ NLP Adapter Mixin that can augment any transformer-based model with Adapter module support.
     This mixin class should be used only with a top level ModelPT subclass, that includes either a `model` or an `enc_dec_model` submodule.
@@ -268,7 +276,7 @@ def load_adapters(
         """
         Utility method that restores only the adapter module(s), and not the entire model itself.
         This allows the sharing of adapters which are often just a fraction of the size of the full model,
-        enabling easier deliver.
+        enabling easier delivery.
 
         .. note::
 
@@ -299,6 +307,8 @@ def load_adapters(
                 '.nemo'
             ), "Inferring peft scheme is only supported for .nemo checkpoints. Please supply the `peft_cfgs` argument."
             peft_cfgs = [PEFT_CONFIG_MAP[conf.peft.peft_scheme](conf)]
+        if self.cfg.megatron_amp_O2:
+            state_dict = {replace_prefix(k, 'model.', 'model.module.'): v for k, v in state_dict.items()}
         self.add_adapter(peft_cfgs)
         if not self.ptuning_only_and_non_first_stage:
             assert set(state_dict.keys()) == self.adapter_keys.union(self.tunable_base_param_keys)
@@ -506,17 +516,19 @@ def merge_inference_cfg(cls, path: str, cfg: DictConfig) -> DictConfig:
 
         with open_dict(peft_cfg):
             # update the model config of the trained model with params we want to set at inference time.
-            peft_cfg.precision = cfg.trainer.precision
             for key, val in cfg.model.items():
                 if key != 'data':
                     peft_cfg[key] = val
+            if cfg.get("trainer", None) and cfg.trainer.get("precision"):
+                peft_cfg.precision = cfg.trainer.precision
             peft_cfg.data.test_ds = cfg.model.data.test_ds
 
         with open_dict(cfg):
             cfg.inference.add_BOS = peft_cfg.data.test_ds.add_bos
             cfg.inference.tokens_to_generate = peft_cfg.data.test_ds.get("tokens_to_generate", 1)
 
-        peft_cfg.megatron_amp_O2 = False  # always evaluate with O1
+        if cfg.model.get('megatron_amp_O2', None) is not None:
+            peft_cfg.megatron_amp_O2 = cfg.model.megatron_amp_O2
         return peft_cfg
 
     def freeze(self, training: bool = False) -> None:

From 4d0ae36dc8b6b720a66a43c5bb9eb6ef6e27fec1 Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Date: Tue, 9 Apr 2024 21:28:04 -0400
Subject: [PATCH 115/140] Add deploy triton and query scripts (#8852)

* Add deploy triton and query scripts

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Update scripts based on reviews

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

---------

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
---
 scripts/deploy/nlp/deploy_triton.py | 274 ++++++++++++++++++++++++++++
 scripts/deploy/nlp/query.py         | 247 +++++++++++++++++++++++++
 2 files changed, 521 insertions(+)
 create mode 100755 scripts/deploy/nlp/deploy_triton.py
 create mode 100644 scripts/deploy/nlp/query.py

diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py
new file mode 100755
index 000000000000..aa896e924584
--- /dev/null
+++ b/scripts/deploy/nlp/deploy_triton.py
@@ -0,0 +1,274 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import os
+import sys
+from pathlib import Path
+
+from nemo.deploy import DeployPyTriton
+from nemo.export import TensorRTLLM
+
+
+LOGGER = logging.getLogger("NeMo")
+
+
+def get_args(argv):
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=f"Deploy nemo models to Triton",
+    )
+    parser.add_argument("-nc", "--nemo_checkpoint", type=str, help="Source .nemo file")
+    parser.add_argument(
+        "-ptnc",
+        "--ptuning_nemo_checkpoint",
+        nargs='+',
+        type=str,
+        required=False,
+        help="Source .nemo file for prompt embeddings table",
+    )
+    parser.add_argument(
+        '-ti', '--task_ids', nargs='+', type=str, required=False, help='Unique task names for the prompt embedding.'
+    )
+    parser.add_argument(
+        "-mt",
+        "--model_type",
+        type=str,
+        required=False,
+        choices=["gptnext", "gpt", "llama", "falcon", "starcoder", "mixtral", "gemma"],
+        help="Type of the model. gptnext, gpt, llama, falcon, and starcoder are only supported."
+        " gptnext and gpt are the same and keeping it for backward compatibility",
+    )
+    parser.add_argument("-tmn", "--triton_model_name", required=True, type=str, help="Name for the service")
+    parser.add_argument("-tmv", "--triton_model_version", default=1, type=int, help="Version for the service")
+    parser.add_argument(
+        "-trp", "--triton_port", default=8000, type=int, help="Port for the Triton server to listen for requests"
+    )
+    parser.add_argument(
+        "-tha", "--triton_http_address", default="0.0.0.0", type=str, help="HTTP address for the Triton server"
+    )
+    parser.add_argument(
+        "-tmr", "--triton_model_repository", default=None, type=str, help="Folder for the trt-llm conversion"
+    )
+    parser.add_argument("-ng", "--num_gpus", default=1, type=int, help="Number of GPUs for the deployment")
+    parser.add_argument(
+        "-dt",
+        "--dtype",
+        choices=["bfloat16", "float16", "fp8", "int8"],
+        default="bfloat16",
+        type=str,
+        help="dtype of the model on TensorRT-LLM",
+    )
+    parser.add_argument("-mil", "--max_input_len", default=256, type=int, help="Max input length of the model")
+    parser.add_argument("-mol", "--max_output_len", default=256, type=int, help="Max output length of the model")
+    parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model")
+    parser.add_argument(
+        "-mpet", "--max_prompt_embedding_table_size", default=None, type=int, help="Max prompt embedding table size"
+    )
+    parser.add_argument(
+        "-upkc", "--use_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
+    )
+    parser.add_argument(
+        "-dcf",
+        "--disable_context_fmha",
+        default=False,
+        action='store_true',
+        help="Disable fused Context MultiHeadedAttention (required for V100 support).",
+    )
+    parser.add_argument(
+        "-mbm",
+        '--multi_block_mode',
+        default=False,
+        action='store_true',
+        help='Split long kv sequence into multiple blocks (applied to generation MHA kernels). \
+                        It is beneifical when batchxnum_heads cannot fully utilize GPU.',
+    )
+    parser.add_argument(
+        "-es", '--enable_streaming', default=False, action='store_true', help="Enables streaming sentences."
+    )
+    parser.add_argument(
+        '--use_lora_plugin',
+        nargs='?',
+        const=None,
+        default=False,
+        choices=['float16', 'float32', 'bfloat16'],
+        help="Activates the lora plugin which enables embedding sharing.",
+    )
+    parser.add_argument(
+        '--lora_target_modules',
+        nargs='+',
+        default=None,
+        choices=["attn_qkv", "attn_q", "attn_k", "attn_v", "attn_dense", "mlp_h_to_4h", "mlp_gate", "mlp_4h_to_h",],
+        help="Add lora in which modules. Only be activated when use_lora_plugin is enabled.",
+    )
+    parser.add_argument(
+        '--max_lora_rank',
+        type=int,
+        default=64,
+        help='maximum lora rank for different lora modules. '
+        'It is used to compute the workspace size of lora plugin.',
+    )
+    parser.add_argument(
+        "-lc", "--lora_ckpt", default=None, type=str, nargs="+", help="The checkpoint list of LoRA weights"
+    )
+    parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode")
+
+    args = parser.parse_args(argv)
+    return args
+
+
+def nemo_deploy(argv):
+    args = get_args(argv)
+
+    if args.debug_mode:
+        loglevel = logging.DEBUG
+    else:
+        loglevel = logging.INFO
+
+    LOGGER.setLevel(loglevel)
+    LOGGER.info("Logging level set to {}".format(loglevel))
+    LOGGER.info(args)
+
+    if args.triton_model_repository is None:
+        trt_llm_path = "/tmp/trt_llm_model_dir/"
+        LOGGER.info(
+            "/tmp/trt_llm_model_dir/ path will be used as the TensorRT LLM folder. "
+            "Please set this parameter if you'd like to use a path that has already "
+            "included the TensorRT LLM model files."
+        )
+        Path(trt_llm_path).mkdir(parents=True, exist_ok=True)
+    else:
+        trt_llm_path = args.triton_model_repository
+
+    if args.nemo_checkpoint is None and args.triton_model_repository is None:
+        LOGGER.error(
+            "The provided model repository is not a valid TensorRT-LLM model "
+            "directory. Please provide a --nemo_checkpoint."
+        )
+        return
+
+    if args.nemo_checkpoint is None and not os.path.isdir(args.triton_model_repository):
+        LOGGER.error(
+            "The provided model repository is not a valid TensorRT-LLM model "
+            "directory. Please provide a --nemo_checkpoint."
+        )
+        return
+
+    if args.nemo_checkpoint is not None and args.model_type is None:
+        LOGGER.error("Model type is required to be defined if a nemo checkpoint is provided.")
+        return
+
+    ptuning_tables_files = []
+    if not args.ptuning_nemo_checkpoint is None:
+        if args.max_prompt_embedding_table_size is None:
+            LOGGER.error("max_prompt_embedding_table_size parameter is needed for the prompt tuning table(s).")
+            return
+
+        for pt_checkpoint in args.ptuning_nemo_checkpoint:
+            ptuning_nemo_checkpoint_path = Path(pt_checkpoint)
+            if ptuning_nemo_checkpoint_path.exists():
+                if ptuning_nemo_checkpoint_path.is_file():
+                    ptuning_tables_files.append(pt_checkpoint)
+                else:
+                    LOGGER.error("Could not read the prompt tuning tables from {0}".format(pt_checkpoint))
+                    return
+            else:
+                LOGGER.error("File or directory {0} does not exist.".format(pt_checkpoint))
+                return
+
+        if args.task_ids is not None:
+            if len(ptuning_tables_files) != len(args.task_ids):
+                LOGGER.error(
+                    "Number of task ids and prompt embedding tables have to match. "
+                    "There are {0} tables and {1} task ids.".format(len(ptuning_tables_files), len(args.task_ids))
+                )
+                return
+
+    trt_llm_exporter = TensorRTLLM(model_dir=trt_llm_path, lora_ckpt_list=args.lora_ckpt)
+
+    if args.nemo_checkpoint is not None:
+        try:
+            LOGGER.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.")
+            trt_llm_exporter.export(
+                nemo_checkpoint_path=args.nemo_checkpoint,
+                model_type=args.model_type,
+                n_gpus=args.num_gpus,
+                tensor_parallel_size=args.num_gpus,
+                pipeline_parallel_size=1,
+                max_input_token=args.max_input_len,
+                max_output_token=args.max_output_len,
+                max_batch_size=args.max_batch_size,
+                max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
+                paged_kv_cache=args.use_paged_kv_cache,
+                enable_context_fmha=not args.disable_context_fmha,
+                dtype=args.dtype,
+                enable_multi_block_mode=args.multi_block_mode,
+                use_lora_plugin=args.use_lora_plugin,
+                lora_target_modules=args.lora_target_modules,
+                max_lora_rank=args.max_lora_rank,
+                save_nemo_model_config=True,
+            )
+        except Exception as error:
+            LOGGER.error("An error has occurred during the model export. Error message: " + str(error))
+            return
+
+    try:
+        for i, prompt_embeddings_checkpoint_path in enumerate(ptuning_tables_files):
+            if args.task_ids is not None:
+                task_id = args.task_ids[i]
+            else:
+                task_id = i
+
+            LOGGER.info(
+                "Adding prompt embedding table: {0} with task id: {1}.".format(
+                    prompt_embeddings_checkpoint_path, task_id
+                )
+            )
+            trt_llm_exporter.add_prompt_table(
+                task_name=str(task_id), prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path,
+            )
+    except Exception as error:
+        LOGGER.error("An error has occurred during adding the prompt embedding table(s). Error message: " + str(error))
+        return
+
+    try:
+        nm = DeployPyTriton(
+            model=trt_llm_exporter,
+            triton_model_name=args.triton_model_name,
+            triton_model_version=args.triton_model_version,
+            max_batch_size=args.max_batch_size,
+            port=args.triton_port,
+            address=args.triton_http_address,
+            streaming=args.enable_streaming,
+        )
+
+        LOGGER.info("Triton deploy function will be called.")
+        nm.deploy()
+    except Exception as error:
+        LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error))
+        return
+
+    try:
+        LOGGER.info("Model serving on Triton is will be started.")
+        nm.serve()
+    except Exception as error:
+        LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error))
+        return
+
+    LOGGER.info("Model serving will be stopped.")
+    nm.stop()
+
+
+if __name__ == '__main__':
+    nemo_deploy(sys.argv[1:])
diff --git a/scripts/deploy/nlp/query.py b/scripts/deploy/nlp/query.py
new file mode 100644
index 000000000000..20f3d587a1cc
--- /dev/null
+++ b/scripts/deploy/nlp/query.py
@@ -0,0 +1,247 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import sys
+import typing
+
+import numpy as np
+from pytriton.client import DecoupledModelClient, ModelClient
+
+
+def get_args(argv):
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description=f"Exports nemo models stored in nemo checkpoints to TensorRT-LLM",
+    )
+    parser.add_argument("-u", "--url", default="0.0.0.0", type=str, help="url for the triton server")
+    parser.add_argument("-mn", "--model_name", required=True, type=str, help="Name of the triton model")
+    prompt_group = parser.add_mutually_exclusive_group(required=True)
+    prompt_group.add_argument("-p", "--prompt", required=False, type=str, help="Prompt")
+    prompt_group.add_argument("-pf", "--prompt_file", required=False, type=str, help="File to read the prompt from")
+    parser.add_argument("-swl", "--stop_words_list", type=str, help="Stop words list")
+    parser.add_argument("-bwl", "--bad_words_list", type=str, help="Bad words list")
+    parser.add_argument("-nrns", "--no_repeat_ngram_size", type=int, help="No repeat ngram size")
+    parser.add_argument("-mot", "--max_output_token", default=128, type=int, help="Max output token length")
+    parser.add_argument("-tk", "--top_k", default=1, type=int, help="top_k")
+    parser.add_argument("-tpp", "--top_p", default=0.0, type=float, help="top_p")
+    parser.add_argument("-t", "--temperature", default=1.0, type=float, help="temperature")
+    parser.add_argument("-ti", "--task_id", type=str, help="Task id for the prompt embedding tables")
+    parser.add_argument(
+        "-lt",
+        "--lora_task_uids",
+        default=None,
+        type=str,
+        nargs="+",
+        help="The list of LoRA task uids; use -1 to disable the LoRA module",
+    )
+    parser.add_argument(
+        "-es", '--enable_streaming', default=False, action='store_true', help="Enables streaming sentences."
+    )
+    parser.add_argument("-it", "--init_timeout", default=60.0, type=float, help="init timeout for the triton server")
+
+    args = parser.parse_args(argv)
+    return args
+
+
+def str_list2numpy(str_list: typing.List[str]) -> np.ndarray:
+    str_ndarray = np.array(str_list)[..., np.newaxis]
+    return np.char.encode(str_ndarray, "utf-8")
+
+
+def query_llm(
+    url,
+    model_name,
+    prompts,
+    stop_words_list=None,
+    bad_words_list=None,
+    no_repeat_ngram_size=None,
+    max_output_token=128,
+    top_k=1,
+    top_p=0.0,
+    temperature=1.0,
+    random_seed=None,
+    task_id=None,
+    lora_uids=None,
+    init_timeout=60.0,
+):
+    prompts = str_list2numpy(prompts)
+    inputs = {"prompts": prompts}
+
+    if max_output_token is not None:
+        inputs["max_output_token"] = np.full(prompts.shape, max_output_token, dtype=np.int_)
+
+    if top_k is not None:
+        inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_)
+
+    if top_p is not None:
+        inputs["top_p"] = np.full(prompts.shape, top_p, dtype=np.single)
+
+    if temperature is not None:
+        inputs["temperature"] = np.full(prompts.shape, temperature, dtype=np.single)
+
+    if random_seed is not None:
+        inputs["random_seed"] = np.full(prompts.shape, random_seed, dtype=np.single)
+
+    if stop_words_list is not None:
+        stop_words_list = np.char.encode(stop_words_list, "utf-8")
+        inputs["stop_words_list"] = np.full((prompts.shape[0], len(stop_words_list)), stop_words_list)
+
+    if bad_words_list is not None:
+        bad_words_list = np.char.encode(bad_words_list, "utf-8")
+        inputs["bad_words_list"] = np.full((prompts.shape[0], len(bad_words_list)), bad_words_list)
+
+    if no_repeat_ngram_size is not None:
+        inputs["no_repeat_ngram_size"] = np.full(prompts.shape, no_repeat_ngram_size, dtype=np.single)
+
+    if task_id is not None:
+        task_id = np.char.encode(task_id, "utf-8")
+        inputs["task_id"] = np.full((prompts.shape[0], len([task_id])), task_id)
+
+    if lora_uids is not None:
+        lora_uids = np.char.encode(lora_uids, "utf-8")
+        inputs["lora_uids"] = np.full((prompts.shape[0], len(lora_uids)), lora_uids)
+
+    with ModelClient(url, model_name, init_timeout_s=init_timeout) as client:
+        result_dict = client.infer_batch(**inputs)
+        output_type = client.model_config.outputs[0].dtype
+
+    if output_type == np.bytes_:
+        sentences = np.char.decode(result_dict["outputs"].astype("bytes"), "utf-8")
+        return sentences
+    else:
+        return result_dict["outputs"]
+
+
+def query_llm_streaming(
+    url,
+    model_name,
+    prompts,
+    stop_words_list=None,
+    bad_words_list=None,
+    no_repeat_ngram_size=None,
+    max_output_token=512,
+    top_k=1,
+    top_p=0.0,
+    temperature=1.0,
+    random_seed=None,
+    task_id=None,
+    lora_uids=None,
+    init_timeout=60.0,
+):
+    prompts = str_list2numpy(prompts)
+    inputs = {"prompts": prompts}
+
+    if max_output_token is not None:
+        inputs["max_output_token"] = np.full(prompts.shape, max_output_token, dtype=np.int_)
+
+    if top_k is not None:
+        inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_)
+
+    if top_p is not None:
+        inputs["top_p"] = np.full(prompts.shape, top_p, dtype=np.single)
+
+    if temperature is not None:
+        inputs["temperature"] = np.full(prompts.shape, temperature, dtype=np.single)
+
+    if random_seed is not None:
+        inputs["random_seed"] = np.full(prompts.shape, random_seed, dtype=np.int_)
+
+    if stop_words_list is not None:
+        stop_words_list = np.char.encode(stop_words_list, "utf-8")
+        inputs["stop_words_list"] = np.full((prompts.shape[0], len(stop_words_list)), stop_words_list)
+
+    if bad_words_list is not None:
+        bad_words_list = np.char.encode(bad_words_list, "utf-8")
+        inputs["bad_words_list"] = np.full((prompts.shape[0], len(bad_words_list)), bad_words_list)
+
+    if no_repeat_ngram_size is not None:
+        inputs["no_repeat_ngram_size"] = np.full(prompts.shape, no_repeat_ngram_size, dtype=np.single)
+
+    if task_id is not None:
+        task_id = np.char.encode(task_id, "utf-8")
+        inputs["task_id"] = np.full((prompts.shape[0], len([task_id])), task_id)
+
+    if lora_uids is not None:
+        lora_uids = np.char.encode(lora_uids, "utf-8")
+        inputs["lora_uids"] = np.full((prompts.shape[0], len(lora_uids)), lora_uids)
+
+    with DecoupledModelClient(url, model_name, init_timeout_s=init_timeout) as client:
+        for partial_result_dict in client.infer_batch(**inputs):
+            output_type = client.model_config.outputs[0].dtype
+            if output_type == np.bytes_:
+                sentences = np.char.decode(partial_result_dict["outputs"].astype("bytes"), "utf-8")
+                yield sentences
+            else:
+                yield partial_result_dict["outputs"]
+
+
+def query(argv):
+    args = get_args(argv)
+
+    if args.prompt_file is not None:
+        with open(args.prompt_file, "r") as f:
+            args.prompt = f.read()
+
+    if args.enable_streaming:
+        output_generator = query_llm_streaming(
+            url=args.url,
+            model_name=args.model_name,
+            prompts=[args.prompt],
+            stop_words_list=None if args.stop_words_list is None else [args.stop_words_list],
+            bad_words_list=None if args.bad_words_list is None else [args.bad_words_list],
+            no_repeat_ngram_size=args.no_repeat_ngram_size,
+            max_output_token=args.max_output_token,
+            top_k=args.top_k,
+            top_p=args.top_p,
+            temperature=args.temperature,
+            task_id=args.task_id,
+            lora_uids=args.lora_task_uids,
+            init_timeout=args.init_timeout,
+        )
+        # The query returns a generator that yields one array per model step,
+        # with the partial generated text in the last dimension. Print that partial text
+        # incrementally and compare it with all the text generated so far.
+        prev_output = ''
+        for output in output_generator:
+            cur_output = output[0][0]
+            if prev_output == '' or cur_output.startswith(prev_output):
+                print(cur_output[len(prev_output) :], end='', flush=True)
+            else:
+                print("WARN: Partial output mismatch, restarting output...")
+                print(cur_output, end='', flush=True)
+            prev_output = cur_output
+        print()
+
+    else:
+        outputs = query_llm(
+            url=args.url,
+            model_name=args.model_name,
+            prompts=[args.prompt],
+            stop_words_list=None if args.stop_words_list is None else [args.stop_words_list],
+            bad_words_list=None if args.bad_words_list is None else [args.bad_words_list],
+            no_repeat_ngram_size=args.no_repeat_ngram_size,
+            max_output_token=args.max_output_token,
+            top_k=args.top_k,
+            top_p=args.top_p,
+            temperature=args.temperature,
+            task_id=args.task_id,
+            lora_uids=args.lora_task_uids,
+            init_timeout=args.init_timeout,
+        )
+        print(outputs[0][0])
+
+
+if __name__ == '__main__':
+    query(sys.argv[1:])

From a6db8dbabec50ee151d94e1352df8c078874fbfb Mon Sep 17 00:00:00 2001
From: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Date: Tue, 9 Apr 2024 21:30:10 -0400
Subject: [PATCH 116/140] add check if pos embed (#8857)

Signed-off-by: jiemingz <jiemingz@nvidia.com>
Co-authored-by: jiemingz <jiemingz@nvidia.com>
Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com>
---
 .../nlp/models/language_modeling/megatron_base_model.py        | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index baa6e30af81d..854c5ee02e31 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -775,7 +775,8 @@ def get_config_arg(key: str, default_value: Optional[Any] = None) -> Any:
                 if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
                     if self.mcore_gpt:
                         fp32_params.append(modules[0].shared_embedding_or_output_weight())
-                        fp32_params.append(modules[0].embedding.position_embeddings.weight)
+                        if modules[0].embedding.add_position_embedding:
+                            fp32_params.append(modules[0].embedding.position_embeddings.weight)
                     else:
                         fp32_params.append(modules[0].word_embeddings_weight())
                         fp32_params.append(modules[0].position_embeddings_weight())

From 0ea94f78b39eec76cfa9bf9df3126328a93337c4 Mon Sep 17 00:00:00 2001
From: Jaemin Choi <minitu77@gmail.com>
Date: Tue, 9 Apr 2024 18:36:22 -0700
Subject: [PATCH 117/140] Enable DGRAD RS overlap (#8840)

* Enable DGRAD RS overlap

Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>

* Support cases where TE version is new but NeMo/MCore is not

Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Clean up syntax

Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>
Co-authored-by: Jaemin Choi <jaeminc@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../gpt_full_te_layer_autocast_spec.py        | 30 ++++++++++++++++---
 .../modules/common/megatron/transformer.py    | 30 ++++++++++++++++---
 2 files changed, 52 insertions(+), 8 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
index f89cbedf9f5d..a6d422a3f2d4 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
@@ -123,8 +123,18 @@ def __init__(
         }
         te_version = packaging.version.Version(version("transformer-engine"))
         if te_version > packaging.version.Version("1.5.0"):
-            transformer_layer_args["ub_overlap_ag"] = kwargs.get("ub_overlap_ag", True)
-            transformer_layer_args["ub_overlap_rs"] = kwargs.get("ub_overlap_rs", True)
+            for comm in ["ag", "rs"]:
+                ub_overlap_flag = "ub_overlap_" + comm
+                split_gemm_flag = "ub_split_" + comm
+                atomic_gemm_flag = "ub_atomic_gemm_" + comm
+                # Use old overlap flags if they were supplied instead
+                if ub_overlap_flag in kwargs:
+                    transformer_layer_args[ub_overlap_flag] = kwargs[ub_overlap_flag]
+                else:
+                    transformer_layer_args[ub_overlap_flag] = kwargs.get(split_gemm_flag, True) or kwargs.get(
+                        atomic_gemm_flag, False
+                    )
+            transformer_layer_args["ub_overlap_rs_dgrad"] = kwargs.get("ub_overlap_rs_dgrad", False)
         else:
             transformer_layer_args["ub_split_ag"] = kwargs.get("ub_split_ag", True)
             transformer_layer_args["ub_split_rs"] = kwargs.get("ub_split_rs", True)
@@ -204,8 +214,20 @@ def __init__(self, config, layer_number=1, hidden_dropout=None):
         }
         te_version = packaging.version.Version(version("transformer-engine"))
         if te_version > packaging.version.Version("1.5.0"):
-            transformer_layer_args["ub_overlap_ag"] = config.tp_comm_overlap_ag
-            transformer_layer_args["ub_overlap_rs"] = config.tp_comm_overlap_rs
+            # Use old overlap flags if they were supplied instead
+            transformer_layer_args["ub_overlap_ag"] = (
+                config.tp_comm_overlap_ag
+                if hasattr(config, "tp_comm_overlap_ag")
+                else config.tp_comm_split_ag or config.tp_comm_atomic_ag
+            )
+            transformer_layer_args["ub_overlap_rs"] = (
+                config.tp_comm_overlap_rs
+                if hasattr(config, "tp_comm_overlap_rs")
+                else config.tp_comm_split_rs or config.tp_comm_atomic_rs
+            )
+            transformer_layer_args["ub_overlap_rs_dgrad"] = (
+                config.tp_comm_overlap_rs_dgrad if hasattr(config, "tp_comm_overlap_rs_dgrad") else False
+            )
         else:
             transformer_layer_args["ub_split_ag"] = config.tp_comm_split_ag
             transformer_layer_args["ub_split_rs"] = config.tp_comm_split_rs
diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py
index d37c1e75d341..b33a996b7987 100644
--- a/nemo/collections/nlp/modules/common/megatron/transformer.py
+++ b/nemo/collections/nlp/modules/common/megatron/transformer.py
@@ -839,8 +839,18 @@ def __init__(
         }
         te_version = packaging.version.Version(version("transformer-engine"))
         if te_version > packaging.version.Version("1.5.0"):
-            transformer_layer_args["ub_overlap_ag"] = kwargs.get("ub_overlap_ag", True)
-            transformer_layer_args["ub_overlap_rs"] = kwargs.get("ub_overlap_rs", True)
+            for comm in ["ag", "rs"]:
+                ub_overlap_flag = "ub_overlap_" + comm
+                split_gemm_flag = "ub_split_" + comm
+                atomic_gemm_flag = "ub_atomic_gemm_" + comm
+                # Use old overlap flags if they were supplied instead
+                if ub_overlap_flag in kwargs:
+                    transformer_layer_args[ub_overlap_flag] = kwargs[ub_overlap_flag]
+                else:
+                    transformer_layer_args[ub_overlap_flag] = kwargs.get(split_gemm_flag, True) or kwargs.get(
+                        atomic_gemm_flag, False
+                    )
+            transformer_layer_args["ub_overlap_rs_dgrad"] = kwargs.get("ub_overlap_rs_dgrad", False)
         else:
             transformer_layer_args["ub_split_ag"] = kwargs.get("ub_split_ag", True)
             transformer_layer_args["ub_split_rs"] = kwargs.get("ub_split_rs", True)
@@ -1099,8 +1109,20 @@ def build_layer(layer_number):
                 }
                 te_version = packaging.version.Version(version("transformer-engine"))
                 if te_version > packaging.version.Version("1.5.0"):
-                    transformer_layer_args["ub_overlap_ag"] = config.tp_comm_overlap_ag
-                    transformer_layer_args["ub_overlap_rs"] = config.tp_comm_overlap_rs
+                    # Use old overlap flags if they were supplied instead
+                    transformer_layer_args["ub_overlap_ag"] = (
+                        config.tp_comm_overlap_ag
+                        if hasattr(config, "tp_comm_overlap_ag")
+                        else config.tp_comm_split_ag or config.tp_comm_atomic_ag
+                    )
+                    transformer_layer_args["ub_overlap_rs"] = (
+                        config.tp_comm_overlap_rs
+                        if hasattr(config, "tp_comm_overlap_rs")
+                        else config.tp_comm_split_rs or config.tp_comm_atomic_rs
+                    )
+                    transformer_layer_args["ub_overlap_rs_dgrad"] = (
+                        config.tp_comm_overlap_rs_dgrad if hasattr(config, "tp_comm_overlap_rs_dgrad") else False
+                    )
                 else:
                     transformer_layer_args["ub_split_ag"] = config.tp_comm_split_ag
                     transformer_layer_args["ub_split_rs"] = config.tp_comm_split_rs

From 6b660e74439f96f987034ae1ecbf9e837dbff02f Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Wed, 10 Apr 2024 08:02:38 -0400
Subject: [PATCH 118/140] fix precision of output model in conversion scripts
 (#8855)

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 scripts/checkpoint_converters/convert_bert_hf_to_nemo.py   | 3 +++
 scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py  | 3 +++
 scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py | 3 +++
 scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py | 3 +++
 4 files changed, 12 insertions(+)

diff --git a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
index 24294cfdfb85..278f7b879b28 100644
--- a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
@@ -31,6 +31,7 @@
 from transformers import AutoModel
 from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
 from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
 from nemo.utils import logging
 
 
@@ -238,6 +239,8 @@ def convert(args):
             nemo_state_dict['model.language_model.embedding.word_embeddings.weight'] = padded_embedding
 
     model.load_state_dict(nemo_state_dict, strict=True)
+    dtype = torch_dtype_from_precision(args.precision)
+    model = model.to(dtype=dtype)
     model.save_to(args.output_path)
     logging.info(f'NeMo model saved to: {args.output_path}')
 
diff --git a/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py b/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py
index 9e2eb5e3a797..de12aefd1844 100644
--- a/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py
@@ -30,6 +30,7 @@
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
 from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
 from nemo.utils import logging
 
 
@@ -259,6 +260,8 @@ def convert(args):
     ), f'prediction mismatch: {hf_tokenizer.decode(hf_next_token)} != {hf_tokenizer.decode(next_token)}'
     logging.info(f'=' * 100)
 
+    dtype = torch_dtype_from_precision(args.precision)
+    model = model.to(dtype=dtype)
     model.save_to(args.output_path)
     logging.info(f'NeMo model saved to: {args.output_path}')
 
diff --git a/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py b/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py
index eeefbd215a1a..c35906dc78c1 100644
--- a/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py
@@ -33,6 +33,7 @@
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
 from nemo.utils import logging
 
 
@@ -224,6 +225,8 @@ def convert(args):
     nemo_state_dict = adjust_tensor_shapes(model, new_state_dict)
     model.load_state_dict(nemo_state_dict, strict=False)
 
+    dtype = torch_dtype_from_precision(args.precision)
+    model = model.to(dtype=dtype)
     model.save_to(args.output_path)
     logging.info(f'NeMo model saved to: {args.output_path}')
 
diff --git a/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py b/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py
index b0dddcc60233..583ee7893c0f 100644
--- a/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py
@@ -33,6 +33,7 @@
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
 from nemo.utils import logging
 
 PAD_TOKEN_ID = -1
@@ -303,6 +304,8 @@ def convert(args):
     )
     assert torch.argmax(nemo_outputs[0, -1], dim=-1) == pyt_outputs, "Predicted next token not match."
 
+    dtype = torch_dtype_from_precision(args.precision)
+    model = model.to(dtype=dtype)
     model.save_to(args.output_path)
     logging.info(f'NeMo model saved to: {args.output_path}')
 

From 95a0a3e2b6b72f8fd6941a7de176029821d5cc3e Mon Sep 17 00:00:00 2001
From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Date: Wed, 10 Apr 2024 19:01:38 +0300
Subject: [PATCH 119/140] NeMo upgrade to ToT mcore & ToT TE (#8755)

* add mcore dataset updates

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix mcore import

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert config

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore installation

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore installation

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert config

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update apex, TE & PyT

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* setup pythonpath for mcore

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add mcore to python path

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add mcore to pythonpath

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update pythonpath for mcore

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* change pythonpath for mcore

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore pythonpath

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore pythonpath

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert mcore ds changes

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert config

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add qk_layernorm support for Falcon self attn submodule

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* code style changes

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add nemo implementation for get_gpt_layer_ammo_spec

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix typo

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* skip Llama2 - INT8 SQ test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* skip Llama2 - INT8 SQ test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* comment out NeMo PTQ test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* bert mcore updates

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add qk_layernorm support for bert's self attention submodule

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add qk_layernorm support for bert's self attn submodule

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* change mcore commit

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* switch back to mcore original

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* bugfix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update TE

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* change legacy model to mcore based model for lora

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove unnecessary files

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore commit

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* uncomment PTQ tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove sbert

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* switch back to mcore main

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove unused variable

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* comment out CUDA Graph test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 Jenkinsfile                                   | 111 ++++++++----------
 .../megatron/gpt_fim_dataset.py               |   5 -
 .../megatron_bert_embedding_model.py          |   4 +-
 .../megatron/bert/bert_model.py               |  19 +--
 .../megatron/bert/bert_spec.py                |   9 +-
 .../megatron/falcon/falcon_spec.py            |   5 +-
 .../gpt_full_te_layer_autocast_spec.py        |   2 +-
 .../megatron/gpt_layer_ammo_spec.py           |  77 ++++++++++++
 .../language_modeling/megatron_bert_model.py  |   2 +-
 .../language_modeling/megatron_gpt_model.py   |  28 ++---
 10 files changed, 168 insertions(+), 94 deletions(-)
 create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_ammo_spec.py

diff --git a/Jenkinsfile b/Jenkinsfile
index 14f9a38a9c17..431bc24907ed 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1,7 +1,7 @@
 pipeline {
   agent {
         docker {
-          image 'nvcr.io/nvidia/pytorch:24.01-py3'
+          image 'nvcr.io/nvidia/pytorch:24.02-py3'
           args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache:/root/.cache --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1'
         }
   }
@@ -63,44 +63,35 @@ pipeline {
       }
     }
 
-    // Transformer Engine 1.2.0
     stage('Transformer Engine installation') {
       steps {
          sh 'git clone https://github.com/NVIDIA/TransformerEngine.git && \
              cd TransformerEngine && \
-             git fetch origin 8c9abbb80dba196f086b8b602a7cf1bce0040a6a && \
+             git fetch origin bfe21c3d68b0a9951e5716fb520045db53419c5e && \
              git checkout FETCH_HEAD && \
              git submodule init && git submodule update && \
              NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .'
       }
     }
 
-    // Apex bugfix for PyTorch 23.11 container: https://github.com/NVIDIA/apex/pull/1760
     stage('Apex installation') {
       steps {
          sh 'git clone https://github.com/NVIDIA/apex.git && \
              cd apex && \
-             git checkout c07a4cf67102b9cd3f97d1ba36690f985bae4227 && \
+             git checkout 810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c && \
              cp -R apex /usr/local/lib/python3.10/dist-packages'
       }
     }
 
-    stage('Pytorch lightning installation') {
-      steps {
-         sh 'git clone -b bug_fix https://github.com/athitten/pytorch-lightning.git && \
-             cd pytorch-lightning && \
-             PACKAGE_NAME=pytorch pip install -e .'
-      }
-    }
-
-    // pip package should be working with main, if not we can update the commit here
-    // until the pip package is updated
     stage('Megatron Core installation') {
       steps {
          sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \
              cd Megatron-LM && \
-             git checkout a5415fcfacef2a37416259bd38b7c4b673583675 && \
-             pip install .'
+             git checkout 7fe863f3d94f7b64a927b04b85f5c9339d3fb784 && \
+             pip install . && \
+             cd megatron/core/datasets && \
+             make'
+         sh 'export PYTHONPATH="${PYTHONPATH}:/mnt/D3/JenkinsWorkDir/workspace/NeMo-multibranch_${GIT_BRANCH}/Megatron-LM"'
       }
     }
 
@@ -217,48 +208,48 @@ pipeline {
         sh "rm -rf /home/TestData/multimodal/stable_diffusion_train"
       }
     }
-    stage('L2: Multimodal Stable Diffusion Train with Cuda Graph') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
-        sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \
-            trainer.precision=16 \
-            trainer.num_nodes=1 \
-            trainer.devices=1 \
-            ++exp_manager.max_time_per_run=00:00:03:00 \
-            exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train_with_cuda_graph \
-            trainer.max_steps=20 \
-            model.micro_batch_size=1 \
-            model.global_batch_size=1 \
-            model.data.synthetic_data=True \
-            model.first_stage_key=images_moments \
-            model.cond_stage_key=clip_encoded \
-            model.optim.name=megatron_fused_adam \
-            +model.optim.capturable=True \
-            exp_manager.ema.enable=False \
-            model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
-            ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
-            ++model.cond_stage_config.max_length=77 \
-            model.inductor=False \
-            ~model.cond_stage_config.restore_from_path \
-            ~model.cond_stage_config.freeze \
-            ~model.cond_stage_config.layer \
-            model.first_stage_config.from_pretrained=null \
-            model.ddp_overlap=False \
-            model.capture_cudagraph_iters=15 \
-            model.unet_config.use_flash_attention=False \
-            model.unet_config.attention_resolutions=[1] \
-            model.unet_config.channel_mult=[1] \
-            "
-        sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
-      }
-    }
+    //stage('L2: Multimodal Stable Diffusion Train with Cuda Graph') {
+    //  when {
+    //    anyOf {
+    //      branch 'main'
+    //      changeRequest target: 'main'
+    //    }
+    //  }
+    //  failFast true
+    //  steps {
+    //    sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
+    //    sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \
+    //        trainer.precision=16 \
+    //        trainer.num_nodes=1 \
+    //        trainer.devices=1 \
+    //        ++exp_manager.max_time_per_run=00:00:03:00 \
+    //        exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train_with_cuda_graph \
+    //        trainer.max_steps=20 \
+    //        model.micro_batch_size=1 \
+    //        model.global_batch_size=1 \
+    //       model.data.synthetic_data=True \
+    //        model.first_stage_key=images_moments \
+    //        model.cond_stage_key=clip_encoded \
+    //        model.optim.name=megatron_fused_adam \
+    //        +model.optim.capturable=True \
+    //        exp_manager.ema.enable=False \
+    //        model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
+    //        ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
+    //        ++model.cond_stage_config.max_length=77 \
+    //        model.inductor=False \
+    //        ~model.cond_stage_config.restore_from_path \
+    //        ~model.cond_stage_config.freeze \
+    //        ~model.cond_stage_config.layer \
+    //        model.first_stage_config.from_pretrained=null \
+    //        model.ddp_overlap=False \
+    //        model.capture_cudagraph_iters=15 \
+    //        model.unet_config.use_flash_attention=False \
+    //        model.unet_config.attention_resolutions=[1] \
+    //        model.unet_config.channel_mult=[1] \
+    //        "
+    //    sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
+    //  }
+    //}
 //     stage('L2: Multimodal ControlNet Train') {
 //       when {
 //         anyOf {
@@ -4654,7 +4645,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         model.pipeline_model_parallel_size=1 \
         model.tensor_model_parallel_size=2 \
         model.sequence_parallel=true \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
+        model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
         model.peft.peft_scheme='lora' \
         model.answer_only_loss=True \
         model.micro_batch_size=1 \
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py
index 8862b52ee84b..474761c41d67 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py
@@ -33,11 +33,6 @@
     IMPORT_ERROR = e
 
 
-# is_dataset_built_on_rank function is needed for mcore GPTDatasetConfig
-def is_dataset_built_on_rank():
-    return True
-
-
 class GPTFIMDatasetConfig(GPTDatasetConfig):
     """Configuration object for Megatron Core GPT FIM datasets
 
diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
index 5d8ff1d305bd..849438d408a5 100644
--- a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
+++ b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
@@ -227,7 +227,7 @@ def setup(self, stage=None):
                 for i, module in enumerate(self.model):
                     parallel_state.set_virtual_pipeline_model_parallel_rank(i)
                     sync_embeddings = (
-                        module.initialize_last_stage_with_word_embeddings
+                        module.setup_embeddings_and_output_layer
                         if self.mcore_bert
                         else module.sync_initial_word_embeddings
                     )
@@ -235,7 +235,7 @@ def setup(self, stage=None):
                 parallel_state.set_virtual_pipeline_model_parallel_rank(0)
             else:
                 sync_embeddings = (
-                    self.model.initialize_last_stage_with_word_embeddings
+                    self.model.setup_embeddings_and_output_layer
                     if self.mcore_bert
                     else self.model.sync_initial_word_embeddings
                 )
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py
index 0fed19dd7718..749d960b9729 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py
@@ -347,17 +347,19 @@ def __init__(self, transformer_block_type='pre-ln', add_pooler=True, *args, **kw
         # Output
         if self.post_process:
             # TODO: Make sure you are passing in the mpu_vocab_size properly
-            self.lm_head = MCoreBertLMHead(
+            self.lm_head = MCoreBertLMHead(self.config.hidden_size, self.config,)
+
+            self.output_layer = tensor_parallel.ColumnParallelLinear(
                 self.config.hidden_size,
-                self.config,
-                self.parallel_output,
                 self.vocab_size,
-                self.pre_process,
-                self.share_embeddings_and_output_weights,
+                config=self.config,
+                init_method=self.config.init_method,
+                bias=True,
+                skip_bias_add=False,
+                gather_output=not self.parallel_output,
+                skip_weight_param_allocation=self.pre_process and self.share_embeddings_and_output_weights,
             )
 
-            self.output_layer = self.lm_head.output_layer
-
             self.binary_head = None
             if self.add_binary_head:
                 # TODO: Shoudl switch this to TE ?
@@ -412,7 +414,8 @@ def forward(
         if self.share_embeddings_and_output_weights:
             output_weight = self.shared_embedding_or_output_weight()
 
-        logits = self.lm_head(hidden_states=hidden_states, word_embeddings_weight=output_weight)
+        hidden_states_after_lm_head = self.lm_head(hidden_states=hidden_states)
+        logits, _ = self.output_layer(hidden_states_after_lm_head, weight=output_weight)
 
         binary_logits = None
         if self.binary_head is not None and self.add_pooler:
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_spec.py
index 31fd62126c15..58ea9c26fbcf 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_spec.py
@@ -26,6 +26,7 @@
     )
     from megatron.core.transformer.dot_product_attention import DotProductAttention
     from megatron.core.transformer.enums import AttnMaskType
+    from megatron.core.transformer.identity_op import IdentityOp
     from megatron.core.transformer.mlp import MLP, MLPSubmodules
     from megatron.core.transformer.spec_utils import ModuleSpec
 
@@ -51,6 +52,8 @@
                 linear_qkv=TEColumnParallelLinear,
                 core_attention=TEDotProductAttention,
                 linear_proj=TERowParallelLinear,
+                q_layernorm=IdentityOp,
+                k_layernorm=IdentityOp,
             ),
         ),
         self_attn_bda=get_bias_dropout_add,
@@ -71,7 +74,11 @@
             module=SelfAttention,
             params={"attn_mask_type": AttnMaskType.padding},
             submodules=SelfAttentionSubmodules(
-                linear_qkv=ColumnParallelLinear, core_attention=DotProductAttention, linear_proj=RowParallelLinear,
+                linear_qkv=ColumnParallelLinear,
+                core_attention=DotProductAttention,
+                linear_proj=RowParallelLinear,
+                q_layernorm=IdentityOp,
+                k_layernorm=IdentityOp,
             ),
         ),
         self_attn_bda=get_bias_dropout_add,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
index 924e5f4321e6..cf0c4c4d99ef 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
@@ -24,9 +24,9 @@
         TERowParallelLinear,
     )
     from megatron.core.transformer.enums import AttnMaskType
+    from megatron.core.transformer.identity_op import IdentityOp
     from megatron.core.transformer.mlp import MLP, MLPSubmodules
     from megatron.core.transformer.spec_utils import ModuleSpec
-
     from megatron.core.transformer.transformer_layer import TransformerLayerSubmodules
 
     HAVE_MEGATRON_CORE = True
@@ -39,6 +39,7 @@
 
 from .falcon_decoder_layer import FalconTransformerLayer
 
+
 # Use this spec for an implementation using modules in TE
 def get_falcon_layer_spec() -> ModuleSpec:
     if not HAVE_MEGATRON_CORE:
@@ -54,6 +55,8 @@ def get_falcon_layer_spec() -> ModuleSpec:
                 linear_qkv=TEColumnParallelLinear,
                 core_attention=TEDotProductAttention,
                 linear_proj=TERowParallelLinear,
+                q_layernorm=IdentityOp,
+                k_layernorm=IdentityOp,
             ),
         ),
         self_attn_bda=get_bias_dropout_add,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
index a6d422a3f2d4..19766e4a34ca 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
@@ -287,7 +287,7 @@ def _get_layer_offset(self):
 
         return offset
 
-    def sharded_state_dict(self, prefix: str = '', sharded_offsets: tuple = ()):
+    def sharded_state_dict(self, prefix: str = '', sharded_offsets: tuple = (), metadata=None):
         TENSOR_PARALLEL_LAYERS_AXIS_MAP = {
             'self_attention.layernorm_qkv.weight': 0,
             'self_attention.layernorm_qkv.bias': 0,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_ammo_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_ammo_spec.py
new file mode 100644
index 000000000000..e51ecaba463a
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_ammo_spec.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+try:
+    from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+    from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+    from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+    from megatron.core.transformer.custom_layers.transformer_engine import TENorm
+    from megatron.core.transformer.dot_product_attention import DotProductAttention
+    from megatron.core.transformer.enums import AttnMaskType
+    from megatron.core.transformer.identity_op import IdentityOp
+    from megatron.core.transformer.mlp import MLP, MLPSubmodules
+    from megatron.core.transformer.spec_utils import ModuleSpec
+    from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError) as e:
+
+    TransformerLayer = TransformerLayerSubmodules = ApexGuardDefaults
+    MLP = MLPSubmodules = ModuleSpec = IdentityOp = ApexGuardDefaults
+    AttnMaskType = DotProductAttention = TENorm = ApexGuardDefaults
+    ColumnParallelLinear = RowParallelLinear = SelfAttention = SelfAttentionSubmodules = ApexGuardDefaults
+
+    HAVE_MEGATRON_CORE = False
+    IMPORT_ERROR = e
+
+# Use this spec for AMMO PTQ and TensorRT-LLM export
+def get_gpt_layer_ammo_spec() -> ModuleSpec:
+    """Mix the native spec with TENorm.
+
+    This is essentially the native local spec except for the layernorm implementation
+    is using TENorm from Transformer-Engine. This TENorm supports both FusedLayerNorm and RMSNorm and
+    prevents the apex dependency.
+    """
+    if not HAVE_MEGATRON_CORE:
+        raise Exception(IMPORT_ERROR)
+
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            input_layernorm=TENorm,
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": AttnMaskType.causal},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=ColumnParallelLinear,
+                    core_attention=DotProductAttention,
+                    linear_proj=RowParallelLinear,
+                    q_layernorm=IdentityOp,
+                    k_layernorm=IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_mlp_layernorm=TENorm,
+            mlp=ModuleSpec(
+                module=MLP, submodules=MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear,),
+            ),
+            mlp_bda=get_bias_dropout_add,
+            # Map TE-layernorm-fusion keys back
+            sharded_state_dict_keys_map={
+                'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
+                'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
+            },
+        ),
+    )
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
index fb02223112d6..82b2b1a96ff4 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
@@ -806,7 +806,7 @@ def setup(self, stage=None):
                 if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
                     parallel_state.set_virtual_pipeline_model_parallel_rank(index)
                     sync_embeddings = (
-                        module.initialize_last_stage_with_word_embeddings
+                        module.setup_embeddings_and_output_layer
                         if self.mcore_bert
                         else module.sync_initial_word_embeddings
                     )
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index ae33cc6761e9..6648abac8ee0 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -36,15 +36,12 @@
     MegatronPretrainingSampler,
 )
 from nemo.collections.nlp.data.language_modeling.megatron.gpt_dataset import build_train_valid_test_datasets
-from nemo.collections.nlp.data.language_modeling.megatron.gpt_fim_dataset import (
-    GPTFIMDataset,
-    GPTFIMDatasetConfig,
-    is_dataset_built_on_rank,
-)
+from nemo.collections.nlp.data.language_modeling.megatron.gpt_fim_dataset import GPTFIMDataset, GPTFIMDatasetConfig
 from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import get_falcon_layer_spec
 from nemo.collections.nlp.models.language_modeling.megatron.gpt_full_te_layer_autocast_spec import (
     get_gpt_full_te_layer_autocast_spec,
 )
+from nemo.collections.nlp.models.language_modeling.megatron.gpt_layer_ammo_spec import get_gpt_layer_ammo_spec
 from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
 from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
 from nemo.collections.nlp.modules.common.megatron.build_model import build_model
@@ -92,7 +89,9 @@
     from megatron.core import InferenceParams, parallel_state, tensor_parallel
     from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
     from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset
-    from megatron.core.deploy.gpt.model_specs import get_gpt_layer_ammo_spec
+
+    # NeMo's implementation of the get_gpt_layer_ammo_spec function is temporarily used
+    # from megatron.core.inference.gpt.model_specs import get_gpt_layer_ammo_spec
     from megatron.core.models.gpt import GPTModel as MCoreGPTModel
     from megatron.core.models.gpt.gpt_layer_specs import (
         get_gpt_layer_local_spec,
@@ -1375,9 +1374,11 @@ def build_train_valid_test_datasets(self):
                 tokenizer=self.tokenizer,
             )
         else:
+            # Function needed for mcore GPTDataset
+            is_dataset_built_on_rank = lambda: True
+
             mock_dataset = True if self.cfg.data.get("data_impl", "mmap") == "mock" else False
             kwargs = {
-                "is_built_on_rank": is_dataset_built_on_rank,
                 "random_seed": self.cfg.seed,
                 "sequence_length": self.cfg.data.seq_length,
                 "path_to_cache": self.cfg.data.index_mapping_dir,
@@ -1399,17 +1400,14 @@ def build_train_valid_test_datasets(self):
 
             if self.cfg.data.get('add_fim', False):
                 dataset_config = GPTFIMDatasetConfig(self.cfg.data.fim, **kwargs)
-
-                self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
-                    GPTFIMDataset, train_valid_test_num_samples, dataset_config,
-                ).build()
+                dataset_type = GPTFIMDataset
             else:
                 dataset_config = GPTDatasetConfig(**kwargs)
                 dataset_type = MockGPTDataset if mock_dataset else GPTDataset
 
-                self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
-                    dataset_type, train_valid_test_num_samples, dataset_config,
-                ).build()
+            self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
+                dataset_type, train_valid_test_num_samples, is_dataset_built_on_rank, dataset_config,
+            ).build()
 
         if self._train_ds is not None:
             logging.info(f'Length of train dataset: {len(self._train_ds)}')
@@ -1746,7 +1744,7 @@ def initialize_last_rank_embeddings(self):
                     if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
                         parallel_state.set_virtual_pipeline_model_parallel_rank(index)
                     sync_embeddings = (
-                        module.initialize_last_stage_with_word_embeddings
+                        module.setup_embeddings_and_output_layer
                         if self.mcore_gpt
                         else module.sync_initial_word_embeddings
                     )

From b33af25bdd1425eb42b96dc3aa06211c830b5278 Mon Sep 17 00:00:00 2001
From: Vladimir Bataev <vbataev@nvidia.com>
Date: Wed, 10 Apr 2024 21:03:16 +0400
Subject: [PATCH 120/140] Use Label-Looping algorithm for RNN-T decoding by
 default (#8831)

* Use Label-Looping algorithm for RNN-T decoding by default
* Fix loop labels + stateless decoding

---------

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
---
 nemo/collections/asr/modules/rnnt.py          |   4 +-
 .../asr/parts/submodules/rnnt_decoding.py     |   2 +-
 .../parts/submodules/rnnt_greedy_decoding.py  |   8 +-
 .../submodules/rnnt_loop_labels_computer.py   |  15 +-
 .../submodules/tdt_loop_labels_computer.py    |  15 +-
 .../test_asr_hybrid_rnnt_ctc_model_char.py    |  63 ++++++--
 .../asr/test_asr_rnnt_encdec_model.py         | 139 ++++++++++++++----
 7 files changed, 173 insertions(+), 73 deletions(-)

diff --git a/nemo/collections/asr/modules/rnnt.py b/nemo/collections/asr/modules/rnnt.py
index 948760e68b30..5a7457f6379d 100644
--- a/nemo/collections/asr/modules/rnnt.py
+++ b/nemo/collections/asr/modules/rnnt.py
@@ -310,7 +310,9 @@ def score_hypothesis(
 
     def initialize_state(self, y: torch.Tensor) -> List[torch.Tensor]:
         batch = y.size(0)
-        state = [torch.ones([batch, self.context_size], dtype=torch.long, device=y.device) * self.blank_idx]
+        # state contains context_size - 1 elements for each utterance in batch,
+        # consistent with the state returned from StatelessNet.forward
+        state = [torch.ones([batch, self.context_size - 1], dtype=torch.long, device=y.device) * self.blank_idx]
         return state
 
     def batch_initialize_states(self, batch_states: List[torch.Tensor], decoder_states: List[List[torch.Tensor]]):
diff --git a/nemo/collections/asr/parts/submodules/rnnt_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_decoding.py
index ad71e5371f01..7a260f3c6c89 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_decoding.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_decoding.py
@@ -319,7 +319,7 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int):
                         preserve_alignments=self.preserve_alignments,
                         preserve_frame_confidence=self.preserve_frame_confidence,
                         confidence_method_cfg=self.confidence_method_cfg,
-                        loop_labels=self.cfg.greedy.get('loop_labels', False),
+                        loop_labels=self.cfg.greedy.get('loop_labels', True),
                         use_cuda_graph_decoder=self.cfg.greedy.get('use_cuda_graph_decoder', False),
                     )
                 else:
diff --git a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
index d69ed1c41049..464dc46e358c 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
@@ -568,9 +568,9 @@ class GreedyBatchedRNNTInfer(_GreedyRNNTInfer):
                     - 'lin' for using the linear mapping.
                     - 'exp' for using exponential mapping with linear shift.
         loop_labels: Switching between decoding algorithms. Both algorithms produce equivalent results.
-            loop_labels=True algorithm is faster (especially for large batches) but can use a bit more memory
+            loop_labels=True (default) algorithm is faster (especially for large batches) but can use a bit more memory
             (negligible overhead compared to the amount of memory used by the encoder).
-            loop_labels=False (default) is an implementation of a traditional decoding algorithm, which iterates over
+            loop_labels=False is an implementation of a traditional decoding algorithm, which iterates over
             frames (encoder output vectors), and in the inner loop, decodes labels for the current frame one by one,
             stopping when <blank> is found.
             loop_labels=True iterates over labels, on each step finding the next non-blank label
@@ -588,7 +588,7 @@ def __init__(
         preserve_alignments: bool = False,
         preserve_frame_confidence: bool = False,
         confidence_method_cfg: Optional[DictConfig] = None,
-        loop_labels: bool = False,
+        loop_labels: bool = True,
         use_cuda_graph_decoder: bool = False,
     ):
         super().__init__(
@@ -2299,7 +2299,7 @@ class GreedyBatchedRNNTInferConfig:
     preserve_alignments: bool = False
     preserve_frame_confidence: bool = False
     confidence_method_cfg: Optional[ConfidenceMethodConfig] = field(default_factory=lambda: ConfidenceMethodConfig())
-    loop_labels: bool = False
+    loop_labels: bool = True
     use_cuda_graph_decoder: bool = False
 
     def __post_init__(self):
diff --git a/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
index 89b474e0f8ba..92cb8a36aeb5 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
@@ -283,21 +283,12 @@ def loop_labels_torch(
         became_inactive_mask = torch.empty_like(active_mask)
 
         # loop while there are active utterances
-        first_step = True
         while active_mask.any():
             active_mask_prev.copy_(active_mask, non_blocking=True)
             # stage 1: get decoder (prediction network) output
-            if first_step:
-                # start of the loop, SOS symbol is passed into prediction network, state is None
-                # we need to separate this for torch.jit
-                decoder_output, state, *_ = self.decoder.predict(
-                    labels.unsqueeze(1), None, add_sos=False, batch_size=batch_size
-                )
-                first_step = False
-            else:
-                decoder_output, state, *_ = self.decoder.predict(
-                    labels.unsqueeze(1), state, add_sos=False, batch_size=batch_size
-                )
+            decoder_output, state, *_ = self.decoder.predict(
+                labels.unsqueeze(1), state, add_sos=False, batch_size=batch_size
+            )
             decoder_output = self.joint.project_prednet(decoder_output)  # do not recalculate joint projection
 
             # stage 2: get joint output, iteratively seeking for non-blank labels
diff --git a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
index e95ea48d15fe..c289ce06cdfa 100644
--- a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
+++ b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
@@ -294,21 +294,12 @@ def loop_labels_torch(
         became_inactive_mask = torch.empty_like(active_mask)
 
         # loop while there are active utterances
-        first_step = True
         while active_mask.any():
             active_mask_prev.copy_(active_mask, non_blocking=True)
             # stage 1: get decoder (prediction network) output
-            if first_step:
-                # start of the loop, SOS symbol is passed into prediction network, state is None
-                # we need to separate this for torch.jit
-                decoder_output, state, *_ = self.decoder.predict(
-                    labels.unsqueeze(1), None, add_sos=False, batch_size=batch_size
-                )
-                first_step = False
-            else:
-                decoder_output, state, *_ = self.decoder.predict(
-                    labels.unsqueeze(1), state, add_sos=False, batch_size=batch_size
-                )
+            decoder_output, state, *_ = self.decoder.predict(
+                labels.unsqueeze(1), state, add_sos=False, batch_size=batch_size
+            )
             decoder_output = self.joint.project_prednet(decoder_output)  # do not recalculate joint projection
 
             # stage 2: get joint output, iteratively seeking for non-blank labels
diff --git a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py
index 60f807dc7b3e..85156bf9e2c5 100644
--- a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py
+++ b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import copy
+from typing import Optional
 
 import pytest
 import torch
@@ -309,9 +310,14 @@ def test_BeamRNNTInferConfig(self):
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+        ("greedy_class", "loop_labels"),
+        [
+            (greedy_decode.GreedyRNNTInfer, None),
+            (greedy_decode.GreedyBatchedRNNTInfer, True),
+            (greedy_decode.GreedyBatchedRNNTInfer, False),
+        ],
     )
-    def test_greedy_decoding(self, greedy_class):
+    def test_greedy_decoding(self, greedy_class, loop_labels: Optional[bool]):
         token_list = [" ", "a", "b", "c"]
         vocab_size = len(token_list)
 
@@ -330,7 +336,10 @@ def test_greedy_decoding(self, greedy_class):
         decoder = RNNTDecoder(prednet_cfg, vocab_size)
         joint_net = RNNTJoint(jointnet_cfg, vocab_size, vocabulary=token_list)
 
-        greedy = greedy_class(decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5)
+        additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels}
+        greedy = greedy_class(
+            decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5, **additional_decoding_kwargs
+        )
 
         # (B, D, T)
         enc_out = torch.randn(1, encoder_output_size, 30)
@@ -381,9 +390,15 @@ def test_greedy_multi_decoding(self, greedy_class):
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+        ("greedy_class", "loop_labels"),
+        [
+            (greedy_decode.GreedyRNNTInfer, None),
+            (greedy_decode.GreedyBatchedRNNTInfer, True),
+            (greedy_decode.GreedyBatchedRNNTInfer, False),
+        ],
     )
-    def test_greedy_decoding_stateless_decoder(self, greedy_class):
+    @pytest.mark.parametrize("context_size", [1, 2])
+    def test_greedy_decoding_stateless_decoder(self, greedy_class, loop_labels: Optional[bool], context_size: int):
         token_list = [" ", "a", "b", "c"]
         vocab_size = len(token_list)
 
@@ -391,7 +406,7 @@ def test_greedy_decoding_stateless_decoder(self, greedy_class):
         decoder_output_size = 4
         joint_output_shape = 4
 
-        prednet_cfg = {'pred_hidden': decoder_output_size, 'pred_rnn_layers': 1}
+        prednet_cfg = {'pred_hidden': decoder_output_size, 'pred_rnn_layers': 1, 'context_size': context_size}
         jointnet_cfg = {
             'encoder_hidden': encoder_output_size,
             'pred_hidden': decoder_output_size,
@@ -402,7 +417,10 @@ def test_greedy_decoding_stateless_decoder(self, greedy_class):
         decoder = StatelessTransducerDecoder(prednet_cfg, vocab_size)
         joint_net = RNNTJoint(jointnet_cfg, vocab_size, vocabulary=token_list)
 
-        greedy = greedy_class(decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5)
+        additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels}
+        greedy = greedy_class(
+            decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5, **additional_decoding_kwargs
+        )
 
         # (B, D, T)
         enc_out = torch.randn(1, encoder_output_size, 30)
@@ -453,9 +471,14 @@ def test_greedy_multi_decoding_stateless_decoder(self, greedy_class):
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+        ("greedy_class", "loop_labels"),
+        [
+            (greedy_decode.GreedyRNNTInfer, None),
+            (greedy_decode.GreedyBatchedRNNTInfer, True),
+            (greedy_decode.GreedyBatchedRNNTInfer, False),
+        ],
     )
-    def test_greedy_decoding_preserve_alignment(self, greedy_class):
+    def test_greedy_decoding_preserve_alignment(self, greedy_class, loop_labels: Optional[bool]):
         token_list = [" ", "a", "b", "c"]
         vocab_size = len(token_list)
 
@@ -474,8 +497,14 @@ def test_greedy_decoding_preserve_alignment(self, greedy_class):
         decoder = RNNTDecoder(prednet_cfg, vocab_size)
         joint_net = RNNTJoint(jointnet_cfg, vocab_size, vocabulary=token_list)
 
+        additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels}
         greedy = greedy_class(
-            decoder, joint_net, blank_index=len(token_list) - 1, preserve_alignments=True, max_symbols_per_step=5
+            decoder,
+            joint_net,
+            blank_index=len(token_list) - 1,
+            preserve_alignments=True,
+            max_symbols_per_step=5,
+            **additional_decoding_kwargs,
         )
 
         # (B, D, T)
@@ -591,9 +620,14 @@ def test_beam_decoding_preserve_alignments(self, beam_config):
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+        ("greedy_class", "loop_labels"),
+        [
+            (greedy_decode.GreedyRNNTInfer, None),
+            (greedy_decode.GreedyBatchedRNNTInfer, True),
+            (greedy_decode.GreedyBatchedRNNTInfer, False),
+        ],
     )
-    def test_greedy_decoding_SampledRNNTJoint(self, greedy_class):
+    def test_greedy_decoding_SampledRNNTJoint(self, greedy_class, loop_labels: Optional[bool]):
         token_list = [" ", "a", "b", "c"]
         vocab_size = len(token_list)
 
@@ -612,7 +646,10 @@ def test_greedy_decoding_SampledRNNTJoint(self, greedy_class):
         decoder = RNNTDecoder(prednet_cfg, vocab_size)
         joint_net = SampledRNNTJoint(jointnet_cfg, vocab_size, n_samples=2, vocabulary=token_list)
 
-        greedy = greedy_class(decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5)
+        additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels}
+        greedy = greedy_class(
+            decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5, **additional_decoding_kwargs
+        )
 
         # (B, D, T)
         enc_out = torch.randn(1, encoder_output_size, 30)
diff --git a/tests/collections/asr/test_asr_rnnt_encdec_model.py b/tests/collections/asr/test_asr_rnnt_encdec_model.py
index d7c47adce1ad..d5ab0054ff87 100644
--- a/tests/collections/asr/test_asr_rnnt_encdec_model.py
+++ b/tests/collections/asr/test_asr_rnnt_encdec_model.py
@@ -73,7 +73,7 @@ def predict(
                 return (
                     output,
                     [
-                        torch.tensor([0] * self.vocab_size + [1], dtype=torch.float32)[None, None, :].exand(
+                        torch.tensor([0] * self.vocab_size + [1], dtype=torch.float32)[None, None, :].expand(
                             [1, batch_size, -1]
                         )
                     ],
@@ -90,22 +90,25 @@ def predict(
                 ],
             )
 
-        def initialize_state(self, y: torch.Tensor) -> Optional[List[torch.Tensor]]:
-            return None
+        def initialize_state(self, y: torch.Tensor) -> List[torch.Tensor]:
+            batch_size = y.shape[0]
+            # NB: .clone is necessary after .expand, since the decoding algorithm manipulates the state
+            # (replacing elements), and this requires the state to be a real full tensor
+            # (not an expanded view, in which different elements can refer to the same memory location)
+            return [
+                torch.tensor([0] * self.vocab_size + [1], dtype=torch.float32)[None, None, :]
+                .expand([1, batch_size, -1])
+                .clone()
+            ]
 
         def score_hypothesis(
             self, hypothesis: Hypothesis, cache: Dict[Tuple[int], Any]
         ) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]:
             return torch.tensor(), [torch.tensor()], torch.tensor()
 
-        def batch_select_state(
-            self, batch_states: Optional[List[torch.Tensor]], idx: int
-        ) -> Optional[List[List[torch.Tensor]]]:
-            if batch_states is not None:
-                states = [batch_states[0][:, idx]]
-                return [states]
-            else:
-                return None
+        def batch_select_state(self, batch_states: List[torch.Tensor], idx: int) -> Optional[List[List[torch.Tensor]]]:
+            states = [batch_states[0][:, idx]]
+            return [states]
 
         def batch_copy_states(
             self,
@@ -126,6 +129,22 @@ def mask_select_states(
                 return None
             return [states[0][:, mask]]
 
+        @classmethod
+        def batch_replace_states_mask(
+            cls, src_states: list[torch.Tensor], dst_states: list[torch.Tensor], mask: torch.Tensor,
+        ):
+            """Replace states in dst_states with states from src_states using the mask"""
+            for src_substate, dst_substate in zip(src_states, dst_states):
+                torch.where(mask.unsqueeze(0).unsqueeze(-1), src_substate, dst_substate, out=dst_substate)
+
+        @classmethod
+        def batch_split_states(cls, batch_states: list[torch.Tensor]) -> list[list[torch.Tensor]]:
+            """
+            Split states into a list of states.
+            Useful for splitting the final state for converting results of the decoding algorithm to Hypothesis class.
+            """
+            return [sub_state.split(1, dim=1) for sub_state in batch_states]
+
     class DummyRNNTJoint(AbstractRNNTJoint):
         def __init__(self, num_outputs: int):
             super().__init__()
@@ -621,9 +640,15 @@ def test_greedy_multi_decoding(self, greedy_class):
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+        ("greedy_class", "loop_labels"),
+        [
+            (greedy_decode.GreedyRNNTInfer, None),
+            (greedy_decode.GreedyBatchedRNNTInfer, True),
+            (greedy_decode.GreedyBatchedRNNTInfer, False),
+        ],
     )
-    def test_greedy_decoding_stateless_decoder(self, greedy_class):
+    @pytest.mark.parametrize("context_size", [1, 2])
+    def test_greedy_decoding_stateless_decoder(self, greedy_class, loop_labels: Optional[bool], context_size: int):
         token_list = [" ", "a", "b", "c"]
         vocab_size = len(token_list)
 
@@ -631,7 +656,7 @@ def test_greedy_decoding_stateless_decoder(self, greedy_class):
         decoder_output_size = 4
         joint_output_shape = 4
 
-        prednet_cfg = {'pred_hidden': decoder_output_size, 'pred_rnn_layers': 1}
+        prednet_cfg = {'pred_hidden': decoder_output_size, 'pred_rnn_layers': 1, 'context_size': context_size}
         jointnet_cfg = {
             'encoder_hidden': encoder_output_size,
             'pred_hidden': decoder_output_size,
@@ -642,8 +667,14 @@ def test_greedy_decoding_stateless_decoder(self, greedy_class):
         decoder = StatelessTransducerDecoder(prednet_cfg, vocab_size)
         for joint_type in [RNNTJoint, HATJoint]:
             joint_net = joint_type(jointnet_cfg, vocab_size, vocabulary=token_list)
-
-            greedy = greedy_class(decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5)
+            additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels}
+            greedy = greedy_class(
+                decoder,
+                joint_net,
+                blank_index=len(token_list) - 1,
+                max_symbols_per_step=5,
+                **additional_decoding_kwargs,
+            )
 
             # (B, D, T)
             enc_out = torch.randn(1, encoder_output_size, 30)
@@ -696,9 +727,14 @@ def test_greedy_multi_decoding_stateless_decoder(self, greedy_class):
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+        ("greedy_class", "loop_labels"),
+        [
+            (greedy_decode.GreedyRNNTInfer, None),
+            (greedy_decode.GreedyBatchedRNNTInfer, True),
+            (greedy_decode.GreedyBatchedRNNTInfer, False),
+        ],
     )
-    def test_greedy_decoding_preserve_alignment(self, greedy_class):
+    def test_greedy_decoding_preserve_alignment(self, greedy_class, loop_labels: Optional[bool]):
         token_list = [" ", "a", "b", "c"]
         vocab_size = len(token_list)
 
@@ -719,13 +755,14 @@ def test_greedy_decoding_preserve_alignment(self, greedy_class):
         max_symbols_per_step = 5
         for joint_type in [RNNTJoint, HATJoint]:
             joint_net = joint_type(jointnet_cfg, vocab_size, vocabulary=token_list)
-
+            additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels}
             greedy = greedy_class(
                 decoder,
                 joint_net,
                 blank_index=len(token_list),
                 preserve_alignments=True,
                 max_symbols_per_step=max_symbols_per_step,
+                **additional_decoding_kwargs,
             )
 
             # (B, D, T)
@@ -760,9 +797,14 @@ def test_greedy_decoding_preserve_alignment(self, greedy_class):
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+        ("greedy_class", "loop_labels"),
+        [
+            (greedy_decode.GreedyRNNTInfer, None),
+            (greedy_decode.GreedyBatchedRNNTInfer, True),
+            (greedy_decode.GreedyBatchedRNNTInfer, False),
+        ],
     )
-    def test_greedy_decoding_preserve_frame_confidence(self, greedy_class):
+    def test_greedy_decoding_preserve_frame_confidence(self, greedy_class, loop_labels: Optional[bool]):
         token_list = [" ", "a", "b", "c"]
         vocab_size = len(token_list)
 
@@ -784,12 +826,14 @@ def test_greedy_decoding_preserve_frame_confidence(self, greedy_class):
         for joint_type in [RNNTJoint, HATJoint]:
             joint_net = joint_type(jointnet_cfg, vocab_size, vocabulary=token_list)
 
+            additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels}
             greedy = greedy_class(
                 decoder,
                 joint_net,
                 blank_index=len(token_list),
                 preserve_frame_confidence=True,
                 max_symbols_per_step=max_symbols_per_step,
+                **additional_decoding_kwargs,
             )
 
             # (B, D, T)
@@ -827,10 +871,17 @@ def test_greedy_decoding_preserve_frame_confidence(self, greedy_class):
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+        ("greedy_class", "loop_labels"),
+        [
+            (greedy_decode.GreedyRNNTInfer, None),
+            (greedy_decode.GreedyBatchedRNNTInfer, True),
+            (greedy_decode.GreedyBatchedRNNTInfer, False),
+        ],
     )
     @pytest.mark.parametrize("max_symbols_per_step", [1, 5])
-    def test_greedy_decoding_max_symbols_alignment(self, max_symbols_setup, greedy_class, max_symbols_per_step):
+    def test_greedy_decoding_max_symbols_alignment(
+        self, max_symbols_setup, greedy_class, max_symbols_per_step: int, loop_labels: Optional[bool]
+    ):
         decoders = [max_symbols_setup["decoder"]]
         if greedy_class is greedy_decode.GreedyBatchedRNNTInfer:
             decoders.append(max_symbols_setup["decoder_masked"])
@@ -839,12 +890,14 @@ def test_greedy_decoding_max_symbols_alignment(self, max_symbols_setup, greedy_c
         encoded_lengths = max_symbols_setup["encoded_lengths"]
 
         for decoder in decoders:
+            additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels}
             greedy = greedy_class(
                 decoder_model=decoder,
                 joint_model=joint,
                 blank_index=decoder.blank_idx,
                 max_symbols_per_step=max_symbols_per_step,
                 preserve_alignments=True,
+                **additional_decoding_kwargs,
             )
 
             with torch.no_grad():
@@ -869,10 +922,17 @@ def test_greedy_decoding_max_symbols_alignment(self, max_symbols_setup, greedy_c
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+        ("greedy_class", "loop_labels"),
+        [
+            (greedy_decode.GreedyRNNTInfer, None),
+            (greedy_decode.GreedyBatchedRNNTInfer, True),
+            (greedy_decode.GreedyBatchedRNNTInfer, False),
+        ],
     )
     @pytest.mark.parametrize("max_symbols_per_step", [-1, 0])
-    def test_greedy_decoding_max_symbols_confidence(self, max_symbols_setup, greedy_class, max_symbols_per_step):
+    def test_greedy_decoding_max_symbols_confidence_incorrect_max_symbols(
+        self, max_symbols_setup, greedy_class, max_symbols_per_step: int, loop_labels: Optional[bool]
+    ):
         """Test ValueError for max_symbols_per_step <= 0"""
         decoders = [max_symbols_setup["decoder"]]
         if greedy_class is greedy_decode.GreedyBatchedRNNTInfer:
@@ -880,6 +940,7 @@ def test_greedy_decoding_max_symbols_confidence(self, max_symbols_setup, greedy_
         joint = max_symbols_setup["joint"]
 
         for decoder in decoders:
+            additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels}
             with pytest.raises(ValueError):
                 _ = greedy_class(
                     decoder_model=decoder,
@@ -887,6 +948,7 @@ def test_greedy_decoding_max_symbols_confidence(self, max_symbols_setup, greedy_
                     blank_index=decoder.blank_idx,
                     max_symbols_per_step=max_symbols_per_step,
                     preserve_frame_confidence=True,
+                    **additional_decoding_kwargs,
                 )
 
     @pytest.mark.skipif(
@@ -894,10 +956,17 @@ def test_greedy_decoding_max_symbols_confidence(self, max_symbols_setup, greedy_
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+        ("greedy_class", "loop_labels"),
+        [
+            (greedy_decode.GreedyRNNTInfer, None),
+            (greedy_decode.GreedyBatchedRNNTInfer, True),
+            (greedy_decode.GreedyBatchedRNNTInfer, False),
+        ],
     )
     @pytest.mark.parametrize("max_symbols_per_step", [1, 5])
-    def test_greedy_decoding_max_symbols_confidence(self, max_symbols_setup, greedy_class, max_symbols_per_step):
+    def test_greedy_decoding_max_symbols_confidence(
+        self, max_symbols_setup, greedy_class, max_symbols_per_step: int, loop_labels: Optional[bool]
+    ):
         decoders = [max_symbols_setup["decoder"]]
         if greedy_class is greedy_decode.GreedyBatchedRNNTInfer:
             decoders.append(max_symbols_setup["decoder_masked"])
@@ -906,12 +975,14 @@ def test_greedy_decoding_max_symbols_confidence(self, max_symbols_setup, greedy_
         encoded_lengths = max_symbols_setup["encoded_lengths"]
 
         for decoder in decoders:
+            additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels}
             greedy = greedy_class(
                 decoder_model=decoder,
                 joint_model=joint,
                 blank_index=decoder.blank_idx,
                 max_symbols_per_step=max_symbols_per_step,
                 preserve_frame_confidence=True,
+                **additional_decoding_kwargs,
             )
 
             with torch.no_grad():
@@ -1035,9 +1106,14 @@ def test_beam_decoding_preserve_alignments(self, beam_config):
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+        ("greedy_class", "loop_labels"),
+        [
+            (greedy_decode.GreedyRNNTInfer, None),
+            (greedy_decode.GreedyBatchedRNNTInfer, True),
+            (greedy_decode.GreedyBatchedRNNTInfer, False),
+        ],
     )
-    def test_greedy_decoding_SampledRNNTJoint(self, greedy_class):
+    def test_greedy_decoding_SampledRNNTJoint(self, greedy_class, loop_labels: Optional[bool]):
         token_list = [" ", "a", "b", "c"]
         vocab_size = len(token_list)
 
@@ -1056,7 +1132,10 @@ def test_greedy_decoding_SampledRNNTJoint(self, greedy_class):
         decoder = RNNTDecoder(prednet_cfg, vocab_size)
         joint_net = SampledRNNTJoint(jointnet_cfg, vocab_size, n_samples=2, vocabulary=token_list)
 
-        greedy = greedy_class(decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5)
+        additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels}
+        greedy = greedy_class(
+            decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5, **additional_decoding_kwargs
+        )
 
         # (B, D, T)
         enc_out = torch.randn(1, encoder_output_size, 30)

From dd75285b295c5d5e71ea27bf9ddd74dbbd99c87a Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Wed, 10 Apr 2024 11:19:20 -0700
Subject: [PATCH 121/140] Cancel old runs for PR commit update (#8874)

---
 .github/workflows/cicd-main.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 309e7936ee3b..550defff7814 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -15,9 +15,12 @@ name: "CICD NeMo"
 
 on:
   pull_request:
-    types: [opened, reopened, ready_for_review]
     branches: [ "main" ]
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   gpu-test:
     runs-on: self-hosted-azure

From 7c07a8de3743c4c03fb6727567cf1c3de3e7d193 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Wed, 10 Apr 2024 16:36:02 -0400
Subject: [PATCH 122/140] Fix packed seq doc math rendering issue (#8832)

* Fix packed seq doc math rendering issue

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Fix packed seq doc math rendering issue

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 docs/source/nlp/nemo_megatron/packed_sequence.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/nlp/nemo_megatron/packed_sequence.rst b/docs/source/nlp/nemo_megatron/packed_sequence.rst
index 23c8976d4f5e..e31444fe1e60 100644
--- a/docs/source/nlp/nemo_megatron/packed_sequence.rst
+++ b/docs/source/nlp/nemo_megatron/packed_sequence.rst
@@ -123,7 +123,7 @@ To train with packed sequences, you need to change four items in the SFT/PEFT co
       preprocessing step. You can increase the ``pack_size`` to achieve the same purpose of increasing micro batch size.
     - Global batch size has to be adjusted so that the training recipe is maintained. Because each pack contains
       multiple sequences now, global batch size needs to be reduced by the average number of sequences per pack ``n``,
-      where :math:`n = \frac{# sequences in dataset}{# packs}`. This ensures that each gradient iteration sees (on
+      where ``n = num_sequences_in_dataset / num_packs``. This ensures that each gradient iteration sees (on
       average) the same number of tokens. The value of ``n`` is printed out when the script is run.
 
     .. code-block:: bash

From f7941cbd41697291cbee714c2182a18b70b85755 Mon Sep 17 00:00:00 2001
From: Tim Moon <4406448+timmoon10@users.noreply.github.com>
Date: Wed, 10 Apr 2024 16:06:37 -0500
Subject: [PATCH 123/140] Move logic for distopt FP32 grads to models (#8867)

* Move logic for FP32 embedding grads to models

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Tim Moon <tmoon@nvidia.com>
Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 .../language_modeling/megatron_base_model.py  | 27 -------
 .../language_modeling/megatron_gpt_model.py   | 72 ++++++++++---------
 .../megatron_lm_encoder_decoder_model.py      |  5 ++
 3 files changed, 44 insertions(+), 60 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 854c5ee02e31..980ea8f9f76d 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -768,33 +768,6 @@ def get_config_arg(key: str, default_value: Optional[Any] = None) -> Any:
             optim_dtype = str_to_dtype(get_config_arg('dtype', torch.float32))
             optim_kwargs['dtype'] = optim_dtype
 
-            # Make sure embedding grad reductions are in FP32
-            if optim_dtype == torch.float32:
-                fp32_params = []
-                modules = self.get_model_module_list()
-                if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
-                    if self.mcore_gpt:
-                        fp32_params.append(modules[0].shared_embedding_or_output_weight())
-                        if modules[0].embedding.add_position_embedding:
-                            fp32_params.append(modules[0].embedding.position_embeddings.weight)
-                    else:
-                        fp32_params.append(modules[0].word_embeddings_weight())
-                        fp32_params.append(modules[0].position_embeddings_weight())
-                if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
-                    share_embeddings_and_output_weights = (
-                        modules[-1].share_embeddings_and_output_weights
-                        if self.mcore_gpt
-                        else modules[-1].share_token_embeddings
-                    )
-                    if share_embeddings_and_output_weights:
-                        if self.mcore_gpt:
-                            fp32_params.append(modules[-1].shared_embedding_or_output_weight())
-                        else:
-                            fp32_params.append(modules[-1].word_embeddings_weight())
-                for param in fp32_params:
-                    if param is not None:
-                        param._with_fp32_optimizer = True
-
             # Match param allgather with model dtype
             model_dtype = torch.float32
             if self.megatron_amp_O2 and hasattr(self, 'autocast_dtype'):
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 6648abac8ee0..8d1d428a9989 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -102,9 +102,6 @@
     from megatron.core.transformer.transformer_config import TransformerConfig
     from megatron.core.utils import drain_embedding_wgrad_compute, init_method_normal, scaled_init_method_normal
 
-    # TODO @tmoon: Use once available in Megatron-LM
-    # from megatron.core.pipeline_parallel.schedules import DataIteratorList
-
     HAVE_MEGATRON_CORE = True
 
 except (ImportError, ModuleNotFoundError):
@@ -494,36 +491,45 @@ def configure_optimizers(self):
 
         if self.with_distributed_adam:
 
-            # Disable overlapped grad sync for embedding grad when
-            # pipeline parallelism is enabled
-            if parallel_state.get_pipeline_model_parallel_world_size() > 1:
-                modules = self.get_model_module_list()
-                if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
-                    if len(modules) > 1:
-                        module = modules[0]  # only the first virtual rank has the embeddings
-                    else:
-                        module = modules[0]
-                    if self.cfg.get('share_embeddings_and_output_weights', True):
-                        param = (
-                            module.shared_embedding_or_output_weight()
-                            if self.mcore_gpt
-                            else module.word_embeddings_weight()
-                        )
-                        param._disable_greedy_grad_copy = not self.megatron_amp_O2
-                        param._disable_overlap_grad_sync = True
-                if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
-                    if len(modules) > 1:
-                        module = modules[-1]  # only the last virtual rank has the embeddings
-                    else:
-                        module = modules[0]
-                    if self.cfg.get('share_embeddings_and_output_weights', True):
-                        param = (
-                            module.shared_embedding_or_output_weight()
-                            if self.mcore_gpt
-                            else module.word_embeddings_weight()
-                        )
-                        param._disable_greedy_grad_copy = not self.megatron_amp_O2
-                        param._disable_overlap_grad_sync = True
+            # Special handling for embedding grads
+            modules = self.get_model_module_list()
+            if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
+                module = modules[0]  # first virtual rank has the embeddings
+
+                # Word embeddings: use FP32 grads and disable
+                # overlapped grad sync with pipeline parallelism
+                word_embeddings = (
+                    module.shared_embedding_or_output_weight() if self.mcore_gpt else module.word_embeddings_weight()
+                )
+                word_embeddings._with_fp32_optimizer = True
+                if parallel_state.get_pipeline_model_parallel_world_size() > 1 and self.cfg.get(
+                    'share_embeddings_and_output_weights', True
+                ):
+                    word_embeddings._disable_greedy_grad_copy = not self.megatron_amp_O2
+                    word_embeddings._disable_overlap_grad_sync = True
+
+                # Position embeddings: use FP32 grads
+                position_embeddings = None
+                if self.mcore_gpt:
+                    if module.embedding.add_position_embedding:
+                        position_embeddings = module.embedding.position_embeddings.weight
+                else:
+                    position_embeddings = module.position_embeddings_weight()
+                if position_embeddings is not None:
+                    position_embeddings._with_fp32_optimizer = True
+
+            # Handle case where embeddings are used in output layer
+            if parallel_state.is_pipeline_last_stage(ignore_virtual=True) and self.cfg.get(
+                'share_embeddings_and_output_weights', True
+            ):
+                module = modules[-1]  # last virtual rank has the embeddings
+                word_embeddings = (
+                    module.shared_embedding_or_output_weight() if self.mcore_gpt else module.word_embeddings_weight()
+                )
+                word_embeddings._with_fp32_optimizer = True
+                if parallel_state.get_pipeline_model_parallel_world_size() > 1:
+                    word_embeddings._disable_greedy_grad_copy = not self.megatron_amp_O2
+                    word_embeddings._disable_overlap_grad_sync = True
 
             # Disable overlapped grad sync for layer norm grads when
             # sequence parallelism is enabled
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
index 651034c91520..3a7ad3d6714c 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
@@ -189,6 +189,11 @@ def configure_optimizers(self):
                 param._disable_greedy_grad_copy = not self.megatron_amp_O2
                 param._disable_overlap_grad_sync = True
 
+            # Make sure embedding grads are reduced in FP32
+            for name, param in self.named_parameters():
+                if 'word_embedding' in name or 'position_embedding' in name or 'output_layer' in name:
+                    param._with_fp32_optimizer = True
+
         return super().configure_optimizers()
 
     def _handle_bias_activation_fusion_args(self, cfg):

From 6e2398a896313c8806766129832423b334c8d876 Mon Sep 17 00:00:00 2001
From: Nithin Rao <nithinrao.koluguri@gmail.com>
Date: Wed, 10 Apr 2024 14:35:47 -0700
Subject: [PATCH 124/140] Fix transcription utils function for duration check
 (#8862)

* add none check

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* add for restore func

Signed-off-by: Nithin Rao Koluguri <nithinraok>

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
---
 nemo/collections/asr/parts/utils/transcribe_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py
index 980500e9ef00..8465406224e7 100644
--- a/nemo/collections/asr/parts/utils/transcribe_utils.py
+++ b/nemo/collections/asr/parts/utils/transcribe_utils.py
@@ -298,7 +298,7 @@ def prepare_audio_data(cfg: DictConfig) -> Tuple[List[str], bool]:
 def read_and_maybe_sort_manifest(path: str, try_sort: bool = False) -> List[dict]:
     """Sorts the manifest if duration key is available for every utterance."""
     items = manifest_utils.read_manifest(path)
-    if try_sort and all("duration" in item for item in items):
+    if try_sort and all("duration" in item and item["duration"] is not None for item in items):
         items = sorted(items, reverse=True, key=lambda item: item["duration"])
     return items
 
@@ -306,7 +306,7 @@ def read_and_maybe_sort_manifest(path: str, try_sort: bool = False) -> List[dict
 def restore_transcription_order(manifest_path: str, transcriptions: list) -> list:
     with open(manifest_path) as f:
         items = [(idx, json.loads(l)) for idx, l in enumerate(f)]
-    if not all("duration" in item[1] for item in items):
+    if not all("duration" in item[1] and item[1]["duration"] is not None for item in items):
         return transcriptions
     new2old = [item[0] for item in sorted(items, reverse=True, key=lambda it: it[1]["duration"])]
     del items  # free up some memory

From 2c6e65e7dd42751f74fdaa47e8c2bc060e8f29f1 Mon Sep 17 00:00:00 2001
From: Danial Mohseni Taheri <49656670+DanialTaheri@users.noreply.github.com>
Date: Wed, 10 Apr 2024 16:29:15 -0700
Subject: [PATCH 125/140] Add clip conv layer (#8838)

* Replace einops with ConvLayer

Signed-off-by: Danial <smohsenitahe@nvidia.com>

* Modify the layers

Signed-off-by: Danial <smohsenitahe@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Danial <smohsenitahe@nvidia.com>

* Fix version and arch

Signed-off-by: Danial <smohsenitahe@nvidia.com>

* Fix a bug in openclip conversion

Signed-off-by: Danial <smohsenitahe@nvidia.com>

---------

Signed-off-by: Danial <smohsenitahe@nvidia.com>
Co-authored-by: Danial <smohsenitahe@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../clip/convert_external_clip_to_nemo.py     | 17 +++++------------
 .../vision/modules/vit/vit_backbone.py        | 19 +++++++++++--------
 2 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py b/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py
index 4ac99a951f0d..631b3faa2f47 100644
--- a/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py
+++ b/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py
@@ -55,8 +55,8 @@
 
 def get_args():
     parser = ArgumentParser()
-    parser.add_argument("--arch", type=str, default="ViT-H-14")
-    parser.add_argument("--version", type=str, default="laion2b_s32b_b79k")
+    parser.add_argument("--arch", type=str, default="openai/clip-vit-base-patch32")
+    parser.add_argument("--version", type=str, default="huggingface")
 
     parser.add_argument(
         "--hparams_file",
@@ -112,7 +112,6 @@ def mapping_openclip_state_dict(open_model):
         ".positional_embedding": ".position_embeddings",
         ".backbone.proj": ".head.weight",
         ".class_embedding": ".cls_token",
-        ".backbone.conv1.weight": ".backbone.linear_encoder.weight",
     }
 
     nemo_state_dict = {}
@@ -139,9 +138,6 @@ def mapping_openclip_state_dict(open_model):
     nemo_state_dict["vision_encoder.backbone.cls_token"] = nemo_state_dict[
         "vision_encoder.backbone.cls_token"
     ].reshape(1, 1, -1)
-    w = nemo_state_dict["vision_encoder.backbone.linear_encoder.weight"]
-    nemo_state_dict["vision_encoder.backbone.linear_encoder.weight"] = einops.rearrange(w, "b c p1 p2 -> b (p1 p2 c)",)
-    nemo_state_dict["vision_encoder.backbone.linear_encoder.bias"] = torch.zeros(w.shape[0])
 
     return nemo_state_dict
 
@@ -168,10 +164,10 @@ def mapping_hf_state_dict(hf_model):
         ".pre_layrnorm.bias": ".preprocess_layernorm.bias",
         ".post_layernorm.weight": ".transformer.final_layernorm.weight",
         ".post_layernorm.bias": ".transformer.final_layernorm.bias",
-        ".backbone.embeddings.position_embedding.weight": ".backbone.position_embeddings",
-        ".language_model.embeddings.position_embedding.weight": ".language_model.embedding.position_embeddings",
+        ".backbone.embeddings.position_embedding.weight": ".backbone.position_embeddings.weight",
+        ".language_model.embeddings.position_embedding.weight": ".language_model.embedding.position_embeddings.weight",
         ".embeddings.class_embedding": ".cls_token",
-        ".backbone.embeddings.patch_embedding.weight": ".backbone.linear_encoder.weight",
+        ".backbone.embeddings.patch_embedding.weight": ".backbone.conv1.weight",
         ".final_layer_norm.weight": ".encoder.final_layernorm.weight",
         ".final_layer_norm.bias": ".encoder.final_layernorm.bias",
         ".embeddings.token_embedding.weight": ".embedding.word_embeddings.weight",
@@ -208,9 +204,6 @@ def mapping_hf_state_dict(hf_model):
     nemo_state_dict["vision_encoder.backbone.cls_token"] = nemo_state_dict[
         "vision_encoder.backbone.cls_token"
     ].reshape(1, 1, -1)
-    w = nemo_state_dict["vision_encoder.backbone.linear_encoder.weight"]
-    nemo_state_dict["vision_encoder.backbone.linear_encoder.weight"] = einops.rearrange(w, "b c p1 p2 -> b (p1 p2 c)",)
-    nemo_state_dict["vision_encoder.backbone.linear_encoder.bias"] = torch.zeros(w.shape[0])
 
     return nemo_state_dict
 
diff --git a/nemo/collections/vision/modules/vit/vit_backbone.py b/nemo/collections/vision/modules/vit/vit_backbone.py
index ebd7e0da3e5c..67989f0f5496 100644
--- a/nemo/collections/vision/modules/vit/vit_backbone.py
+++ b/nemo/collections/vision/modules/vit/vit_backbone.py
@@ -227,8 +227,14 @@ def __init__(
                 torch.nn.init.zeros_(self.cls_token)
             self.position_ids = torch.arange(self.seq_length).expand(1, -1).cuda()
 
-            # Linear encoder
-            self.linear_encoder = torch.nn.Linear(self.flatten_dim, self.hidden_size)
+            # Convolution layer
+            self.conv1 = torch.nn.Conv2d(
+                in_channels=model_cfg.num_channels,  # Number of input channels
+                out_channels=self.hidden_size,  # Number of output channels
+                kernel_size=(self.patch_dim, self.patch_dim),  # Kernel size (height, width)
+                stride=(self.patch_dim, self.patch_dim),  # Stride (height, width)
+                bias=False,
+            )  # Disable bias
 
             # embedding
             self.position_embedding_type = model_cfg.get("position_embedding_type", "learned_absolute")
@@ -332,12 +338,9 @@ def interpolate_pos_encoding(
     def forward(self, input):
 
         if self.pre_process:
-            rearranged_input = einops.rearrange(
-                input, "b c (h p1) (w p2) -> b (h w) (p1 p2 c)", p1=self.patch_dim, p2=self.patch_dim,
-            )
-
-            # [b num_patch patch_dim*patch_dim*c] ->  [b, s, h]; s:=num_patch, h:=hidden
-            encoder_output = self.linear_encoder(rearranged_input)
+            rearranged_input = self.conv1(input)
+            rearranged_input = rearranged_input.reshape(rearranged_input.shape[0], rearranged_input.shape[1], -1)
+            encoder_output = rearranged_input.permute(0, 2, 1)
 
             concatenated_tokens = encoder_output
             if self.class_token:

From 1809b61efa95e0440ca7e35c62148c8b4fcc2e9d Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Wed, 10 Apr 2024 16:47:53 -0700
Subject: [PATCH 126/140] [Nemo CICD] Update dependencies for container build
 (#8878)

* Cancel old runs for PR commit update

* update dependencies for container build

* temp for test

* update back

* Revert "temp for test"

This reverts commit 9f9221155412393d05b2c862880f9128a93b26a4.
---
 .github/workflows/cicd-main.yml | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 550defff7814..5cc990902953 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -73,7 +73,7 @@ jobs:
     - name: Container setup
       run: |
         # Pull base PyTorch container
-        docker pull nvcr.io/nvidia/pytorch:24.01-py3
+        docker pull nvcr.io/nvidia/pytorch:24.02-py3
         docker run --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume ${{ github.workspace }}/${{ github.run_id }}:/workspace --volume /mnt/datadrive/TestData:/home/TestData nvcr.io/nvidia/pytorch:24.01-py3 /bin/bash -c '
             set -x
 
@@ -93,21 +93,19 @@ jobs:
             # NeMo Installation
             ./reinstall.sh release
 
-            # Transformer Engine 1.2.0
             # Transformer Engine installation
             git clone https://github.com/NVIDIA/TransformerEngine.git && \
                 pushd TransformerEngine && \
-                git fetch origin 9b2fed514ea419141146f843ab2c84b22b86bfd7 && \
+                git fetch origin bfe21c3d68b0a9951e5716fb520045db53419c5e && \
                 git checkout FETCH_HEAD && \
                 git submodule init && git submodule update && \
                 NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .  && \
                 popd
 
-            # Apex bugfix for PyTorch 23.11 container: https://github.com/NVIDIA/apex/pull/1760
             # Apex installation
             git clone https://github.com/NVIDIA/apex.git && \
                 pushd apex && \
-                git checkout b496d85fb88a801d8e680872a12822de310951fd && \
+                git checkout 810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c && \
                 cp -R apex /usr/local/lib/python3.10/dist-packages && \
                 popd
 
@@ -116,12 +114,13 @@ jobs:
             # Megatron Core installation
             git clone https://github.com/NVIDIA/Megatron-LM.git && \
                 pushd Megatron-LM && \
-                git checkout 43792028f003ed25a3ee8c5a0d4cad82317d81b5 && \
+                git checkout 7fe863f3d94f7b64a927b04b85f5c9339d3fb784 && \
                 pip install . && \
                   pushd megatron/core/datasets && \
                   make && \
                   popd && \
                 popd
+            export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
 
             # Install only for test: L2: Segmentation Tool
             pushd tools/ctc_segmentation && \

From 2890b3338f18c972246b26487d0d4a18795248fd Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Wed, 10 Apr 2024 20:17:11 -0700
Subject: [PATCH 127/140] Akoumparouli/fix sd train (#8876)

* hardcode autocast

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* uncomment sd_train

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 Jenkinsfile                                   | 86 +++++++++----------
 .../stable_diffusion/sd_train.py              |  3 +-
 2 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 431bc24907ed..6471fa3d011f 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -208,48 +208,48 @@ pipeline {
         sh "rm -rf /home/TestData/multimodal/stable_diffusion_train"
       }
     }
-    //stage('L2: Multimodal Stable Diffusion Train with Cuda Graph') {
-    //  when {
-    //    anyOf {
-    //      branch 'main'
-    //      changeRequest target: 'main'
-    //    }
-    //  }
-    //  failFast true
-    //  steps {
-    //    sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
-    //    sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \
-    //        trainer.precision=16 \
-    //        trainer.num_nodes=1 \
-    //        trainer.devices=1 \
-    //        ++exp_manager.max_time_per_run=00:00:03:00 \
-    //        exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train_with_cuda_graph \
-    //        trainer.max_steps=20 \
-    //        model.micro_batch_size=1 \
-    //        model.global_batch_size=1 \
-    //       model.data.synthetic_data=True \
-    //        model.first_stage_key=images_moments \
-    //        model.cond_stage_key=clip_encoded \
-    //        model.optim.name=megatron_fused_adam \
-    //        +model.optim.capturable=True \
-    //        exp_manager.ema.enable=False \
-    //        model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
-    //        ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
-    //        ++model.cond_stage_config.max_length=77 \
-    //        model.inductor=False \
-    //        ~model.cond_stage_config.restore_from_path \
-    //        ~model.cond_stage_config.freeze \
-    //        ~model.cond_stage_config.layer \
-    //        model.first_stage_config.from_pretrained=null \
-    //        model.ddp_overlap=False \
-    //        model.capture_cudagraph_iters=15 \
-    //        model.unet_config.use_flash_attention=False \
-    //        model.unet_config.attention_resolutions=[1] \
-    //        model.unet_config.channel_mult=[1] \
-    //        "
-    //    sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
-    //  }
-    //}
+    stage('L2: Multimodal Stable Diffusion Train with Cuda Graph') {
+      when {
+        anyOf {
+          branch 'main'
+          changeRequest target: 'main'
+        }
+      }
+      failFast true
+      steps {
+        sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
+        sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \
+            trainer.precision=16 \
+            trainer.num_nodes=1 \
+            trainer.devices=1 \
+            ++exp_manager.max_time_per_run=00:00:03:00 \
+            exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train_with_cuda_graph \
+            trainer.max_steps=20 \
+            model.micro_batch_size=1 \
+            model.global_batch_size=1 \
+           model.data.synthetic_data=True \
+            model.first_stage_key=images_moments \
+            model.cond_stage_key=clip_encoded \
+            model.optim.name=megatron_fused_adam \
+            +model.optim.capturable=True \
+            exp_manager.ema.enable=False \
+            model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
+            ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
+            ++model.cond_stage_config.max_length=77 \
+            model.inductor=False \
+            ~model.cond_stage_config.restore_from_path \
+            ~model.cond_stage_config.freeze \
+            ~model.cond_stage_config.layer \
+            model.first_stage_config.from_pretrained=null \
+            model.ddp_overlap=False \
+            model.capture_cudagraph_iters=15 \
+            model.unet_config.use_flash_attention=False \
+            model.unet_config.attention_resolutions=[1] \
+            model.unet_config.channel_mult=[1] \
+            "
+        sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
+      }
+    }
 //     stage('L2: Multimodal ControlNet Train') {
 //       when {
 //         anyOf {
@@ -5849,4 +5849,4 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
       cleanWs()
     }
   }
-}
\ No newline at end of file
+}
diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_train.py b/examples/multimodal/text_to_image/stable_diffusion/sd_train.py
index 968d9bec2884..b10eda550e9a 100644
--- a/examples/multimodal/text_to_image/stable_diffusion/sd_train.py
+++ b/examples/multimodal/text_to_image/stable_diffusion/sd_train.py
@@ -83,7 +83,8 @@ def main(cfg) -> None:
             else:
                 autocast_enabled = True
                 dgrad_dtype = torch.float16
-
+            # akoumparouli: temp fix.
+            autocast_enabled = True
             model = model.cuda()
             for _ in range(5):
                 with torch.autocast(device_type="cuda", enabled=autocast_enabled, dtype=torch.float16):

From 57444ae9b63afc5419a16948f7d032a1a9f7dc7f Mon Sep 17 00:00:00 2001
From: fedorovgv <50668534+fedorovgv@users.noreply.github.com>
Date: Thu, 11 Apr 2024 17:10:47 +0300
Subject: [PATCH 128/140] Add Semi Sorted Batching. (#8584)

---
 docs/source/asr/datasets.rst                  | 119 +++++----
 nemo/collections/asr/data/audio_to_text.py    |   7 +
 nemo/collections/asr/models/ctc_bpe_models.py |  17 ++
 nemo/collections/asr/models/ctc_models.py     |  17 ++
 .../asr/models/hybrid_rnnt_ctc_bpe_models.py  |  17 ++
 .../collections/asr/models/rnnt_bpe_models.py |  17 ++
 nemo/collections/asr/models/rnnt_models.py    |  17 ++
 .../asr/parts/utils/asr_batching.py           | 237 ++++++++++++++++++
 nemo/core/optim/lr_scheduler.py               |   8 +
 tests/collections/asr/test_asr_samplers.py    | 157 ++++++++++++
 10 files changed, 566 insertions(+), 47 deletions(-)
 create mode 100644 nemo/collections/asr/parts/utils/asr_batching.py
 create mode 100644 tests/collections/asr/test_asr_samplers.py

diff --git a/docs/source/asr/datasets.rst b/docs/source/asr/datasets.rst
index 7b6873de0ed7..7612c6a3f630 100644
--- a/docs/source/asr/datasets.rst
+++ b/docs/source/asr/datasets.rst
@@ -250,62 +250,49 @@ To enable sharded manifest filename expansion, set the ``shard_manifests`` field
 ``defer_setup`` flag needs to be true as well, so that the dataloader will be initialized after the DDP and its length can be collected from
 the distributed workers.
 
+Batching strategies
+---------------------
 
-Conversion to Tarred Datasets
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+For training ASR models, audios with different lengths may be grouped into a batch. It would make it necessary to use paddings to make all the same length.
+These extra paddings is a significant source of computation waste. 
 
-You can easily convert your existing NeMo-compatible ASR datasets using the
-`conversion script here <https://github.com/NVIDIA/NeMo/tree/stable/scripts/speech_recognition/convert_to_tarred_audio_dataset.py>`_.
+Semi Sorted Batching
+---------------------
 
-.. code:: bash
+Sorting samples by duration and spliting them into batches speeds up training, but can degrade the quality of the model. To avoid quality degradation and maintain some randomness in the partitioning process, we add pseudo noise to the sample length when sorting.
 
-  python convert_to_tarred_audio_dataset.py \
-    --manifest_path=<path to the manifest file> \
-    --target_dir=<path to output directory> \
-    --num_shards=<number of tarfiles that will contain the audio>
-    --max_duration=<float representing maximum duration of audio samples> \
-    --min_duration=<float representing minimum duration of audio samples> \
-    --force_codec=flac \
-    --shuffle --shuffle_seed=0
+  .. image:: images/ssb.png
+    :align: center
+    :alt: semi sorted batching
+    :scale: 50%
 
-.. note:: For extra reduction of storage space at the cost of lossy (but high-quality) compression, you may use ``--force_codec=opus`` instead.
+It may result into training speeedup of more than 40 percent with the same quality. To enable and use semi sorted batching add some lines in config.
 
-This script shuffles the entries in the given manifest (if ``--shuffle`` is set, which we recommend), filter
-audio files according to ``min_duration`` and ``max_duration``, and tar the remaining audio files to the directory
-``--target_dir`` in ``n`` shards, along with separate manifest and metadata files.
+  .. code::
 
-The files in the target directory should look similar to the following:
+    ++model.train_ds.use_semi_sorted_batching=true
+    ++model.train_ds.randomization_factor=0.1
 
-.. code:: none
+Semi sorted batching is supported by the following models:
 
-  target_dir/
-  ├── audio_1.tar
-  ├── audio_2.tar
-  ├── ...
-  ├── metadata.yaml
-  ├── tarred_audio_manifest.json
-  ├── sharded_manifests/
-      ├── manifest_1.json
-      ├── ...
-      └── manifest_N.json
+  .. code::
 
+    nemo.collections.asr.models.EncDecCTCModel
+    nemo.collections.asr.models.EncDecCTCModelBPE
+    nemo.collections.asr.models.EncDecRNNTModel
+    nemo.collections.asr.models.EncDecRNNTBPEModel
+    nemo.collections.asr.models.EncDecHybridRNNTCTCModel
+    nemo.collections.asr.models.EncDecHybridRNNTCTCBPEModel
 
-Note that file structures are flattened such that all audio files are at the top level in each tarball. This ensures that
-filenames are unique in the tarred dataset and the filepaths do not contain "-sub" and forward slashes in each ``audio_filepath`` are
-simply converted to underscores. For example, a manifest entry for ``/data/directory1/file.wav`` would be ``_data_directory1_file.wav``
-in the tarred dataset manifest, and ``/data/directory2/file.wav`` would be converted to ``_data_directory2_file.wav``.
-
-Sharded manifests are generated by default; this behavior can be toggled via the ``no_shard_manifests`` flag.
+For more details about this algorithm, see the `paper <https://www.isca-speech.org/archive/pdfs/interspeech_2021/ge21_interspeech.pdf>`_ .
 
 Bucketing Datasets
-------------------
+---------------------
 
-For training ASR models, audios with different lengths may be grouped into a batch. It would make it necessary to use paddings to make all the same length.
-These extra paddings is a significant source of computation waste. Splitting the training samples into buckets with different lengths and sampling from the same bucket for each batch would increase the computation efficicncy.
+Splitting the training samples into buckets with different lengths and sampling from the same bucket for each batch would increase the computation efficicncy.
 It may result into training speeedup of more than 2X. To enable and use the bucketing feature, you need to create the bucketing version of the dataset by using `conversion script here <https://github.com/NVIDIA/NeMo/tree/stable/scripts/speech_recognition/convert_to_tarred_audio_dataset.py>`_.
 You may use --buckets_num to specify the number of buckets (Recommend to use 4 to 8 buckets). It creates multiple tarred datasets, one per bucket, based on the audio durations. The range of [min_duration, max_duration) is split into equal sized buckets.
 
-
 To enable the bucketing feature in the dataset section of the config files, you need to pass the multiple tarred datasets as a list of lists.
 If user passes just a list of strings, then the datasets would simply get concatenated which would be different from bucketing.
 Here is an example for 4 buckets and 512 shards:
@@ -352,6 +339,50 @@ The fully_randomized strategy would have lower speedup than synced_randomized bu
 Bucketing may improve the training speed more than 2x but may affect the final accuracy of the model slightly. Training for more epochs and using 'synced_randomized' strategy help to fill this gap.
 Currently bucketing feature is just supported for tarred datasets.
 
+
+Conversion to Tarred Datasets
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You can easily convert your existing NeMo-compatible ASR datasets using the
+`conversion script here <https://github.com/NVIDIA/NeMo/tree/stable/scripts/speech_recognition/convert_to_tarred_audio_dataset.py>`_.
+
+.. code:: bash
+
+  python convert_to_tarred_audio_dataset.py \
+    --manifest_path=<path to the manifest file> \
+    --target_dir=<path to output directory> \
+    --num_shards=<number of tarfiles that will contain the audio>
+    --max_duration=<float representing maximum duration of audio samples> \
+    --min_duration=<float representing minimum duration of audio samples> \
+    --shuffle --shuffle_seed=0
+
+This script shuffles the entries in the given manifest (if ``--shuffle`` is set, which we recommend), filter
+audio files according to ``min_duration`` and ``max_duration``, and tar the remaining audio files to the directory
+``--target_dir`` in ``n`` shards, along with separate manifest and metadata files.
+
+The files in the target directory should look similar to the following:
+
+.. code::
+
+  target_dir/
+  ├── audio_1.tar
+  ├── audio_2.tar
+  ├── ...
+  ├── metadata.yaml
+  ├── tarred_audio_manifest.json
+  ├── sharded_manifests/
+      ├── manifest_1.json
+      ├── ...
+      └── manifest_N.json
+
+
+Note that file structures are flattened such that all audio files are at the top level in each tarball. This ensures that
+filenames are unique in the tarred dataset and the filepaths do not contain "-sub" and forward slashes in each ``audio_filepath`` are
+simply converted to underscores. For example, a manifest entry for ``/data/directory1/file.wav`` would be ``_data_directory1_file.wav``
+in the tarred dataset manifest, and ``/data/directory2/file.wav`` would be converted to ``_data_directory2_file.wav``.
+
+Sharded manifests are generated by default; this behavior can be toggled via the ``no_shard_manifests`` flag.
+
 Upsampling Datasets
 -------------------
 
@@ -437,7 +468,7 @@ For tarred datasets, shards from the AIS cluster are used by piping ``ais get``
 Tarred Dataset from AIS
 ^^^^^^^^^^^^^^^^^^^^^^^
 
-A tarred dataset can be easily used as described in the `Tarred Datasets`_ section by providing paths to manifests on an AIS cluster.
+A tarred dataset can be easily used as described in the :ref:`Tarred Datasets` section by providing paths to manifests on an AIS cluster.
 For example, a tarred dataset from an AIS cluster can be configured as
 
 .. code::
@@ -445,7 +476,7 @@ For example, a tarred dataset from an AIS cluster can be configured as
   manifest_filepath='ais://bucket/tarred_audio_manifest.json'
   tarred_audio_filepaths='ais://bucket/shard_{1..64}.tar'
 
-`Bucketing Datasets`_ are configured in a similar way by providing paths on an AIS cluster.
+:ref:`Bucketing Datasets` are configured in a similar way by providing paths on an AIS cluster.
 
 Non-tarred Dataset from AIS
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -640,20 +671,14 @@ We recommend to pre-compute the bucket duration bins in order to accelerate the
 The following script may be used:
 
 .. code-block:: bash
-
     $ python scripts/speech_recognition/estimate_duration_bins.py -b 30 manifest.json
-
     Use the following options in your config:
             num_buckets=30
             bucket_duration_bins=[1.78,2.34,2.69,...
     <other diagnostic information about the dataset>
-
 For multi-dataset setups, one may provide multiple manifests and even their weights:
-
 .. code-block:: bash
-
     $ python scripts/speech_recognition/estimate_duration_bins.py -b 30 [[manifest.json,0.7],[other.json,0.3]]
-
     Use the following options in your config:
             num_buckets=30
             bucket_duration_bins=[1.91,3.02,3.56,...
diff --git a/nemo/collections/asr/data/audio_to_text.py b/nemo/collections/asr/data/audio_to_text.py
index a689450c94ba..00c15109b64f 100644
--- a/nemo/collections/asr/data/audio_to_text.py
+++ b/nemo/collections/asr/data/audio_to_text.py
@@ -16,6 +16,7 @@
 import math
 import multiprocessing
 import os
+from collections.abc import Iterable as IterableABC
 from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import braceexpand
@@ -472,6 +473,12 @@ def get_manifest_sample(self, sample_id):
         return self.manifest_processor.collection[sample_id]
 
     def __getitem__(self, index):
+        if isinstance(index, IterableABC):
+            return [self._process_sample(_index) for _index in index]
+        else:
+            return self._process_sample(index)
+
+    def _process_sample(self, index):
         sample = self.manifest_processor.collection[index]
         offset = sample.offset
 
diff --git a/nemo/collections/asr/models/ctc_bpe_models.py b/nemo/collections/asr/models/ctc_bpe_models.py
index 9f3b6b4cf83b..f861a971f5ea 100644
--- a/nemo/collections/asr/models/ctc_bpe_models.py
+++ b/nemo/collections/asr/models/ctc_bpe_models.py
@@ -20,6 +20,7 @@
 from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict
 
 from nemo.collections.asr.data import audio_to_text_dataset
+from nemo.collections.asr.data.audio_to_text import _AudioTextDataset
 from nemo.collections.asr.data.audio_to_text_dali import AudioToBPEDALIDataset
 from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset
 from nemo.collections.asr.losses.ctc import CTCLoss
@@ -27,6 +28,7 @@
 from nemo.collections.asr.models.ctc_models import EncDecCTCModel
 from nemo.collections.asr.parts.mixins import ASRBPEMixin
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCBPEDecoding, CTCBPEDecodingConfig
+from nemo.collections.asr.parts.utils.asr_batching import get_semi_sorted_batch_sampler
 from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging, model_utils
@@ -129,9 +131,24 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
             # support datasets that are lists of lists
             collate_fn = dataset.datasets[0].datasets[0].collate_fn
 
+        batch_sampler = None
+        if config.get('use_semi_sorted_batching', False):
+            if not isinstance(dataset, _AudioTextDataset):
+                raise RuntimeError(
+                    "Semi Sorted Batch sampler can be used with AudioToCharDataset or AudioToBPEDataset "
+                    f"but found dataset of type {type(dataset)}"
+                )
+            # set batch_size and batch_sampler to None to disable automatic batching
+            batch_sampler = get_semi_sorted_batch_sampler(self, dataset, config)
+            config['batch_size'] = None
+            config['drop_last'] = False
+            shuffle = False
+
         return torch.utils.data.DataLoader(
             dataset=dataset,
             batch_size=config['batch_size'],
+            sampler=batch_sampler,
+            batch_sampler=None,
             collate_fn=collate_fn,
             drop_last=config.get('drop_last', False),
             shuffle=shuffle,
diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py
index 5f380619db68..4df02b1177cd 100644
--- a/nemo/collections/asr/models/ctc_models.py
+++ b/nemo/collections/asr/models/ctc_models.py
@@ -25,6 +25,7 @@
 from tqdm.auto import tqdm
 
 from nemo.collections.asr.data import audio_to_text_dataset
+from nemo.collections.asr.data.audio_to_text import _AudioTextDataset
 from nemo.collections.asr.data.audio_to_text_dali import AudioToCharDALIDataset, DALIOutputs
 from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset
 from nemo.collections.asr.losses.ctc import CTCLoss
@@ -33,6 +34,7 @@
 from nemo.collections.asr.parts.mixins import ASRModuleMixin, ASRTranscriptionMixin, InterCTCMixin, TranscribeConfig
 from nemo.collections.asr.parts.mixins.transcription import GenericTranscriptionType, TranscriptionReturnType
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecoding, CTCDecodingConfig
+from nemo.collections.asr.parts.utils.asr_batching import get_semi_sorted_batch_sampler
 from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType
 from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
 from nemo.collections.common.parts.preprocessing.parsers import make_parser
@@ -319,9 +321,24 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
             # support datasets that are lists of lists
             collate_fn = dataset.datasets[0].datasets[0].collate_fn
 
+        batch_sampler = None
+        if config.get('use_semi_sorted_batching', False):
+            if not isinstance(dataset, _AudioTextDataset):
+                raise RuntimeError(
+                    "Semi Sorted Batch sampler can be used with AudioToCharDataset or AudioToBPEDataset "
+                    f"but found dataset of type {type(dataset)}"
+                )
+            # set batch_size and batch_sampler to None to disable automatic batching
+            batch_sampler = get_semi_sorted_batch_sampler(self, dataset, config)
+            config['batch_size'] = None
+            config['drop_last'] = False
+            shuffle = False
+
         return torch.utils.data.DataLoader(
             dataset=dataset,
             batch_size=config['batch_size'],
+            sampler=batch_sampler,
+            batch_sampler=None,
             collate_fn=collate_fn,
             drop_last=config.get('drop_last', False),
             shuffle=shuffle,
diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py
index 182acf3904db..39375f08e139 100644
--- a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py
+++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py
@@ -21,6 +21,7 @@
 from pytorch_lightning import Trainer
 
 from nemo.collections.asr.data import audio_to_text_dataset
+from nemo.collections.asr.data.audio_to_text import _AudioTextDataset
 from nemo.collections.asr.data.audio_to_text_dali import AudioToBPEDALIDataset
 from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset
 from nemo.collections.asr.losses.ctc import CTCLoss
@@ -30,6 +31,7 @@
 from nemo.collections.asr.parts.mixins import ASRBPEMixin
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCBPEDecoding, CTCBPEDecodingConfig
 from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTBPEDecoding, RNNTBPEDecodingConfig
+from nemo.collections.asr.parts.utils.asr_batching import get_semi_sorted_batch_sampler
 from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging, model_utils
@@ -169,9 +171,24 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
             # support datasets that are lists of lists
             collate_fn = dataset.datasets[0].datasets[0].collate_fn
 
+        batch_sampler = None
+        if config.get('use_semi_sorted_batching', False):
+            if not isinstance(dataset, _AudioTextDataset):
+                raise RuntimeError(
+                    "Semi Sorted Batch sampler can be used with AudioToCharDataset or AudioToBPEDataset "
+                    f"but found dataset of type {type(dataset)}"
+                )
+            # set batch_size and batch_sampler to None to disable automatic batching
+            batch_sampler = get_semi_sorted_batch_sampler(self, dataset, config)
+            config['batch_size'] = None
+            config['drop_last'] = False
+            shuffle = False
+
         return torch.utils.data.DataLoader(
             dataset=dataset,
             batch_size=config['batch_size'],
+            sampler=batch_sampler,
+            batch_sampler=None,
             collate_fn=collate_fn,
             drop_last=config.get('drop_last', False),
             shuffle=shuffle,
diff --git a/nemo/collections/asr/models/rnnt_bpe_models.py b/nemo/collections/asr/models/rnnt_bpe_models.py
index 6fba163c65e1..bb4e7f718a8e 100644
--- a/nemo/collections/asr/models/rnnt_bpe_models.py
+++ b/nemo/collections/asr/models/rnnt_bpe_models.py
@@ -21,6 +21,7 @@
 from pytorch_lightning import Trainer
 
 from nemo.collections.asr.data import audio_to_text_dataset
+from nemo.collections.asr.data.audio_to_text import _AudioTextDataset
 from nemo.collections.asr.data.audio_to_text_dali import AudioToBPEDALIDataset
 from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset
 from nemo.collections.asr.losses.rnnt import RNNTLoss
@@ -28,6 +29,7 @@
 from nemo.collections.asr.models.rnnt_models import EncDecRNNTModel
 from nemo.collections.asr.parts.mixins import ASRBPEMixin
 from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTBPEDecoding, RNNTBPEDecodingConfig
+from nemo.collections.asr.parts.utils.asr_batching import get_semi_sorted_batch_sampler
 from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging, model_utils
@@ -527,9 +529,24 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
             # support datasets that are lists of lists
             collate_fn = dataset.datasets[0].datasets[0].collate_fn
 
+        batch_sampler = None
+        if config.get('use_semi_sorted_batching', False):
+            if not isinstance(dataset, _AudioTextDataset):
+                raise RuntimeError(
+                    "Semi Sorted Batch sampler can be used with AudioToCharDataset or AudioToBPEDataset "
+                    f"but found dataset of type {type(dataset)}"
+                )
+            # set batch_size and batch_sampler to None to disable automatic batching
+            batch_sampler = get_semi_sorted_batch_sampler(self, dataset, config)
+            config['batch_size'] = None
+            config['drop_last'] = False
+            shuffle = False
+
         return torch.utils.data.DataLoader(
             dataset=dataset,
             batch_size=config['batch_size'],
+            sampler=batch_sampler,
+            batch_sampler=None,
             collate_fn=collate_fn,
             drop_last=config.get('drop_last', False),
             shuffle=shuffle,
diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py
index 047e25b8dd5d..386f2a915142 100644
--- a/nemo/collections/asr/models/rnnt_models.py
+++ b/nemo/collections/asr/models/rnnt_models.py
@@ -25,6 +25,7 @@
 from tqdm.auto import tqdm
 
 from nemo.collections.asr.data import audio_to_text_dataset
+from nemo.collections.asr.data.audio_to_text import _AudioTextDataset
 from nemo.collections.asr.data.audio_to_text_dali import AudioToCharDALIDataset, DALIOutputs
 from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset
 from nemo.collections.asr.losses.rnnt import RNNTLoss, resolve_rnnt_default_loss_name
@@ -38,6 +39,7 @@
     TranscriptionReturnType,
 )
 from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecoding, RNNTDecodingConfig
+from nemo.collections.asr.parts.utils.asr_batching import get_semi_sorted_batch_sampler
 from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType
 from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
 from nemo.collections.common.parts.preprocessing.parsers import make_parser
@@ -467,9 +469,24 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
             # support datasets that are lists of lists
             collate_fn = dataset.datasets[0].datasets[0].collate_fn
 
+        batch_sampler = None
+        if config.get('use_semi_sorted_batching', False):
+            if not isinstance(dataset, _AudioTextDataset):
+                raise RuntimeError(
+                    "Semi Sorted Batch sampler can be used with AudioToCharDataset or AudioToBPEDataset "
+                    f"but found dataset of type {type(dataset)}"
+                )
+            # set batch_size and batch_sampler to None to disable automatic batching
+            batch_sampler = get_semi_sorted_batch_sampler(self, dataset, config)
+            config['batch_size'] = None
+            config['drop_last'] = False
+            shuffle = False
+
         return torch.utils.data.DataLoader(
             dataset=dataset,
             batch_size=config['batch_size'],
+            sampler=batch_sampler,
+            batch_sampler=None,
             collate_fn=collate_fn,
             drop_last=config.get('drop_last', False),
             shuffle=shuffle,
diff --git a/nemo/collections/asr/parts/utils/asr_batching.py b/nemo/collections/asr/parts/utils/asr_batching.py
new file mode 100644
index 000000000000..dcbebdc0f949
--- /dev/null
+++ b/nemo/collections/asr/parts/utils/asr_batching.py
@@ -0,0 +1,237 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Iterator, List, Optional, Union
+
+import numpy as np
+import torch
+from torch.utils.data.distributed import DistributedSampler
+
+from nemo.collections.asr.data.audio_to_text import AudioToBPEDataset, AudioToCharDataset
+from nemo.collections.asr.models.asr_model import ASRModel
+from nemo.utils import logging
+
+
+class SemiSortBatchSampler(DistributedSampler):
+    def __init__(
+        self,
+        global_rank: int,
+        world_size: int,
+        durations: List[int],
+        batch_size: int,
+        batch_shuffle: bool = True,
+        drop_last: bool = False,
+        randomization_factor: Optional[float] = None,
+        seed: int = 42,
+    ) -> None:
+        """
+        Semi Sorted Batching, as proposed in _SSB ("Speed up training with variable 
+        length inputs by efficient batching strategies.", Zhenhao Ge et al. (2021).).
+
+        The Semi Sorted Batch Sampler (SSB) samples the indices by their duration 
+        with the addition of pseudo noise that is sampled from the uniform 
+        distribution \mathbb{U}\left[ -delta * r, delta * r \right], where delta is 
+        defined as the difference between the maximum and minimum duration and r is 
+        the randomization factor that controls the strength of the noise (when r = 0, 
+        there will be a strong sorting). The heuristic value of the r according to 
+        the experiments from paper is 0.2. 
+
+        The torch calls the set_epoch method from the distributed data loader sampler 
+        at the end of each epoch to shuffle the samples according to the seed and 
+        epoch number. So the SSB is passed to the dataloader as a sampler with the 
+        dataloader's batch size options and the batch_sampler option set to None to 
+        disable automatical batching. In this case, the sampler has become an iterator 
+        that returns a list of batch indices.
+
+        Args:
+            global_rank: Rank among all GPUs.
+            world_size: The number of GPUs used.
+            durations: Sample durations parsed from `dataset.manifest_processor`.
+            batch_size: Micro batch size or batch size per singe gpu.
+            batch_shuffle: Batch sort before each epoch.
+            drop_last: Drop the last batch if the number of samples is less than batch 
+                size. Defaults to False.
+            randomization_factor: The strength of noise that will be added to the sample
+                duration. If no value is passed, the value 0.2 will be used.
+            seed: Seed for batch shuffleling. Defaults to 42.
+
+        Raises:
+            ValueError: Wrong randomization factor value.
+            RuntimeError: Unexpected behavior.
+
+        .. SSB_: 
+            https://www.isca-speech.org/archive/pdfs/interspeech_2021/ge21_interspeech.pdf
+        """
+        if randomization_factor is None:
+            randomization_factor = 0.1
+            logging.info("Randomization factor not found in config, default value 0.1 will be set.")
+        else:
+            logging.info(f"A randomization factor {randomization_factor} will be used.")
+
+        if randomization_factor < 0.0:
+            raise ValueError(f'Randomization factor must be non-negative but found {randomization_factor}.')
+
+        self.rank: List = global_rank
+        self.num_replicas: int = world_size
+
+        self.durations: np.array = np.array(durations, dtype=np.float32)
+
+        self.shuffle: bool = batch_shuffle
+        self.micro_batch_size: int = batch_size
+        self.drop_last: bool = drop_last
+        self.epoch: int = 0
+        self.seed: int = seed
+        self.randomization_factor: float = randomization_factor
+
+        self.local_num_batches: int = self._calculate_local_num_batches()
+
+        logging.info(f"Semi Sorted Batch Sampler will be used")
+
+    def _calculate_local_num_batches(self) -> int:
+        init_num_samples = len(self.durations)
+
+        # delete batches with a non-integer number of samples
+        if self.drop_last:
+            init_num_samples -= init_num_samples % self.micro_batch_size
+
+        # calculate the number of batches according to the counted number of samples
+        global_num_batches = math.ceil(init_num_samples / self.micro_batch_size)
+
+        # add extra batches to make it divisible by world size (num replicas)
+        num_batches_pad = (self.num_replicas - global_num_batches % self.num_replicas) % self.num_replicas
+        global_num_batches += num_batches_pad
+
+        # calculate the number of batches per rank
+        local_num_batches = global_num_batches // self.num_replicas
+
+        return local_num_batches
+
+    def _make_batches(self) -> List[np.array]:
+        max_duration: float = np.max(self.durations)
+        min_duration: float = np.min(self.durations)
+        bound: float = (max_duration - min_duration) * self.randomization_factor / 2
+
+        # generate pseudo noise
+        noise: np.array = np.random.uniform(low=-bound, high=bound, size=len(self.durations))
+
+        # sort indices accroding to pseudo noise
+        sorted_indices: np.array = np.argsort(self.durations + noise)
+
+        # delete batches with a non-integer number of samples
+        tail = 0
+        if self.drop_last:
+            tail: int = len(sorted_indices) % self.micro_batch_size
+            exclude = np.random.choice(len(sorted_indices), tail, replace=False)
+            sorted_indices = np.delete(sorted_indices, exclude)
+            logging.warning(f"Drop last is set to True, so {len(exclude)} samples will be dropped.")
+
+        global_num_batches: int = math.ceil(len(sorted_indices) / self.micro_batch_size)
+
+        # if the global_num_batches is zero than return empty list
+        if global_num_batches == 0:
+            logging.warning(
+                f"The number of all batches is {global_num_batches}, than dataloader will "
+                "be empty. To avoid this try to decrease batch size or world size or set "
+                "drop_last to False."
+            )
+            return []
+
+        # add extra batches to make it divisible by world size (num replicas)
+        pad_batches_num: int = (self.num_replicas - global_num_batches % self.num_replicas) % self.num_replicas
+        if global_num_batches < self.num_replicas:
+            logging.warning(
+                f"The number of all batches is {global_num_batches}, which is less than the "
+                f"world size of {self.num_replicas}. SSB Sampler will add {pad_batches_num} "
+                "batches. To avoid this try to decrease batch size or world size."
+            )
+
+        if pad_batches_num != 0:
+            # randomly select batch indeces to pad and concatenate them
+            batch_indeces_pad: np.array = np.random.randint(
+                low=0, high=len(sorted_indices), size=pad_batches_num * self.micro_batch_size,
+            )
+            sorted_indices: np.array = np.concatenate(
+                (sorted_indices, sorted_indices[batch_indeces_pad]), axis=0,
+            )
+
+        # local indeces are selected by world size and local rank
+        local_indices: np.array = sorted_indices[self.rank :: self.num_replicas]
+
+        # split local batches
+        size_mask = range(self.micro_batch_size, len(local_indices), self.micro_batch_size)
+        local_batches = np.split(local_indices, size_mask, axis=0)
+
+        if len(local_batches) != self.local_num_batches:
+            raise RuntimeError(
+                f'Number of calculated indices {len(local_batches)} is not equal to calculated '
+                f'number of local batches {self.local_num_batches}.'
+            )
+
+        return local_batches
+
+    def __iter__(self) -> Iterator[List[int]]:
+        local_batches = self._make_batches()
+
+        if self.shuffle:
+            g = torch.Generator()
+            g.manual_seed(self.seed + self.epoch + 1)
+            indices = torch.randperm(self.local_num_batches, generator=g)
+        else:
+            indices = torch.arange(0, self.local_num_batches)
+
+        for _, index in enumerate(indices):
+            yield local_batches[index]
+
+    def __len__(self) -> int:
+        return self.local_num_batches
+
+
+def get_semi_sorted_batch_sampler(
+    model: ASRModel, dataset: Union[AudioToCharDataset, AudioToBPEDataset], config: dict
+) -> SemiSortBatchSampler:
+    """
+    Instantiates a Semi Sorted (Batch) Sampler.
+
+    Args:
+        model: ASR Model.
+        dataset: Dataset which allow iterate over all object and parse durations.
+        config: Train, Vaidation or Test dataset config.
+
+    Raises:
+        ValueError: Wrong dataset type.
+
+    Returns:
+        SemiSortBatchSampler: Semi Sorted Batch Sampler class.
+    """
+    if not (isinstance(dataset, AudioToCharDataset) or isinstance(dataset, AudioToBPEDataset)):
+        raise ValueError(
+            "Only AudioToCharDataset or AudioToBPEDataset supported with semi sorted batching, "
+            f"but found {type(dataset)}."
+        )
+
+    durations = [sample.duration for sample in dataset.manifest_processor.collection.data]
+
+    sampler = SemiSortBatchSampler(
+        global_rank=model.global_rank,
+        world_size=model.world_size,
+        durations=durations,
+        batch_size=config['batch_size'],
+        batch_shuffle=config.get('shuffle', True),
+        drop_last=config.get('drop_last', False),
+        randomization_factor=config.get('randomization_factor', None),
+        seed=config.get('semi_sort_sampler_seed', 42),
+    )
+
+    return sampler
diff --git a/nemo/core/optim/lr_scheduler.py b/nemo/core/optim/lr_scheduler.py
index 38ad372f3e51..473ca0f5c416 100644
--- a/nemo/core/optim/lr_scheduler.py
+++ b/nemo/core/optim/lr_scheduler.py
@@ -877,6 +877,14 @@ def prepare_lr_scheduler(
                 batch_size = train_dataloader.batch_sampler.micro_batch_size
             else:
                 raise ValueError(f'Could not find batch_size from batch_sampler: {train_dataloader.batch_sampler}')
+        elif hasattr(train_dataloader, 'sampler') and train_dataloader.sampler is not None:
+            if (
+                hasattr(train_dataloader.sampler, 'micro_batch_size')
+                and train_dataloader.sampler.micro_batch_size is not None
+            ):
+                batch_size = train_dataloader.sampler.micro_batch_size
+            else:
+                raise ValueError(f'Could not find batch_size from sampler: {train_dataloader.sampler}')
         else:
             raise ValueError(f'Could not find batch_size from train_dataloader: {train_dataloader}')
         drop_last = train_dataloader.drop_last
diff --git a/tests/collections/asr/test_asr_samplers.py b/tests/collections/asr/test_asr_samplers.py
new file mode 100644
index 000000000000..0b4d11fe2946
--- /dev/null
+++ b/tests/collections/asr/test_asr_samplers.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import tempfile
+
+import numpy as np
+import pytest
+import soundfile as sf
+import torch
+
+from nemo.collections.asr.data import audio_to_text
+from nemo.collections.asr.parts.utils.asr_batching import SemiSortBatchSampler
+from nemo.collections.asr.parts.utils.manifest_utils import write_manifest
+
+
+class TestASRSamplers:
+    labels = [
+        " ",
+        "a",
+        "b",
+        "c",
+        "d",
+        "e",
+        "f",
+        "g",
+        "h",
+        "i",
+        "j",
+        "k",
+        "l",
+        "m",
+        "n",
+        "o",
+        "p",
+        "q",
+        "r",
+        "s",
+        "t",
+        "u",
+        "v",
+        "w",
+        "x",
+        "y",
+        "z",
+        "'",
+    ]
+
+    @pytest.mark.unit
+    def test_ssb_sampler(self):
+        # Generate random signals
+        data_min_duration = 0.1
+        data_max_duration = 16.7
+
+        random_seed = 42
+        sample_rate = 16000
+
+        _rng = np.random.default_rng(seed=random_seed)
+
+        def generate_samples(num_examples: int) -> list:
+            data_duration = np.round(_rng.uniform(low=data_min_duration, high=data_max_duration, size=num_examples), 3)
+            data_duration_samples = np.floor(data_duration * sample_rate).astype(int)
+            samples = []
+            for data_duration_sample in data_duration_samples:
+                samples.append(_rng.uniform(low=-0.5, high=0.5, size=(data_duration_sample)))
+            return samples
+
+        with tempfile.TemporaryDirectory() as test_dir:
+            # Build metadata for manifest
+            metadata = []
+
+            # Test size of dataloader with and without ssb
+            for num_samples in np.concatenate([np.array([1, 2]), _rng.integers(3, 10, 2), _rng.integers(10, 1000, 2)]):
+                samples = generate_samples(num_samples)
+
+                for n, sample in enumerate(samples):
+                    meta = dict()
+                    signal_filename = f'{n:04d}.wav'
+                    # write audio files
+                    sf.write(os.path.join(test_dir, signal_filename), sample, sample_rate)
+                    # update metadata
+                    meta['audio_filepath'] = os.path.join(test_dir, signal_filename)
+                    meta['duration'] = len(sample) / sample_rate
+                    meta['text'] = 'non empty'
+                    metadata.append(meta)
+
+                # Save manifest
+                manifest_filepath = os.path.join(test_dir, 'manifest.json')
+                write_manifest(manifest_filepath, metadata)
+
+                # Make dataset
+                dataset = audio_to_text.AudioToCharDataset(
+                    manifest_filepath=manifest_filepath,
+                    labels=self.labels,
+                    sample_rate=sample_rate,
+                    max_duration=data_max_duration,
+                    min_duration=data_min_duration,
+                )
+                durations = [sample.duration for sample in dataset.manifest_processor.collection.data]
+
+                # Compare two dataloader
+                for batch_size in _rng.integers(1, n + 20, 5):
+                    batch_size = int(batch_size)
+                    drop_last = True if _rng.integers(0, 2) else False
+                    sampler = SemiSortBatchSampler(
+                        global_rank=0,
+                        world_size=1,
+                        durations=durations,
+                        batch_size=batch_size,
+                        batch_shuffle=True,
+                        drop_last=drop_last,
+                        randomization_factor=0.1,
+                        seed=random_seed,
+                    )
+                    dataloader_with_ssb = torch.utils.data.DataLoader(
+                        dataset=dataset,
+                        batch_size=None,
+                        sampler=sampler,
+                        batch_sampler=None,
+                        collate_fn=lambda x: audio_to_text._speech_collate_fn(x, pad_id=0),
+                    )
+                    dataloader = torch.utils.data.DataLoader(
+                        dataset=dataset,
+                        batch_size=batch_size,
+                        collate_fn=lambda x: audio_to_text._speech_collate_fn(x, pad_id=0),
+                        drop_last=drop_last,
+                        shuffle=True,
+                    )
+
+                    assert abs(len(dataloader) - len(dataloader_with_ssb)) == 0, (
+                        "Different num of batches with batch! Num of batches with ssb is "
+                        f"{len(dataloader_with_ssb)} and without ssb is {len(dataloader)}!"
+                    )
+
+                    dataloader_with_ssb_exception, dataloader_exception = False, False
+
+                    try:
+                        list(dataloader_with_ssb)
+                    except:
+                        dataloader_with_ssb_exception = True
+
+                    try:
+                        list(dataloader)
+                    except:
+                        dataloader_exception = True
+
+                    assert dataloader_with_ssb_exception == dataloader_exception

From 9c80bdd9671ff32d6472fd7e8726220a5e349241 Mon Sep 17 00:00:00 2001
From: anteju <108555623+anteju@users.noreply.github.com>
Date: Thu, 11 Apr 2024 23:13:11 +0200
Subject: [PATCH 129/140] Added codec checkpoint to docs (#8860)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ante Jukić <ajukic@nvidia.com>
---
 docs/source/tts/checkpoints.rst           | 7 +++++++
 docs/source/tts/data/ngc_models_codec.csv | 2 ++
 2 files changed, 9 insertions(+)
 create mode 100644 docs/source/tts/data/ngc_models_codec.csv

diff --git a/docs/source/tts/checkpoints.rst b/docs/source/tts/checkpoints.rst
index 9c3de1ab4926..7d5daedd0559 100644
--- a/docs/source/tts/checkpoints.rst
+++ b/docs/source/tts/checkpoints.rst
@@ -152,3 +152,10 @@ End2End models
    :file: data/ngc_models_e2e.csv
    :align: left
    :header-rows: 1
+
+Codec models
+^^^^^^^^^^^^
+.. csv-table::
+   :file: data/ngc_models_codec.csv
+   :align: left
+   :header-rows: 1
diff --git a/docs/source/tts/data/ngc_models_codec.csv b/docs/source/tts/data/ngc_models_codec.csv
new file mode 100644
index 000000000000..d46567012600
--- /dev/null
+++ b/docs/source/tts/data/ngc_models_codec.csv
@@ -0,0 +1,2 @@
+Model Name,Dataset,Sampling Rate,Model Class,Overview,Checkpoint
+audio_codec_16khz_small,Libri-Light,16000Hz,nemo.collections.tts.models.AudioCodecModel,`audio_codec_16khz_small <https://ngc.nvidia.com/catalog/models/nvidia:nemo:audio_codec_16khz_small>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/audio_codec_16khz_small/versions/v1/files/audio_codec_16khz_small.nemo``

From 83a5cad63c91d55b6c2a63a32beecd15ea12580a Mon Sep 17 00:00:00 2001
From: Eric Harper <complex451@gmail.com>
Date: Thu, 11 Apr 2024 16:38:54 -0600
Subject: [PATCH 130/140] fix header (#8892)

Signed-off-by: eharper <eharper@nvidia.com>
---
 README.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index 1beef67832f0..f7374641d66d 100644
--- a/README.rst
+++ b/README.rst
@@ -36,7 +36,7 @@
 .. _main-readme:
 
 **NVIDIA NeMo Framework**
-===============
+=========================
 
 Latest News
 -----------

From b63cbe0e072a6531e3276e1474cb7019240959e9 Mon Sep 17 00:00:00 2001
From: anmolgupt <14880251+anmolgupt@users.noreply.github.com>
Date: Thu, 11 Apr 2024 16:35:08 -0700
Subject: [PATCH 131/140] disable bprop reduce for cpl when SP is enabled
 (#8889)

* disable bprop reduce for cpl when SP is enabled

Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com>

* updated megatron commit in jenkinsfile

Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com>

---------

Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
---
 Jenkinsfile                                                     | 2 +-
 .../nlp/modules/common/megatron/adapters/parallel_adapters.py   | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 6471fa3d011f..c98d13fbed38 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -87,7 +87,7 @@ pipeline {
       steps {
          sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \
              cd Megatron-LM && \
-             git checkout 7fe863f3d94f7b64a927b04b85f5c9339d3fb784 && \
+             git checkout f3a3020031f384ddafd9b7e9f3a587798c0aea21 && \
              pip install . && \
              cd megatron/core/datasets && \
              make'
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
index 70ed4d695b3c..419126bd3f18 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
@@ -187,6 +187,7 @@ def __init__(
                 bias=False,
                 gather_output=True,
                 init_method=self._get_init_fn(column_init_method),
+                disable_grad_reduce=self._sequence_parallel,
             )
         if gather_output:
             self.linear_out = RowParallelLinear(

From 5da310109f9b09ac950b4bf83e226e955d376728 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Thu, 11 Apr 2024 17:10:24 -0700
Subject: [PATCH 132/140] [Nemo CICD] Add further runners for
 cpu-intensive-only (non-gpu using) jobs (#8894)

* Cancel old runs for PR commit update

* update dependencies for container build

* temp for test

* update back

* Revert "temp for test"

This reverts commit 9f9221155412393d05b2c862880f9128a93b26a4.

* Add further runners for cpu-intensive-only (non-gpu using) jobs
---
 .github/workflows/cicd-main.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 5cc990902953..a9509fda51e9 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -31,7 +31,7 @@ jobs:
         nvidia-smi
 
   cicd-cluster-clean:
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-cpu
     steps:
     - name: Clean server from old files
       run: |
@@ -53,7 +53,7 @@ jobs:
 
   cicd-test-container-setup:
     needs: [cicd-cluster-clean]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-cpu
     # uses: actions/cache@v2
     #container:
 #      image: nvcr.io/nvidia/pytorch:24.01-py3
@@ -196,7 +196,7 @@ jobs:
 
   L0_Unit_Tests_CPU:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-cpu
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 

From acf9a7eebfeb80ba7c7f540331c8d772839ff158 Mon Sep 17 00:00:00 2001
From: Jaemin Choi <minitu77@gmail.com>
Date: Thu, 11 Apr 2024 18:30:24 -0700
Subject: [PATCH 133/140] Add TE guards for DGRAD RS overlap (#8879)

* Add TE guards for DGRAD RS overlap

Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>

* Fix TE guard

Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>

---------

Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>
Co-authored-by: Jaemin Choi <jaeminc@nvidia.com>
---
 .../megatron/gpt_full_te_layer_autocast_spec.py        | 10 ++++++----
 .../nlp/modules/common/megatron/transformer.py         | 10 ++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
index 19766e4a34ca..02858b119bfa 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
@@ -134,7 +134,8 @@ def __init__(
                     transformer_layer_args[ub_overlap_flag] = kwargs.get(split_gemm_flag, True) or kwargs.get(
                         atomic_gemm_flag, False
                     )
-            transformer_layer_args["ub_overlap_rs_dgrad"] = kwargs.get("ub_overlap_rs_dgrad", False)
+            if te_version > packaging.version.Version("1.6.0.dev0"):
+                transformer_layer_args["ub_overlap_rs_dgrad"] = kwargs.get("ub_overlap_rs_dgrad", False)
         else:
             transformer_layer_args["ub_split_ag"] = kwargs.get("ub_split_ag", True)
             transformer_layer_args["ub_split_rs"] = kwargs.get("ub_split_rs", True)
@@ -225,9 +226,10 @@ def __init__(self, config, layer_number=1, hidden_dropout=None):
                 if hasattr(config, "tp_comm_overlap_rs")
                 else config.tp_comm_split_rs or config.tp_comm_atomic_rs
             )
-            transformer_layer_args["ub_overlap_rs_dgrad"] = (
-                config.tp_comm_overlap_rs_dgrad if hasattr(config, "tp_comm_overlap_rs_dgrad") else False
-            )
+            if te_version > packaging.version.Version("1.6.0.dev0"):
+                transformer_layer_args["ub_overlap_rs_dgrad"] = (
+                    config.tp_comm_overlap_rs_dgrad if hasattr(config, "tp_comm_overlap_rs_dgrad") else False
+                )
         else:
             transformer_layer_args["ub_split_ag"] = config.tp_comm_split_ag
             transformer_layer_args["ub_split_rs"] = config.tp_comm_split_rs
diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py
index b33a996b7987..cb23c4a6b1fd 100644
--- a/nemo/collections/nlp/modules/common/megatron/transformer.py
+++ b/nemo/collections/nlp/modules/common/megatron/transformer.py
@@ -850,7 +850,8 @@ def __init__(
                     transformer_layer_args[ub_overlap_flag] = kwargs.get(split_gemm_flag, True) or kwargs.get(
                         atomic_gemm_flag, False
                     )
-            transformer_layer_args["ub_overlap_rs_dgrad"] = kwargs.get("ub_overlap_rs_dgrad", False)
+            if te_version > packaging.version.Version("1.6.0.dev0"):
+                transformer_layer_args["ub_overlap_rs_dgrad"] = kwargs.get("ub_overlap_rs_dgrad", False)
         else:
             transformer_layer_args["ub_split_ag"] = kwargs.get("ub_split_ag", True)
             transformer_layer_args["ub_split_rs"] = kwargs.get("ub_split_rs", True)
@@ -1120,9 +1121,10 @@ def build_layer(layer_number):
                         if hasattr(config, "tp_comm_overlap_rs")
                         else config.tp_comm_split_rs or config.tp_comm_atomic_rs
                     )
-                    transformer_layer_args["ub_overlap_rs_dgrad"] = (
-                        config.tp_comm_overlap_rs_dgrad if hasattr(config, "tp_comm_overlap_rs_dgrad") else False
-                    )
+                    if te_version > packaging.version.Version("1.6.0.dev0"):
+                        transformer_layer_args["ub_overlap_rs_dgrad"] = (
+                            config.tp_comm_overlap_rs_dgrad if hasattr(config, "tp_comm_overlap_rs_dgrad") else False
+                        )
                 else:
                     transformer_layer_args["ub_split_ag"] = config.tp_comm_split_ag
                     transformer_layer_args["ub_split_rs"] = config.tp_comm_split_rs

From 752ed8a822a1135e73b94307099804e26f20b703 Mon Sep 17 00:00:00 2001
From: Sangkug Lym <slym@nvidia.com>
Date: Fri, 12 Apr 2024 06:45:06 -0700
Subject: [PATCH 134/140] Skip validation model gradient zeroing (#8890)

* Skip validation model gradient zeroing

Signed-off-by: Sangkug Lym <slym@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../conf/megatron_gpt_config.yaml             |  1 +
 .../language_modeling/megatron_gpt_model.py   | 21 +++++++++++++++----
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 7be891a156c8..ea37237f2eac 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -142,6 +142,7 @@ model:
   gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
   sync_batch_comm: False # Enable stream synchronization after each p2p communication between pipeline stages
   nccl_communicator_config_path: null # Path to the yaml file with NCCL communicator options (min_ctas, max_ctas, and cga_cluster_size)
+  validation_param_sync_overlap: False # Overlap parameter AllGather with validation step.
 
   # FSDP
   fsdp: False # Enable training with torch FSDP.
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 8d1d428a9989..a651ada5c38a 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -365,6 +365,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         self.log_train_loss = bool(int(os.getenv("NEMO_LOG_TRAIN_LOSS", 1)))
         self.log_memory_usage = bool(int(os.getenv("NEMO_LOG_MEMORY_USAGE", 0)))
         self.loss_broadcast_src_rank = None
+        self.validation_param_sync_overlap = self.cfg.get('validation_param_sync_overlap', False)
 
         self.inference_params = None
 
@@ -585,10 +586,14 @@ def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
         no_sync_func = None
         grad_sync_func = None
         param_sync_func = None
-        if not forward_only and self.with_distributed_adam:
-            no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,)
-            grad_sync_func = self.reduce_overlap_gradients
-            param_sync_func = self.sync_overlap_parameters
+        if self.with_distributed_adam:
+            if forward_only:
+                if self.validation_param_sync_overlap:
+                    param_sync_func = self.sync_overlap_parameters
+            else:
+                no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,)
+                grad_sync_func = self.reduce_overlap_gradients
+                param_sync_func = self.sync_overlap_parameters
 
         # pipeline schedules will get these from self.model.config
         for module in self.get_model_module_list():
@@ -1703,6 +1708,14 @@ def on_load_checkpoint(self, checkpoint) -> None:
                     self.model[i].module.load_state_dict(checkpoint[f'model{i}'], strict=True)
                 parallel_state.set_virtual_pipeline_model_parallel_rank(0)
 
+    def on_validation_model_zero_grad(self) -> None:
+        """
+         Skip gradient zeroing at the beginning of validation routine.
+         This is needed when overlapping the AllGather of the updated parameters with the following valdation step.
+         """
+        if not self.validation_param_sync_overlap:
+            super().on_validation_model_zero_grad()
+
     def sharded_state_dict(self, prefix: str = '') -> Dict[str, Any]:
         """
         Creates the sharded state dict which is used by dist_checkpoint to save the sharded tensors to disk.

From 8bf2bc0a435ef18dd43830f4727d7824ae7e60e4 Mon Sep 17 00:00:00 2001
From: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com>
Date: Fri, 12 Apr 2024 10:54:51 -0500
Subject: [PATCH 135/140] Correcting bullets and notes within NeMo Forced
 Aligner (#8903)

* Update nemo_forced_aligner.rst

Correcting bullets and note tags

Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com>

* Update nemo_forced_aligner.rst

Note that the bottom is not showing up correctly.

Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com>

---------

Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com>
---
 docs/source/tools/nemo_forced_aligner.rst | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/docs/source/tools/nemo_forced_aligner.rst b/docs/source/tools/nemo_forced_aligner.rst
index a4ed90fa7f9f..aa8d2139653f 100644
--- a/docs/source/tools/nemo_forced_aligner.rst
+++ b/docs/source/tools/nemo_forced_aligner.rst
@@ -45,7 +45,7 @@ Call the ``align.py`` script, specifying the parameters as follows:
 
 * ``model_path``: string specifying the local filepath to a CTC NeMo ASR model which will be used to generate the log-probs which we will use to do alignment. If ``pretrained_name`` is specified, ``model_path`` must not be specified.
 
-	Note: Currently NFA can only use CTC models, or Hybrid CTC-Transducer models (in CTC mode). Pure Transducer models cannot be used.
+	.. note:: Currently NFA can only use CTC models, or Hybrid CTC-Transducer models (in CTC mode). Pure Transducer models cannot be used.
 
 * ``manifest_filepath``: The path to the manifest of the data you want to align, containing ``'audio_filepath'`` and ``'text'`` fields. The audio filepaths need to be absolute paths.
 
@@ -66,7 +66,7 @@ Optional parameters:
 
 * ``additional_segment_grouping_separator``: an optional string used to separate the text into smaller segments. If this is not specified, then the whole text will be treated as a single segment. (Default: ``None``. Cannot be empty string or space (" "), as NFA will automatically produce word-level timestamps for substrings separated by spaces).
 
-	Note: the ``additional_segment_grouping_separator`` will be removed from the reference text and all the output files, ie it is treated as a marker which is not part of the reference text. The separator will essentially be treated as a space, and any additional spaces around it will be amalgamated into one, i.e. if ``additional_segment_grouping_separator="|"``, the following texts will be treated equivalently: ``“abc|def”``, ``“abc |def”``, ``“abc| def”``, ``“abc | def"``.
+	.. note:: the ``additional_segment_grouping_separator`` will be removed from the reference text and all the output files, ie it is treated as a marker which is not part of the reference text. The separator will essentially be treated as a space, and any additional spaces around it will be amalgamated into one, i.e. if ``additional_segment_grouping_separator="|"``, the following texts will be treated equivalently: ``“abc|def”``, ``“abc |def”``, ``“abc| def”``, ``“abc | def"``.
 
 * ``remove_blank_tokens_from_ctm``: a boolean denoting whether to remove <blank> tokens from token-level output CTMs. (Default: False). 
 
@@ -98,13 +98,14 @@ By default, NFA needs to be provided with a 'manifest' file where each line spec
 
 You can omit the ``"text"`` field from the manifest if you specify ``align_using_pred_text=true``. In that case, any ``"text"`` fields in the manifest will be ignored: the ASR model at ``pretrained_name`` or ``model_path`` will be used to transcribe the audio and obtain ``"pred_text"``, which will be used as the reference text for the forced alignment process. The ``"pred_text"`` will also be saved in the output manifest JSON file at ``<output_dir>/<original manifest file name>_with_output_file_paths.json``. To remove the possibility of overwriting ``"pred_text"``, NFA will raise an error if ``align_using_pred_text=true`` and there are existing ``"pred_text"`` fields in the original manifest.
 
-	..note:: NFA does not require ``"duration"`` fields in the manifest, and can align long audio files without running out of memory. The duration of audio file you can align will depend on the amount of memory on your machine. NFA will also produce better alignments the more accurate the reference text in ``"text"`` is.
+	.. note:: NFA does not require ``"duration"`` fields in the manifest, and can align long audio files without running out of memory. The duration of audio file you can align will depend on the amount of memory on your machine. NFA will also produce better alignments the more accurate the reference text in ``"text"`` is.
 
 
 Output CTM file format
 ----------------------
 
 For each utterance specified in a line of ``manifest_filepath``, several CTM files will be generated:
+
 * a CTM file containing token-level alignments at ``<output_dir>/ctm/tokens/<utt_id>.ctm``,
 * a CTM file containing word-level alignments at ``<output_dir>/ctm/words/<utt_id>.ctm``,
 * a CTM file containing segment-level alignments at ``<output_dir>/ctm/segments/<utt_id>.ctm``. If ``additional_segment_grouping_separator`` is specified, the segments will be parts of the text separated by ``additonal_segment_grouping_separator``. If it is not specified, the entire text will be treated as a single segment.
@@ -117,6 +118,7 @@ Note the second item in the line (the 'channel ID', which is required by the CTM
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The ``CTMFileConfig`` (which is passed into the main NFA config) has the following parameters:
+
 * ``remove_blank_tokens``: bool (default ``False``) to specify if the token-level CTM files should have the timestamps of the blank tokens removed.
 * ``minimum_timestamp_duration``: float (default ``0``) to specify the minimum duration that will be applied to all timestamps. If any line in the CTM has a duration lower than this, it will be enlarged from the middle outwards until it meets the ``minimum_timestamp_duration``, or reaches the beginning or end of the audio file. Note that using a non-zero value may cause timestamps to overlap.
 
@@ -124,14 +126,17 @@ Output ASS file format
 ----------------------
 
 NFA will produce the following ASS files, which you can use to generate subtitle videos:
+
 * ASS files with token-level highlighting will be at ``<output_dir>/ass/tokens/<utt_id>.ass,``
 * ASS files with word-level highlighting will be at ``<output_dir>/ass/words/<utt_id>.ass``.
+
 All words belonging to the same segment 'segments' will appear at the same time in the subtitles generated with the ASS files. If you find that your segments are not the right size, you can use set ``ass_file_config.resegment_text_to_fill_space=true`` and specify some number of ``ass_file_config.max_lines_per_segment``.
 
 ``ASSFileConfig`` parameters
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The ``ASSFileConfig`` (which is passed into the main NFA config) has the following parameters:
+
 * ``fontsize``: int (default value ``20``) which will be the fontsize of the text
 * ``vertical_alignment``: string (default value ``center``) to specify the vertical alignment of the text. Can be one of ``center``, ``top``, ``bottom``.
 * ``resegment_text_to_fill_space``: bool (default value ``False``). If ``True``, the text will be resegmented such that each segment will not take up more than (approximately) ``max_lines_per_segment`` when the ASS file is applied to a video.
@@ -144,6 +149,7 @@ Output JSON manifest file format
 --------------------------------
 
 A new manifest file will be saved at ``<output_dir>/<original manifest file name>_with_output_file_paths.json``. It will contain the same fields as the original manifest, and additionally:
+
 * ``"token_level_ctm_filepath"`` (if ``save_output_file_formats`` contains ``ctm``)
 * ``"word_level_ctm_filepath"`` (if ``save_output_file_formats`` contains ``ctm``)
 * ``"segment_level_ctm_filepath"`` (if ``save_output_file_formats`` contains ``ctm``)
@@ -159,8 +165,9 @@ Ideally you would have some 'true' CTM files to compare with your generated CTM
 
 Alternatively (or additionally), you can visualize the quality of alignments using tools such as Gecko, which can play your audio file and display the predicted alignments at the same time. The Gecko tool requires you to upload an audio file and at least one CTM file. The Gecko tool can be accessed here: https://gong-io.github.io/gecko/. More information about the Gecko tool can be found on its Github page here: https://github.com/gong-io/gecko. 
 
-**Note**: the following may help improve your experience viewing the CTMs in Gecko:
+.. note:: 
+	The following may help improve your experience viewing the CTMs in Gecko:
 
-* setting ``minimum_timestamp_duration`` to a larger number, as Gecko may not display some tokens/words/segments properly if their timestamps are too short.
-* setting ``remove_blank_tokens_from_ctm=true`` if you are analyzing token-level CTMs, as it will make the Gecko visualization less cluttered.
+	* setting ``minimum_timestamp_duration`` to a larger number, as Gecko may not display some tokens/words/segments properly if their timestamps are too short.
+	* setting ``remove_blank_tokens_from_ctm=true`` if you are analyzing token-level CTMs, as it will make the Gecko visualization less cluttered.
 

From f05ecb601556ccc72cf487603ee8774916b88698 Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Date: Fri, 12 Apr 2024 11:27:08 -0700
Subject: [PATCH 136/140] Adding distributed checkpointing for bert (#8650)

* Adding distributed checkpointing for bert

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update megatron_bert_model.py

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>

* Adding dist checkpointing to bert

* Simple bug fix

* Fixing parallel state

* Simple bug fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fixing bug

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Simple bug fix

* Simple bug fix

---------

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 .../megatron_lm_ckpt_to_nemo.py               |  2 +-
 .../language_modeling/megatron_retro_eval.py  |  2 +-
 .../multimodal/data/common/webdataset.py      |  2 +-
 .../models/multimodal_llm/neva/neva_model.py  |  8 +--
 .../megatron_bert_embedding_model.py          |  4 +-
 .../megatron/bert/bert_model.py               |  1 +
 .../language_modeling/megatron_base_model.py  |  9 ++-
 .../language_modeling/megatron_bert_model.py  | 69 +++++++++++++++----
 .../language_modeling/megatron_gpt_model.py   |  6 +-
 .../megatron_gpt_prompt_learning_model.py     |  2 +-
 .../megatron_lm_encoder_decoder_model.py      |  2 +-
 .../megatron_retrieval_model.py               |  2 +-
 nemo/collections/nlp/models/nlp_model.py      |  2 +-
 nemo/collections/nlp/parts/nlp_overrides.py   |  4 +-
 nemo/export/quantize/quantizer.py             |  2 +-
 nemo/utils/distributed.py                     |  2 +-
 .../convert_prompt_learning_ckpt_to_nemo.py   |  2 +-
 .../start_retro_model_service.py              |  2 +-
 18 files changed, 80 insertions(+), 43 deletions(-)

diff --git a/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py b/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py
index d09c79f7a051..03d6fd94e4e2 100644
--- a/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py
+++ b/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py
@@ -529,7 +529,7 @@ def convert(local_rank, rank, world_size, args):
 
     if args.nemo_file_path:
         if args.model_type == 'gpt':
-            if mcore_output and parallel_state.is_unitialized():
+            if mcore_output and not parallel_state.is_initialized():
                 parallel_state.initialize_model_parallel(
                     tensor_model_parallel_size=args.tensor_model_parallel_size,
                     pipeline_model_parallel_size=args.pipeline_model_parallel_size,
diff --git a/examples/nlp/language_modeling/megatron_retro_eval.py b/examples/nlp/language_modeling/megatron_retro_eval.py
index 79b1e2debdfa..9978bab78bfc 100644
--- a/examples/nlp/language_modeling/megatron_retro_eval.py
+++ b/examples/nlp/language_modeling/megatron_retro_eval.py
@@ -108,7 +108,7 @@ def main(cfg) -> None:
     }
 
     # check whether the DDP is initialized
-    if parallel_state.is_unitialized():
+    if not parallel_state.is_initialized():
 
         def dummy():
             return
diff --git a/nemo/collections/multimodal/data/common/webdataset.py b/nemo/collections/multimodal/data/common/webdataset.py
index 8d70a03fa911..79d22f34f77c 100644
--- a/nemo/collections/multimodal/data/common/webdataset.py
+++ b/nemo/collections/multimodal/data/common/webdataset.py
@@ -302,7 +302,7 @@ def run(self, src):
                 epoch = self.epoch
             rng = random.Random()
             # This seed to be deterministic AND the same across all nodes/workers in each epoch
-            if parallel_state.is_unitialized():
+            if not parallel_state.is_initialized():
                 seed = self.seed + epoch
             else:
                 seed = self.seed + epoch + (100 * parallel_state.get_data_parallel_rank())
diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
index 78a46ce3b0db..4556ba1b3e72 100644
--- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
+++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
@@ -461,7 +461,7 @@ def model_provider_func(self, pre_process, post_process):
         media_end_id = self.tokenizer.token_to_id(DEFAULT_IM_END_TOKEN)
 
         if self.mcore_gpt:
-            if parallel_state.is_unitialized():
+            if not parallel_state.is_initialized():
 
                 def dummy():
                     return
@@ -795,9 +795,7 @@ def setup(self, stage=None):
         Args:
             stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
         """
-        num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert(
-            self.model
-        )
+        num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert()
 
         logging.info(
             f'Pipeline model parallel rank: {parallel_state.get_pipeline_model_parallel_rank()}, '
@@ -998,7 +996,7 @@ def generate(
     ) -> OutputType:
 
         # check whether the DDP is initialized
-        if parallel_state.is_unitialized():
+        if not parallel_state.is_initialized():
 
             def dummy():
                 return
diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
index 849438d408a5..d974c8182234 100644
--- a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
+++ b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
@@ -189,9 +189,7 @@ def setup(self, stage=None):
             stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
         """
 
-        num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert(
-            self.model
-        )
+        num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert()
 
         logging.info(
             f'Pipeline model parallel rank: {parallel_state.get_pipeline_model_parallel_rank()}, '
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py
index 749d960b9729..e7ae529fe4e2 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py
@@ -347,6 +347,7 @@ def __init__(self, transformer_block_type='pre-ln', add_pooler=True, *args, **kw
         # Output
         if self.post_process:
             # TODO: Make sure you are passing in the mpu_vocab_size properly
+
             self.lm_head = MCoreBertLMHead(self.config.hidden_size, self.config,)
 
             self.output_layer = tensor_parallel.ColumnParallelLinear(
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 980ea8f9f76d..035d194de09f 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -996,10 +996,11 @@ def is_data_parallel_rank_zero(self):
             else:
                 return False
 
-    def _get_total_params_across_model_parallel_groups_gpt_bert(self, model):
+    def _get_total_params_across_model_parallel_groups_gpt_bert(self):
         """Returns the total number of parameters across all model parallel groups."""
         is_mcore_model = self.__dict__.get('mcore_gpt', False) or self.__dict__.get('mcore_bert', False)
         # log number of parameters
+        model = self.get_model_module_list()
         if isinstance(model, list):
             num_parameters_on_device = sum(
                 [sum([p.nelement() for p in model_module.parameters()]) for model_module in model]
@@ -1010,7 +1011,7 @@ def _get_total_params_across_model_parallel_groups_gpt_bert(self, model):
                 and self.cfg.get('share_embeddings_and_output_weights', True)
             ):
                 word_embeddings_weight = (
-                    model[-1].module.shared_embedding_or_output_weight()
+                    model[-1].shared_embedding_or_output_weight()
                     if is_mcore_model
                     else model[-1].word_embeddings_weight()
                 )
@@ -1025,9 +1026,7 @@ def _get_total_params_across_model_parallel_groups_gpt_bert(self, model):
                 and self.cfg.get('share_embeddings_and_output_weights', True)
             ):
                 word_embeddings_weight = (
-                    model.module.shared_embedding_or_output_weight()
-                    if is_mcore_model
-                    else model.word_embeddings_weight()
+                    model.shared_embedding_or_output_weight() if is_mcore_model else model.word_embeddings_weight()
                 )
                 # substract the embedding weights on the last stage
                 num_word_embedding_parameters = sum([p.nelement() for p in word_embeddings_weight])
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
index 82b2b1a96ff4..dc6d81649122 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
@@ -767,9 +767,7 @@ def setup(self, stage=None):
             stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
         """
 
-        num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert(
-            self.model
-        )
+        num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert()
 
         logging.info(
             f'Pipeline model parallel rank: {parallel_state.get_pipeline_model_parallel_rank()}, '
@@ -1084,25 +1082,70 @@ def input_example(self, max_batch=1, max_dim=256):
         input_dict = {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids}
         return tuple([input_dict])
 
+    def sharded_state_dict(self, prefix: str = '') -> Dict[str, Any]:
+        """
+        Creates the sharded state dict which is used by dist_checkpoint to save the sharded tensors to disk.
+        When given the sharded_stated_dict, dist_checkpoint.load will load the tensors corresponding to
+        self.state_dict().
+        The sharded tensor mapping is defined in the GPTModel class from mcore.
+        """
+        if self.mcore_bert:
+            module_prefix = f'{prefix}model.'
+            sharded_state_dict = {}
+            for index, module in enumerate(self.get_model_module_list()):
+                if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+                    # virtual pipline rank must be set so that GPTModel returns the correct sharded state dict
+                    parallel_state.set_virtual_pipeline_model_parallel_rank(index)
+                    module_sharded_state_dict = module.sharded_state_dict(prefix=module_prefix)
+                    sharded_state_dict[f'model_{index}'] = module_sharded_state_dict
+                else:
+                    module_sharded_state_dict = module.sharded_state_dict(prefix=module_prefix)
+                    sharded_state_dict.update(module_sharded_state_dict)
+
+            # reset vp rank
+            if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+                parallel_state.set_virtual_pipeline_model_parallel_rank(0)
+
+            return sharded_state_dict
+
     def on_save_checkpoint(self, checkpoint) -> None:
         """LightningModule hook:
         https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-save-checkpoint
         """
-        if isinstance(self.model, list):
-            for i in range(len(self.model)):
-                parallel_state.set_virtual_pipeline_model_parallel_rank(i)
-                checkpoint[f'model{i}'] = self.model[i].module.state_dict_for_save_checkpoint()
-            parallel_state.set_virtual_pipeline_model_parallel_rank(0)
+        if self.mcore_bert:
+            checkpoint['sharded_state_dict'] = self.sharded_state_dict()
+        else:
+            if isinstance(self.model, list):
+                for i in range(len(self.model)):
+                    parallel_state.set_virtual_pipeline_model_parallel_rank(i)
+                    checkpoint[f'model{i}'] = self.model[i].module.state_dict_for_save_checkpoint()
+                parallel_state.set_virtual_pipeline_model_parallel_rank(0)
 
     def on_load_checkpoint(self, checkpoint) -> None:
         """LightningModule hook:
         https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-load-checkpoint
         """
-        if isinstance(self.model, list):
-            for i in range(len(self.model)):
-                parallel_state.set_virtual_pipeline_model_parallel_rank(i)
-                self.model[i].module.load_state_dict(checkpoint[f'model{i}'], strict=True)
-            parallel_state.set_virtual_pipeline_model_parallel_rank(0)
+        if self.mcore_bert:
+            if 'state_dict' in checkpoint and checkpoint['state_dict']:
+                for index, module in enumerate(self.get_model_module_list()):
+                    if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+                        checkpoint_state_dict = checkpoint['state_dict'][f'model_{index}']
+                    else:
+                        checkpoint_state_dict = checkpoint['state_dict']
+                    # checkpoint_state_dict has "model." but module does not so we need to remove it when loading
+                    checkpoint_state_dict = {
+                        key.replace('model.', ''): checkpoint_state_dict.pop(key)
+                        for key in list(checkpoint_state_dict.keys())
+                    }
+                    module.load_state_dict(checkpoint_state_dict, strict=True)
+            else:
+                checkpoint['state_dict'] = {}
+        else:
+            if isinstance(self.model, list):
+                for i in range(len(self.model)):
+                    parallel_state.set_virtual_pipeline_model_parallel_rank(i)
+                    self.model[i].module.load_state_dict(checkpoint[f'model{i}'], strict=True)
+                parallel_state.set_virtual_pipeline_model_parallel_rank(0)
 
     def build_transformer_config(self) -> TransformerConfig:
         """ Builds the megatron core gpt transformer config for the model.
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index a651ada5c38a..f5b1667be27f 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1479,9 +1479,7 @@ def setup(self, stage=None):
         Args:
             stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
         """
-        num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert(
-            self.model
-        )
+        num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert()
 
         logging.info(
             f'Pipeline model parallel rank: {parallel_state.get_pipeline_model_parallel_rank()}, '
@@ -1570,7 +1568,7 @@ def generate(
     ) -> OutputType:
 
         # check whether the DDP is initialized
-        if parallel_state.is_unitialized():
+        if not parallel_state.is_initialized():
 
             def dummy():
                 return
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
index 617a585ef3a9..5ee7a3fcf480 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
@@ -729,7 +729,7 @@ def generate(
     ):
 
         # check whether the DDP is initialized
-        if parallel_state.is_unitialized():
+        if not parallel_state.is_initialized():
 
             def dummy():
                 return
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
index 3a7ad3d6714c..459bf5b71c7e 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
@@ -1006,7 +1006,7 @@ def encode(self, tokens_enc, enc_mask, encoder_input=None, batch_data=None, reco
                 Format is not defined and should match the expected format of the used hiddens modules.
         """
         # Check whether the DDP is initialized. This is needed when running inference outside of training loop.
-        if parallel_state.is_unitialized():
+        if not parallel_state.is_initialized():
 
             def dummy():
                 return
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py
index ebe936a8178a..acd85261f7e5 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py
@@ -482,7 +482,7 @@ def generate(
     ) -> OutputType:
 
         # check whether the DDP is initialized
-        if parallel_state.is_unitialized():
+        if not parallel_state.is_initialized():
 
             def dummy():
                 return
diff --git a/nemo/collections/nlp/models/nlp_model.py b/nemo/collections/nlp/models/nlp_model.py
index 04bbb2ca17fe..65d8645688fd 100644
--- a/nemo/collections/nlp/models/nlp_model.py
+++ b/nemo/collections/nlp/models/nlp_model.py
@@ -385,7 +385,7 @@ def load_from_checkpoint(
                 sharded_state_dict = model.sharded_state_dict()
                 checkpoint['state_dict'] = sharded_state_dict
                 # dist checkpointing needs torch.distributed to load the checkpoint
-                if parallel_state.is_unitialized():
+                if not parallel_state.is_initialized():
 
                     def dummy():
                         return
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 91f1fab348da..d4a75e3353c7 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -869,7 +869,7 @@ def save_to(self, model, save_path: str):
 
                     sharded_state_dict = model.sharded_state_dict()
                     # dist checkpoint needs torch.distributed to save the checkpoint
-                    if parallel_state.is_unitialized():
+                    if not parallel_state.is_initialized():
 
                         def dummy():
                             return
@@ -1110,7 +1110,7 @@ def restore_from(
         # if we're using dist checkpointing then state_dict will be None
         if state_dict is None:
             # dist checkpointing needs torch.distributed to load the checkpoint
-            if parallel_state.is_unitialized():
+            if not parallel_state.is_initialized():
 
                 def dummy():
                     return
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index 435ca6a496b1..d60ede29e22e 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -133,7 +133,7 @@ def _load_model(
         return model
 
     def _check_ddp_initialized(self, model):
-        if parallel_state.is_unitialized():
+        if not parallel_state.is_initialized():
 
             def dummy():
                 return
diff --git a/nemo/utils/distributed.py b/nemo/utils/distributed.py
index 9649089e40af..443c0216785e 100644
--- a/nemo/utils/distributed.py
+++ b/nemo/utils/distributed.py
@@ -81,7 +81,7 @@ def gather_objects(partial_results_list, main_rank=None):
         pickle.dump(predictions, open(output_fname, "wb"))
     """
     # do not fail when DDP is not initialized
-    if parallel_state.is_unitialized():
+    if not parallel_state.is_initialized():
         return partial_results_list
 
     rank = parallel_state.get_data_parallel_rank()
diff --git a/scripts/nlp_language_modeling/convert_prompt_learning_ckpt_to_nemo.py b/scripts/nlp_language_modeling/convert_prompt_learning_ckpt_to_nemo.py
index 61cbbc1ae682..334b3415a93b 100644
--- a/scripts/nlp_language_modeling/convert_prompt_learning_ckpt_to_nemo.py
+++ b/scripts/nlp_language_modeling/convert_prompt_learning_ckpt_to_nemo.py
@@ -104,7 +104,7 @@ def main(cfg) -> None:
         raise ValueError("need at least a nemo file or checkpoint dir")
 
     # check whether the DDP is initialized
-    if parallel_state.is_unitialized():
+    if not parallel_state.is_initialized():
 
         def dummy():
             return
diff --git a/scripts/nlp_language_modeling/service_launch_scripts/start_retro_model_service.py b/scripts/nlp_language_modeling/service_launch_scripts/start_retro_model_service.py
index 4a373dcaf278..ee32f69bf734 100644
--- a/scripts/nlp_language_modeling/service_launch_scripts/start_retro_model_service.py
+++ b/scripts/nlp_language_modeling/service_launch_scripts/start_retro_model_service.py
@@ -80,7 +80,7 @@ def main(cfg) -> None:
     )
 
     # check whether the DDP is initialized
-    if parallel_state.is_unitialized():
+    if not parallel_state.is_initialized():
 
         def dummy():
             return

From f3d45fd64482b15a6b0f63e7079d6db1be4f46e6 Mon Sep 17 00:00:00 2001
From: Rachit Garg <rachitgarg91@gmail.com>
Date: Fri, 12 Apr 2024 11:30:45 -0700
Subject: [PATCH 137/140] remove fp8 checkpoints for Attention (#8875)

* remove fp8 checkpoints for Attention

Signed-off-by: rachitg <rachitg@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fixes

Signed-off-by: rachitg <rachitg@nvidia.com>

* set default value and support mha

Signed-off-by: rachitg <rachitg@nvidia.com>

---------

Signed-off-by: rachitg <rachitg@nvidia.com>
Co-authored-by: rachitg <rachitg@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../models/language_modeling/megatron_gpt_model.py    | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index f5b1667be27f..d3f5a7afd631 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -89,6 +89,8 @@
     from megatron.core import InferenceParams, parallel_state, tensor_parallel
     from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
     from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset
+    from megatron.core.dist_checkpointing.dict_utils import dict_list_map_inplace
+    from megatron.core.dist_checkpointing.mapping import LocalNonpersitentObject, ShardedObject
 
     # NeMo's implementation of the get_gpt_layer_ammo_spec function is temporarily used
     # from megatron.core.inference.gpt.model_specs import get_gpt_layer_ammo_spec
@@ -1739,6 +1741,15 @@ def sharded_state_dict(self, prefix: str = '') -> Dict[str, Any]:
             if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
                 parallel_state.set_virtual_pipeline_model_parallel_rank(0)
 
+            # WAR: This is a temporary fix to skip loading FP8 parameters for Dot Product Attention
+            def skip_fp8_load(x):
+                if isinstance(x, ShardedObject) and 'fused_attention' in x.key and '_extra_state' in x.key:
+                    x = LocalNonpersitentObject(x.data)  # use the FP8 state from initialization, not from ckpt
+                return x
+
+            if self.cfg.get('fp8_dot_product_attention', False) or self.cfg.get('fp8_multi_head_attention', False):
+                dict_list_map_inplace(skip_fp8_load, sharded_state_dict)
+
             return sharded_state_dict
 
     def parameters(self):

From ac95b5c8cb49a8b3023f39d1eff75226b0478d1c Mon Sep 17 00:00:00 2001
From: anmolgupt <14880251+anmolgupt@users.noreply.github.com>
Date: Fri, 12 Apr 2024 15:49:22 -0700
Subject: [PATCH 138/140] lora a2a after linear out when sp is enabled and
 parallel input (#8882)

* disable reduce for lora CPL bprop

Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com>

* a2a for linear out lora

Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* moved a2a tp to init

Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com>

* add lora config option for enable a2a

Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com>

* added custom all2all

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* code cleanup

Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../megatron/adapters/parallel_adapters.py    | 51 ++++++++++++++++++-
 nemo/collections/nlp/parts/peft_config.py     |  1 +
 2 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
index 419126bd3f18..5037bb1b3634 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
@@ -47,6 +47,7 @@
 
 try:
     from megatron.core import ModelParallelConfig
+    from megatron.core.parallel_state import get_tensor_model_parallel_group, get_tensor_model_parallel_world_size
     from megatron.core.tensor_parallel import ColumnParallelLinear, RowParallelLinear
     from megatron.core.tensor_parallel.mappings import (
         gather_from_sequence_parallel_region,
@@ -146,6 +147,7 @@ def __init__(
         model_parallel_config: Optional[ModelParallelConfig] = None,
         alpha: float | None = None,
         dropout_position: str = 'post',
+        a2a_experimental: bool = False,  # TODO: should rename this or make it a default feature
         **kwargs,
     ):
         super().__init__()
@@ -161,6 +163,9 @@ def __init__(
         self.alpha = alpha if alpha is not None else self.dim
         self.input_is_parallel = input_is_parallel
         self.dropout_position = dropout_position
+        self.tp_world_size = None
+        self.tp_group = None
+        self.use_a2a = a2a_experimental
 
         # megatron_gpt_peft_models will provide this arg, but deprecated ones do not.
         # in case this arg is not provided, use the dummy default config.
@@ -202,12 +207,17 @@ def __init__(
         else:
             # (@adithyare) we use this option to mirror the behavior a column parallel layer with two low-rank column parallel layers
             # if the original column parallel layer uses gather_output=False, then we will use the self.liner_out layer defined below.
+            lin_out_gather_output = True if input_is_parallel else False
+            if self.use_a2a and input_is_parallel and self._sequence_parallel:
+                lin_out_gather_output = False
+                self.tp_world_size = get_tensor_model_parallel_world_size()
+                self.tp_group = get_tensor_model_parallel_group()
             self.linear_out = ColumnParallelLinear(
                 dim,
                 out_features,
                 config=model_parallel_config,
                 bias=False,
-                gather_output=True if input_is_parallel else False,
+                gather_output=lin_out_gather_output,
                 init_method=self._get_init_fn(row_init_method),
             )
 
@@ -291,7 +301,11 @@ def forward(self, x):
             # layernorm after lora is impacted by sequence parallel,
             # hence seq dim need to be scattered right after lora linear layers
             # this function also handles the backward pass correctly
-            x = scatter_to_sequence_parallel_region(x)
+            if self.use_a2a:
+                # all2all hidden_size / TP to seq_len / TP
+                x = all2all_hp2sp(x, self.tp_world_size, self.tp_group)
+            else:
+                x = scatter_to_sequence_parallel_region(x)
 
         if self.norm_position == 'post':
             x = self.layer_norm(x)
@@ -305,6 +319,38 @@ def forward(self, x):
         return x
 
 
+class _All2AllHp2Sp(torch.autograd.Function):
+    """
+    All-2-All from Hidden Parallel to Sequence Parallel
+    This is a temporary workaround and can be updated in the future
+    TODO: Move the functionality to MCore
+    """
+
+    @staticmethod
+    def forward(ctx, input_, world_size, group):
+        ctx.world_size = world_size
+        ctx.group = group
+        send_list = list(input_.chunk(world_size, dim=0))
+        send_list = [tensor.contiguous() for tensor in send_list]
+        receive_list = [torch.empty_like(send_list[0]) for _ in range(world_size)]
+        torch.distributed.all_to_all(receive_list, send_list, group=group)
+        x = torch.cat(receive_list, dim=-1)
+        return x
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        send_list = list(grad_output.chunk(ctx.world_size, dim=-1))
+        send_list = [tensor.contiguous() for tensor in send_list]
+        receive_list = [torch.empty_like(send_list[0]) for _ in range(ctx.world_size)]
+        torch.distributed.all_to_all(receive_list, send_list, group=ctx.group)
+        x = torch.cat(receive_list, dim=0)
+        return x, None, None
+
+
+def all2all_hp2sp(input_, world_size, group):
+    return _All2AllHp2Sp.apply(input_, world_size, group)
+
+
 @dataclass
 class ParallelLinearAdapterConfig(AdapterConfig):
     in_features: int
@@ -321,6 +367,7 @@ class ParallelLinearAdapterConfig(AdapterConfig):
     dropout_position: str = 'post'
     alpha: float | None = None
     network_alpha: int | None = None
+    a2a_experimental: bool = False
     _target_: str = "{0}.{1}".format(ParallelLinearAdapter.__module__, ParallelLinearAdapter.__name__)
 
 
diff --git a/nemo/collections/nlp/parts/peft_config.py b/nemo/collections/nlp/parts/peft_config.py
index e6f0fe267d18..63caa409b218 100644
--- a/nemo/collections/nlp/parts/peft_config.py
+++ b/nemo/collections/nlp/parts/peft_config.py
@@ -184,6 +184,7 @@ def _create_lora_config(self, cfg, lora_cfg, in_features, out_features, adapter_
             "dropout": lora_cfg.adapter_dropout,
             "alpha": lora_cfg.get("alpha", lora_cfg.adapter_dim),
             "dropout_position": lora_cfg.get("dropout_position", "post"),
+            "a2a_experimental": lora_cfg.get("a2a_experimental", False),
         }
 
         if lora_cfg.weight_tying:

From 08ea4cb15889d604652115de8e4d8544a2a76776 Mon Sep 17 00:00:00 2001
From: Wil Kong <alpha0422@gmail.com>
Date: Sat, 13 Apr 2024 06:50:44 +0800
Subject: [PATCH 139/140] Fix Distributed Fused Adam Issues (#8880)

* Fix distributed fused adam issue with NHWC layout.

* Fix the CUDA graph issue if there's kernel in zero_grad.

* Add option to distribute adam states within node.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 nemo/core/optim/distributed_adam.py | 35 +++++++++++++++++++++++++++++
 nemo/utils/callbacks/cuda_graph.py  |  9 +++++++-
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/nemo/core/optim/distributed_adam.py b/nemo/core/optim/distributed_adam.py
index 43a784cd7736..32bd7e6c1154 100644
--- a/nemo/core/optim/distributed_adam.py
+++ b/nemo/core/optim/distributed_adam.py
@@ -50,6 +50,11 @@ class MegatronDistributedFusedAdam(DistributedFusedAdam):
         disable_distributed_parameters (bool, optional): use standard
             data-parallel communication instead of ZeRO.
             (default: False)
+        distribute_within_nodes (bool, optional): distribute states
+            within the same node, e.g. DGX. This can improve performance
+            but requires larger memory than distributing within all
+            ranks, especially for pure data parallel models.
+            (default: False).
         **kwargs: keyword arguments to pass to Apex
             DistributedFusedAdam.
 
@@ -59,6 +64,7 @@ def __init__(
         self,
         params: Union[Iterable[torch.nn.Parameter], Iterable[dict]],
         disable_distributed_parameters: bool = False,
+        distribute_within_nodes: bool = False,
         **kwargs,
     ):
 
@@ -71,6 +77,28 @@ def __init__(
             self_groups = [torch.distributed.new_group(ranks=[i]) for i in range(world_size)]
             kwargs['distributed_process_group'] = self_groups[rank]
             kwargs['redundant_process_group'] = kwargs['process_group']
+        elif distribute_within_nodes:
+            world_size = torch.distributed.get_world_size()
+            rank = torch.distributed.get_rank()
+            devices = torch.cuda.device_count()
+            nodes = world_size // devices
+            assert nodes * devices == world_size, "Expected all nodes have teh same amout of devices."
+            node_id = rank // devices
+            device_id = rank % devices
+
+            distributed_pgs = []
+            for i in range(nodes):
+                ranks = [i * devices + j for j in range(devices)]
+                pg = torch.distributed.new_group(ranks=ranks)
+                distributed_pgs.append(pg)
+            kwargs['distributed_process_group'] = distributed_pgs[node_id]
+
+            redundant_pgs = []
+            for i in range(devices):
+                ranks = [i + j * devices for j in range(nodes)]
+                pg = torch.distributed.new_group(ranks=ranks)
+                redundant_pgs.append(pg)
+            kwargs['redundant_process_group'] = redundant_pgs[device_id]
 
         # Make sure dtypes are in right type
         for keyword in ('dtype', 'grad_sync_dtype', 'param_sync_dtype'):
@@ -425,6 +453,13 @@ def _param_copy_fragments(self, fragments: Iterable[DistributedFusedAdam.Paramet
                 buffers_in.append(buffer_in)
                 buffers_out.append(buffer_out)
             elif torch.is_floating_point(buffer_in) and torch.is_floating_point(param):
+                # Conv with NHWC layout, i.e. shape (N, C, H, W) and stride
+                # (HWC, 1, WC, C), can't `.view(-1)`. Here to turn it to
+                # tensor with shape (N, H, W, C) and stride (HWC, WC, C, 1).
+                # Note: https://github.com/NVIDIA/apex/pull/1794
+                if param.is_contiguous(memory_format=torch.channels_last):
+                    param = param.permute(0, 2, 3, 1)
+
                 # Cast between floating-point dtypes
                 buffer_out = param.detach().view(-1)[param_start:param_end]
                 buffers_in.append(buffer_in)
diff --git a/nemo/utils/callbacks/cuda_graph.py b/nemo/utils/callbacks/cuda_graph.py
index 247c67856c7b..77dc33e7b567 100644
--- a/nemo/utils/callbacks/cuda_graph.py
+++ b/nemo/utils/callbacks/cuda_graph.py
@@ -180,12 +180,19 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure=None,) -
             torch.cuda.current_stream().wait_stream(state.stream)
 
         if state.current_iteration == state.capture_iteration:
-            optimizer.zero_grad(**zero_grad_kwargs)
             torch.cuda.synchronize()
             # Sleep for one second to let environment stable
             time.sleep(1)
             rank_zero_info("CUDAGraphCallback: capturing CUDA graph for module %s.", self.__class__.__name__)
             with torch.cuda.graph(state.graph, stream=state.stream, capture_error_mode="global"):
+                # PyTorch CUDA graph doc for whole-network capturing mentions:
+                #
+                #   Sets grads to None before capture, so backward() will create
+                #   .grad attributes with allocations from the graph's private pool
+                #
+                # But it's not necessary, and it can lead to CUDA kernels inside
+                # `zero_grad()` being not captured.
+                optimizer.zero_grad(**zero_grad_kwargs)
                 self.__orig_optimizer_step__(
                     epoch, batch_idx, optimizer, optimizer_closure=optimizer_closure,
                 )

From 21913a015d28293532a4550f1138df4a6d6e26e5 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Fri, 12 Apr 2024 18:16:05 -0700
Subject: [PATCH 140/140] update mcore 24.04.12 (#8910)

---
 .github/workflows/cicd-main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index a9509fda51e9..29ea34dba197 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -114,7 +114,7 @@ jobs:
             # Megatron Core installation
             git clone https://github.com/NVIDIA/Megatron-LM.git && \
                 pushd Megatron-LM && \
-                git checkout 7fe863f3d94f7b64a927b04b85f5c9339d3fb784 && \
+                git checkout f3a3020031f384ddafd9b7e9f3a587798c0aea21 && \
                 pip install . && \
                   pushd megatron/core/datasets && \
                   make && \