From 327c6bb8be478dd621da7188b86c3774f43f013a Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Thu, 9 May 2024 08:28:10 -0700
Subject: [PATCH 1/8] add torch dist test to ci/cd

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .github/workflows/cicd-main.yml | 99 +++++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index ef3206e48f69..1e28e83fa23d 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -4290,6 +4290,104 @@ jobs:
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
+  L2_Megatron_GPT_Pretraining_and_Resume_Training_Torch_Dist_TP2_PP2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    timeout-minutes: 10
+    container:
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /mnt/datadrive/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v4
+        - run: |
+            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=2 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=3 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+            model.dist_ckpt_format=torch_dist \
+            model.tensor_model_parallel_size=2 \
+            model.pipeline_model_parallel_size=1 \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.optim.sched.warmup_steps=1 \
+            model.optim.sched.constant_steps=1 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.data.seq_length=128 \
+            model.normalization=rmsnorm \
+            model.bias=False \
+            model.bias_activation_fusion=False \
+            model.bias_dropout_add_fusion=False \
+            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+            model.num_layers=8 \
+            model.hidden_size=256 \
+            model.num_attention_heads=8 \
+            model.activations_checkpoint_method=block \
+            model.activations_checkpoint_granularity=full \
+            model.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+
+            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=2 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=6 \
+            trainer.precision=16 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+            exp_manager.resume_if_exists=True \
+            model.dist_ckpt_format=torch_dist \
+            model.tensor_model_parallel_size=1 \
+            model.pipeline_model_parallel_size=2 \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.optim.sched.warmup_steps=2 \
+            model.optim.sched.constant_steps=2 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.data.seq_length=128 \
+            model.normalization=rmsnorm \
+            model.bias=False \
+            model.bias_activation_fusion=False \
+            model.bias_dropout_add_fusion=False \
+            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+            model.num_layers=8 \
+            model.hidden_size=256 \
+            model.num_attention_heads=8 \
+            model.activations_checkpoint_method=block \
+            model.activations_checkpoint_granularity=full \
+            model.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+        
+            rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+            rm -rf examples/nlp/language_modeling/gpt_index_mappings
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+          if: "failure()"
+
   L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
@@ -6588,6 +6686,7 @@ jobs:
       - L2_Megatron_RETRO_Pretraining_and_Resume_Training
       - L2_BioMegatron_Bert_NER_Task
       - L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2
+      - L2_Megatron_GPT_Pretraining_and_Resume_Training_Torch_Dist_TP2_PP2
       - L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2

From c48a7f0e6ecc78388d6135b35fe3ff4dd494cd53 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Thu, 9 May 2024 14:38:06 -0700
Subject: [PATCH 2/8] set ckpt strategy to torch_dist

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../nlp/language_modeling/conf/megatron_bert_config.yaml     | 5 ++++-
 examples/nlp/language_modeling/conf/megatron_gpt_config.yaml | 2 +-
 .../tuning/conf/megatron_gpt_finetuning_config.yaml          | 3 +++
 scripts/checkpoint_converters/convert_zarr_to_torch_dist.py  | 2 +-
 4 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml
index bc66ae717ebb..23a003a1fc62 100644
--- a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml
@@ -95,6 +95,9 @@ model:
   onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
   gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
   
+  # Distributed checkpoint format
+  dist_ckpt_format: 'torch_dist' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
+
   ## Activation Checkpointing
   # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
   # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
@@ -158,4 +161,4 @@ model:
       name: CosineAnnealing
       warmup_steps: 500
       constant_steps: 50000
-      min_lr: 2e-5
\ No newline at end of file
+      min_lr: 2e-5
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 57c82726ae11..8eb50775522c 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -151,7 +151,7 @@ model:
   fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint.
 
   # Distributed checkpoint format
-  dist_ckpt_format: 'zarr' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
+  dist_ckpt_format: 'torch_dist' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
 
   ## Activation Checkpointing
   # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
index 6517b62010b4..b6b933bf8f31 100644
--- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
@@ -82,6 +82,9 @@ model:
   fsdp_grad_reduce_dtype: 'fp32' # Gradient reduction data type.
   fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint.
   fsdp_use_orig_params: False # Set to True to use FSDP for specific peft scheme.
+  
+  # Distributed checkpoint format
+  dist_ckpt_format: 'torch_dist' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
 
   peft:
     peft_scheme: "adapter"  # can be either adapter,ia3, or ptuning
diff --git a/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py b/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py
index 29b56aa706fa..ebd42f8f99d7 100644
--- a/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py
+++ b/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py
@@ -167,7 +167,7 @@ def convert(local_rank, rank, world_size, args):
         )
 
     with open_dict(model.cfg):
-        model.cfg.torch_distributed_checkpoint = True
+        model.cfg.dist_ckpt_format='torch_dist'
 
     model._save_restore_connector = NLPSaveRestoreConnector()
     save_file_path = args.path_to_save

From 7752b8d03325f5f51c54f4b492fae154c081ce2c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 9 May 2024 21:39:34 +0000
Subject: [PATCH 3/8] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 scripts/checkpoint_converters/convert_zarr_to_torch_dist.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py b/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py
index ebd42f8f99d7..eb16889c8def 100644
--- a/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py
+++ b/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py
@@ -167,7 +167,7 @@ def convert(local_rank, rank, world_size, args):
         )
 
     with open_dict(model.cfg):
-        model.cfg.dist_ckpt_format='torch_dist'
+        model.cfg.dist_ckpt_format = 'torch_dist'
 
     model._save_restore_connector = NLPSaveRestoreConnector()
     save_file_path = args.path_to_save

From ad0b6430cd1dfe5c09ee02cce8650e2e629f8bc8 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Thu, 9 May 2024 14:57:47 -0700
Subject: [PATCH 4/8] set torch_dist ckpt strategy as default strategy

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 nemo/collections/nlp/parts/megatron_trainer_builder.py | 2 +-
 nemo/collections/nlp/parts/nlp_overrides.py            | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
index 367cf46c6fd0..957e89a088ac 100644
--- a/nemo/collections/nlp/parts/megatron_trainer_builder.py
+++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py
@@ -138,7 +138,7 @@ def _plugins(self) -> list:
             self.cfg.model.get('mcore_gpt', False) or self.cfg.model.get('mcore_bert', False)
         )
         if use_dist_ckpt:
-            plugins.append(DistributedCheckpointIO(self.cfg.model.get('dist_ckpt_format', 'zarr')))
+            plugins.append(DistributedCheckpointIO(self.cfg.model.get('dist_ckpt_format', 'torch_dist')))
 
         return plugins
 
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 97661c752c52..86a3fbe005be 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -474,7 +474,7 @@ def use_distributed_checkpointing(self):
             logging.warning(
                 'Distributed checkpoints requires DistributedCheckpointIO plugin to be used. Setting up a default now.'
             )
-            self.checkpoint_io = DistributedCheckpointIO(self.lightning_module.cfg.get('dist_ckpt_format', 'zarr'))
+            self.checkpoint_io = DistributedCheckpointIO(self.lightning_module.cfg.get('dist_ckpt_format', 'torch_dist'))
         if not has_sharded_state_dict and has_dist_ckpt_io:
             logging.warning(
                 'DistributedCheckpointIO configured but should not be used. Reverting back to TorchCheckpointIO'
@@ -866,7 +866,7 @@ def dummy():
                     if model.trainer.strategy.launcher is not None:
                         model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer)
                     model.trainer.strategy.setup_environment()
-                checkpoint_io = DistributedCheckpointIO(model.cfg.get('dist_ckpt_format', 'zarr'))
+                checkpoint_io = DistributedCheckpointIO(model.cfg.get('dist_ckpt_format', 'torch_dist'))
                 checkpoint_io.save_checkpoint(sharded_state_dict, dist_ckpt_dir)
 
             else:
@@ -1150,7 +1150,7 @@ def dummy():
                 tmp_model_weights_ckpt = os.path.join(tmpdir, self.model_weights_ckpt)
                 tmp_model_weights_dir = os.path.splitext(tmp_model_weights_ckpt)[0]
                 assert os.path.isdir(tmp_model_weights_dir), f'Expected {tmp_model_weights_dir} to be a directory.'
-                checkpoint_io = DistributedCheckpointIO(conf.get('dist_ckpt_format', 'zarr'))
+                checkpoint_io = DistributedCheckpointIO(conf.get('dist_ckpt_format', 'torch_dist'))
                 checkpoint = checkpoint_io.load_checkpoint(tmp_model_weights_dir, sharded_state_dict=checkpoint)
                 instance.on_load_checkpoint(checkpoint)
                 if hasattr(instance, 'setup_transformer_engine_tp_groups'):

From 5eb382a6c10a1356384c0278bda76f7aaa784b11 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 9 May 2024 22:00:02 +0000
Subject: [PATCH 5/8] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 nemo/collections/nlp/parts/nlp_overrides.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 86a3fbe005be..73308c8bbeb7 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -474,7 +474,9 @@ def use_distributed_checkpointing(self):
             logging.warning(
                 'Distributed checkpoints requires DistributedCheckpointIO plugin to be used. Setting up a default now.'
             )
-            self.checkpoint_io = DistributedCheckpointIO(self.lightning_module.cfg.get('dist_ckpt_format', 'torch_dist'))
+            self.checkpoint_io = DistributedCheckpointIO(
+                self.lightning_module.cfg.get('dist_ckpt_format', 'torch_dist')
+            )
         if not has_sharded_state_dict and has_dist_ckpt_io:
             logging.warning(
                 'DistributedCheckpointIO configured but should not be used. Reverting back to TorchCheckpointIO'

From 0ad0b3c99532541cbae1258b48820994d2d8b3a2 Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Fri, 10 May 2024 12:11:23 +0000
Subject: [PATCH 6/8] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 nemo/collections/nlp/parts/nlp_overrides.py   | 98 +++++++++++--------
 .../convert_zarr_to_torch_dist.py             | 12 ++-
 2 files changed, 65 insertions(+), 45 deletions(-)

diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index c7394dbbc314..bf27d8ef8038 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -120,7 +120,7 @@
 def init_model_parallel(
     sharp: bool, nccl_communicator_config_path: str = None, distributed_timeout_minutes: int = 30
 ) -> None:
-    """ Initializes Megatron-LM model parallel if using model parallelism.
+    """Initializes Megatron-LM model parallel if using model parallelism.
 
     Args:
         sharp: Apply SHARP to NCCL data-parallel communication.
@@ -164,7 +164,7 @@ def init_model_parallel(
 
 
 class NLPDDPStrategy(DDPStrategy):
-    """ DDP plugin for Pytorch Lightning. Needed to customize DDP for model parallel models.
+    """DDP plugin for Pytorch Lightning. Needed to customize DDP for model parallel models.
 
     Args:
         no_ddp_communication_hook: Disable DDP communication hook when using AMP-O2
@@ -231,8 +231,8 @@ def setup_distributed(self, global_rank: int = None, world_size: int = None) ->
                 )
 
     def configure_ddp(self):
-        """ Override LightningModule ddp if using model parallel.
-            Sets find_unused_parameters to False to use activation-checkpoint-recomputation.
+        """Override LightningModule ddp if using model parallel.
+        Sets find_unused_parameters to False to use activation-checkpoint-recomputation.
         """
 
         if (hasattr(self.model, 'megatron_amp_O2') and self.model.megatron_amp_O2) or (
@@ -406,7 +406,7 @@ def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict: bool = Tr
             self.lightning_module.load_state_dict(checkpoint["state_dict"], strict=strict)
 
     def _fix_tensors_device(self, ckpt: Dict) -> Dict:
-        """ Ensure checkpoint tensors are on the correct device."""
+        """Ensure checkpoint tensors are on the correct device."""
         assert torch.cuda.is_initialized(), (torch.cuda.is_available(), torch.cuda.is_initialized())
         cur_dev = torch.device("cuda", index=torch.cuda.current_device())
 
@@ -418,10 +418,10 @@ def _fix_device(t):
         return dict_list_map_outplace(_fix_device, ckpt)
 
     def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
-        """ PTL method which we override to integrate distributed checkpoints for model parallel models.
-            In order to load distributed checkpoints we need to provide the sharded_state_dict to 
-            the distributed load function. We get the sharded_state_dict from self.lightning_module
-            which makes it convenient to have the loading logic happen at the strategy level.
+        """PTL method which we override to integrate distributed checkpoints for model parallel models.
+        In order to load distributed checkpoints we need to provide the sharded_state_dict to
+        the distributed load function. We get the sharded_state_dict from self.lightning_module
+        which makes it convenient to have the loading logic happen at the strategy level.
         """
 
         fs = get_filesystem(checkpoint_path)
@@ -500,15 +500,15 @@ def distributed_sampler_kwargs(self):
 
     @property
     def restore_checkpoint_after_setup(self) -> bool:
-        """ This needs to be True for distributed checkpointing because
-            we require the model to have configured the optimizer before 
-            deserializing the checkpoint.
+        """This needs to be True for distributed checkpointing because
+        we require the model to have configured the optimizer before
+        deserializing the checkpoint.
         """
         return True
 
 
 class NLPDDPStrategyNotebook(NLPDDPStrategy):
-    """ Version of NLPDDPStrategy to be used in a Jupyter Notebook
+    """Version of NLPDDPStrategy to be used in a Jupyter Notebook
     A large portion of Megatron code has DDP dependency, so it has been necessary to use NLPDDPStrategy even for
     single-GPU training (e.g. in a Jupyter notebook)
     A PTL 2.0 changes has prevented DDPStrategy to be used in a notebook.
@@ -546,7 +546,7 @@ def _get_full_state_dict_context(module: torch.nn.Module, rank0_only: bool = Fal
 
 
 class NLPFSDPStrategy(FSDPStrategy):
-    """ FSDP plugin for Pytorch Lightning with the support for tensor-parallelism.
+    """FSDP plugin for Pytorch Lightning with the support for tensor-parallelism.
 
     Args:
         sharding_strategy: FSDP parameter sharding strategy.
@@ -639,7 +639,11 @@ def _set_mixed_precision_recipe(
             reduce_dtype = utils_funcs.torch_dtype_from_precision(grad_reduce_dtype, None)
         if set_buffer_dtype is not None:
             buffer_dtype = utils_funcs.torch_dtype_from_precision(buffer_dtype, None)
-        return MixedPrecision(param_dtype=param_dtype, reduce_dtype=reduce_dtype, buffer_dtype=buffer_dtype,)
+        return MixedPrecision(
+            param_dtype=param_dtype,
+            reduce_dtype=reduce_dtype,
+            buffer_dtype=buffer_dtype,
+        )
 
     def setup_environment(self) -> None:
         """
@@ -750,7 +754,9 @@ def _get_osd(opt_state):
                         with FSDP.summon_full_params(self.model, writeback=True, rank0_only=False):
                             # rekey the osd stored from non-FSDP model
                             rekeyed_osd = FSDP.rekey_optim_state_dict(
-                                temp_osd, OptimStateKeyType.PARAM_NAME, self.model,
+                                temp_osd,
+                                OptimStateKeyType.PARAM_NAME,
+                                self.model,
                             )
                         temp_osd = FSDP.shard_full_optim_state_dict(rekeyed_osd, self.model)
                     except Exception as e:
@@ -758,7 +764,9 @@ def _get_osd(opt_state):
                         exit(1)
                 # Shard optimizer state dict
                 sharded_osd = FSDP.optim_state_dict_to_load(
-                    optim_state_dict=temp_osd, model=self.model, optim=optimizer,
+                    optim_state_dict=temp_osd,
+                    model=self.model,
+                    optim=optimizer,
                 )
 
                 optimizer.load_state_dict(sharded_osd)
@@ -767,9 +775,9 @@ def _get_osd(opt_state):
     def save_checkpoint(
         self, checkpoint: Dict[str, Any], filepath: Union[str, Path], storage_options: Optional[Any] = None
     ) -> None:
-        """ Store checkpoints
-            1. In case of sharded checkpoint, all ranks store unique checkpoints.
-            2. In case of non-sharded checkpoint, all data-parallel rank 0 store checkpoints.
+        """Store checkpoints
+        1. In case of sharded checkpoint, all ranks store unique checkpoints.
+        2. In case of non-sharded checkpoint, all data-parallel rank 0 store checkpoints.
         """
         app_state = AppState()
         filepath = inject_model_parallel_rank(filepath, fsdp_sharded_ckpt=self.sharded_checkpoint)
@@ -780,8 +788,7 @@ def save_checkpoint(
             self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options)
 
     def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
-        """ Load checkpoints
-        """
+        """Load checkpoints"""
         # 1. Load normal or FSDP-sharded checkpoints.
         fs = get_filesystem(checkpoint_path)
 
@@ -798,8 +805,7 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
         return checkpoint
 
     def remove_checkpoint(self, filepath: Union[str, Path]) -> None:
-        """ Remove checkpoints
-        """
+        """Remove checkpoints"""
         # legacy checkpoint logic, does not use megatron core
         app_state = AppState()
         # PTL override to accomodate model parallel checkpoints
@@ -814,9 +820,9 @@ def remove_checkpoint(self, filepath: Union[str, Path]) -> None:
 
     @property
     def restore_checkpoint_after_setup(self) -> bool:
-        """ When loading FSDP-sharded checkpoint, need to restore checkpoint after configuring
-            FSDP sharding to match FSDP-sharded format between the checkpoint and the current
-            model and optimizer.
+        """When loading FSDP-sharded checkpoint, need to restore checkpoint after configuring
+        FSDP sharding to match FSDP-sharded format between the checkpoint and the current
+        model and optimizer.
         """
         return True
 
@@ -915,7 +921,8 @@ def dummy():
                     else:
                         # move weights to the tmpdir
                         for tp_rank, pp_rank in itertools.product(
-                            range(app_state.tensor_model_parallel_size), range(app_state.pipeline_model_parallel_size),
+                            range(app_state.tensor_model_parallel_size),
+                            range(app_state.pipeline_model_parallel_size),
                         ):
                             os.makedirs(os.path.join(tmpdir, f'tp_rank_{tp_rank:02d}_pp_rank_{pp_rank:03d}'))
                             mp_model_weights = os.path.join(
@@ -1000,6 +1007,7 @@ def modify_state_dict(self, conf, state_dict):
         loaded_keys = state_dict.keys()
         if 'model.model.diffusion_model.input_blocks.1.0.in_layers.2.weight' in loaded_keys:
             new_state_dict = {}
+
             # GroupNormOpt fuses activation function to one layer, thus the indexing of weights are shifted for following
             def should_process(key):
                 base_str = "model.model.diffusion_model."
@@ -1110,7 +1118,13 @@ def restore_from(
         # Get path where the command is executed - the artifacts will be "retrieved" there
         # (original .nemo behavior)
         loaded_params = super().load_config_and_state_dict(
-            calling_cls, restore_path, override_config_path, map_location, strict, return_config, trainer,
+            calling_cls,
+            restore_path,
+            override_config_path,
+            map_location,
+            strict,
+            return_config,
+            trainer,
         )
         if not isinstance(loaded_params, tuple) or return_config is True:
             return loaded_params
@@ -1165,12 +1179,12 @@ def dummy():
 
 
 class PipelineMixedPrecisionPlugin(MixedPrecisionPlugin):
-    """ Overrides PTL autocasting to not wrap training/val/test_step.
-        We do this because we have the megatron-core fwd/bwd functions in training_step.
-        This means .backward is being called in training_step so we do not want the whole
-        step wrapped in autocast.
+    """Overrides PTL autocasting to not wrap training/val/test_step.
+    We do this because we have the megatron-core fwd/bwd functions in training_step.
+    This means .backward is being called in training_step so we do not want the whole
+    step wrapped in autocast.
 
-        We instead wrap the fwd_output_and_loss_func that is passed to the megatron-core fwd/bwd functions.
+    We instead wrap the fwd_output_and_loss_func that is passed to the megatron-core fwd/bwd functions.
     """
 
     def __init__(
@@ -1206,12 +1220,12 @@ def forward_context(self) -> Generator[None, None, None]:
 
 
 class FSDPMixedPrecisionPlugin(FSDPPrecision):
-    """ Overrides PTL autocasting to not wrap training/val/test_step.
-        We do this because we have the megatron-core fwd/bwd functions in training_step.
-        This means .backward is being called in training_step so we do not want the whole
-        step wrapped in autocast.
+    """Overrides PTL autocasting to not wrap training/val/test_step.
+    We do this because we have the megatron-core fwd/bwd functions in training_step.
+    This means .backward is being called in training_step so we do not want the whole
+    step wrapped in autocast.
 
-        We instead wrap the fwd_output_and_loss_func that is passed to the megatron-core fwd/bwd functions.
+    We instead wrap the fwd_output_and_loss_func that is passed to the megatron-core fwd/bwd functions.
     """
 
     def __init__(
@@ -1246,7 +1260,7 @@ class GradScaler(torch.cuda.amp.GradScaler):
 
     def __init__(
         self,
-        init_scale=2.0 ** 16,
+        init_scale=2.0**16,
         growth_factor=2.0,
         backoff_factor=0.5,
         growth_interval=2000,
@@ -1500,7 +1514,7 @@ def optimizer_step(
 
     @contextmanager
     def forward_context(self) -> Generator[None, None, None]:
-        """ No explicit precision casting. Inputs are supposed to be manually casted """
+        """No explicit precision casting. Inputs are supposed to be manually casted"""
         try:
             yield
         finally:
@@ -1508,7 +1522,7 @@ def forward_context(self) -> Generator[None, None, None]:
 
 
 class GlobalBatchDataFetcher(_DataFetcher):
-    """ Overrides PTL DataFetcher. Used to fetch global batches."""
+    """Overrides PTL DataFetcher. Used to fetch global batches."""
 
     def __init__(self, prefetch_batches: int = 0, store_on_device: bool = False) -> None:
 
diff --git a/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py b/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py
index eb16889c8def..6816001e6a9b 100644
--- a/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py
+++ b/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py
@@ -69,7 +69,9 @@ def get_args():
     )
     parser.add_argument("--path_to_save", type=str, default=None, required=True, help="Path to output ckpt files.")
     parser.add_argument(
-        "--save_to_nemo", action="store_true", help="If passed, output will be written as .nemo file.",
+        "--save_to_nemo",
+        action="store_true",
+        help="If passed, output will be written as .nemo file.",
     )
     parser.add_argument("--gpus_per_node", type=int, required=True, default=None)
     parser.add_argument("--tensor_model_parallel_size", type=int, required=True, default=None)
@@ -93,7 +95,11 @@ def get_args():
     )
 
     parser.add_argument(
-        "--model_type", type=str, required=True, default="gpt", choices=["gpt", "sft", "bert"],
+        "--model_type",
+        type=str,
+        required=True,
+        default="gpt",
+        choices=["gpt", "sft", "bert"],
     )
 
     args = parser.parse_args()
@@ -114,7 +120,7 @@ def convert(local_rank, rank, world_size, args):
             'precision': args.precision,
         },
         'model': {
-            'native_amp_init_scale': 2 ** 32,
+            'native_amp_init_scale': 2**32,
             'native_amp_growth_interval': 1000,
             'hysteresis': 2,
             'gradient_as_bucket_view': True,

From f4ba733bf8b923eceead9771080b94d34359fda5 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Fri, 10 May 2024 05:29:45 -0700
Subject: [PATCH 7/8] add load on device value

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 examples/nlp/language_modeling/conf/megatron_bert_config.yaml | 3 ++-
 examples/nlp/language_modeling/conf/megatron_gpt_config.yaml  | 4 ++--
 .../tuning/conf/megatron_gpt_finetuning_config.yaml           | 3 ++-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml
index 23a003a1fc62..3329d264cbec 100644
--- a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml
@@ -95,8 +95,9 @@ model:
   onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
   gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
   
-  # Distributed checkpoint format
+  # Distributed checkpoint setup
   dist_ckpt_format: 'torch_dist' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
+  dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU.
 
   ## Activation Checkpointing
   # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 6ae516f7b46d..bd65f5ca9912 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -150,9 +150,9 @@ model:
   fsdp_grad_reduce_dtype: 32 # Gradient reduction data type.
   fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint.
 
-  # Distributed checkpoint format
+  # Distributed checkpoint setup
   dist_ckpt_format: 'torch_dist' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
-  dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU
+  dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU.
 
   ## Activation Checkpointing
   # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
index b6b933bf8f31..b067a5fdf21a 100644
--- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
@@ -83,8 +83,9 @@ model:
   fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint.
   fsdp_use_orig_params: False # Set to True to use FSDP for specific peft scheme.
   
-  # Distributed checkpoint format
+  # Distributed checkpoint setup
   dist_ckpt_format: 'torch_dist' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
+  dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU.
 
   peft:
     peft_scheme: "adapter"  # can be either adapter,ia3, or ptuning

From 614ac783303335f482c661ac679ba0cc23ff99e9 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Fri, 10 May 2024 05:31:17 -0700
Subject: [PATCH 8/8] remove torch dist test

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .github/workflows/cicd-main.yml | 99 ---------------------------------
 1 file changed, 99 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index e9bd0074070f..252843bcc0ce 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -4290,104 +4290,6 @@ jobs:
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
 
-  L2_Megatron_GPT_Pretraining_and_Resume_Training_Torch_Dist_TP2_PP2:
-    needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
-    timeout-minutes: 10
-    container:
-      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
-        # --user 0:128
-        --device=/dev/nvidia0
-        --gpus all
-        --shm-size=8g
-        --env TRANSFORMERS_OFFLINE=0 
-        --env HYDRA_FULL_ERROR=1
-        --volume /mnt/datadrive/TestData:/home/TestData
-    steps:
-        - name: Checkout repository
-          uses: actions/checkout@v4
-        - run: |
-            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=2 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=3 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            model.dist_ckpt_format=torch_dist \
-            model.tensor_model_parallel_size=2 \
-            model.pipeline_model_parallel_size=1 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=1 \
-            model.optim.sched.constant_steps=1 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.normalization=rmsnorm \
-            model.bias=False \
-            model.bias_activation_fusion=False \
-            model.bias_dropout_add_fusion=False \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_granularity=full \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-
-            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            trainer.devices=2 \
-            trainer.accelerator=gpu \
-            trainer.log_every_n_steps=1 \
-            trainer.val_check_interval=2 \
-            trainer.limit_val_batches=2 \
-            trainer.accumulate_grad_batches=1 \
-            trainer.max_steps=6 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            exp_manager.resume_if_exists=True \
-            model.dist_ckpt_format=torch_dist \
-            model.tensor_model_parallel_size=1 \
-            model.pipeline_model_parallel_size=2 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.data.seq_length=128 \
-            model.normalization=rmsnorm \
-            model.bias=False \
-            model.bias_activation_fusion=False \
-            model.bias_dropout_add_fusion=False \
-            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-            model.num_layers=8 \
-            model.hidden_size=256 \
-            model.num_attention_heads=8 \
-            model.activations_checkpoint_method=block \
-            model.activations_checkpoint_granularity=full \
-            model.activations_checkpoint_num_layers=1 \
-            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-        
-            rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-            rm -rf examples/nlp/language_modeling/gpt_index_mappings
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: "failure()"
-
   L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure
@@ -6687,7 +6589,6 @@ jobs:
       - L2_Megatron_RETRO_Pretraining_and_Resume_Training
       - L2_BioMegatron_Bert_NER_Task
       - L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2
-      - L2_Megatron_GPT_Pretraining_and_Resume_Training_Torch_Dist_TP2_PP2
       - L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2