NVIDIA · akoumpa · May 22, 2024 · May 9, 2024 · May 15, 2024 · May 15, 2024
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -154,6 +154,7 @@ model:
   # Distributed checkpoint setup
   dist_ckpt_format: 'zarr' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
   dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU
+  dist_ckpt_parallel_save: False # if true, each worker will write its own part of the dist checkpoint
 
   ## Activation Checkpointing
   # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.

diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
@@ -90,6 +90,7 @@ def _training_strategy(self) -> Union[NLPDDPStrategy, NLPFSDPStrategy]:
             find_unused_parameters=False,
             nccl_communicator_config_path=self.cfg.model.get('nccl_communicator_config_path', None),
             sharp=self.cfg.model.get('sharp', False),
+            dist_ckpt_parallel_save=self.cfg.model.get('dist_ckpt_parallel_save', False),
         )
 
     def _grad_scaler(self) -> GradScaler:

diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -78,6 +78,7 @@
     from apex.transformer.pipeline_parallel.utils import get_num_microbatches
 
     from nemo.core.optim.distributed_adam import MegatronDistributedFusedAdam
+    from nemo.core.optim.mcore_optim import McoreDistributedOptimizer
 
     HAVE_APEX = True
 
@@ -182,6 +183,7 @@ def __init__(
         no_ddp_communication_hook: bool = False,
         nccl_communicator_config_path: Optional[str] = None,
         sharp: bool = False,
+        dist_ckpt_parallel_save: bool = False,
         **kwargs: Union[Any, Dict[str, Any]],
     ) -> None:
         if not HAVE_APEX:
@@ -198,6 +200,7 @@ def __init__(
         self.no_ddp_communication_hook = no_ddp_communication_hook
         self.nccl_communicator_config_path = nccl_communicator_config_path
         self.sharp = sharp
+        self._dist_ckpt_parallel_save = dist_ckpt_parallel_save
 
     def setup(self, trainer: "pl.Trainer") -> None:
         """
@@ -293,8 +296,11 @@ def optimizer_sharded_state_dict(self, unsharded_optim_state=None):
         model_sharded_state_dict = {
             key: value for key, value in model_sharded_state_dict.items() if not key.endswith('_extra_state')
         }
-
-        if isinstance(optimizer, MegatronDistributedFusedAdam):
+        if isinstance(optimizer, McoreDistributedOptimizer):
+            return optimizer.sharded_state_dict(
+                model_sharded_state_dict, unsharded_optim_state, dist_ckpt_parallel_save=self._dist_ckpt_parallel_save
+            )
+        elif isinstance(optimizer, MegatronDistributedFusedAdam):
             return optimizer.sharded_state_dict(model_sharded_state_dict, unsharded_optim_state)
         elif not isinstance(optimizer, MainParamsOptimizerWrapper):
             # Regular optimizer, e.g. Adam or FusedAdam

diff --git a/nemo/core/optim/mcore_optim.py b/nemo/core/optim/mcore_optim.py
@@ -55,8 +55,13 @@ def state_dict(self):
     def load_state_dict(self, state_dict):
         self.mcore_optimizer.load_state_dict(state_dict)
 
-    def sharded_state_dict(self, model_sharded_state_dict, is_loading: bool = False, **kwargs):
-        return self.mcore_optimizer.sharded_state_dict(model_sharded_state_dict, is_loading, **kwargs)
+    def sharded_state_dict(
+        self, model_sharded_state_dict, optimizer_state_dict=None, is_loading=False, dist_ckpt_parallel_save=False
+    ):
+        sharding_type = 'fully_sharded_bucket_space' if dist_ckpt_parallel_save else 'dp_zero_gather_scatter'
+        return self.mcore_optimizer.sharded_state_dict(
+            model_sharded_state_dict, is_loading=is_loading, sharding_type=sharding_type
+        )
 
     def step(self, closure):
         """Clip gradients (if needed) and step the base optimizer.