From 327c6bb8be478dd621da7188b86c3774f43f013a Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Thu, 9 May 2024 08:28:10 -0700 Subject: [PATCH 1/8] add torch dist test to ci/cd Signed-off-by: dimapihtar --- .github/workflows/cicd-main.yml | 99 +++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index ef3206e48f69..1e28e83fa23d 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -4290,6 +4290,104 @@ jobs: - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" if: "failure()" + L2_Megatron_GPT_Pretraining_and_Resume_Training_Torch_Dist_TP2_PP2: + needs: [cicd-test-container-setup] + runs-on: self-hosted-azure + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.dist_ckpt_format=torch_dist \ + model.tensor_model_parallel_size=2 \ + model.pipeline_model_parallel_size=1 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.normalization=rmsnorm \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=6 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.dist_ckpt_format=torch_dist \ + model.tensor_model_parallel_size=1 \ + model.pipeline_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.normalization=rmsnorm \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + rm -rf examples/nlp/language_modeling/gpt_pretrain_results + rm -rf examples/nlp/language_modeling/gpt_index_mappings + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" + L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] runs-on: self-hosted-azure @@ -6588,6 +6686,7 @@ jobs: - L2_Megatron_RETRO_Pretraining_and_Resume_Training - L2_BioMegatron_Bert_NER_Task - L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2 + - L2_Megatron_GPT_Pretraining_and_Resume_Training_Torch_Dist_TP2_PP2 - L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2 - L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2 - L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2 From c48a7f0e6ecc78388d6135b35fe3ff4dd494cd53 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Thu, 9 May 2024 14:38:06 -0700 Subject: [PATCH 2/8] set ckpt strategy to torch_dist Signed-off-by: dimapihtar --- .../nlp/language_modeling/conf/megatron_bert_config.yaml | 5 ++++- examples/nlp/language_modeling/conf/megatron_gpt_config.yaml | 2 +- .../tuning/conf/megatron_gpt_finetuning_config.yaml | 3 +++ scripts/checkpoint_converters/convert_zarr_to_torch_dist.py | 2 +- 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml index bc66ae717ebb..23a003a1fc62 100644 --- a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml @@ -95,6 +95,9 @@ model: onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) + # Distributed checkpoint format + dist_ckpt_format: 'torch_dist' # Set to 'torch_dist' to use PyTorch distributed checkpoint format. + ## Activation Checkpointing # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed. # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+). @@ -158,4 +161,4 @@ model: name: CosineAnnealing warmup_steps: 500 constant_steps: 50000 - min_lr: 2e-5 \ No newline at end of file + min_lr: 2e-5 diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 57c82726ae11..8eb50775522c 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -151,7 +151,7 @@ model: fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint. # Distributed checkpoint format - dist_ckpt_format: 'zarr' # Set to 'torch_dist' to use PyTorch distributed checkpoint format. + dist_ckpt_format: 'torch_dist' # Set to 'torch_dist' to use PyTorch distributed checkpoint format. ## Activation Checkpointing # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed. diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml index 6517b62010b4..b6b933bf8f31 100644 --- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml +++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml @@ -82,6 +82,9 @@ model: fsdp_grad_reduce_dtype: 'fp32' # Gradient reduction data type. fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint. fsdp_use_orig_params: False # Set to True to use FSDP for specific peft scheme. + + # Distributed checkpoint format + dist_ckpt_format: 'torch_dist' # Set to 'torch_dist' to use PyTorch distributed checkpoint format. peft: peft_scheme: "adapter" # can be either adapter,ia3, or ptuning diff --git a/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py b/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py index 29b56aa706fa..ebd42f8f99d7 100644 --- a/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py +++ b/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py @@ -167,7 +167,7 @@ def convert(local_rank, rank, world_size, args): ) with open_dict(model.cfg): - model.cfg.torch_distributed_checkpoint = True + model.cfg.dist_ckpt_format='torch_dist' model._save_restore_connector = NLPSaveRestoreConnector() save_file_path = args.path_to_save From 7752b8d03325f5f51c54f4b492fae154c081ce2c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 May 2024 21:39:34 +0000 Subject: [PATCH 3/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- scripts/checkpoint_converters/convert_zarr_to_torch_dist.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py b/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py index ebd42f8f99d7..eb16889c8def 100644 --- a/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py +++ b/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py @@ -167,7 +167,7 @@ def convert(local_rank, rank, world_size, args): ) with open_dict(model.cfg): - model.cfg.dist_ckpt_format='torch_dist' + model.cfg.dist_ckpt_format = 'torch_dist' model._save_restore_connector = NLPSaveRestoreConnector() save_file_path = args.path_to_save From ad0b6430cd1dfe5c09ee02cce8650e2e629f8bc8 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Thu, 9 May 2024 14:57:47 -0700 Subject: [PATCH 4/8] set torch_dist ckpt strategy as default strategy Signed-off-by: dimapihtar --- nemo/collections/nlp/parts/megatron_trainer_builder.py | 2 +- nemo/collections/nlp/parts/nlp_overrides.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py index 367cf46c6fd0..957e89a088ac 100644 --- a/nemo/collections/nlp/parts/megatron_trainer_builder.py +++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py @@ -138,7 +138,7 @@ def _plugins(self) -> list: self.cfg.model.get('mcore_gpt', False) or self.cfg.model.get('mcore_bert', False) ) if use_dist_ckpt: - plugins.append(DistributedCheckpointIO(self.cfg.model.get('dist_ckpt_format', 'zarr'))) + plugins.append(DistributedCheckpointIO(self.cfg.model.get('dist_ckpt_format', 'torch_dist'))) return plugins diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index 97661c752c52..86a3fbe005be 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -474,7 +474,7 @@ def use_distributed_checkpointing(self): logging.warning( 'Distributed checkpoints requires DistributedCheckpointIO plugin to be used. Setting up a default now.' ) - self.checkpoint_io = DistributedCheckpointIO(self.lightning_module.cfg.get('dist_ckpt_format', 'zarr')) + self.checkpoint_io = DistributedCheckpointIO(self.lightning_module.cfg.get('dist_ckpt_format', 'torch_dist')) if not has_sharded_state_dict and has_dist_ckpt_io: logging.warning( 'DistributedCheckpointIO configured but should not be used. Reverting back to TorchCheckpointIO' @@ -866,7 +866,7 @@ def dummy(): if model.trainer.strategy.launcher is not None: model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer) model.trainer.strategy.setup_environment() - checkpoint_io = DistributedCheckpointIO(model.cfg.get('dist_ckpt_format', 'zarr')) + checkpoint_io = DistributedCheckpointIO(model.cfg.get('dist_ckpt_format', 'torch_dist')) checkpoint_io.save_checkpoint(sharded_state_dict, dist_ckpt_dir) else: @@ -1150,7 +1150,7 @@ def dummy(): tmp_model_weights_ckpt = os.path.join(tmpdir, self.model_weights_ckpt) tmp_model_weights_dir = os.path.splitext(tmp_model_weights_ckpt)[0] assert os.path.isdir(tmp_model_weights_dir), f'Expected {tmp_model_weights_dir} to be a directory.' - checkpoint_io = DistributedCheckpointIO(conf.get('dist_ckpt_format', 'zarr')) + checkpoint_io = DistributedCheckpointIO(conf.get('dist_ckpt_format', 'torch_dist')) checkpoint = checkpoint_io.load_checkpoint(tmp_model_weights_dir, sharded_state_dict=checkpoint) instance.on_load_checkpoint(checkpoint) if hasattr(instance, 'setup_transformer_engine_tp_groups'): From 5eb382a6c10a1356384c0278bda76f7aaa784b11 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 May 2024 22:00:02 +0000 Subject: [PATCH 5/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- nemo/collections/nlp/parts/nlp_overrides.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index 86a3fbe005be..73308c8bbeb7 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -474,7 +474,9 @@ def use_distributed_checkpointing(self): logging.warning( 'Distributed checkpoints requires DistributedCheckpointIO plugin to be used. Setting up a default now.' ) - self.checkpoint_io = DistributedCheckpointIO(self.lightning_module.cfg.get('dist_ckpt_format', 'torch_dist')) + self.checkpoint_io = DistributedCheckpointIO( + self.lightning_module.cfg.get('dist_ckpt_format', 'torch_dist') + ) if not has_sharded_state_dict and has_dist_ckpt_io: logging.warning( 'DistributedCheckpointIO configured but should not be used. Reverting back to TorchCheckpointIO' From 0ad0b3c99532541cbae1258b48820994d2d8b3a2 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Fri, 10 May 2024 12:11:23 +0000 Subject: [PATCH 6/8] Apply isort and black reformatting Signed-off-by: dimapihtar --- nemo/collections/nlp/parts/nlp_overrides.py | 98 +++++++++++-------- .../convert_zarr_to_torch_dist.py | 12 ++- 2 files changed, 65 insertions(+), 45 deletions(-) diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index c7394dbbc314..bf27d8ef8038 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -120,7 +120,7 @@ def init_model_parallel( sharp: bool, nccl_communicator_config_path: str = None, distributed_timeout_minutes: int = 30 ) -> None: - """ Initializes Megatron-LM model parallel if using model parallelism. + """Initializes Megatron-LM model parallel if using model parallelism. Args: sharp: Apply SHARP to NCCL data-parallel communication. @@ -164,7 +164,7 @@ def init_model_parallel( class NLPDDPStrategy(DDPStrategy): - """ DDP plugin for Pytorch Lightning. Needed to customize DDP for model parallel models. + """DDP plugin for Pytorch Lightning. Needed to customize DDP for model parallel models. Args: no_ddp_communication_hook: Disable DDP communication hook when using AMP-O2 @@ -231,8 +231,8 @@ def setup_distributed(self, global_rank: int = None, world_size: int = None) -> ) def configure_ddp(self): - """ Override LightningModule ddp if using model parallel. - Sets find_unused_parameters to False to use activation-checkpoint-recomputation. + """Override LightningModule ddp if using model parallel. + Sets find_unused_parameters to False to use activation-checkpoint-recomputation. """ if (hasattr(self.model, 'megatron_amp_O2') and self.model.megatron_amp_O2) or ( @@ -406,7 +406,7 @@ def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict: bool = Tr self.lightning_module.load_state_dict(checkpoint["state_dict"], strict=strict) def _fix_tensors_device(self, ckpt: Dict) -> Dict: - """ Ensure checkpoint tensors are on the correct device.""" + """Ensure checkpoint tensors are on the correct device.""" assert torch.cuda.is_initialized(), (torch.cuda.is_available(), torch.cuda.is_initialized()) cur_dev = torch.device("cuda", index=torch.cuda.current_device()) @@ -418,10 +418,10 @@ def _fix_device(t): return dict_list_map_outplace(_fix_device, ckpt) def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]: - """ PTL method which we override to integrate distributed checkpoints for model parallel models. - In order to load distributed checkpoints we need to provide the sharded_state_dict to - the distributed load function. We get the sharded_state_dict from self.lightning_module - which makes it convenient to have the loading logic happen at the strategy level. + """PTL method which we override to integrate distributed checkpoints for model parallel models. + In order to load distributed checkpoints we need to provide the sharded_state_dict to + the distributed load function. We get the sharded_state_dict from self.lightning_module + which makes it convenient to have the loading logic happen at the strategy level. """ fs = get_filesystem(checkpoint_path) @@ -500,15 +500,15 @@ def distributed_sampler_kwargs(self): @property def restore_checkpoint_after_setup(self) -> bool: - """ This needs to be True for distributed checkpointing because - we require the model to have configured the optimizer before - deserializing the checkpoint. + """This needs to be True for distributed checkpointing because + we require the model to have configured the optimizer before + deserializing the checkpoint. """ return True class NLPDDPStrategyNotebook(NLPDDPStrategy): - """ Version of NLPDDPStrategy to be used in a Jupyter Notebook + """Version of NLPDDPStrategy to be used in a Jupyter Notebook A large portion of Megatron code has DDP dependency, so it has been necessary to use NLPDDPStrategy even for single-GPU training (e.g. in a Jupyter notebook) A PTL 2.0 changes has prevented DDPStrategy to be used in a notebook. @@ -546,7 +546,7 @@ def _get_full_state_dict_context(module: torch.nn.Module, rank0_only: bool = Fal class NLPFSDPStrategy(FSDPStrategy): - """ FSDP plugin for Pytorch Lightning with the support for tensor-parallelism. + """FSDP plugin for Pytorch Lightning with the support for tensor-parallelism. Args: sharding_strategy: FSDP parameter sharding strategy. @@ -639,7 +639,11 @@ def _set_mixed_precision_recipe( reduce_dtype = utils_funcs.torch_dtype_from_precision(grad_reduce_dtype, None) if set_buffer_dtype is not None: buffer_dtype = utils_funcs.torch_dtype_from_precision(buffer_dtype, None) - return MixedPrecision(param_dtype=param_dtype, reduce_dtype=reduce_dtype, buffer_dtype=buffer_dtype,) + return MixedPrecision( + param_dtype=param_dtype, + reduce_dtype=reduce_dtype, + buffer_dtype=buffer_dtype, + ) def setup_environment(self) -> None: """ @@ -750,7 +754,9 @@ def _get_osd(opt_state): with FSDP.summon_full_params(self.model, writeback=True, rank0_only=False): # rekey the osd stored from non-FSDP model rekeyed_osd = FSDP.rekey_optim_state_dict( - temp_osd, OptimStateKeyType.PARAM_NAME, self.model, + temp_osd, + OptimStateKeyType.PARAM_NAME, + self.model, ) temp_osd = FSDP.shard_full_optim_state_dict(rekeyed_osd, self.model) except Exception as e: @@ -758,7 +764,9 @@ def _get_osd(opt_state): exit(1) # Shard optimizer state dict sharded_osd = FSDP.optim_state_dict_to_load( - optim_state_dict=temp_osd, model=self.model, optim=optimizer, + optim_state_dict=temp_osd, + model=self.model, + optim=optimizer, ) optimizer.load_state_dict(sharded_osd) @@ -767,9 +775,9 @@ def _get_osd(opt_state): def save_checkpoint( self, checkpoint: Dict[str, Any], filepath: Union[str, Path], storage_options: Optional[Any] = None ) -> None: - """ Store checkpoints - 1. In case of sharded checkpoint, all ranks store unique checkpoints. - 2. In case of non-sharded checkpoint, all data-parallel rank 0 store checkpoints. + """Store checkpoints + 1. In case of sharded checkpoint, all ranks store unique checkpoints. + 2. In case of non-sharded checkpoint, all data-parallel rank 0 store checkpoints. """ app_state = AppState() filepath = inject_model_parallel_rank(filepath, fsdp_sharded_ckpt=self.sharded_checkpoint) @@ -780,8 +788,7 @@ def save_checkpoint( self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options) def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]: - """ Load checkpoints - """ + """Load checkpoints""" # 1. Load normal or FSDP-sharded checkpoints. fs = get_filesystem(checkpoint_path) @@ -798,8 +805,7 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]: return checkpoint def remove_checkpoint(self, filepath: Union[str, Path]) -> None: - """ Remove checkpoints - """ + """Remove checkpoints""" # legacy checkpoint logic, does not use megatron core app_state = AppState() # PTL override to accomodate model parallel checkpoints @@ -814,9 +820,9 @@ def remove_checkpoint(self, filepath: Union[str, Path]) -> None: @property def restore_checkpoint_after_setup(self) -> bool: - """ When loading FSDP-sharded checkpoint, need to restore checkpoint after configuring - FSDP sharding to match FSDP-sharded format between the checkpoint and the current - model and optimizer. + """When loading FSDP-sharded checkpoint, need to restore checkpoint after configuring + FSDP sharding to match FSDP-sharded format between the checkpoint and the current + model and optimizer. """ return True @@ -915,7 +921,8 @@ def dummy(): else: # move weights to the tmpdir for tp_rank, pp_rank in itertools.product( - range(app_state.tensor_model_parallel_size), range(app_state.pipeline_model_parallel_size), + range(app_state.tensor_model_parallel_size), + range(app_state.pipeline_model_parallel_size), ): os.makedirs(os.path.join(tmpdir, f'tp_rank_{tp_rank:02d}_pp_rank_{pp_rank:03d}')) mp_model_weights = os.path.join( @@ -1000,6 +1007,7 @@ def modify_state_dict(self, conf, state_dict): loaded_keys = state_dict.keys() if 'model.model.diffusion_model.input_blocks.1.0.in_layers.2.weight' in loaded_keys: new_state_dict = {} + # GroupNormOpt fuses activation function to one layer, thus the indexing of weights are shifted for following def should_process(key): base_str = "model.model.diffusion_model." @@ -1110,7 +1118,13 @@ def restore_from( # Get path where the command is executed - the artifacts will be "retrieved" there # (original .nemo behavior) loaded_params = super().load_config_and_state_dict( - calling_cls, restore_path, override_config_path, map_location, strict, return_config, trainer, + calling_cls, + restore_path, + override_config_path, + map_location, + strict, + return_config, + trainer, ) if not isinstance(loaded_params, tuple) or return_config is True: return loaded_params @@ -1165,12 +1179,12 @@ def dummy(): class PipelineMixedPrecisionPlugin(MixedPrecisionPlugin): - """ Overrides PTL autocasting to not wrap training/val/test_step. - We do this because we have the megatron-core fwd/bwd functions in training_step. - This means .backward is being called in training_step so we do not want the whole - step wrapped in autocast. + """Overrides PTL autocasting to not wrap training/val/test_step. + We do this because we have the megatron-core fwd/bwd functions in training_step. + This means .backward is being called in training_step so we do not want the whole + step wrapped in autocast. - We instead wrap the fwd_output_and_loss_func that is passed to the megatron-core fwd/bwd functions. + We instead wrap the fwd_output_and_loss_func that is passed to the megatron-core fwd/bwd functions. """ def __init__( @@ -1206,12 +1220,12 @@ def forward_context(self) -> Generator[None, None, None]: class FSDPMixedPrecisionPlugin(FSDPPrecision): - """ Overrides PTL autocasting to not wrap training/val/test_step. - We do this because we have the megatron-core fwd/bwd functions in training_step. - This means .backward is being called in training_step so we do not want the whole - step wrapped in autocast. + """Overrides PTL autocasting to not wrap training/val/test_step. + We do this because we have the megatron-core fwd/bwd functions in training_step. + This means .backward is being called in training_step so we do not want the whole + step wrapped in autocast. - We instead wrap the fwd_output_and_loss_func that is passed to the megatron-core fwd/bwd functions. + We instead wrap the fwd_output_and_loss_func that is passed to the megatron-core fwd/bwd functions. """ def __init__( @@ -1246,7 +1260,7 @@ class GradScaler(torch.cuda.amp.GradScaler): def __init__( self, - init_scale=2.0 ** 16, + init_scale=2.0**16, growth_factor=2.0, backoff_factor=0.5, growth_interval=2000, @@ -1500,7 +1514,7 @@ def optimizer_step( @contextmanager def forward_context(self) -> Generator[None, None, None]: - """ No explicit precision casting. Inputs are supposed to be manually casted """ + """No explicit precision casting. Inputs are supposed to be manually casted""" try: yield finally: @@ -1508,7 +1522,7 @@ def forward_context(self) -> Generator[None, None, None]: class GlobalBatchDataFetcher(_DataFetcher): - """ Overrides PTL DataFetcher. Used to fetch global batches.""" + """Overrides PTL DataFetcher. Used to fetch global batches.""" def __init__(self, prefetch_batches: int = 0, store_on_device: bool = False) -> None: diff --git a/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py b/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py index eb16889c8def..6816001e6a9b 100644 --- a/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py +++ b/scripts/checkpoint_converters/convert_zarr_to_torch_dist.py @@ -69,7 +69,9 @@ def get_args(): ) parser.add_argument("--path_to_save", type=str, default=None, required=True, help="Path to output ckpt files.") parser.add_argument( - "--save_to_nemo", action="store_true", help="If passed, output will be written as .nemo file.", + "--save_to_nemo", + action="store_true", + help="If passed, output will be written as .nemo file.", ) parser.add_argument("--gpus_per_node", type=int, required=True, default=None) parser.add_argument("--tensor_model_parallel_size", type=int, required=True, default=None) @@ -93,7 +95,11 @@ def get_args(): ) parser.add_argument( - "--model_type", type=str, required=True, default="gpt", choices=["gpt", "sft", "bert"], + "--model_type", + type=str, + required=True, + default="gpt", + choices=["gpt", "sft", "bert"], ) args = parser.parse_args() @@ -114,7 +120,7 @@ def convert(local_rank, rank, world_size, args): 'precision': args.precision, }, 'model': { - 'native_amp_init_scale': 2 ** 32, + 'native_amp_init_scale': 2**32, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'gradient_as_bucket_view': True, From f4ba733bf8b923eceead9771080b94d34359fda5 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Fri, 10 May 2024 05:29:45 -0700 Subject: [PATCH 7/8] add load on device value Signed-off-by: dimapihtar --- examples/nlp/language_modeling/conf/megatron_bert_config.yaml | 3 ++- examples/nlp/language_modeling/conf/megatron_gpt_config.yaml | 4 ++-- .../tuning/conf/megatron_gpt_finetuning_config.yaml | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml index 23a003a1fc62..3329d264cbec 100644 --- a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml @@ -95,8 +95,9 @@ model: onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - # Distributed checkpoint format + # Distributed checkpoint setup dist_ckpt_format: 'torch_dist' # Set to 'torch_dist' to use PyTorch distributed checkpoint format. + dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU. ## Activation Checkpointing # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed. diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 6ae516f7b46d..bd65f5ca9912 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -150,9 +150,9 @@ model: fsdp_grad_reduce_dtype: 32 # Gradient reduction data type. fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint. - # Distributed checkpoint format + # Distributed checkpoint setup dist_ckpt_format: 'torch_dist' # Set to 'torch_dist' to use PyTorch distributed checkpoint format. - dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU + dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU. ## Activation Checkpointing # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed. diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml index b6b933bf8f31..b067a5fdf21a 100644 --- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml +++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml @@ -83,8 +83,9 @@ model: fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint. fsdp_use_orig_params: False # Set to True to use FSDP for specific peft scheme. - # Distributed checkpoint format + # Distributed checkpoint setup dist_ckpt_format: 'torch_dist' # Set to 'torch_dist' to use PyTorch distributed checkpoint format. + dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU. peft: peft_scheme: "adapter" # can be either adapter,ia3, or ptuning From 614ac783303335f482c661ac679ba0cc23ff99e9 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Fri, 10 May 2024 05:31:17 -0700 Subject: [PATCH 8/8] remove torch dist test Signed-off-by: dimapihtar --- .github/workflows/cicd-main.yml | 99 --------------------------------- 1 file changed, 99 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index e9bd0074070f..252843bcc0ce 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -4290,104 +4290,6 @@ jobs: - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" if: "failure()" - L2_Megatron_GPT_Pretraining_and_Resume_Training_Torch_Dist_TP2_PP2: - needs: [cicd-test-container-setup] - runs-on: self-hosted-azure - timeout-minutes: 10 - container: - image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - options: - # --user 0:128 - --device=/dev/nvidia0 - --gpus all - --shm-size=8g - --env TRANSFORMERS_OFFLINE=0 - --env HYDRA_FULL_ERROR=1 - --volume /mnt/datadrive/TestData:/home/TestData - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - run: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.dist_ckpt_format=torch_dist \ - model.tensor_model_parallel_size=2 \ - model.pipeline_model_parallel_size=1 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.dist_ckpt_format=torch_dist \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: "failure()" - L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] runs-on: self-hosted-azure @@ -6687,7 +6589,6 @@ jobs: - L2_Megatron_RETRO_Pretraining_and_Resume_Training - L2_BioMegatron_Bert_NER_Task - L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2 - - L2_Megatron_GPT_Pretraining_and_Resume_Training_Torch_Dist_TP2_PP2 - L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2 - L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2 - L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2