From 440647511b05819bf892d36226ebd7885ad715d5 Mon Sep 17 00:00:00 2001 From: ericharper Date: Thu, 26 Aug 2021 09:42:35 -0600 Subject: [PATCH] add _del_model_without_trainer Signed-off-by: ericharper --- .../conf/megatron_gpt_config.yaml | 4 ++-- nemo/utils/exp_manager.py | 18 ++++++++++++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 8476d0befa22..0d2b78898796 100644 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -33,7 +33,7 @@ model: # TODO: add validation generation config, size of context, max sequence length # needed to initialize megatron micro_batch_size: 8 - tensor_model_parallel_size: 1 + tensor_model_parallel_size: 2 max_position_embeddings: 1024 encoder_seq_length: 1024 num_layers: 24 @@ -66,7 +66,7 @@ model: optim: name: adam lr: 0.0001 - weight_decay: 0.01 + #weight_decay: 0.01 betas: - 0.9 - 0.98 diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py index a945a9b123ed..652f93bbe18f 100644 --- a/nemo/utils/exp_manager.py +++ b/nemo/utils/exp_manager.py @@ -702,7 +702,7 @@ def nemo_topk_check_previous_run(self): for _ in range(models_to_delete): model = best_k_models.pop(-1) self.best_k_models.pop(model) - self._del_model(model) + self._del_model_without_trainer(model) logging.debug(f"Removed checkpoint: {model}") self.kth_best_model_path = best_k_models[-1] @@ -776,6 +776,20 @@ def _del_model(self, trainer: "pl.Trainer", filepath: str) -> None: else: return super()._del_model(trainer, filepath) + def _del_model_without_trainer(self, filepath: str) -> None: + app_state = AppState() + if app_state.model_parallel_size is not None: + # filepath needs to be updated to include mp_rank + dirname = os.path.dirname(filepath) + basename = os.path.basename(filepath) + filepath = f'{dirname}/mp_rank_{app_state.model_parallel_rank:02d}/{basename}' + + # each model parallel rank needs to remove its model + if app_state.data_parallel_rank is None or app_state.data_parallel_rank == 0: + if self._fs.exists(filepath): + self._fs.rm(filepath) + logging.info(f"Removed checkpoint: {filepath}") + def _save_last_checkpoint(self, trainer: 'pl.Trainer', monitor_candidates: Dict[str, _METRIC]) -> None: """ Overrides PTL method to account for model parallel checkpoints. Checks for data parallel rank 0 rather than global rank 0. @@ -817,7 +831,7 @@ def _save_none_monitor_checkpoint(self, trainer: 'pl.Trainer', monitor_candidate and self.best_model_path != filepath and app_state.data_parallel_rank == 0 ): - self._del_model(self.best_model_path) + self._del_model(trainer, self.best_model_path) self.best_model_path = filepath else: