Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add torch dist ckpt test to ci/cd #9154

Closed
wants to merge 12 commits into from
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ model:
onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)

# Distributed checkpoint setup
dist_ckpt_format: 'torch_dist' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU.

## Activation Checkpointing
# NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
# These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
Expand Down Expand Up @@ -158,4 +162,4 @@ model:
name: CosineAnnealing
warmup_steps: 500
constant_steps: 50000
min_lr: 2e-5
min_lr: 2e-5
4 changes: 2 additions & 2 deletions examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,8 @@ model:
fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint.

# Distributed checkpoint setup
dist_ckpt_format: 'zarr' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU
dist_ckpt_format: 'torch_dist' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU.

## Activation Checkpointing
# NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,10 @@ model:
fsdp_grad_reduce_dtype: 'fp32' # Gradient reduction data type.
fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint.
fsdp_use_orig_params: False # Set to True to use FSDP for specific peft scheme.

# Distributed checkpoint setup
dist_ckpt_format: 'torch_dist' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU.

peft:
peft_scheme: "adapter" # can be either adapter,ia3, or ptuning
Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/nlp/parts/nlp_overrides.py
Original file line number Diff line number Diff line change
Expand Up @@ -875,7 +875,7 @@ def dummy():
if model.trainer.strategy.launcher is not None:
model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer)
model.trainer.strategy.setup_environment()
checkpoint_io = DistributedCheckpointIO(model.cfg.get('dist_ckpt_format', 'zarr'))
checkpoint_io = DistributedCheckpointIO(model.cfg.get('dist_ckpt_format', 'torch_dist'))
checkpoint_io.save_checkpoint(sharded_state_dict, dist_ckpt_dir)

else:
Expand Down
14 changes: 10 additions & 4 deletions scripts/checkpoint_converters/convert_zarr_to_torch_dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,9 @@ def get_args():
)
parser.add_argument("--path_to_save", type=str, default=None, required=True, help="Path to output ckpt files.")
parser.add_argument(
"--save_to_nemo", action="store_true", help="If passed, output will be written as .nemo file.",
"--save_to_nemo",
action="store_true",
help="If passed, output will be written as .nemo file.",
)
parser.add_argument("--gpus_per_node", type=int, required=True, default=None)
parser.add_argument("--tensor_model_parallel_size", type=int, required=True, default=None)
Expand All @@ -93,7 +95,11 @@ def get_args():
)

parser.add_argument(
"--model_type", type=str, required=True, default="gpt", choices=["gpt", "sft", "bert"],
"--model_type",
type=str,
required=True,
default="gpt",
choices=["gpt", "sft", "bert"],
)

args = parser.parse_args()
Expand All @@ -114,7 +120,7 @@ def convert(local_rank, rank, world_size, args):
'precision': args.precision,
},
'model': {
'native_amp_init_scale': 2 ** 32,
'native_amp_init_scale': 2**32,
'native_amp_growth_interval': 1000,
'hysteresis': 2,
'gradient_as_bucket_view': True,
Expand Down Expand Up @@ -167,7 +173,7 @@ def convert(local_rank, rank, world_size, args):
)

with open_dict(model.cfg):
model.cfg.torch_distributed_checkpoint = True
model.cfg.dist_ckpt_format = 'torch_dist'

model._save_restore_connector = NLPSaveRestoreConnector()
save_file_path = args.path_to_save
Expand Down