From 1c0a425c04db143cb29e1459b8f2db21724836b5 Mon Sep 17 00:00:00 2001 From: "coderabbitai[bot]" <136622811+coderabbitai[bot]@users.noreply.github.com> Date: Thu, 23 Apr 2026 15:57:19 +0000 Subject: [PATCH 1/3] =?UTF-8?q?=F0=9F=93=9D=20Add=20docstrings=20to=20`gka?= =?UTF-8?q?rch/fix-incomplete-tensor-mapping`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Docstrings generation was requested by @grzegorz-k-karch. * https://github.com/NVIDIA/Model-Optimizer/pull/1330#issuecomment-4303244743 The following files were modified: * `modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py` * `modelopt/torch/puzzletron/tools/validate_puzzle_with_multi_replacements.py` --- .../puzzletron/tools/checkpoint_utils_hf.py | 37 ++++++++-- ...validate_puzzle_with_multi_replacements.py | 69 +++++-------------- 2 files changed, 50 insertions(+), 56 deletions(-) diff --git a/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py b/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py index adb639c2b9..7d5d9a8fa9 100644 --- a/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py +++ b/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py @@ -200,18 +200,31 @@ def init_model_from_config( def save_checkpoint( model: PreTrainedModel, checkpoint_dir: Path | str, descriptor: "ModelDescriptor" ) -> None: + """ + Save a model's configuration and weights to a Hugging Face-compatible checkpoint using the subblocks safetensors layout. + + This writes the model's configuration and the model's state dictionary partitioned into subblocks suitable for safetensors indexing, placing the resulting files under the specified checkpoint directory. + + Parameters: + model (PreTrainedModel): The model whose config and weights will be saved. + checkpoint_dir (Path | str): Destination directory for the checkpoint files. + descriptor (ModelDescriptor): Descriptor that determines how model weights are grouped and named into subblocks for the safetensors index. + """ _save_checkpoint(model.config, model.state_dict(), checkpoint_dir, descriptor) def save_checkpoint_from_shards( model: PreTrainedModel, checkpoint_dir: Path | str, descriptor: "ModelDescriptor" ) -> None: - """Save a checkpoint whose weights are split across distributed ranks. - - Each rank holds only a subset of the model's layers (via ``load_and_shard_model``). - This function gathers every rank's partial state dict onto rank 0 so that - ``model.safetensors.index.json`` is built from the *complete* weight map. - Falls back to :func:`save_checkpoint` when running on a single process. + """ + Save a checkpoint when the model's weights are sharded across distributed ranks. + + Gathers each rank's partial state dictionary onto rank 0 and writes a complete checkpoint (including the safetensors index and subblocks) from the merged weights. On a single-process run, saves directly from the local state dict. Only rank 0 performs the filesystem write; non-master ranks only participate in the gather. + + Parameters: + model (PreTrainedModel): The model instance whose local state_dict contains this rank's shard of weights. + checkpoint_dir (Path | str): Destination directory for the checkpoint files. + descriptor (ModelDescriptor): Descriptor used to partition weights into subblocks and build the safetensors index. """ local_sd = {k: v.cpu() for k, v in model.state_dict().items()} @@ -236,6 +249,18 @@ def _save_checkpoint( descriptor: "ModelDescriptor", max_workers: int | None = None, # Now optional - will auto-calculate if None ) -> None: + """ + Save a model configuration and its state tensors into a subblocks safetensors checkpoint layout. + + Saves the provided model config to checkpoint_dir, partitions the given state_dict into subblock files according to the provided descriptor, writes a safetensors index (model.safetensors.index.json) that maps tensor names to subblock filenames, and writes the per-subblock safetensors files. If model_config.tie_word_embeddings is true and the output embedding weight is present, that tensor is removed from the state_dict and index before writing the index so the tied embedding is not duplicated. The checkpoint directory is created if it does not exist. + + Parameters: + model_config (PretrainedConfig): Model configuration to save (written via save_pretrained). + state_dict (dict[str, torch.Tensor]): Mapping of tensor names to CPU tensors to be saved; tensors may be filtered (e.g., tied embeddings removed) before writing. + checkpoint_dir (Path | str): Target directory where config, index, and subblocks directory will be written; the directory is created if necessary. + descriptor (ModelDescriptor): Descriptor used to compute subblock groups and to derive the output embedding tensor name. + max_workers (int | None): Maximum number of worker threads to use when writing subblock files. If None, the implementation will choose a sensible default based on CPU count and number of files. + """ if not isinstance(checkpoint_dir, Path): checkpoint_dir = Path(checkpoint_dir) diff --git a/modelopt/torch/puzzletron/tools/validate_puzzle_with_multi_replacements.py b/modelopt/torch/puzzletron/tools/validate_puzzle_with_multi_replacements.py index feda1f8aeb..cd465a30f5 100644 --- a/modelopt/torch/puzzletron/tools/validate_puzzle_with_multi_replacements.py +++ b/modelopt/torch/puzzletron/tools/validate_puzzle_with_multi_replacements.py @@ -63,57 +63,26 @@ @torch.no_grad() def validate_puzzle_solutions(args: DictConfig) -> None: - """Validate puzzle solutions by applying layer replacements and evaluating model performance. - - Args: - args: Configuration object containing the following attributes: - - Puzzle Configuration (Required): - - replacement_library_path (Path): Path to the replacement library JSON file. - - solutions_path (Path): Path to puzzle solutions JSON file or directory containing solution files. - - solutions_to_validate (list[int], optional): Indices of specific solutions to validate. Validates all solutions if None. - - sort_solutions_by (str, optional): JSON field path to sort solutions by before validation. - - bigger_is_better (bool): If True, sort solutions in descending order. Used with sort_solutions_by. - - skip_validation (bool): If True, skip model validation and only save models if requested. - - save_models (bool): If True, save realized model checkpoints for each solution. - - Teacher/Tokenizer Configuration: - - teacher_dir (Path, optional): Path to teacher model directory. Auto-inferred if not provided. - - tokenizer_name (str, optional): Tokenizer name/path. Uses teacher_dir if not specified. - - Model Configuration (Required if skip_validation=False): - - model_dtype (str or torch.dtype): Model data type (e.g., "torch.bfloat16", torch.float16). - - autocast_dtype (str or torch.dtype): Autocast data type for mixed precision. - - Dataset Configuration (Required if skip_validation=False): - - dataset_path (str): Path to the validation dataset. - - data_column (str): Column name in dataset containing text data. - - block_size (int): Maximum sequence length for tokenization. - - eval_samples (int, optional): Number of samples to evaluate. - - val_dataset_name (str): Name of validation dataset split. - - source_datasets_to_discard (list[str], optional): List of source datasets to exclude. - - load_dataset_fn (callable, optional): Custom function to load the dataset. - - Data Processing (Required if skip_validation=False): - - micro_batch_size (int): Batch size for evaluation. - - seed (int): Random seed for reproducibility. - - shuffle_seed (int, optional): Seed for shuffling data. - - varlen (bool): Enable variable-length sequences. - - bos_rate (float): Rate of adding BOS token. - - fim_rate (float): Fill-in-the-middle rate for code completion tasks. - - fim_spm_rate (float): SPM-based fill-in-the-middle rate. - - Output Configuration: - - output_dir (Path, optional): Directory to save validation results. Auto-generated from solutions_path if not provided. - - Execution Options (Optional if skip_validation=False): - - calc_losses_on_cpu (bool): Calculate losses on CPU to avoid OOM. - - write_results (bool): Write validation results to file. - - activations_log_dir (str, optional): Directory to log activation scores. - - activation_hooks_kwargs (str or dict, optional): Arguments for activation hooks. - + """ + Validate and (optionally) save realized models for a collection of puzzle solutions. + + Loads puzzle solutions and a replacement library, applies each solution's layer replacements to realize a model, optionally evaluates realized models (including optional teacher-based hidden-state similarity metrics), and optionally saves realized model checkpoints and tokenizer files to disk. + + Parameters: + args (DictConfig): Configuration with fields used by this routine. Key fields: + - replacement_library_path (Path): Path to the replacement library JSON. + - solutions_path (Path): File or directory containing puzzle solution JSON(s). + - solutions_to_validate (list[int], optional): Indices of solutions to process; all solutions if None. + - skip_validation (bool): If True, skip model validation steps. + - save_models (bool): If True, save realized model checkpoints and tokenizer files. + - teacher_dir (Path, optional): Path to a teacher model for hidden-state comparisons. + - tokenizer_name (str, optional): Tokenizer name or path; teacher_dir is used if unset. + - output_dir (Path, optional): Directory to write validation outputs; auto-derived from solutions_path if unset. + - model_dtype (str or torch.dtype, optional): Dtype to set on saved model configs. + - (Other dataset/validation options may be read from args when validation is enabled.) + Returns: - None. Saves validation results and optionally model checkpoints to disk. + None """ descriptor = ModelDescriptorFactory.get(args.descriptor) From e59c7158bd94e241ad03a38c6de4f6f351179551 Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Thu, 23 Apr 2026 22:37:22 +0200 Subject: [PATCH 2/3] undid some docstrings Signed-off-by: Grzegorz Karch --- .../puzzletron/tools/checkpoint_utils_hf.py | 37 +++++-------------- 1 file changed, 10 insertions(+), 27 deletions(-) diff --git a/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py b/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py index 7d5d9a8fa9..dfa6eb8233 100644 --- a/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py +++ b/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py @@ -200,16 +200,6 @@ def init_model_from_config( def save_checkpoint( model: PreTrainedModel, checkpoint_dir: Path | str, descriptor: "ModelDescriptor" ) -> None: - """ - Save a model's configuration and weights to a Hugging Face-compatible checkpoint using the subblocks safetensors layout. - - This writes the model's configuration and the model's state dictionary partitioned into subblocks suitable for safetensors indexing, placing the resulting files under the specified checkpoint directory. - - Parameters: - model (PreTrainedModel): The model whose config and weights will be saved. - checkpoint_dir (Path | str): Destination directory for the checkpoint files. - descriptor (ModelDescriptor): Descriptor that determines how model weights are grouped and named into subblocks for the safetensors index. - """ _save_checkpoint(model.config, model.state_dict(), checkpoint_dir, descriptor) @@ -218,13 +208,18 @@ def save_checkpoint_from_shards( ) -> None: """ Save a checkpoint when the model's weights are sharded across distributed ranks. - - Gathers each rank's partial state dictionary onto rank 0 and writes a complete checkpoint (including the safetensors index and subblocks) from the merged weights. On a single-process run, saves directly from the local state dict. Only rank 0 performs the filesystem write; non-master ranks only participate in the gather. - + + Gathers each rank's partial state dictionary onto rank 0 and writes a complete checkpoint + (including the safetensors index and subblocks) from the merged weights. On a single-process + run, saves directly from the local state dict. Only rank 0 performs the filesystem write; + non-master ranks only participate in the gather. + Parameters: - model (PreTrainedModel): The model instance whose local state_dict contains this rank's shard of weights. + model (PreTrainedModel): The model instance whose local state_dict contains this rank's + shard of weights. checkpoint_dir (Path | str): Destination directory for the checkpoint files. - descriptor (ModelDescriptor): Descriptor used to partition weights into subblocks and build the safetensors index. + descriptor (ModelDescriptor): Descriptor used to partition weights into subblocks and build + the safetensors index. """ local_sd = {k: v.cpu() for k, v in model.state_dict().items()} @@ -249,18 +244,6 @@ def _save_checkpoint( descriptor: "ModelDescriptor", max_workers: int | None = None, # Now optional - will auto-calculate if None ) -> None: - """ - Save a model configuration and its state tensors into a subblocks safetensors checkpoint layout. - - Saves the provided model config to checkpoint_dir, partitions the given state_dict into subblock files according to the provided descriptor, writes a safetensors index (model.safetensors.index.json) that maps tensor names to subblock filenames, and writes the per-subblock safetensors files. If model_config.tie_word_embeddings is true and the output embedding weight is present, that tensor is removed from the state_dict and index before writing the index so the tied embedding is not duplicated. The checkpoint directory is created if it does not exist. - - Parameters: - model_config (PretrainedConfig): Model configuration to save (written via save_pretrained). - state_dict (dict[str, torch.Tensor]): Mapping of tensor names to CPU tensors to be saved; tensors may be filtered (e.g., tied embeddings removed) before writing. - checkpoint_dir (Path | str): Target directory where config, index, and subblocks directory will be written; the directory is created if necessary. - descriptor (ModelDescriptor): Descriptor used to compute subblock groups and to derive the output embedding tensor name. - max_workers (int | None): Maximum number of worker threads to use when writing subblock files. If None, the implementation will choose a sensible default based on CPU count and number of files. - """ if not isinstance(checkpoint_dir, Path): checkpoint_dir = Path(checkpoint_dir) From f733cfd6190d4e215ff2613702be08cc1646fc7e Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Thu, 23 Apr 2026 22:41:32 +0200 Subject: [PATCH 3/3] undid some docstrings Signed-off-by: Grzegorz Karch --- ...validate_puzzle_with_multi_replacements.py | 69 ++++++++++++++----- 1 file changed, 50 insertions(+), 19 deletions(-) diff --git a/modelopt/torch/puzzletron/tools/validate_puzzle_with_multi_replacements.py b/modelopt/torch/puzzletron/tools/validate_puzzle_with_multi_replacements.py index cd465a30f5..feda1f8aeb 100644 --- a/modelopt/torch/puzzletron/tools/validate_puzzle_with_multi_replacements.py +++ b/modelopt/torch/puzzletron/tools/validate_puzzle_with_multi_replacements.py @@ -63,26 +63,57 @@ @torch.no_grad() def validate_puzzle_solutions(args: DictConfig) -> None: - """ - Validate and (optionally) save realized models for a collection of puzzle solutions. - - Loads puzzle solutions and a replacement library, applies each solution's layer replacements to realize a model, optionally evaluates realized models (including optional teacher-based hidden-state similarity metrics), and optionally saves realized model checkpoints and tokenizer files to disk. - - Parameters: - args (DictConfig): Configuration with fields used by this routine. Key fields: - - replacement_library_path (Path): Path to the replacement library JSON. - - solutions_path (Path): File or directory containing puzzle solution JSON(s). - - solutions_to_validate (list[int], optional): Indices of solutions to process; all solutions if None. - - skip_validation (bool): If True, skip model validation steps. - - save_models (bool): If True, save realized model checkpoints and tokenizer files. - - teacher_dir (Path, optional): Path to a teacher model for hidden-state comparisons. - - tokenizer_name (str, optional): Tokenizer name or path; teacher_dir is used if unset. - - output_dir (Path, optional): Directory to write validation outputs; auto-derived from solutions_path if unset. - - model_dtype (str or torch.dtype, optional): Dtype to set on saved model configs. - - (Other dataset/validation options may be read from args when validation is enabled.) - + """Validate puzzle solutions by applying layer replacements and evaluating model performance. + + Args: + args: Configuration object containing the following attributes: + + Puzzle Configuration (Required): + - replacement_library_path (Path): Path to the replacement library JSON file. + - solutions_path (Path): Path to puzzle solutions JSON file or directory containing solution files. + - solutions_to_validate (list[int], optional): Indices of specific solutions to validate. Validates all solutions if None. + - sort_solutions_by (str, optional): JSON field path to sort solutions by before validation. + - bigger_is_better (bool): If True, sort solutions in descending order. Used with sort_solutions_by. + - skip_validation (bool): If True, skip model validation and only save models if requested. + - save_models (bool): If True, save realized model checkpoints for each solution. + + Teacher/Tokenizer Configuration: + - teacher_dir (Path, optional): Path to teacher model directory. Auto-inferred if not provided. + - tokenizer_name (str, optional): Tokenizer name/path. Uses teacher_dir if not specified. + + Model Configuration (Required if skip_validation=False): + - model_dtype (str or torch.dtype): Model data type (e.g., "torch.bfloat16", torch.float16). + - autocast_dtype (str or torch.dtype): Autocast data type for mixed precision. + + Dataset Configuration (Required if skip_validation=False): + - dataset_path (str): Path to the validation dataset. + - data_column (str): Column name in dataset containing text data. + - block_size (int): Maximum sequence length for tokenization. + - eval_samples (int, optional): Number of samples to evaluate. + - val_dataset_name (str): Name of validation dataset split. + - source_datasets_to_discard (list[str], optional): List of source datasets to exclude. + - load_dataset_fn (callable, optional): Custom function to load the dataset. + + Data Processing (Required if skip_validation=False): + - micro_batch_size (int): Batch size for evaluation. + - seed (int): Random seed for reproducibility. + - shuffle_seed (int, optional): Seed for shuffling data. + - varlen (bool): Enable variable-length sequences. + - bos_rate (float): Rate of adding BOS token. + - fim_rate (float): Fill-in-the-middle rate for code completion tasks. + - fim_spm_rate (float): SPM-based fill-in-the-middle rate. + + Output Configuration: + - output_dir (Path, optional): Directory to save validation results. Auto-generated from solutions_path if not provided. + + Execution Options (Optional if skip_validation=False): + - calc_losses_on_cpu (bool): Calculate losses on CPU to avoid OOM. + - write_results (bool): Write validation results to file. + - activations_log_dir (str, optional): Directory to log activation scores. + - activation_hooks_kwargs (str or dict, optional): Arguments for activation hooks. + Returns: - None + None. Saves validation results and optionally model checkpoints to disk. """ descriptor = ModelDescriptorFactory.get(args.descriptor)