From 1c0a425c04db143cb29e1459b8f2db21724836b5 Mon Sep 17 00:00:00 2001
From: "coderabbitai[bot]"
 <136622811+coderabbitai[bot]@users.noreply.github.com>
Date: Thu, 23 Apr 2026 15:57:19 +0000
Subject: [PATCH 1/3] =?UTF-8?q?=F0=9F=93=9D=20Add=20docstrings=20to=20`gka?=
 =?UTF-8?q?rch/fix-incomplete-tensor-mapping`?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Docstrings generation was requested by @grzegorz-k-karch.

* https://github.com/NVIDIA/Model-Optimizer/pull/1330#issuecomment-4303244743

The following files were modified:

* `modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py`
* `modelopt/torch/puzzletron/tools/validate_puzzle_with_multi_replacements.py`
---
 .../puzzletron/tools/checkpoint_utils_hf.py   | 37 ++++++++--
 ...validate_puzzle_with_multi_replacements.py | 69 +++++--------------
 2 files changed, 50 insertions(+), 56 deletions(-)

diff --git a/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py b/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py
index adb639c2b9..7d5d9a8fa9 100644
--- a/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py
+++ b/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py
@@ -200,18 +200,31 @@ def init_model_from_config(
 def save_checkpoint(
     model: PreTrainedModel, checkpoint_dir: Path | str, descriptor: "ModelDescriptor"
 ) -> None:
+    """
+    Save a model's configuration and weights to a Hugging Face-compatible checkpoint using the subblocks safetensors layout.
+    
+    This writes the model's configuration and the model's state dictionary partitioned into subblocks suitable for safetensors indexing, placing the resulting files under the specified checkpoint directory.
+    
+    Parameters:
+        model (PreTrainedModel): The model whose config and weights will be saved.
+        checkpoint_dir (Path | str): Destination directory for the checkpoint files.
+        descriptor (ModelDescriptor): Descriptor that determines how model weights are grouped and named into subblocks for the safetensors index.
+    """
     _save_checkpoint(model.config, model.state_dict(), checkpoint_dir, descriptor)
 
 
 def save_checkpoint_from_shards(
     model: PreTrainedModel, checkpoint_dir: Path | str, descriptor: "ModelDescriptor"
 ) -> None:
-    """Save a checkpoint whose weights are split across distributed ranks.
-
-    Each rank holds only a subset of the model's layers (via ``load_and_shard_model``).
-    This function gathers every rank's partial state dict onto rank 0 so that
-    ``model.safetensors.index.json`` is built from the *complete* weight map.
-    Falls back to :func:`save_checkpoint` when running on a single process.
+    """
+    Save a checkpoint when the model's weights are sharded across distributed ranks.
+    
+    Gathers each rank's partial state dictionary onto rank 0 and writes a complete checkpoint (including the safetensors index and subblocks) from the merged weights. On a single-process run, saves directly from the local state dict. Only rank 0 performs the filesystem write; non-master ranks only participate in the gather.
+    
+    Parameters:
+        model (PreTrainedModel): The model instance whose local state_dict contains this rank's shard of weights.
+        checkpoint_dir (Path | str): Destination directory for the checkpoint files.
+        descriptor (ModelDescriptor): Descriptor used to partition weights into subblocks and build the safetensors index.
     """
 
     local_sd = {k: v.cpu() for k, v in model.state_dict().items()}
@@ -236,6 +249,18 @@ def _save_checkpoint(
     descriptor: "ModelDescriptor",
     max_workers: int | None = None,  # Now optional - will auto-calculate if None
 ) -> None:
+    """
+    Save a model configuration and its state tensors into a subblocks safetensors checkpoint layout.
+    
+    Saves the provided model config to checkpoint_dir, partitions the given state_dict into subblock files according to the provided descriptor, writes a safetensors index (model.safetensors.index.json) that maps tensor names to subblock filenames, and writes the per-subblock safetensors files. If model_config.tie_word_embeddings is true and the output embedding weight is present, that tensor is removed from the state_dict and index before writing the index so the tied embedding is not duplicated. The checkpoint directory is created if it does not exist.
+    
+    Parameters:
+        model_config (PretrainedConfig): Model configuration to save (written via save_pretrained).
+        state_dict (dict[str, torch.Tensor]): Mapping of tensor names to CPU tensors to be saved; tensors may be filtered (e.g., tied embeddings removed) before writing.
+        checkpoint_dir (Path | str): Target directory where config, index, and subblocks directory will be written; the directory is created if necessary.
+        descriptor (ModelDescriptor): Descriptor used to compute subblock groups and to derive the output embedding tensor name.
+        max_workers (int | None): Maximum number of worker threads to use when writing subblock files. If None, the implementation will choose a sensible default based on CPU count and number of files.
+    """
     if not isinstance(checkpoint_dir, Path):
         checkpoint_dir = Path(checkpoint_dir)
 
diff --git a/modelopt/torch/puzzletron/tools/validate_puzzle_with_multi_replacements.py b/modelopt/torch/puzzletron/tools/validate_puzzle_with_multi_replacements.py
index feda1f8aeb..cd465a30f5 100644
--- a/modelopt/torch/puzzletron/tools/validate_puzzle_with_multi_replacements.py
+++ b/modelopt/torch/puzzletron/tools/validate_puzzle_with_multi_replacements.py
@@ -63,57 +63,26 @@
 
 @torch.no_grad()
 def validate_puzzle_solutions(args: DictConfig) -> None:
-    """Validate puzzle solutions by applying layer replacements and evaluating model performance.
-
-    Args:
-        args: Configuration object containing the following attributes:
-
-            Puzzle Configuration (Required):
-            - replacement_library_path (Path): Path to the replacement library JSON file.
-            - solutions_path (Path): Path to puzzle solutions JSON file or directory containing solution files.
-            - solutions_to_validate (list[int], optional): Indices of specific solutions to validate. Validates all solutions if None.
-            - sort_solutions_by (str, optional): JSON field path to sort solutions by before validation.
-            - bigger_is_better (bool): If True, sort solutions in descending order. Used with sort_solutions_by.
-            - skip_validation (bool): If True, skip model validation and only save models if requested.
-            - save_models (bool): If True, save realized model checkpoints for each solution.
-
-            Teacher/Tokenizer Configuration:
-            - teacher_dir (Path, optional): Path to teacher model directory. Auto-inferred if not provided.
-            - tokenizer_name (str, optional): Tokenizer name/path. Uses teacher_dir if not specified.
-
-            Model Configuration (Required if skip_validation=False):
-            - model_dtype (str or torch.dtype): Model data type (e.g., "torch.bfloat16", torch.float16).
-            - autocast_dtype (str or torch.dtype): Autocast data type for mixed precision.
-
-            Dataset Configuration (Required if skip_validation=False):
-            - dataset_path (str): Path to the validation dataset.
-            - data_column (str): Column name in dataset containing text data.
-            - block_size (int): Maximum sequence length for tokenization.
-            - eval_samples (int, optional): Number of samples to evaluate.
-            - val_dataset_name (str): Name of validation dataset split.
-            - source_datasets_to_discard (list[str], optional): List of source datasets to exclude.
-            - load_dataset_fn (callable, optional): Custom function to load the dataset.
-
-            Data Processing (Required if skip_validation=False):
-            - micro_batch_size (int): Batch size for evaluation.
-            - seed (int): Random seed for reproducibility.
-            - shuffle_seed (int, optional): Seed for shuffling data.
-            - varlen (bool): Enable variable-length sequences.
-            - bos_rate (float): Rate of adding BOS token.
-            - fim_rate (float): Fill-in-the-middle rate for code completion tasks.
-            - fim_spm_rate (float): SPM-based fill-in-the-middle rate.
-
-            Output Configuration:
-            - output_dir (Path, optional): Directory to save validation results. Auto-generated from solutions_path if not provided.
-
-            Execution Options (Optional if skip_validation=False):
-            - calc_losses_on_cpu (bool): Calculate losses on CPU to avoid OOM.
-            - write_results (bool): Write validation results to file.
-            - activations_log_dir (str, optional): Directory to log activation scores.
-            - activation_hooks_kwargs (str or dict, optional): Arguments for activation hooks.
-
+    """
+    Validate and (optionally) save realized models for a collection of puzzle solutions.
+    
+    Loads puzzle solutions and a replacement library, applies each solution's layer replacements to realize a model, optionally evaluates realized models (including optional teacher-based hidden-state similarity metrics), and optionally saves realized model checkpoints and tokenizer files to disk.
+    
+    Parameters:
+        args (DictConfig): Configuration with fields used by this routine. Key fields:
+            - replacement_library_path (Path): Path to the replacement library JSON.
+            - solutions_path (Path): File or directory containing puzzle solution JSON(s).
+            - solutions_to_validate (list[int], optional): Indices of solutions to process; all solutions if None.
+            - skip_validation (bool): If True, skip model validation steps.
+            - save_models (bool): If True, save realized model checkpoints and tokenizer files.
+            - teacher_dir (Path, optional): Path to a teacher model for hidden-state comparisons.
+            - tokenizer_name (str, optional): Tokenizer name or path; teacher_dir is used if unset.
+            - output_dir (Path, optional): Directory to write validation outputs; auto-derived from solutions_path if unset.
+            - model_dtype (str or torch.dtype, optional): Dtype to set on saved model configs.
+            - (Other dataset/validation options may be read from args when validation is enabled.)
+    
     Returns:
-        None. Saves validation results and optionally model checkpoints to disk.
+        None
     """
     descriptor = ModelDescriptorFactory.get(args.descriptor)
 

From e59c7158bd94e241ad03a38c6de4f6f351179551 Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Thu, 23 Apr 2026 22:37:22 +0200
Subject: [PATCH 2/3] undid some docstrings

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 .../puzzletron/tools/checkpoint_utils_hf.py   | 37 +++++--------------
 1 file changed, 10 insertions(+), 27 deletions(-)

diff --git a/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py b/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py
index 7d5d9a8fa9..dfa6eb8233 100644
--- a/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py
+++ b/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py
@@ -200,16 +200,6 @@ def init_model_from_config(
 def save_checkpoint(
     model: PreTrainedModel, checkpoint_dir: Path | str, descriptor: "ModelDescriptor"
 ) -> None:
-    """
-    Save a model's configuration and weights to a Hugging Face-compatible checkpoint using the subblocks safetensors layout.
-    
-    This writes the model's configuration and the model's state dictionary partitioned into subblocks suitable for safetensors indexing, placing the resulting files under the specified checkpoint directory.
-    
-    Parameters:
-        model (PreTrainedModel): The model whose config and weights will be saved.
-        checkpoint_dir (Path | str): Destination directory for the checkpoint files.
-        descriptor (ModelDescriptor): Descriptor that determines how model weights are grouped and named into subblocks for the safetensors index.
-    """
     _save_checkpoint(model.config, model.state_dict(), checkpoint_dir, descriptor)
 
 
@@ -218,13 +208,18 @@ def save_checkpoint_from_shards(
 ) -> None:
     """
     Save a checkpoint when the model's weights are sharded across distributed ranks.
-    
-    Gathers each rank's partial state dictionary onto rank 0 and writes a complete checkpoint (including the safetensors index and subblocks) from the merged weights. On a single-process run, saves directly from the local state dict. Only rank 0 performs the filesystem write; non-master ranks only participate in the gather.
-    
+
+    Gathers each rank's partial state dictionary onto rank 0 and writes a complete checkpoint
+    (including the safetensors index and subblocks) from the merged weights. On a single-process
+    run, saves directly from the local state dict. Only rank 0 performs the filesystem write;
+    non-master ranks only participate in the gather.
+
     Parameters:
-        model (PreTrainedModel): The model instance whose local state_dict contains this rank's shard of weights.
+        model (PreTrainedModel): The model instance whose local state_dict contains this rank's
+        shard of weights.
         checkpoint_dir (Path | str): Destination directory for the checkpoint files.
-        descriptor (ModelDescriptor): Descriptor used to partition weights into subblocks and build the safetensors index.
+        descriptor (ModelDescriptor): Descriptor used to partition weights into subblocks and build
+        the safetensors index.
     """
 
     local_sd = {k: v.cpu() for k, v in model.state_dict().items()}
@@ -249,18 +244,6 @@ def _save_checkpoint(
     descriptor: "ModelDescriptor",
     max_workers: int | None = None,  # Now optional - will auto-calculate if None
 ) -> None:
-    """
-    Save a model configuration and its state tensors into a subblocks safetensors checkpoint layout.
-    
-    Saves the provided model config to checkpoint_dir, partitions the given state_dict into subblock files according to the provided descriptor, writes a safetensors index (model.safetensors.index.json) that maps tensor names to subblock filenames, and writes the per-subblock safetensors files. If model_config.tie_word_embeddings is true and the output embedding weight is present, that tensor is removed from the state_dict and index before writing the index so the tied embedding is not duplicated. The checkpoint directory is created if it does not exist.
-    
-    Parameters:
-        model_config (PretrainedConfig): Model configuration to save (written via save_pretrained).
-        state_dict (dict[str, torch.Tensor]): Mapping of tensor names to CPU tensors to be saved; tensors may be filtered (e.g., tied embeddings removed) before writing.
-        checkpoint_dir (Path | str): Target directory where config, index, and subblocks directory will be written; the directory is created if necessary.
-        descriptor (ModelDescriptor): Descriptor used to compute subblock groups and to derive the output embedding tensor name.
-        max_workers (int | None): Maximum number of worker threads to use when writing subblock files. If None, the implementation will choose a sensible default based on CPU count and number of files.
-    """
     if not isinstance(checkpoint_dir, Path):
         checkpoint_dir = Path(checkpoint_dir)
 

From f733cfd6190d4e215ff2613702be08cc1646fc7e Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Thu, 23 Apr 2026 22:41:32 +0200
Subject: [PATCH 3/3] undid some docstrings

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 ...validate_puzzle_with_multi_replacements.py | 69 ++++++++++++++-----
 1 file changed, 50 insertions(+), 19 deletions(-)

diff --git a/modelopt/torch/puzzletron/tools/validate_puzzle_with_multi_replacements.py b/modelopt/torch/puzzletron/tools/validate_puzzle_with_multi_replacements.py
index cd465a30f5..feda1f8aeb 100644
--- a/modelopt/torch/puzzletron/tools/validate_puzzle_with_multi_replacements.py
+++ b/modelopt/torch/puzzletron/tools/validate_puzzle_with_multi_replacements.py
@@ -63,26 +63,57 @@
 
 @torch.no_grad()
 def validate_puzzle_solutions(args: DictConfig) -> None:
-    """
-    Validate and (optionally) save realized models for a collection of puzzle solutions.
-    
-    Loads puzzle solutions and a replacement library, applies each solution's layer replacements to realize a model, optionally evaluates realized models (including optional teacher-based hidden-state similarity metrics), and optionally saves realized model checkpoints and tokenizer files to disk.
-    
-    Parameters:
-        args (DictConfig): Configuration with fields used by this routine. Key fields:
-            - replacement_library_path (Path): Path to the replacement library JSON.
-            - solutions_path (Path): File or directory containing puzzle solution JSON(s).
-            - solutions_to_validate (list[int], optional): Indices of solutions to process; all solutions if None.
-            - skip_validation (bool): If True, skip model validation steps.
-            - save_models (bool): If True, save realized model checkpoints and tokenizer files.
-            - teacher_dir (Path, optional): Path to a teacher model for hidden-state comparisons.
-            - tokenizer_name (str, optional): Tokenizer name or path; teacher_dir is used if unset.
-            - output_dir (Path, optional): Directory to write validation outputs; auto-derived from solutions_path if unset.
-            - model_dtype (str or torch.dtype, optional): Dtype to set on saved model configs.
-            - (Other dataset/validation options may be read from args when validation is enabled.)
-    
+    """Validate puzzle solutions by applying layer replacements and evaluating model performance.
+
+    Args:
+        args: Configuration object containing the following attributes:
+
+            Puzzle Configuration (Required):
+            - replacement_library_path (Path): Path to the replacement library JSON file.
+            - solutions_path (Path): Path to puzzle solutions JSON file or directory containing solution files.
+            - solutions_to_validate (list[int], optional): Indices of specific solutions to validate. Validates all solutions if None.
+            - sort_solutions_by (str, optional): JSON field path to sort solutions by before validation.
+            - bigger_is_better (bool): If True, sort solutions in descending order. Used with sort_solutions_by.
+            - skip_validation (bool): If True, skip model validation and only save models if requested.
+            - save_models (bool): If True, save realized model checkpoints for each solution.
+
+            Teacher/Tokenizer Configuration:
+            - teacher_dir (Path, optional): Path to teacher model directory. Auto-inferred if not provided.
+            - tokenizer_name (str, optional): Tokenizer name/path. Uses teacher_dir if not specified.
+
+            Model Configuration (Required if skip_validation=False):
+            - model_dtype (str or torch.dtype): Model data type (e.g., "torch.bfloat16", torch.float16).
+            - autocast_dtype (str or torch.dtype): Autocast data type for mixed precision.
+
+            Dataset Configuration (Required if skip_validation=False):
+            - dataset_path (str): Path to the validation dataset.
+            - data_column (str): Column name in dataset containing text data.
+            - block_size (int): Maximum sequence length for tokenization.
+            - eval_samples (int, optional): Number of samples to evaluate.
+            - val_dataset_name (str): Name of validation dataset split.
+            - source_datasets_to_discard (list[str], optional): List of source datasets to exclude.
+            - load_dataset_fn (callable, optional): Custom function to load the dataset.
+
+            Data Processing (Required if skip_validation=False):
+            - micro_batch_size (int): Batch size for evaluation.
+            - seed (int): Random seed for reproducibility.
+            - shuffle_seed (int, optional): Seed for shuffling data.
+            - varlen (bool): Enable variable-length sequences.
+            - bos_rate (float): Rate of adding BOS token.
+            - fim_rate (float): Fill-in-the-middle rate for code completion tasks.
+            - fim_spm_rate (float): SPM-based fill-in-the-middle rate.
+
+            Output Configuration:
+            - output_dir (Path, optional): Directory to save validation results. Auto-generated from solutions_path if not provided.
+
+            Execution Options (Optional if skip_validation=False):
+            - calc_losses_on_cpu (bool): Calculate losses on CPU to avoid OOM.
+            - write_results (bool): Write validation results to file.
+            - activations_log_dir (str, optional): Directory to log activation scores.
+            - activation_hooks_kwargs (str or dict, optional): Arguments for activation hooks.
+
     Returns:
-        None
+        None. Saves validation results and optionally model checkpoints to disk.
     """
     descriptor = ModelDescriptorFactory.get(args.descriptor)