From 10fdc463f1f22cedfe0484e328a5fb211d4c91c4 Mon Sep 17 00:00:00 2001
From: Pawel Gadzinski <pgadzinski@nvidia.com>
Date: Mon, 1 Sep 2025 10:17:09 +0000
Subject: [PATCH 01/20] initial code drop

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>
---
 transformer_engine/debug/features/api.py        |  6 +++---
 .../debug/features/log_fp8_tensor_stats.py      |  6 +++++-
 .../debug/features/log_tensor_stats.py          |  6 +++++-
 .../debug/pytorch/debug_quantization.py         | 17 +++++++++++++++++
 transformer_engine/pytorch/module/base.py       |  6 ++++--
 5 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/transformer_engine/debug/features/api.py b/transformer_engine/debug/features/api.py
index 94fc6d129c..176d285cb6 100644
--- a/transformer_engine/debug/features/api.py
+++ b/transformer_engine/debug/features/api.py
@@ -244,7 +244,7 @@ def inspect_tensor(
         config: Dict,
         layer_name: str,
         tensor_name: str,
-        tensor: torch.Tensor,
+        tensor: Optional[torch.Tensor],
         rowwise_quantized_tensor: Optional[torch.Tensor],
         columnwise_quantized_tensor: Optional[torch.Tensor],
         quantizer: Optional[Quantizer],
@@ -262,8 +262,8 @@ def inspect_tensor(
         layer_name: str
         tensor_name: str
             one of [`activation`, `weight`, `gradient`, `output`, `wgrad`, `dgrad`],
-        tensor: torch.Tensor
-            tensor in high precision,
+        tensor: Optional[torch.Tensor]
+            tensor in high precision, can be None only if fp8 model parameters are used and tensor name is `weight`.
         rowwise_quantized_tensor: Optional[torch.Tensor]
             rowwise quantized tensor,
         columnwise_quantized_tensor: Optional[torch.Tensor]
diff --git a/transformer_engine/debug/features/log_fp8_tensor_stats.py b/transformer_engine/debug/features/log_fp8_tensor_stats.py
index 31620211d0..2d4a2c0e2f 100644
--- a/transformer_engine/debug/features/log_fp8_tensor_stats.py
+++ b/transformer_engine/debug/features/log_fp8_tensor_stats.py
@@ -268,7 +268,7 @@ def inspect_tensor(
         tensor_name: str,
         iteration: int,
         tp_group: torch.distributed.ProcessGroup,
-        tensor: torch.Tensor,
+        tensor: Optional[torch.Tensor],
         rowwise_quantized_tensor: Optional[torch.Tensor | QuantizedTensor] = None,
         columnwise_quantized_tensor: Optional[torch.Tensor | QuantizedTensor] = None,
         quantizer: Optional[Quantizer] = None,
@@ -281,6 +281,10 @@ def inspect_tensor(
             quantizer is not None
         ), "[NVTORCH INSPECT ERROR] LogFp8TensorStats cannot be run without low-precision recipe."
 
+        assert tensor is not None, \
+            f"[NVTORCH INSPECT ERROR] LogFp8TensorStats needs tensor in high precision for tensor: {tensor_name}. \
+             This feature cannot be used for weight tensor with fp8 model parameters."
+
         quantized_tensor = rowwise_quantized_tensor
         assert isinstance(
             quantized_tensor, QuantizedTensor
diff --git a/transformer_engine/debug/features/log_tensor_stats.py b/transformer_engine/debug/features/log_tensor_stats.py
index 7ba2f9f771..beeecf19bf 100644
--- a/transformer_engine/debug/features/log_tensor_stats.py
+++ b/transformer_engine/debug/features/log_tensor_stats.py
@@ -115,13 +115,17 @@ def inspect_tensor(
         tensor_name: str,
         iteration: int,
         tp_group: torch.distributed.ProcessGroup,
-        tensor: torch.Tensor,
+        tensor: Optional[torch.Tensor],
         rowwise_quantized_tensor: Optional[torch.Tensor | QuantizedTensor] = None,
         columnwise_quantized_tensor: Optional[torch.Tensor | QuantizedTensor] = None,
         quantizer: Optional[Quantizer] = None,
     ):  # pylint: disable=unused-argument
         """API call used to collect the data about the tensor before process_tensor()/quantization."""
 
+        if tensor is None:
+            assert isinstance(rowwise_quantized_tensor, QuantizedTensor)
+            tensor = rowwise_quantized_tensor.dequantize()
+
         assert (
             type(tensor) not in [Float8Tensor, Float8TensorBase, MXFP8Tensor, MXFP8TensorBase]
             and tensor.dtype != torch.uint8
diff --git a/transformer_engine/debug/pytorch/debug_quantization.py b/transformer_engine/debug/pytorch/debug_quantization.py
index d564ca8e92..4312989731 100644
--- a/transformer_engine/debug/pytorch/debug_quantization.py
+++ b/transformer_engine/debug/pytorch/debug_quantization.py
@@ -555,6 +555,23 @@ def set_usage(self, rowwise: bool = None, columnwise: bool = None):
         super().set_usage(rowwise=rowwise, columnwise=columnwise)
         if not self.output_tensor:
             self._update_parent_quantizer_usage()
+    
+    def wrap_quantized_tensor(self, tensor: QuantizedTensor):
+        """Wraps the quantized tensor with the debug quantizer."""
+
+        assert API_CALL_MODIFY not in [self.rowwise_tensor_plan, self.columnwise_tensor_plan], \
+            "Modify tensor is not allowed in debug mode for weight if fp8 model parameters are used."
+
+
+        self._call_inspect_tensor_api(None, tensor, tensor)
+        
+        return DebugQuantizedTensor(
+            rowwise_gemm_tensor=tensor,
+            columnwise_gemm_tensor=tensor,
+            quantizer=self,
+            layer_name=self.layer_name,
+            tensor_name=self.tensor_name,
+        )
 
 
 class DebugQuantizedTensor(QuantizedTensorBase):
diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py
index 3bbfaacdf5..78e649c4c4 100644
--- a/transformer_engine/pytorch/module/base.py
+++ b/transformer_engine/pytorch/module/base.py
@@ -1367,6 +1367,10 @@ def get_weight_workspace(
                 rowwise_usage=update_rowwise_usage,
                 columnwise_usage=update_columnwise_usage,
             )
+
+            if isinstance(quantizer, DebugQuantizer):
+                tensor = quantizer.wrap_quantized_tensor(tensor)
+            
             return tensor
 
         # Try getting workspace from cache
@@ -1530,8 +1534,6 @@ def no_debug_features_active(self, quantizers):
         if not run_current:
             return True
 
-        if self.primary_weights_in_fp8:
-            raise RuntimeError("FP8 weights are not supported in debug mode.")
         return False
 
     def _validate_name(self):

From 2e5debbb3dda1873de7fed2b6825c2384de4978d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 1 Sep 2025 10:33:26 +0000
Subject: [PATCH 02/20] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../debug/features/log_fp8_tensor_stats.py            |  8 +++++---
 .../debug/pytorch/debug_quantization.py               | 11 ++++++-----
 transformer_engine/pytorch/module/base.py             |  2 +-
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/transformer_engine/debug/features/log_fp8_tensor_stats.py b/transformer_engine/debug/features/log_fp8_tensor_stats.py
index 2d4a2c0e2f..cc2829d996 100644
--- a/transformer_engine/debug/features/log_fp8_tensor_stats.py
+++ b/transformer_engine/debug/features/log_fp8_tensor_stats.py
@@ -281,9 +281,11 @@ def inspect_tensor(
             quantizer is not None
         ), "[NVTORCH INSPECT ERROR] LogFp8TensorStats cannot be run without low-precision recipe."
 
-        assert tensor is not None, \
-            f"[NVTORCH INSPECT ERROR] LogFp8TensorStats needs tensor in high precision for tensor: {tensor_name}. \
-             This feature cannot be used for weight tensor with fp8 model parameters."
+        assert tensor is not None, (
+            "[NVTORCH INSPECT ERROR] LogFp8TensorStats needs tensor in high precision for tensor:"
+            f" {tensor_name}.              This feature cannot be used for weight tensor with fp8"
+            " model parameters."
+        )
 
         quantized_tensor = rowwise_quantized_tensor
         assert isinstance(
diff --git a/transformer_engine/debug/pytorch/debug_quantization.py b/transformer_engine/debug/pytorch/debug_quantization.py
index 4312989731..3fa548e75f 100644
--- a/transformer_engine/debug/pytorch/debug_quantization.py
+++ b/transformer_engine/debug/pytorch/debug_quantization.py
@@ -555,16 +555,17 @@ def set_usage(self, rowwise: bool = None, columnwise: bool = None):
         super().set_usage(rowwise=rowwise, columnwise=columnwise)
         if not self.output_tensor:
             self._update_parent_quantizer_usage()
-    
+
     def wrap_quantized_tensor(self, tensor: QuantizedTensor):
         """Wraps the quantized tensor with the debug quantizer."""
 
-        assert API_CALL_MODIFY not in [self.rowwise_tensor_plan, self.columnwise_tensor_plan], \
-            "Modify tensor is not allowed in debug mode for weight if fp8 model parameters are used."
-
+        assert API_CALL_MODIFY not in [
+            self.rowwise_tensor_plan,
+            self.columnwise_tensor_plan,
+        ], "Modify tensor is not allowed in debug mode for weight if fp8 model parameters are used."
 
         self._call_inspect_tensor_api(None, tensor, tensor)
-        
+
         return DebugQuantizedTensor(
             rowwise_gemm_tensor=tensor,
             columnwise_gemm_tensor=tensor,
diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py
index 78e649c4c4..3cd39673cb 100644
--- a/transformer_engine/pytorch/module/base.py
+++ b/transformer_engine/pytorch/module/base.py
@@ -1370,7 +1370,7 @@ def get_weight_workspace(
 
             if isinstance(quantizer, DebugQuantizer):
                 tensor = quantizer.wrap_quantized_tensor(tensor)
-            
+
             return tensor
 
         # Try getting workspace from cache

From 11b4c263df9e2789b10ac661ff1f2d1e554bc727 Mon Sep 17 00:00:00 2001
From: Pawel Gadzinski <pgadzinski@nvidia.com>
Date: Mon, 1 Sep 2025 10:45:18 +0000
Subject: [PATCH 03/20] fixes

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>
---
 tests/pytorch/test_sanity.py                  |  6 ------
 transformer_engine/debug/features/api.py      |  2 +-
 .../debug/features/log_fp8_tensor_stats.py    | 20 ++++++++++++-------
 .../debug/features/log_tensor_stats.py        |  2 ++
 4 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/tests/pytorch/test_sanity.py b/tests/pytorch/test_sanity.py
index 5151aa96e7..17de4b3096 100644
--- a/tests/pytorch/test_sanity.py
+++ b/tests/pytorch/test_sanity.py
@@ -428,8 +428,6 @@ def test_sanity_linear(dtype, fp8_recipe, model, skip_wgrad, skip_dgrad, microba
 @pytest.mark.parametrize("fp8_model_params", all_boolean)
 @pytest.mark.parametrize("use_bias", all_boolean)
 def test_sanity_linear_with_zero_tokens(dtype, bs, model, fp8_recipe, fp8_model_params, use_bias):
-    if NVTE_TEST_NVINSPECT_ENABLED and fp8_model_params:
-        pytest.skip("Quantized model parameters are not supported in debug mode.")
     config = model_configs[model]
     ffn_hidden_size = 4 * config.hidden_size
     num_tokens = bs * config.max_seqlen_q
@@ -465,8 +463,6 @@ def test_sanity_linear_with_zero_tokens(dtype, bs, model, fp8_recipe, fp8_model_
 def test_sanity_grouped_linear(
     dtype, bs, model, fp8_recipe, fp8_model_params, use_bias, num_gemms, empty_split
 ):
-    if NVTE_TEST_NVINSPECT_ENABLED and fp8_model_params:
-        pytest.skip("FP8 model parameters are not supported in debug mode.")
     config = model_configs[model]
     ffn_hidden_size = 4 * config.hidden_size
     # Small batch size used to catch bug from https://github.com/NVIDIA/TransformerEngine/pull/1527.
@@ -1045,8 +1041,6 @@ def test_inference_mode(
     quantization: Optional[str],
 ) -> None:
     """Test heuristics for initializing quantized weights"""
-    if NVTE_TEST_NVINSPECT_ENABLED and quantization is not None:
-        pytest.skip("Quantized model parameters are not supported in debug mode.")
 
     # Tensor dimensions
     sequence_length = 32
diff --git a/transformer_engine/debug/features/api.py b/transformer_engine/debug/features/api.py
index 176d285cb6..2b0b780e51 100644
--- a/transformer_engine/debug/features/api.py
+++ b/transformer_engine/debug/features/api.py
@@ -263,7 +263,7 @@ def inspect_tensor(
         tensor_name: str
             one of [`activation`, `weight`, `gradient`, `output`, `wgrad`, `dgrad`],
         tensor: Optional[torch.Tensor]
-            tensor in high precision, can be None only if fp8 model parameters are used and tensor name is `weight`.
+            tensor in high precision. It can be None only if fp8 model parameters are used and tensor name is `weight`.
         rowwise_quantized_tensor: Optional[torch.Tensor]
             rowwise quantized tensor,
         columnwise_quantized_tensor: Optional[torch.Tensor]
diff --git a/transformer_engine/debug/features/log_fp8_tensor_stats.py b/transformer_engine/debug/features/log_fp8_tensor_stats.py
index cc2829d996..9af89748e7 100644
--- a/transformer_engine/debug/features/log_fp8_tensor_stats.py
+++ b/transformer_engine/debug/features/log_fp8_tensor_stats.py
@@ -28,6 +28,7 @@
 ALL_RECIPE_NAMES = ["fp8_delayed_scaling", "fp8_current_scaling", "mxfp8", "fp8_block_scaling"]
 
 
+
 def _get_recipe_name(quantizer: Optional[Quantizer]):
     if quantizer is None:
         return ""
@@ -110,6 +111,9 @@ class LogFp8TensorStats(BaseLogTensorStats):
                 - scale_inv_min - minimum of the inverse of the scaling factors,
                 - scale_inv_max - maximum of the inverse of the scaling factors,
                 - mse - mean squared error of the quantized tensor and the original tensor = sum((quantized_tensor - original_tensor)**2) / num_elements,
+            
+            If stats are collected for weight tensora and fp8 model parameters are used, only "scale_inv_min", "scale_inv_max" are supported.
+            The other stats need high precision tensor to be computed.
 
         tensors/tensors_struct: List[str]
             list of tensors to log
@@ -148,7 +152,7 @@ class LogFp8TensorStats(BaseLogTensorStats):
                     end_step: 80
     """
 
-    def check_if_stat_is_supported(self, stat: str, current_recipe: str):
+    def check_if_stat_is_supported(self, stat: str, current_recipe: str, high_precision_tensor_provided: bool):
         """Returns True if stat is supported, raises ValueError otherwise."""
         columnwise = stat.endswith("_columnwise")
         if columnwise:
@@ -156,6 +160,13 @@ def check_if_stat_is_supported(self, stat: str, current_recipe: str):
         recipe_from_stat, _ = self.get_recipe_from_stat(stat, default_recipe=current_recipe)
         stat_without_recipe = stat.replace(recipe_from_stat + "_", "")
 
+        need_high_precision_tensor_stats = ["underflows%", "overflows%", "mse"]
+        if stat_without_recipe in need_high_precision_tensor_stats and not high_precision_tensor_provided:
+            raise ValueError(
+                f"Stat {stat} needs high precision tensor to be provided. \
+                This feature cannot be used for weight tensor with fp8 model parameters."
+            )
+
         if current_recipe == "" and recipe_from_stat == "":
             raise ValueError(
                 f"Stat {stat} does not contain a recipe name and the current recipe is not set."
@@ -281,11 +292,6 @@ def inspect_tensor(
             quantizer is not None
         ), "[NVTORCH INSPECT ERROR] LogFp8TensorStats cannot be run without low-precision recipe."
 
-        assert tensor is not None, (
-            "[NVTORCH INSPECT ERROR] LogFp8TensorStats needs tensor in high precision for tensor:"
-            f" {tensor_name}.              This feature cannot be used for weight tensor with fp8"
-            " model parameters."
-        )
 
         quantized_tensor = rowwise_quantized_tensor
         assert isinstance(
@@ -294,7 +300,7 @@ def inspect_tensor(
         recipe_name = _get_recipe_name(quantizer)
 
         for stat in config["stats"]:
-            self.check_if_stat_is_supported(stat, recipe_name)
+            self.check_if_stat_is_supported(stat, recipe_name, high_precision_tensor_provided=tensor is not None)
 
         options = (
             config.get("start_step", None),
diff --git a/transformer_engine/debug/features/log_tensor_stats.py b/transformer_engine/debug/features/log_tensor_stats.py
index beeecf19bf..c1f1f1080e 100644
--- a/transformer_engine/debug/features/log_tensor_stats.py
+++ b/transformer_engine/debug/features/log_tensor_stats.py
@@ -122,6 +122,8 @@ def inspect_tensor(
     ):  # pylint: disable=unused-argument
         """API call used to collect the data about the tensor before process_tensor()/quantization."""
 
+        # Tensor is None only if fp8 model parameters are used and tensor name is `weight`.
+        # If one wants to collect stats for this tensor, we need to dequantize it.
         if tensor is None:
             assert isinstance(rowwise_quantized_tensor, QuantizedTensor)
             tensor = rowwise_quantized_tensor.dequantize()

From 8a8eed3212bd3c195dac8dc9c14fd59a856d7d1c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 1 Sep 2025 10:47:41 +0000
Subject: [PATCH 04/20] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../debug/features/log_fp8_tensor_stats.py    | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/transformer_engine/debug/features/log_fp8_tensor_stats.py b/transformer_engine/debug/features/log_fp8_tensor_stats.py
index 9af89748e7..4a9416568e 100644
--- a/transformer_engine/debug/features/log_fp8_tensor_stats.py
+++ b/transformer_engine/debug/features/log_fp8_tensor_stats.py
@@ -28,7 +28,6 @@
 ALL_RECIPE_NAMES = ["fp8_delayed_scaling", "fp8_current_scaling", "mxfp8", "fp8_block_scaling"]
 
 
-
 def _get_recipe_name(quantizer: Optional[Quantizer]):
     if quantizer is None:
         return ""
@@ -111,7 +110,7 @@ class LogFp8TensorStats(BaseLogTensorStats):
                 - scale_inv_min - minimum of the inverse of the scaling factors,
                 - scale_inv_max - maximum of the inverse of the scaling factors,
                 - mse - mean squared error of the quantized tensor and the original tensor = sum((quantized_tensor - original_tensor)**2) / num_elements,
-            
+
             If stats are collected for weight tensora and fp8 model parameters are used, only "scale_inv_min", "scale_inv_max" are supported.
             The other stats need high precision tensor to be computed.
 
@@ -152,7 +151,9 @@ class LogFp8TensorStats(BaseLogTensorStats):
                     end_step: 80
     """
 
-    def check_if_stat_is_supported(self, stat: str, current_recipe: str, high_precision_tensor_provided: bool):
+    def check_if_stat_is_supported(
+        self, stat: str, current_recipe: str, high_precision_tensor_provided: bool
+    ):
         """Returns True if stat is supported, raises ValueError otherwise."""
         columnwise = stat.endswith("_columnwise")
         if columnwise:
@@ -161,10 +162,13 @@ def check_if_stat_is_supported(self, stat: str, current_recipe: str, high_precis
         stat_without_recipe = stat.replace(recipe_from_stat + "_", "")
 
         need_high_precision_tensor_stats = ["underflows%", "overflows%", "mse"]
-        if stat_without_recipe in need_high_precision_tensor_stats and not high_precision_tensor_provided:
+        if (
+            stat_without_recipe in need_high_precision_tensor_stats
+            and not high_precision_tensor_provided
+        ):
             raise ValueError(
-                f"Stat {stat} needs high precision tensor to be provided. \
-                This feature cannot be used for weight tensor with fp8 model parameters."
+                f"Stat {stat} needs high precision tensor to be provided.                 This"
+                " feature cannot be used for weight tensor with fp8 model parameters."
             )
 
         if current_recipe == "" and recipe_from_stat == "":
@@ -292,7 +296,6 @@ def inspect_tensor(
             quantizer is not None
         ), "[NVTORCH INSPECT ERROR] LogFp8TensorStats cannot be run without low-precision recipe."
 
-
         quantized_tensor = rowwise_quantized_tensor
         assert isinstance(
             quantized_tensor, QuantizedTensor
@@ -300,7 +303,9 @@ def inspect_tensor(
         recipe_name = _get_recipe_name(quantizer)
 
         for stat in config["stats"]:
-            self.check_if_stat_is_supported(stat, recipe_name, high_precision_tensor_provided=tensor is not None)
+            self.check_if_stat_is_supported(
+                stat, recipe_name, high_precision_tensor_provided=tensor is not None
+            )
 
         options = (
             config.get("start_step", None),

From ecdc7273f2133f8d16983063a85db85e7363bc8b Mon Sep 17 00:00:00 2001
From: Pawel Gadzinski <pgadzinski@nvidia.com>
Date: Mon, 1 Sep 2025 11:02:15 +0000
Subject: [PATCH 05/20] fix

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>
---
 .../debug/features/log_fp8_tensor_stats.py               | 9 ++++-----
 transformer_engine/debug/pytorch/debug_quantization.py   | 7 +++----
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/transformer_engine/debug/features/log_fp8_tensor_stats.py b/transformer_engine/debug/features/log_fp8_tensor_stats.py
index 4a9416568e..04020476fe 100644
--- a/transformer_engine/debug/features/log_fp8_tensor_stats.py
+++ b/transformer_engine/debug/features/log_fp8_tensor_stats.py
@@ -111,8 +111,7 @@ class LogFp8TensorStats(BaseLogTensorStats):
                 - scale_inv_max - maximum of the inverse of the scaling factors,
                 - mse - mean squared error of the quantized tensor and the original tensor = sum((quantized_tensor - original_tensor)**2) / num_elements,
 
-            If stats are collected for weight tensora and fp8 model parameters are used, only "scale_inv_min", "scale_inv_max" are supported.
-            The other stats need high precision tensor to be computed.
+            When collecting stats for the weight tensor with FP8 model parameters enabled, only "scale_inv_min" and "scale_inv_max" are available. All other statistics require access to the high precision tensor.
 
         tensors/tensors_struct: List[str]
             list of tensors to log
@@ -167,10 +166,10 @@ def check_if_stat_is_supported(
             and not high_precision_tensor_provided
         ):
             raise ValueError(
-                f"Stat {stat} needs high precision tensor to be provided.                 This"
-                " feature cannot be used for weight tensor with fp8 model parameters."
+                f"Stat {stat} requires a high precision tensor to be provided. \
+                    This feature is not supported for weight tensors when using fp8 model parameters."
             )
-
+    
         if current_recipe == "" and recipe_from_stat == "":
             raise ValueError(
                 f"Stat {stat} does not contain a recipe name and the current recipe is not set."
diff --git a/transformer_engine/debug/pytorch/debug_quantization.py b/transformer_engine/debug/pytorch/debug_quantization.py
index 3fa548e75f..5dd2d42565 100644
--- a/transformer_engine/debug/pytorch/debug_quantization.py
+++ b/transformer_engine/debug/pytorch/debug_quantization.py
@@ -559,10 +559,9 @@ def set_usage(self, rowwise: bool = None, columnwise: bool = None):
     def wrap_quantized_tensor(self, tensor: QuantizedTensor):
         """Wraps the quantized tensor with the debug quantizer."""
 
-        assert API_CALL_MODIFY not in [
-            self.rowwise_tensor_plan,
-            self.columnwise_tensor_plan,
-        ], "Modify tensor is not allowed in debug mode for weight if fp8 model parameters are used."
+        assert self.rowwise_tensor_plan == HIGH_PRECISION and self.columnwise_tensor_plan == HIGH_PRECISION, \
+            "[NVTORCH INSPECT ERROR] Weight tensor with fp8 model paramters enabled cannot be modified by any feature."
+
 
         self._call_inspect_tensor_api(None, tensor, tensor)
 

From b0162af18e08a845c7e079da29c0c21d048ee7b4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 1 Sep 2025 11:02:46 +0000
Subject: [PATCH 06/20] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../debug/features/log_fp8_tensor_stats.py             |  7 ++++---
 transformer_engine/debug/pytorch/debug_quantization.py | 10 +++++++---
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/transformer_engine/debug/features/log_fp8_tensor_stats.py b/transformer_engine/debug/features/log_fp8_tensor_stats.py
index 04020476fe..a6e30fc07f 100644
--- a/transformer_engine/debug/features/log_fp8_tensor_stats.py
+++ b/transformer_engine/debug/features/log_fp8_tensor_stats.py
@@ -166,10 +166,11 @@ def check_if_stat_is_supported(
             and not high_precision_tensor_provided
         ):
             raise ValueError(
-                f"Stat {stat} requires a high precision tensor to be provided. \
-                    This feature is not supported for weight tensors when using fp8 model parameters."
+                f"Stat {stat} requires a high precision tensor to be provided.                    "
+                " This feature is not supported for weight tensors when using fp8 model"
+                " parameters."
             )
-    
+
         if current_recipe == "" and recipe_from_stat == "":
             raise ValueError(
                 f"Stat {stat} does not contain a recipe name and the current recipe is not set."
diff --git a/transformer_engine/debug/pytorch/debug_quantization.py b/transformer_engine/debug/pytorch/debug_quantization.py
index 5dd2d42565..cf7eb812b1 100644
--- a/transformer_engine/debug/pytorch/debug_quantization.py
+++ b/transformer_engine/debug/pytorch/debug_quantization.py
@@ -559,9 +559,13 @@ def set_usage(self, rowwise: bool = None, columnwise: bool = None):
     def wrap_quantized_tensor(self, tensor: QuantizedTensor):
         """Wraps the quantized tensor with the debug quantizer."""
 
-        assert self.rowwise_tensor_plan == HIGH_PRECISION and self.columnwise_tensor_plan == HIGH_PRECISION, \
-            "[NVTORCH INSPECT ERROR] Weight tensor with fp8 model paramters enabled cannot be modified by any feature."
-
+        assert (
+            self.rowwise_tensor_plan == HIGH_PRECISION
+            and self.columnwise_tensor_plan == HIGH_PRECISION
+        ), (
+            "[NVTORCH INSPECT ERROR] Weight tensor with fp8 model paramters enabled cannot be"
+            " modified by any feature."
+        )
 
         self._call_inspect_tensor_api(None, tensor, tensor)
 

From 20515c7d82886d9993a623f4d6033b573aaad426 Mon Sep 17 00:00:00 2001
From: Pawel Gadzinski <pgadzinski@nvidia.com>
Date: Mon, 1 Sep 2025 12:54:16 +0000
Subject: [PATCH 07/20] fixes

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>
---
 tests/pytorch/debug/test_log.py               | 44 ++++++++++++++++++-
 .../debug/features/utils/stats_buffer.py      | 19 +++++---
 .../debug/pytorch/debug_quantization.py       | 11 +++--
 3 files changed, 63 insertions(+), 11 deletions(-)

diff --git a/tests/pytorch/debug/test_log.py b/tests/pytorch/debug/test_log.py
index ca8e10ad69..e32f5eeee6 100644
--- a/tests/pytorch/debug/test_log.py
+++ b/tests/pytorch/debug/test_log.py
@@ -78,6 +78,8 @@
 
 all_stats.append("fp8_delayed_scaling_overflows%")  # only delayed-scaling supports overflows%
 
+# remove all contai
+
 
 @contextlib.contextmanager
 def debug_session(config_str: str, feature_dirs):
@@ -117,7 +119,6 @@ def read_log(log_dir: str) -> str:
     with open(stat_path, "r") as f:
         return f.read()
 
-
 def test_sanity(feature_dirs):
     if not fp8_available:
         pytest.skip(reason_for_no_fp8)
@@ -140,6 +141,47 @@ def test_sanity(feature_dirs):
     for stat in all_stats:
         assert stat in output, f"Stat {stat} not found in output"
 
+LOG_FP8_MODEL_PARAMETERS_CONFIG_BASE = """
+log:
+    layers:
+        layer_name_regex_pattern: .*
+    enabled: 
+        True
+    transformer_engine:
+        LogTensorStats:
+            enabled: 
+                True
+            stats: [min]
+            tensors: [weight, activation]
+            freq: 1
+        LogFp8TensorStats:
+            enabled: 
+                True
+            tensors_struct:
+                - tensor: activation
+                  stats: [scale_inv_min, scale_inv_max, underflows%]
+                - tensor: weight
+                  stats: [scale_inv_min, scale_inv_max]
+            freq: 1
+"""
+
+def test_sanity_log_fp8_model_parameters(feature_dirs):
+    if not fp8_available:
+        pytest.skip(reason_for_no_fp8)
+    
+    with debug_session(LOG_FP8_MODEL_PARAMETERS_CONFIG_BASE, feature_dirs) as log_dir:
+        with te.fp8_model_init():
+            model = te.Linear(128, 128, params_dtype=torch.bfloat16)
+        inp = torch.zeros(128, 128, dtype=torch.bfloat16).cuda()
+        for _ in range(10):
+            with te.fp8_autocast(fp8_recipe=recipe.DelayedScaling()):
+                output = model(inp)
+            loss = output.sum()
+            loss.backward()
+            debug_api.step()
+        output = read_log(log_dir)
+    assert output, "Output is empty"
+    TEDebugState._reset()
 
 fp8_recipes = [
     recipe.MXFP8BlockScaling(),
diff --git a/transformer_engine/debug/features/utils/stats_buffer.py b/transformer_engine/debug/features/utils/stats_buffer.py
index f07602d234..556eb53643 100644
--- a/transformer_engine/debug/features/utils/stats_buffer.py
+++ b/transformer_engine/debug/features/utils/stats_buffer.py
@@ -90,12 +90,19 @@ def feed(self, tensor, iteration, aux_dict=None):
         if self.modified[0] and not self.reduce_within_microbatch:
             return
 
-        if (
-            tensor.numel() == 0
-            if hasattr(tensor, "numel")
-            else all((t is None or t.numel() == 0) for t in tensor.get_data_tensors())
-        ):
-            return
+        if tensor is not None:
+            # tensor can be None if we compute fp8 stats for weight and fp8 model parameters are used
+            # then high precision is not provided and quantized tensor from aux_dict is used.
+
+            # This condition prevents computation of stats for empty tensor.
+            # This will not happen for weight - since it is the only situation then tensor can be None,
+            # we do not need to check similar condition for weight.
+            if (
+                tensor.numel() == 0
+                if hasattr(tensor, "numel")
+                else all((t is None or t.numel() == 0) for t in tensor.get_data_tensors())
+            ):
+                return
 
         # save stats for tensor to tmp buffer
         for stat_name in self.stats_to_compute:
diff --git a/transformer_engine/debug/pytorch/debug_quantization.py b/transformer_engine/debug/pytorch/debug_quantization.py
index cf7eb812b1..3540785fc3 100644
--- a/transformer_engine/debug/pytorch/debug_quantization.py
+++ b/transformer_engine/debug/pytorch/debug_quantization.py
@@ -262,7 +262,7 @@ def _call_inspect_tensor_api(
             "rowwise_quantized_tensor": rowwise_gemm_tensor,
             "quantizer": self.parent_quantizer,
         }
-        if tensor is not None and self.inspect_tensor_enabled:
+        if self.inspect_tensor_enabled:
             debug_api.transformer_engine.inspect_tensor(**args)
 
         if self.output_tensor:
@@ -557,11 +557,14 @@ def set_usage(self, rowwise: bool = None, columnwise: bool = None):
             self._update_parent_quantizer_usage()
 
     def wrap_quantized_tensor(self, tensor: QuantizedTensor):
-        """Wraps the quantized tensor with the debug quantizer."""
+        """
+            Wraps the quantized tensor with the debug quantizer.
+            It is used for weight tensors when fp8 model parameters are enabled.
+        """
 
         assert (
-            self.rowwise_tensor_plan == HIGH_PRECISION
-            and self.columnwise_tensor_plan == HIGH_PRECISION
+            self.rowwise_tensor_plan == STANDARD_FP8_QUANTIZE
+            and self.columnwise_tensor_plan == STANDARD_FP8_QUANTIZE
         ), (
             "[NVTORCH INSPECT ERROR] Weight tensor with fp8 model paramters enabled cannot be"
             " modified by any feature."

From 26926ccbfcd8483d3a5fe298018e13e047e9e5d7 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 1 Sep 2025 12:54:59 +0000
Subject: [PATCH 08/20] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/pytorch/debug/test_log.py                      | 12 ++++++++----
 .../debug/pytorch/debug_quantization.py              |  4 ++--
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/tests/pytorch/debug/test_log.py b/tests/pytorch/debug/test_log.py
index e32f5eeee6..e9fe2e9fab 100644
--- a/tests/pytorch/debug/test_log.py
+++ b/tests/pytorch/debug/test_log.py
@@ -119,6 +119,7 @@ def read_log(log_dir: str) -> str:
     with open(stat_path, "r") as f:
         return f.read()
 
+
 def test_sanity(feature_dirs):
     if not fp8_available:
         pytest.skip(reason_for_no_fp8)
@@ -141,21 +142,22 @@ def test_sanity(feature_dirs):
     for stat in all_stats:
         assert stat in output, f"Stat {stat} not found in output"
 
+
 LOG_FP8_MODEL_PARAMETERS_CONFIG_BASE = """
 log:
     layers:
         layer_name_regex_pattern: .*
-    enabled: 
+    enabled:
         True
     transformer_engine:
         LogTensorStats:
-            enabled: 
+            enabled:
                 True
             stats: [min]
             tensors: [weight, activation]
             freq: 1
         LogFp8TensorStats:
-            enabled: 
+            enabled:
                 True
             tensors_struct:
                 - tensor: activation
@@ -165,10 +167,11 @@ def test_sanity(feature_dirs):
             freq: 1
 """
 
+
 def test_sanity_log_fp8_model_parameters(feature_dirs):
     if not fp8_available:
         pytest.skip(reason_for_no_fp8)
-    
+
     with debug_session(LOG_FP8_MODEL_PARAMETERS_CONFIG_BASE, feature_dirs) as log_dir:
         with te.fp8_model_init():
             model = te.Linear(128, 128, params_dtype=torch.bfloat16)
@@ -183,6 +186,7 @@ def test_sanity_log_fp8_model_parameters(feature_dirs):
     assert output, "Output is empty"
     TEDebugState._reset()
 
+
 fp8_recipes = [
     recipe.MXFP8BlockScaling(),
     recipe.DelayedScaling(),
diff --git a/transformer_engine/debug/pytorch/debug_quantization.py b/transformer_engine/debug/pytorch/debug_quantization.py
index 3540785fc3..a0f5cc2a27 100644
--- a/transformer_engine/debug/pytorch/debug_quantization.py
+++ b/transformer_engine/debug/pytorch/debug_quantization.py
@@ -558,8 +558,8 @@ def set_usage(self, rowwise: bool = None, columnwise: bool = None):
 
     def wrap_quantized_tensor(self, tensor: QuantizedTensor):
         """
-            Wraps the quantized tensor with the debug quantizer.
-            It is used for weight tensors when fp8 model parameters are enabled.
+        Wraps the quantized tensor with the debug quantizer.
+        It is used for weight tensors when fp8 model parameters are enabled.
         """
 
         assert (

From 39d500a017291f56d26947a4ec7aecf61229bf9d Mon Sep 17 00:00:00 2001
From: Pawel Gadzinski <pgadzinski@nvidia.com>
Date: Mon, 1 Sep 2025 13:06:11 +0000
Subject: [PATCH 09/20] fix

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>
---
 tests/pytorch/debug/test_log.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/pytorch/debug/test_log.py b/tests/pytorch/debug/test_log.py
index e9fe2e9fab..c0e916350c 100644
--- a/tests/pytorch/debug/test_log.py
+++ b/tests/pytorch/debug/test_log.py
@@ -78,8 +78,6 @@
 
 all_stats.append("fp8_delayed_scaling_overflows%")  # only delayed-scaling supports overflows%
 
-# remove all contai
-
 
 @contextlib.contextmanager
 def debug_session(config_str: str, feature_dirs):
@@ -154,7 +152,7 @@ def test_sanity(feature_dirs):
             enabled:
                 True
             stats: [min]
-            tensors: [weight, activation]
+            tensors: [weight, activation, gradient]
             freq: 1
         LogFp8TensorStats:
             enabled:
@@ -169,6 +167,14 @@ def test_sanity(feature_dirs):
 
 
 def test_sanity_log_fp8_model_parameters(feature_dirs):
+    """
+        Tests logging stats when model parameters are in fp8.
+        It tests 3 things:
+            - LogTensorStats for weight tensor should work without change,
+            - LogTensorStats and LogFp8TensorStats for non-weight tensors should work without change,
+            - LogFp8TensorStats should support scale_inv_min, scale_inv_max for weight tensor.
+
+    """
     if not fp8_available:
         pytest.skip(reason_for_no_fp8)
 

From 206cbc61ebbfd09160e4bb19fc3ca6800443aeb6 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 1 Sep 2025 13:06:53 +0000
Subject: [PATCH 10/20] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/pytorch/debug/test_log.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/pytorch/debug/test_log.py b/tests/pytorch/debug/test_log.py
index c0e916350c..df9aac2194 100644
--- a/tests/pytorch/debug/test_log.py
+++ b/tests/pytorch/debug/test_log.py
@@ -168,11 +168,11 @@ def test_sanity(feature_dirs):
 
 def test_sanity_log_fp8_model_parameters(feature_dirs):
     """
-        Tests logging stats when model parameters are in fp8.
-        It tests 3 things:
-            - LogTensorStats for weight tensor should work without change,
-            - LogTensorStats and LogFp8TensorStats for non-weight tensors should work without change,
-            - LogFp8TensorStats should support scale_inv_min, scale_inv_max for weight tensor.
+    Tests logging stats when model parameters are in fp8.
+    It tests 3 things:
+        - LogTensorStats for weight tensor should work without change,
+        - LogTensorStats and LogFp8TensorStats for non-weight tensors should work without change,
+        - LogFp8TensorStats should support scale_inv_min, scale_inv_max for weight tensor.
 
     """
     if not fp8_available:

From 82e3f2cc0e53e8154f71537ebc6b3901ff0aee2a Mon Sep 17 00:00:00 2001
From: Pawel Gadzinski <pgadzinski@nvidia.com>
Date: Mon, 15 Sep 2025 10:15:06 +0200
Subject: [PATCH 11/20] fix

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>
---
 tests/pytorch/debug/test_log.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pytorch/debug/test_log.py b/tests/pytorch/debug/test_log.py
index df9aac2194..4434157d02 100644
--- a/tests/pytorch/debug/test_log.py
+++ b/tests/pytorch/debug/test_log.py
@@ -179,7 +179,7 @@ def test_sanity_log_fp8_model_parameters(feature_dirs):
         pytest.skip(reason_for_no_fp8)
 
     with debug_session(LOG_FP8_MODEL_PARAMETERS_CONFIG_BASE, feature_dirs) as log_dir:
-        with te.fp8_model_init():
+        with te.fp8_model_init(recipe=recipe.DelayedScaling()):
             model = te.Linear(128, 128, params_dtype=torch.bfloat16)
         inp = torch.zeros(128, 128, dtype=torch.bfloat16).cuda()
         for _ in range(10):

From c9867087effe22c59b4bfdc650fb10e52f9dff14 Mon Sep 17 00:00:00 2001
From: Pawel Gadzinski <pgadzinski@nvidia.com>
Date: Fri, 24 Oct 2025 13:50:30 +0000
Subject: [PATCH 12/20] fix

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>
---
 transformer_engine/debug/features/log_fp8_tensor_stats.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/transformer_engine/debug/features/log_fp8_tensor_stats.py b/transformer_engine/debug/features/log_fp8_tensor_stats.py
index a6e30fc07f..f0a158b2ef 100644
--- a/transformer_engine/debug/features/log_fp8_tensor_stats.py
+++ b/transformer_engine/debug/features/log_fp8_tensor_stats.py
@@ -111,7 +111,9 @@ class LogFp8TensorStats(BaseLogTensorStats):
                 - scale_inv_max - maximum of the inverse of the scaling factors,
                 - mse - mean squared error of the quantized tensor and the original tensor = sum((quantized_tensor - original_tensor)**2) / num_elements,
 
-            When collecting stats for the weight tensor with FP8 model parameters enabled, only "scale_inv_min" and "scale_inv_max" are available. All other statistics require access to the high precision tensor.
+            When collecting stats for the weight tensor with FP8 model parameters enabled, 
+            only "scale_inv_min" and "scale_inv_max" are available. 
+            All other statistics require access to the high precision tensor.
 
         tensors/tensors_struct: List[str]
             list of tensors to log

From 3740ff6fe7e429d3597fc60aaa844b8e2244f929 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 24 Oct 2025 13:52:16 +0000
Subject: [PATCH 13/20] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 transformer_engine/debug/features/log_fp8_tensor_stats.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/transformer_engine/debug/features/log_fp8_tensor_stats.py b/transformer_engine/debug/features/log_fp8_tensor_stats.py
index ceb0a8a579..8c6b17d57d 100644
--- a/transformer_engine/debug/features/log_fp8_tensor_stats.py
+++ b/transformer_engine/debug/features/log_fp8_tensor_stats.py
@@ -111,8 +111,8 @@ class LogFp8TensorStats(BaseLogTensorStats):
                 - scale_inv_max - maximum of the inverse of the scaling factors,
                 - mse - mean squared error of the quantized tensor and the original tensor = sum((quantized_tensor - original_tensor)**2) / num_elements,
 
-            When collecting stats for the weight tensor with FP8 model parameters enabled, 
-            only "scale_inv_min" and "scale_inv_max" are available. 
+            When collecting stats for the weight tensor with FP8 model parameters enabled,
+            only "scale_inv_min" and "scale_inv_max" are available.
             All other statistics require access to the high precision tensor.
 
         tensors/tensors_struct: List[str]

From 1aa421ce0e1d9c18f8b3ce84907c17ec15b5b5a0 Mon Sep 17 00:00:00 2001
From: Pawel Gadzinski <pgadzinski@nvidia.com>
Date: Fri, 31 Oct 2025 10:08:52 +0000
Subject: [PATCH 14/20] fix

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>
---
 transformer_engine/debug/features/log_fp8_tensor_stats.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/transformer_engine/debug/features/log_fp8_tensor_stats.py b/transformer_engine/debug/features/log_fp8_tensor_stats.py
index 8c6b17d57d..c285d45ee1 100644
--- a/transformer_engine/debug/features/log_fp8_tensor_stats.py
+++ b/transformer_engine/debug/features/log_fp8_tensor_stats.py
@@ -168,9 +168,9 @@ def check_if_stat_is_supported(
             and not high_precision_tensor_provided
         ):
             raise ValueError(
-                f"Stat {stat} requires a high precision tensor to be provided.                    "
-                " This feature is not supported for weight tensors when using fp8 model"
-                " parameters."
+                f"Stat {stat} requires a high precision tensor to be provided."
+                "This feature is not supported for weight tensors when using fp8 model"
+                "parameters."
             )
 
         if current_recipe == "" and recipe_from_stat == "":

From e0ba7e7eb0e3c780068769fe4f8dd9dc9a4ec4cf Mon Sep 17 00:00:00 2001
From: Pawel Gadzinski <pgadzinski@nvidia.com>
Date: Tue, 4 Nov 2025 15:13:47 +0000
Subject: [PATCH 15/20] fix

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>
---
 transformer_engine/debug/features/log_fp8_tensor_stats.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/transformer_engine/debug/features/log_fp8_tensor_stats.py b/transformer_engine/debug/features/log_fp8_tensor_stats.py
index c285d45ee1..4541ab0018 100644
--- a/transformer_engine/debug/features/log_fp8_tensor_stats.py
+++ b/transformer_engine/debug/features/log_fp8_tensor_stats.py
@@ -168,8 +168,8 @@ def check_if_stat_is_supported(
             and not high_precision_tensor_provided
         ):
             raise ValueError(
-                f"Stat {stat} requires a high precision tensor to be provided."
-                "This feature is not supported for weight tensors when using fp8 model"
+                f"Stat {stat} requires a high precision tensor to be provided. "
+                "This feature is not supported for weight tensors when using fp8 model "
                 "parameters."
             )
 

From a3ec90d9cc86c94f973999a4401aebb79190ee58 Mon Sep 17 00:00:00 2001
From: Pawel Gadzinski <pgadzinski@nvidia.com>
Date: Tue, 18 Nov 2025 16:51:04 +0000
Subject: [PATCH 16/20] fix

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>
---
 transformer_engine/pytorch/module/linear.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py
index b3f8165a77..f7307b8d3f 100644
--- a/transformer_engine/pytorch/module/linear.py
+++ b/transformer_engine/pytorch/module/linear.py
@@ -253,7 +253,7 @@ def forward(
         if fp8 or debug:
             # Configure quantizer
             # No need to set the quantizer states if weight is already quantized
-            if weight_quantizer is not None and not isinstance(weight, QuantizedTensor):
+            if weight_quantizer is not None and not isinstance(weight, QuantizedTensor) or debug:
                 columnwise_usage = is_grad_enabled and inp.requires_grad
                 if not columnwise_usage:
                     columnwise_usage = (

From 91e0332ebd0aa2f3af21b4fd16105fdc1a16c451 Mon Sep 17 00:00:00 2001
From: Pawel Gadzinski <pgadzinski@nvidia.com>
Date: Tue, 18 Nov 2025 16:59:57 +0000
Subject: [PATCH 17/20] fix

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>
---
 transformer_engine/pytorch/module/linear.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py
index f7307b8d3f..d60b32fe27 100644
--- a/transformer_engine/pytorch/module/linear.py
+++ b/transformer_engine/pytorch/module/linear.py
@@ -253,6 +253,7 @@ def forward(
         if fp8 or debug:
             # Configure quantizer
             # No need to set the quantizer states if weight is already quantized
+            # for debug mode we create quantizer every iteration, thus we need to set the quantizer states
             if weight_quantizer is not None and not isinstance(weight, QuantizedTensor) or debug:
                 columnwise_usage = is_grad_enabled and inp.requires_grad
                 if not columnwise_usage:

From 863de121b97528032b337a611cb6a512e65d9809 Mon Sep 17 00:00:00 2001
From: Pawel Gadzinski <pgadzinski@nvidia.com>
Date: Thu, 11 Dec 2025 12:53:14 +0000
Subject: [PATCH 18/20] Fix weight quantizer logic in debug mode

Add 'or debug' / 'and not debug' conditions to weight quantizer configuration
in linear.py, grouped_linear.py, layernorm_linear.py, and layernorm_mlp.py.

In debug mode, quantizers are recreated every iteration, so we need to
set quantizer states even when weights are already quantized.

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>
---
 transformer_engine/pytorch/module/grouped_linear.py   | 8 +++-----
 transformer_engine/pytorch/module/layernorm_linear.py | 3 ++-
 transformer_engine/pytorch/module/layernorm_mlp.py    | 5 +++--
 transformer_engine/pytorch/module/linear.py           | 2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py
index 004c95c372..42edce2cc2 100644
--- a/transformer_engine/pytorch/module/grouped_linear.py
+++ b/transformer_engine/pytorch/module/grouped_linear.py
@@ -121,8 +121,9 @@ def forward(
                     and not in_fp8_activation_recompute_phase()
                 )
             # No need to set the quantizer states if weight is already quantized
-            if weight_quantizers[0] is not None and not isinstance(
-                weights[0], QuantizedTensorStorage
+            # for debug mode we create quantizer every iteration, thus we need to set the quantizer states
+            if weight_quantizers[0] is not None and (
+                not isinstance(weights[0], QuantizedTensorStorage) or debug
             ):
                 for weight_quantizer in weight_quantizers:
                     weight_quantizer.set_usage(rowwise=True, columnwise=columnwise_usage)
@@ -798,9 +799,6 @@ def forward(
                     debug = False
                     quantizers = self._get_quantizers()
 
-                if isinstance(weight_tensors, QuantizedTensorStorage):
-                    raise RuntimeError("FP8 weights are not supported in debug mode.")
-
             (
                 input_quantizers,
                 weight_quantizers,
diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py
index 667c199c49..c5c622f00a 100644
--- a/transformer_engine/pytorch/module/layernorm_linear.py
+++ b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -296,7 +296,8 @@ def forward(
 
             # Configure quantizer
             # If weight is already quantized, no need to set quantizer states
-            if is_weight_param_quantized:
+            # for debug mode we create quantizer every iteration, thus we need to set the quantizer states
+            if is_weight_param_quantized and not debug:
                 weight_quantizer = weight._quantizer
             elif weight_quantizer is not None:
                 weight_quantizer.set_usage(rowwise=True, columnwise=is_grad_enabled)
diff --git a/transformer_engine/pytorch/module/layernorm_mlp.py b/transformer_engine/pytorch/module/layernorm_mlp.py
index 56e050fe88..1fd8334e95 100644
--- a/transformer_engine/pytorch/module/layernorm_mlp.py
+++ b/transformer_engine/pytorch/module/layernorm_mlp.py
@@ -472,12 +472,13 @@ def _forward(
             # FP8 cast to workspace buffer
             update_workspace = is_first_microbatch is None or is_first_microbatch
             # No need to set the quantizer states if weights are already quantized
-            if isinstance(fc1_weight, QuantizedTensorStorage):
+            # for debug mode we create quantizer every iteration, thus we need to set the quantizer states
+            if isinstance(fc1_weight, QuantizedTensorStorage) and not debug:
                 fc1_weight_quantizer = fc1_weight._quantizer
             elif fc1_weight_quantizer is not None:
                 fc1_weight_quantizer.set_usage(rowwise=True, columnwise=is_grad_enabled)
 
-            if isinstance(fc2_weight, QuantizedTensorStorage):
+            if isinstance(fc2_weight, QuantizedTensorStorage) and not debug:
                 fc2_weight_quantizer = fc2_weight._quantizer
             elif fc2_weight_quantizer is not None:
                 fc2_weight_quantizer.set_usage(rowwise=True, columnwise=is_grad_enabled)
diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py
index f4a8ec2da0..341278ad1c 100644
--- a/transformer_engine/pytorch/module/linear.py
+++ b/transformer_engine/pytorch/module/linear.py
@@ -254,7 +254,7 @@ def forward(
             # Configure quantizer
             # No need to set the quantizer states if weight is already quantized
             # for debug mode we create quantizer every iteration, thus we need to set the quantizer states
-            if weight_quantizer is not None and not isinstance(weight, QuantizedTensor) or debug:
+            if weight_quantizer is not None and (not isinstance(weight, QuantizedTensor) or debug):
                 columnwise_usage = is_grad_enabled and inp.requires_grad
                 if not columnwise_usage:
                     columnwise_usage = (

From ae85aefee674279ea8065b618814d90cea38a82e Mon Sep 17 00:00:00 2001
From: Pawel Gadzinski <pgadzinski@nvidia.com>
Date: Mon, 16 Feb 2026 11:11:48 +0000
Subject: [PATCH 19/20] fix

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>
---
 transformer_engine/debug/pytorch/debug_quantization.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/transformer_engine/debug/pytorch/debug_quantization.py b/transformer_engine/debug/pytorch/debug_quantization.py
index 7590679d80..565e1f9541 100644
--- a/transformer_engine/debug/pytorch/debug_quantization.py
+++ b/transformer_engine/debug/pytorch/debug_quantization.py
@@ -566,8 +566,8 @@ def wrap_quantized_tensor(self, tensor: QuantizedTensor):
         """
 
         assert (
-            self.rowwise_tensor_plan == STANDARD_FP8_QUANTIZE
-            and self.columnwise_tensor_plan == STANDARD_FP8_QUANTIZE
+            self.rowwise_tensor_plan == STANDARD_QUANTIZE
+            and self.columnwise_tensor_plan == STANDARD_QUANTIZE
         ), (
             "[NVTORCH INSPECT ERROR] Weight tensor with fp8 model paramters enabled cannot be"
             " modified by any feature."

From fca78e9d52d7c5049961d4300c4286664f314a25 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Gadzi=C5=84ski?=
 <62263673+pggPL@users.noreply.github.com>
Date: Mon, 16 Feb 2026 12:32:20 +0100
Subject: [PATCH 20/20] Update
 transformer_engine/debug/pytorch/debug_quantization.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
Signed-off-by: Paweł Gadziński <62263673+pggPL@users.noreply.github.com>
---
 transformer_engine/debug/pytorch/debug_quantization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformer_engine/debug/pytorch/debug_quantization.py b/transformer_engine/debug/pytorch/debug_quantization.py
index 565e1f9541..5624970547 100644
--- a/transformer_engine/debug/pytorch/debug_quantization.py
+++ b/transformer_engine/debug/pytorch/debug_quantization.py
@@ -569,7 +569,7 @@ def wrap_quantized_tensor(self, tensor: QuantizedTensor):
             self.rowwise_tensor_plan == STANDARD_QUANTIZE
             and self.columnwise_tensor_plan == STANDARD_QUANTIZE
         ), (
-            "[NVTORCH INSPECT ERROR] Weight tensor with fp8 model paramters enabled cannot be"
+            "[NVTORCH INSPECT ERROR] Weight tensor with fp8 model parameters enabled cannot be"
             " modified by any feature."
         )