From c84dd92a81113ca02f6e6c9e20946b3b5819e9ac Mon Sep 17 00:00:00 2001
From: Tim Moon <tmoon@nvidia.com>
Date: Wed, 28 May 2025 21:56:26 +0000
Subject: [PATCH 1/3] Use FP16 tols for tests with TF32

Signed-off-by: Tim Moon <tmoon@nvidia.com>
---
 tests/pytorch/distributed/run_numerics.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/pytorch/distributed/run_numerics.py b/tests/pytorch/distributed/run_numerics.py
index c1edb74b17..3aa3b7be88 100644
--- a/tests/pytorch/distributed/run_numerics.py
+++ b/tests/pytorch/distributed/run_numerics.py
@@ -185,7 +185,8 @@ def _get_tolerances(dtype):
     if dtype == torch.bfloat16:
         return {"rtol": 1.6e-2, "atol": 1e-5}
     if dtype == torch.float32:
-        return {"rtol": 1e-4, "atol": 1e-4}
+        # TF32 has same mantissa bits as FP16
+        return {"rtol": 1e-3, "atol": 1e-5}
     raise ValueError(f"Unsupported dtype ({dtype})")
 
 

From 99d4cbfa7378961db1cc21e203981ac2b641e3ac Mon Sep 17 00:00:00 2001
From: Tim Moon <tmoon@nvidia.com>
Date: Wed, 4 Jun 2025 20:41:31 +0000
Subject: [PATCH 2/3] Use uniform init instead of constant init

Signed-off-by: Tim Moon <tmoon@nvidia.com>
---
 tests/pytorch/distributed/run_numerics.py  | 16 ++++++++--------
 tests/pytorch/distributed/test_numerics.py |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/pytorch/distributed/run_numerics.py b/tests/pytorch/distributed/run_numerics.py
index 3aa3b7be88..682684d855 100644
--- a/tests/pytorch/distributed/run_numerics.py
+++ b/tests/pytorch/distributed/run_numerics.py
@@ -161,8 +161,8 @@ def backward(ctx, grad_output):
     return torch.cat(gathered, dim=dim)
 
 
-def _constant(tensor):
-    return nn.init.constant_(tensor, 0.5)
+def _init_uniform(tensor):
+    return nn.init.uniform_(tensor, a=0.0, b=0.05)
 
 
 def dist_print(msg, src=None, end="\n", error=False):
@@ -512,7 +512,7 @@ def test_linear():
     kwargs_list = [
         {},
         {"bias": False},
-        {"init_method": _constant},
+        {"init_method": _init_uniform},
         {"fuse_wgrad_accumulation": True},
         {"return_bias": True},
         {"params_dtype": torch.float16},
@@ -688,7 +688,7 @@ def test_layernorm_linear():
     kwargs_list = [
         {},
         {"bias": False},
-        {"init_method": _constant},
+        {"init_method": _init_uniform},
         {"fuse_wgrad_accumulation": True},
         {"return_bias": True},
         {"params_dtype": torch.float16},
@@ -792,8 +792,8 @@ def _test_layernorm_mlp(set_parallel_mode=None, sequence_parallel=False, **kwarg
 def test_layernorm_mlp():
     kwargs_list = [
         {},
-        {"init_method": _constant},
-        {"output_layer_init_method": _constant},
+        {"init_method": _init_uniform},
+        {"output_layer_init_method": _init_uniform},
         {"normalization": "RMSNorm"},
         {"zero_centered_gamma": True},
         {"bias": False},
@@ -882,8 +882,8 @@ def test_transformer_layer():
     kwargs_list = [
         {},
         {"num_gqa_groups": 4},
-        {"init_method": _constant},
-        {"output_layer_init_method": _constant},
+        {"init_method": _init_uniform},
+        {"output_layer_init_method": _init_uniform},
         {"apply_residual_connection_post_layernorm": True},
         {"output_layernorm": True},
         {"parallel_attention_mlp": True},
diff --git a/tests/pytorch/distributed/test_numerics.py b/tests/pytorch/distributed/test_numerics.py
index 632f50e90a..1ff5aff997 100644
--- a/tests/pytorch/distributed/test_numerics.py
+++ b/tests/pytorch/distributed/test_numerics.py
@@ -56,7 +56,7 @@ def test_distributed(quantization):
     if quantization == "fp8" and not fp8_available:
         pytest.skip(reason_for_no_fp8)
     if quantization == "fp8_cs" and not fp8_available:
-        pytest.skip(fp8_available)
+        pytest.skip(reason_for_no_fp8)
     if quantization == "mxfp8" and not mxfp8_available:
         pytest.skip(reason_for_no_mxfp8)
     if quantization == "fp8_block_scaling" and not fp8_block_scaling_available:

From ed3bb12afed5a7642a0a824ddfe2b4024b04c2b0 Mon Sep 17 00:00:00 2001
From: Tim Moon <tmoon@nvidia.com>
Date: Wed, 4 Jun 2025 23:56:13 +0000
Subject: [PATCH 3/3] Revert constant init test, but reduce value

Signed-off-by: Tim Moon <tmoon@nvidia.com>
---
 tests/pytorch/distributed/run_numerics.py | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/tests/pytorch/distributed/run_numerics.py b/tests/pytorch/distributed/run_numerics.py
index 682684d855..c01642bdfe 100644
--- a/tests/pytorch/distributed/run_numerics.py
+++ b/tests/pytorch/distributed/run_numerics.py
@@ -47,11 +47,6 @@
     )
 
 
-# Disable TF32
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.backends.cudnn.allow_tf32 = False
-
-
 # Quantization recipe setup
 def quantization_recipe() -> Recipe:
     if QUANTIZATION == "fp8":
@@ -161,8 +156,8 @@ def backward(ctx, grad_output):
     return torch.cat(gathered, dim=dim)
 
 
-def _init_uniform(tensor):
-    return nn.init.uniform_(tensor, a=0.0, b=0.05)
+def _constant(tensor):
+    return nn.init.constant_(tensor, 0.05)
 
 
 def dist_print(msg, src=None, end="\n", error=False):
@@ -512,7 +507,7 @@ def test_linear():
     kwargs_list = [
         {},
         {"bias": False},
-        {"init_method": _init_uniform},
+        {"init_method": _constant},
         {"fuse_wgrad_accumulation": True},
         {"return_bias": True},
         {"params_dtype": torch.float16},
@@ -688,7 +683,7 @@ def test_layernorm_linear():
     kwargs_list = [
         {},
         {"bias": False},
-        {"init_method": _init_uniform},
+        {"init_method": _constant},
         {"fuse_wgrad_accumulation": True},
         {"return_bias": True},
         {"params_dtype": torch.float16},
@@ -792,8 +787,8 @@ def _test_layernorm_mlp(set_parallel_mode=None, sequence_parallel=False, **kwarg
 def test_layernorm_mlp():
     kwargs_list = [
         {},
-        {"init_method": _init_uniform},
-        {"output_layer_init_method": _init_uniform},
+        {"init_method": _constant},
+        {"output_layer_init_method": _constant},
         {"normalization": "RMSNorm"},
         {"zero_centered_gamma": True},
         {"bias": False},
@@ -882,8 +877,8 @@ def test_transformer_layer():
     kwargs_list = [
         {},
         {"num_gqa_groups": 4},
-        {"init_method": _init_uniform},
-        {"output_layer_init_method": _init_uniform},
+        {"init_method": _constant},
+        {"output_layer_init_method": _constant},
         {"apply_residual_connection_post_layernorm": True},
         {"output_layernorm": True},
         {"parallel_attention_mlp": True},