From c84dd92a81113ca02f6e6c9e20946b3b5819e9ac Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Wed, 28 May 2025 21:56:26 +0000 Subject: [PATCH 1/3] Use FP16 tols for tests with TF32 Signed-off-by: Tim Moon --- tests/pytorch/distributed/run_numerics.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/pytorch/distributed/run_numerics.py b/tests/pytorch/distributed/run_numerics.py index c1edb74b17..3aa3b7be88 100644 --- a/tests/pytorch/distributed/run_numerics.py +++ b/tests/pytorch/distributed/run_numerics.py @@ -185,7 +185,8 @@ def _get_tolerances(dtype): if dtype == torch.bfloat16: return {"rtol": 1.6e-2, "atol": 1e-5} if dtype == torch.float32: - return {"rtol": 1e-4, "atol": 1e-4} + # TF32 has same mantissa bits as FP16 + return {"rtol": 1e-3, "atol": 1e-5} raise ValueError(f"Unsupported dtype ({dtype})") From 99d4cbfa7378961db1cc21e203981ac2b641e3ac Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Wed, 4 Jun 2025 20:41:31 +0000 Subject: [PATCH 2/3] Use uniform init instead of constant init Signed-off-by: Tim Moon --- tests/pytorch/distributed/run_numerics.py | 16 ++++++++-------- tests/pytorch/distributed/test_numerics.py | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/pytorch/distributed/run_numerics.py b/tests/pytorch/distributed/run_numerics.py index 3aa3b7be88..682684d855 100644 --- a/tests/pytorch/distributed/run_numerics.py +++ b/tests/pytorch/distributed/run_numerics.py @@ -161,8 +161,8 @@ def backward(ctx, grad_output): return torch.cat(gathered, dim=dim) -def _constant(tensor): - return nn.init.constant_(tensor, 0.5) +def _init_uniform(tensor): + return nn.init.uniform_(tensor, a=0.0, b=0.05) def dist_print(msg, src=None, end="\n", error=False): @@ -512,7 +512,7 @@ def test_linear(): kwargs_list = [ {}, {"bias": False}, - {"init_method": _constant}, + {"init_method": _init_uniform}, {"fuse_wgrad_accumulation": True}, {"return_bias": True}, {"params_dtype": torch.float16}, @@ -688,7 +688,7 @@ def test_layernorm_linear(): kwargs_list = [ {}, {"bias": False}, - {"init_method": _constant}, + {"init_method": _init_uniform}, {"fuse_wgrad_accumulation": True}, {"return_bias": True}, {"params_dtype": torch.float16}, @@ -792,8 +792,8 @@ def _test_layernorm_mlp(set_parallel_mode=None, sequence_parallel=False, **kwarg def test_layernorm_mlp(): kwargs_list = [ {}, - {"init_method": _constant}, - {"output_layer_init_method": _constant}, + {"init_method": _init_uniform}, + {"output_layer_init_method": _init_uniform}, {"normalization": "RMSNorm"}, {"zero_centered_gamma": True}, {"bias": False}, @@ -882,8 +882,8 @@ def test_transformer_layer(): kwargs_list = [ {}, {"num_gqa_groups": 4}, - {"init_method": _constant}, - {"output_layer_init_method": _constant}, + {"init_method": _init_uniform}, + {"output_layer_init_method": _init_uniform}, {"apply_residual_connection_post_layernorm": True}, {"output_layernorm": True}, {"parallel_attention_mlp": True}, diff --git a/tests/pytorch/distributed/test_numerics.py b/tests/pytorch/distributed/test_numerics.py index 632f50e90a..1ff5aff997 100644 --- a/tests/pytorch/distributed/test_numerics.py +++ b/tests/pytorch/distributed/test_numerics.py @@ -56,7 +56,7 @@ def test_distributed(quantization): if quantization == "fp8" and not fp8_available: pytest.skip(reason_for_no_fp8) if quantization == "fp8_cs" and not fp8_available: - pytest.skip(fp8_available) + pytest.skip(reason_for_no_fp8) if quantization == "mxfp8" and not mxfp8_available: pytest.skip(reason_for_no_mxfp8) if quantization == "fp8_block_scaling" and not fp8_block_scaling_available: From ed3bb12afed5a7642a0a824ddfe2b4024b04c2b0 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Wed, 4 Jun 2025 23:56:13 +0000 Subject: [PATCH 3/3] Revert constant init test, but reduce value Signed-off-by: Tim Moon --- tests/pytorch/distributed/run_numerics.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/tests/pytorch/distributed/run_numerics.py b/tests/pytorch/distributed/run_numerics.py index 682684d855..c01642bdfe 100644 --- a/tests/pytorch/distributed/run_numerics.py +++ b/tests/pytorch/distributed/run_numerics.py @@ -47,11 +47,6 @@ ) -# Disable TF32 -torch.backends.cuda.matmul.allow_tf32 = False -torch.backends.cudnn.allow_tf32 = False - - # Quantization recipe setup def quantization_recipe() -> Recipe: if QUANTIZATION == "fp8": @@ -161,8 +156,8 @@ def backward(ctx, grad_output): return torch.cat(gathered, dim=dim) -def _init_uniform(tensor): - return nn.init.uniform_(tensor, a=0.0, b=0.05) +def _constant(tensor): + return nn.init.constant_(tensor, 0.05) def dist_print(msg, src=None, end="\n", error=False): @@ -512,7 +507,7 @@ def test_linear(): kwargs_list = [ {}, {"bias": False}, - {"init_method": _init_uniform}, + {"init_method": _constant}, {"fuse_wgrad_accumulation": True}, {"return_bias": True}, {"params_dtype": torch.float16}, @@ -688,7 +683,7 @@ def test_layernorm_linear(): kwargs_list = [ {}, {"bias": False}, - {"init_method": _init_uniform}, + {"init_method": _constant}, {"fuse_wgrad_accumulation": True}, {"return_bias": True}, {"params_dtype": torch.float16}, @@ -792,8 +787,8 @@ def _test_layernorm_mlp(set_parallel_mode=None, sequence_parallel=False, **kwarg def test_layernorm_mlp(): kwargs_list = [ {}, - {"init_method": _init_uniform}, - {"output_layer_init_method": _init_uniform}, + {"init_method": _constant}, + {"output_layer_init_method": _constant}, {"normalization": "RMSNorm"}, {"zero_centered_gamma": True}, {"bias": False}, @@ -882,8 +877,8 @@ def test_transformer_layer(): kwargs_list = [ {}, {"num_gqa_groups": 4}, - {"init_method": _init_uniform}, - {"output_layer_init_method": _init_uniform}, + {"init_method": _constant}, + {"output_layer_init_method": _constant}, {"apply_residual_connection_post_layernorm": True}, {"output_layernorm": True}, {"parallel_attention_mlp": True},