Expose use_fast_acc for fp8 mma

xiaoqiqi177 · xiaoqiqi177 · commit 0d172bfc3c46 · 2026-04-29T10:45:34.000-07:00
Signed-off-by: Qiqi Xiao &lt;qiqix@nvidia.com&gt;
diff --git a/changelog.d/mma-fast-acc.md b/changelog.d/mma-fast-acc.md
@@ -0,0 +1,5 @@
+<!--- SPDX-FileCopyrightText: Copyright (c) <2026> NVIDIA CORPORATION & AFFILIATES. All rights reserved. -->
+<!--- SPDX-License-Identifier: Apache-2.0 -->
+
+- Add `use_fast_acc` keyword argument to `ct.mma()` to enable fast accumulation
+  mode for fp8 inputs (`float8_e4m3fn`, `float8_e5m2`) on Hopper GPUs.
diff --git a/src/cuda/tile/_ir/ops.py b/src/cuda/tile/_ir/ops.py
@@ -18,7 +18,7 @@
 from cuda.tile import RoundingMode, MemoryOrder, MemoryScope
 from cuda.tile._mutex import tile_mutex
 from cuda.tile._exception import TileTypeError, TileSyntaxError, TileError, \
-    TileStaticAssertionError, TileStaticEvalError, TileValueError
+    TileStaticAssertionError, TileStaticEvalError, TileValueError, TileUnsupportedFeatureError
 from cuda.tile._ir.ir import (
     Operation, Var, Loc, Block,
     add_operation, Builder,
@@ -3224,6 +3224,7 @@ def _matmul_broadcast_shape(x_shape: _TileShape, y_shape: _TileShape) -> \
 
 @dataclass(eq=False)
 class TileMma(Operation, opcode="tile_mma"):
+    use_fast_acc: bool = attribute(default=False)
     x: Var = operand()
     y: Var = operand()
     acc: Var = operand()
@@ -3243,13 +3244,13 @@ def generate_bytecode(self, ctx: BytecodeContext) -> bc.Value:
             return bc.encode_MmaIOp(ctx.builder, res_typeid, x_value, y_value,
                                     acc_value, signedness_lhs, signedness_rhs)
         else:
-            # TODO: consider expose fast_acc
             return bc.encode_MmaFOp(ctx.builder, res_typeid, x_value, y_value,
-                                    acc_value, fast_acc=False)
+                                    acc_value, fast_acc=self.use_fast_acc)
 
 
 @impl(ct.mma)
-def mma_impl(x: Var, y: Var, acc: Var) -> Var:
+def mma_impl(x: Var, y: Var, acc: Var, use_fast_acc: Var) -> Var:
+    use_fast_acc = require_constant_bool(use_fast_acc)
     x_tile_type = require_tile_type(x)
     y_tile_type = require_tile_type(y)
     acc_tile_type = require_tile_type(acc)
@@ -3263,10 +3264,21 @@ def mma_impl(x: Var, y: Var, acc: Var) -> Var:
     x_shape, y_shape, _, output_shape = _matmul_broadcast_shape(x_shape_orig, y_shape_orig)
     if acc_shape_orig != output_shape:
         raise TileTypeError(f'Expect acc shape to be {output_shape}, got {acc_shape_orig}')
+    if use_fast_acc:
+        if x_tile_type.dtype not in (datatype.float8_e4m3fn, datatype.float8_e5m2):
+            raise TileTypeError(
+                f'use_fast_acc is only supported for fp8 input dtypes '
+                f'(float8_e4m3fn, float8_e5m2), got {x_tile_type.dtype}')
+        cur_version = Builder.get_current().ir_ctx.tileiras_version
+        if cur_version < BytecodeVersion.V_13_3:
+            raise TileUnsupportedFeatureError(
+                f'use_fast_acc requires tileiras '
+                f'{BytecodeVersion.V_13_3.as_string()} or later. '
+                f'Current version is {cur_version.as_string()}.')
     datatype._resolve_mma_supported_dtype(x_tile_type.dtype, y_tile_type.dtype, acc_tile_type.dtype)
     x = _promote_and_broadcast_to(x, TileTy(x_tile_type.dtype, x_shape))
     y = _promote_and_broadcast_to(y, TileTy(y_tile_type.dtype, y_shape))
-    return add_operation(TileMma, acc_tile_type, x=x, y=y, acc=acc)
+    return add_operation(TileMma, acc_tile_type, use_fast_acc=use_fast_acc, x=x, y=y, acc=acc)
 
 
 @impl(ct.matmul)
diff --git a/src/cuda/tile/_stub.py b/src/cuda/tile/_stub.py
@@ -1701,7 +1701,7 @@ def zeros(shape, dtype) -> Tile:
 
 
 @stub
-def mma(x, y, /, acc) -> Tile:
+def mma(x, y, /, acc, *, use_fast_acc: bool = False) -> Tile:
     """Matrix multiply-accumulate.
 
     Computes ``(x @ y) + acc`` as a single operation
@@ -1712,6 +1712,11 @@ def mma(x, y, /, acc) -> Tile:
         x (Tile): LHS of the mma, 2D or 3D.
         y (Tile): RHS of the mma, 2D or 3D.
         acc (Tile): Accumulator of mma.
+        use_fast_acc (bool): Enable fast accumulation mode, which trades accumulator
+            precision for throughput. Requires fp8 input dtypes
+            (``float8_e4m3fn`` or ``float8_e5m2``). Currently only has an effect on
+            Hopper GPUs; silently ignored on other architectures. Default: ``False``
+            (since CTK 13.3).
 
     Supported datatypes:
 
diff --git a/test/test_mma.py b/test/test_mma.py
@@ -11,7 +11,8 @@
 from util import (
     assert_close, assert_equal, require_hopper_or_newer, torch_to_tf32, is_ampere_or_ada
 )
-from conftest import dtype_id
+from conftest import dtype_id, get_tileiras_version
+from cuda.tile._bytecode.version import BytecodeVersion
 from cuda.tile._exception import TileTypeError, TileUnsupportedFeatureError
 
 
@@ -21,11 +22,12 @@
 def mma_kernel(A, B, C,
                tm: ct.Constant[int],
                tn: ct.Constant[int],
-               tk: ct.Constant[int]):
+               tk: ct.Constant[int],
+               use_fast_acc: ct.Constant[bool]):
     tx = ct.load(A, index=(0, 0), shape=(tm, tk))
     ty = ct.load(B, index=(0, 0), shape=(tk, tn))
     acc = ct.load(C, index=(0, 0), shape=(tm, tn))
-    acc = ct.mma(tx, ty, acc)
+    acc = ct.mma(tx, ty, acc, use_fast_acc=use_fast_acc)
     ct.store(C, index=(0, 0), tile=acc)
 
 
@@ -110,32 +112,56 @@ def test_mma_regular_float(tile_size, case):
     C = torch.ones((m, n), dtype=case.acc_dtype, device="cuda")
     ref = torch.mm(A, B, out_dtype=C.dtype) + C
     ct.launch(torch.cuda.current_stream(), (1,), mma_kernel,
-              (A, B, C, m, n, k))
+              (A, B, C, m, n, k, False))
     atol, rtol = get_tolerance(A.dtype)
     assert_close(C, ref, atol=atol, rtol=rtol)
 
 
+@ct.kernel
+def mma_fast_acc_kernel(A, B, C,
+                        tm: ct.Constant[int],
+                        tn: ct.Constant[int],
+                        tk: ct.Constant[int]):
+    tx = ct.load(A, index=(0, 0), shape=(tm, tk))
+    ty = ct.load(B, index=(0, 0), shape=(tk, tn))
+    acc = ct.load(C, index=(0, 0), shape=(tm, tn))
+    acc = ct.mma(tx, ty, acc, use_fast_acc=True)
+    ct.store(C, index=(0, 0), tile=acc)
+
+
 @require_hopper_or_newer()
 @pytest.mark.parametrize("tile_size", [(16, 16, 16)])
 @pytest.mark.parametrize("case", fp8_cases, ids=str)
-def test_mma_fp8(tile_size, case):
+@pytest.mark.parametrize("use_fast_acc", [True, False])
+def test_mma_fp8(tile_size, case, use_fast_acc):
+    if use_fast_acc and get_tileiras_version() < BytecodeVersion.V_13_3:
+        pytest.skip("use_fast_acc requires tileiras 13.3")
     m, n, k = tile_size
     A = torch.randn((m, k), dtype=torch.float32, device="cuda").to(case.dtype)
     B = torch.randn((n, k), dtype=torch.float32, device="cuda").to(case.dtype)
     C = torch.ones((m, n), dtype=case.acc_dtype, device="cuda")
     scale = torch.tensor([1.0], dtype=torch.float32, device="cuda")
     try:
-        ref = torch._scaled_mm(A, B.T, scale, scale, out_dtype=C.dtype) + C
+        ref = torch._scaled_mm(A, B.T, scale, scale, out_dtype=C.dtype,
+                               use_fast_accum=use_fast_acc) + C
     except (RuntimeError, ValueError) as e:
         assert 'Multiplication of two Float8_e5m2 matrices is not supported' in str(e)
         ref = None
     ct.launch(torch.cuda.current_stream(), (1,), mma_kernel,
-              (A, B.T, C, m, n, k))
+              (A, B.T, C, m, n, k, use_fast_acc))
     if ref is not None:
         atol, rtol = get_tolerance(A.dtype)
         assert_close(C, ref, atol=atol, rtol=rtol)
 
 
+def test_mma_fast_acc_non_fp8_error():
+    A = torch.randn((2, 4), dtype=torch.float16, device="cuda")
+    B = torch.randn((4, 2), dtype=torch.float16, device="cuda")
+    C = torch.zeros((2, 2), dtype=torch.float16, device="cuda")
+    with pytest.raises(TileTypeError, match="use_fast_acc is only supported for fp8"):
+        ct.launch(torch.cuda.current_stream(), (1,), mma_fast_acc_kernel, (A, B, C, 2, 2, 4))
+
+
 @pytest.mark.parametrize("tile_size", [(8, 2, 4)])
 def test_mma_tf32(tile_size):
     m, n, k = tile_size
@@ -163,7 +189,7 @@ def test_mma_int(tile_size, case):
     C = torch.ones((m, n), dtype=case.acc_dtype, device="cuda")
     ref = C + (A.to(torch.float32) @ B.to(torch.float32)).to(C.dtype)
     ct.launch(torch.cuda.current_stream(), (1,), mma_kernel,
-              (A, B, C, m, n, k))
+              (A, B, C, m, n, k, False))
     assert_equal(C, ref)
 
 
@@ -175,7 +201,7 @@ def test_mma_mixed_int_uint(tile_size):
     C = torch.ones((m, n), dtype=torch.int32, device="cuda")
     ref = C + (A.to(torch.float32) @ B.to(torch.float32)).to(C.dtype)
     ct.launch(torch.cuda.current_stream(), (1,), mma_kernel,
-              (A, B, C, m, n, k))
+              (A, B, C, m, n, k, False))
     assert_equal(C, ref)
 
 
@@ -229,7 +255,7 @@ def test_mma_dtype_error(case):
     with pytest.raises(TileTypeError, match=case.message):
         ct.launch(torch.cuda.current_stream(),
                   (1,), mma_kernel,
-                  (A, B, C, 2, 2, 2))
+                  (A, B, C, 2, 2, 2, False))
 
 # ================ ct.matmul =================
 
@@ -405,4 +431,4 @@ def test_ampere_fp8_error(dtype):
         with pytest.raises(TileUnsupportedFeatureError,
                            match="is not supported on sm_80"):
             ct.launch(torch.cuda.current_stream(), (1,), mma_kernel,
-                      (A, B, C, 16, 16, 16))
+                      (A, B, C, 16, 16, 16, False))